admin管理员组

文章数量:1037775

ATACseq 数据分析全流程实战(三):课后习题

本帖子学习资源:/

续上前面的实战:

在R语言中的 ATACseq 数据分析全流程实战(一)

在R语言中的 ATACseq 数据分析全流程实战(二)

6.2 课后习题

题目:.html

练习数据:

今天的练习数据来自 Christina Leslie’s lab,样本为 T-regulatory cells的ATAC-seq数据,下载地址:

代码语言:javascript代码运行次数:0运行复制
read1:/@@download/ENCFF175VOD.fastq.gz
read2:/@@download/ENCFF447BGX.fastq.gz
bam:/@@download/ENCFF053CGD.bam

当然我们也可以用前面的数据进行分析:ATACseq_50k_Rep2 sample GEO - GSM1155958

ATAC-seq预处理

1.从read1和read2分别随机读取10000个reads,绘制Q值的箱线图

使用 FastqSampler这个函数进行读取:

代码语言:javascript代码运行次数:0运行复制
library(GenomicRanges)
library(Rsamtools)
library(rtracklayer)
library(GenomicAlignments)
library(ShortRead)
library(ggplot2)

# 随机读取read1和read2,n设置为10000
f1 <- FastqSampler("./ENCSR724UJS/ENCFF175VOD.fastq.gz", n=10000)
f2 <- FastqSampler("./ENCSR724UJS/ENCFF447BGX.fastq.gz", n=10000)
# 设置随机种子
set.seed(123456)
p1 <- yield(f1)
p2 <- yield(f2)

# 绘制read1
allQuals <- quality(p1)
forBox <- as(allQuals,"matrix")
dim(forBox)
# 每一行代表一个reads,每一列代表一个碱基位置
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox,main="Read1")

# 绘制read2
allQuals <- quality(p2)
forBox <- as(allQuals,"matrix")
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox,main="Read2")

结果如下:

2.绘制read1和read2的ATGCN含量
代码语言:javascript代码运行次数:0运行复制
# 获取read1的atgcn
alpByCyle <- alphabetByCycle(sread(p1))
alpByCyleFilt <-  alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:max(width(sread(p1))))
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:max(width(sread(p1))))
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:max(width(sread(p1))))
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:max(width(sread(p1))))
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:max(width(sread(p1))))
myFrameRead1 <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
myFrameRead1$Read <- "Read1"

# 获取read2的atgcn
alpByCyle <- alphabetByCycle(sread(p2))
alpByCyleFilt <-  alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:max(width(sread(p2))))
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:max(width(sread(p2))))
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:max(width(sread(p2))))
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:max(width(sread(p2))))
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:max(width(sread(p2))))
myFrameRead2 <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
myFrameRead2$Read <- "Read2"
myFrame <- rbind(myFrameRead1,myFrameRead2)

# 绘图
ggplot(myFrame,aes(x=Cycle,y=Freq,colour=Base))+geom_line()+theme_bw()+facet_grid(~Read)

结果如下:

image-20250315205919666
3.对fq进行比对,并对生成的bam进行排序并构建索引

这个在前面已经做过了,比较简单:

代码语言:javascript代码运行次数:0运行复制
# 提取参考基因组fa
library(BSgenome.Hsapiens.UCSC.hg38)
mainChromosomes <- paste0("chr",c(1:21,"X","Y","M"))
mainChrSeq <- lapply(mainChromosomes,
                     function(x)BSgenome.Hsapiens.UCSC.hg38[[x]])
names(mainChrSeq) <- mainChromosomes
mainChrSeqSet <- DNAStringSet(mainChrSeq)
writeXStringSet(mainChrSeqSet,"BSgenome.Hsapiens.UCSC.hg38.mainChrs.fa")

# 对参考基因组构建索引,memory单位为M,可以设置大一点
library(Rsubread)
buildindex("BSgenome.Hsapiens.UCSC.hg38.mainChrs",
           "BSgenome.Hsapiens.UCSC.hg38.mainChrs.fa",
           indexSplit = TRUE,
           memory = 50000) # 大约48G

# 比对
read1 <- "./ENCSR724UJS/ENCFF175VOD.fastq.gz"
read2 <- "./ENCSR724UJS/ENCFF447BGX.fastq.gz"
outBAM <- "TcellReg_ATAC.bam"

# nthreads使用线程数
align("BSgenome.Hsapiens.UCSC.hg38.mainChrs",
      readfile1=read1,readfile2=read2,
      output_file = outBAM,
      nthreads=64,type=1,
      unique=TRUE,maxFragLength = 2000)
      
# 对bam进行排序并构建索引
library(Rsamtools)
sortBam(outBAM,"Sorted_TcellReg_ATAC")
indexBam("Sorted_TcellReg_ATAC.bam")
4.使用bam文件,提取10号染色体上的fragment,并用ggplot2绘制其长度分布
代码语言:javascript代码运行次数:0运行复制
library(Rsamtools)
library(ggplot2)

# bam构建索引,可跳过
indexBam("Sorted_TcellReg_ATAC.bam")

library(GenomicAlignments)
toReviewLenOfChrom <- idxstatsBam("Sorted_TcellReg_ATAC.bam")
myParam=ScanBamParam(flag=scanBamFlag(isProperPair =TRUE),
                   what=c("qname","mapq","isize"),
                   which=GRanges("chr10", IRanges(1,130694993)))

myPairs <- readGAlignmentPairs("Sorted_TcellReg_ATAC.bam",param = myParam)

IS <- abs(mcols(second(myPairs))$isize)
tableOfIS <- table(IS)
toPlot <- data.frame(InsertSize=as.numeric(names(tableOfIS)),
                            Count=as.numeric(tableOfIS))
                            
# 绘图
fragLenPlot <- ggplot(toPlot,aes(x=InsertSize,y=Count)) + 
  geom_line() + 
  scale_y_continuous(trans='log2') + 
  theme_minimal()
fragLenPlot

结果如下:

5.使用这篇文献中的定义:nucleosome free (< 100bp), mono-nucleosome (180bp-247bp) and di-nucleosome (315-437),绘制每种类型中fragment的数量直方图

文献:Transposition of native chromatin for multimodal regulatory analysis and personal epigenomics Jason D. Buenrostro, Paul G. Giresi, Lisa C. Zaba, Howard Y. Chang, and William J. Greenleaf.

代码语言:javascript代码运行次数:0运行复制
atacReads_Open <- myPairs[IS < 100,]
atacReads_MonoNuc <- myPairs[IS > 180 & IS < 247,]
atacReads_diNuc <- myPairs[IS > 315 & IS < 437,]

toPlot <- data.frame(Fraction=c("NucleosomeFree","MonoNucleosome","DiNucleosome"),
                     Total=c(length(atacReads_Open), length(atacReads_MonoNuc),length(atacReads_diNuc))
                     )

ggplot(toPlot,aes(x=Fraction,y=Total,fill=Fraction)) +
  geom_bar(stat="identity") +
  theme_bw()

结果如下:

6.分别对nucleosome free (< 100bp), mono-nucleosome (180bp-247bp) and di-nucleosome (315-437) fractions的部分生成一个bigwig文件并导入IGV可视化
代码语言:javascript代码运行次数:0运行复制
# nucleosome free (< 100bp)
atacFragments_Open <- granges(atacReads_Open)
myCoverage_Open <- coverage(atacFragments_Open)
export.bw(myCoverage_Open,"NucleosomeFree.bw")

# mono-nucleosome (180bp-247bp)
atacFragments_MonoNuc <- granges(atacReads_MonoNuc)
myCoverage_MonoNuc <- coverage(atacFragments_MonoNuc)
export.bw(myCoverage_MonoNuc,"MonoNucleosome.bw")

# di-nucleosome (315-437) fractions
atacFragments_diNuc <- granges(atacReads_diNuc)
myCoverage_diNuc <- coverage(atacFragments_diNuc)
export.bw(myCoverage_diNuc,"DiNucleosome.bw")

结果如下:

(未完待续~)

本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。原始发表:2025-03-15,如有侵权请联系 cloudcommunity@tencent 删除索引数据分析cycle排序数据

ATACseq 数据分析全流程实战(三):课后习题

本帖子学习资源:/

续上前面的实战:

在R语言中的 ATACseq 数据分析全流程实战(一)

在R语言中的 ATACseq 数据分析全流程实战(二)

6.2 课后习题

题目:.html

练习数据:

今天的练习数据来自 Christina Leslie’s lab,样本为 T-regulatory cells的ATAC-seq数据,下载地址:

代码语言:javascript代码运行次数:0运行复制
read1:/@@download/ENCFF175VOD.fastq.gz
read2:/@@download/ENCFF447BGX.fastq.gz
bam:/@@download/ENCFF053CGD.bam

当然我们也可以用前面的数据进行分析:ATACseq_50k_Rep2 sample GEO - GSM1155958

ATAC-seq预处理

1.从read1和read2分别随机读取10000个reads,绘制Q值的箱线图

使用 FastqSampler这个函数进行读取:

代码语言:javascript代码运行次数:0运行复制
library(GenomicRanges)
library(Rsamtools)
library(rtracklayer)
library(GenomicAlignments)
library(ShortRead)
library(ggplot2)

# 随机读取read1和read2,n设置为10000
f1 <- FastqSampler("./ENCSR724UJS/ENCFF175VOD.fastq.gz", n=10000)
f2 <- FastqSampler("./ENCSR724UJS/ENCFF447BGX.fastq.gz", n=10000)
# 设置随机种子
set.seed(123456)
p1 <- yield(f1)
p2 <- yield(f2)

# 绘制read1
allQuals <- quality(p1)
forBox <- as(allQuals,"matrix")
dim(forBox)
# 每一行代表一个reads,每一列代表一个碱基位置
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox,main="Read1")

# 绘制read2
allQuals <- quality(p2)
forBox <- as(allQuals,"matrix")
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox,main="Read2")

结果如下:

2.绘制read1和read2的ATGCN含量
代码语言:javascript代码运行次数:0运行复制
# 获取read1的atgcn
alpByCyle <- alphabetByCycle(sread(p1))
alpByCyleFilt <-  alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:max(width(sread(p1))))
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:max(width(sread(p1))))
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:max(width(sread(p1))))
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:max(width(sread(p1))))
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:max(width(sread(p1))))
myFrameRead1 <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
myFrameRead1$Read <- "Read1"

# 获取read2的atgcn
alpByCyle <- alphabetByCycle(sread(p2))
alpByCyleFilt <-  alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:max(width(sread(p2))))
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:max(width(sread(p2))))
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:max(width(sread(p2))))
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:max(width(sread(p2))))
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:max(width(sread(p2))))
myFrameRead2 <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
myFrameRead2$Read <- "Read2"
myFrame <- rbind(myFrameRead1,myFrameRead2)

# 绘图
ggplot(myFrame,aes(x=Cycle,y=Freq,colour=Base))+geom_line()+theme_bw()+facet_grid(~Read)

结果如下:

image-20250315205919666
3.对fq进行比对,并对生成的bam进行排序并构建索引

这个在前面已经做过了,比较简单:

代码语言:javascript代码运行次数:0运行复制
# 提取参考基因组fa
library(BSgenome.Hsapiens.UCSC.hg38)
mainChromosomes <- paste0("chr",c(1:21,"X","Y","M"))
mainChrSeq <- lapply(mainChromosomes,
                     function(x)BSgenome.Hsapiens.UCSC.hg38[[x]])
names(mainChrSeq) <- mainChromosomes
mainChrSeqSet <- DNAStringSet(mainChrSeq)
writeXStringSet(mainChrSeqSet,"BSgenome.Hsapiens.UCSC.hg38.mainChrs.fa")

# 对参考基因组构建索引,memory单位为M,可以设置大一点
library(Rsubread)
buildindex("BSgenome.Hsapiens.UCSC.hg38.mainChrs",
           "BSgenome.Hsapiens.UCSC.hg38.mainChrs.fa",
           indexSplit = TRUE,
           memory = 50000) # 大约48G

# 比对
read1 <- "./ENCSR724UJS/ENCFF175VOD.fastq.gz"
read2 <- "./ENCSR724UJS/ENCFF447BGX.fastq.gz"
outBAM <- "TcellReg_ATAC.bam"

# nthreads使用线程数
align("BSgenome.Hsapiens.UCSC.hg38.mainChrs",
      readfile1=read1,readfile2=read2,
      output_file = outBAM,
      nthreads=64,type=1,
      unique=TRUE,maxFragLength = 2000)
      
# 对bam进行排序并构建索引
library(Rsamtools)
sortBam(outBAM,"Sorted_TcellReg_ATAC")
indexBam("Sorted_TcellReg_ATAC.bam")
4.使用bam文件,提取10号染色体上的fragment,并用ggplot2绘制其长度分布
代码语言:javascript代码运行次数:0运行复制
library(Rsamtools)
library(ggplot2)

# bam构建索引,可跳过
indexBam("Sorted_TcellReg_ATAC.bam")

library(GenomicAlignments)
toReviewLenOfChrom <- idxstatsBam("Sorted_TcellReg_ATAC.bam")
myParam=ScanBamParam(flag=scanBamFlag(isProperPair =TRUE),
                   what=c("qname","mapq","isize"),
                   which=GRanges("chr10", IRanges(1,130694993)))

myPairs <- readGAlignmentPairs("Sorted_TcellReg_ATAC.bam",param = myParam)

IS <- abs(mcols(second(myPairs))$isize)
tableOfIS <- table(IS)
toPlot <- data.frame(InsertSize=as.numeric(names(tableOfIS)),
                            Count=as.numeric(tableOfIS))
                            
# 绘图
fragLenPlot <- ggplot(toPlot,aes(x=InsertSize,y=Count)) + 
  geom_line() + 
  scale_y_continuous(trans='log2') + 
  theme_minimal()
fragLenPlot

结果如下:

5.使用这篇文献中的定义:nucleosome free (< 100bp), mono-nucleosome (180bp-247bp) and di-nucleosome (315-437),绘制每种类型中fragment的数量直方图

文献:Transposition of native chromatin for multimodal regulatory analysis and personal epigenomics Jason D. Buenrostro, Paul G. Giresi, Lisa C. Zaba, Howard Y. Chang, and William J. Greenleaf.

代码语言:javascript代码运行次数:0运行复制
atacReads_Open <- myPairs[IS < 100,]
atacReads_MonoNuc <- myPairs[IS > 180 & IS < 247,]
atacReads_diNuc <- myPairs[IS > 315 & IS < 437,]

toPlot <- data.frame(Fraction=c("NucleosomeFree","MonoNucleosome","DiNucleosome"),
                     Total=c(length(atacReads_Open), length(atacReads_MonoNuc),length(atacReads_diNuc))
                     )

ggplot(toPlot,aes(x=Fraction,y=Total,fill=Fraction)) +
  geom_bar(stat="identity") +
  theme_bw()

结果如下:

6.分别对nucleosome free (< 100bp), mono-nucleosome (180bp-247bp) and di-nucleosome (315-437) fractions的部分生成一个bigwig文件并导入IGV可视化
代码语言:javascript代码运行次数:0运行复制
# nucleosome free (< 100bp)
atacFragments_Open <- granges(atacReads_Open)
myCoverage_Open <- coverage(atacFragments_Open)
export.bw(myCoverage_Open,"NucleosomeFree.bw")

# mono-nucleosome (180bp-247bp)
atacFragments_MonoNuc <- granges(atacReads_MonoNuc)
myCoverage_MonoNuc <- coverage(atacFragments_MonoNuc)
export.bw(myCoverage_MonoNuc,"MonoNucleosome.bw")

# di-nucleosome (315-437) fractions
atacFragments_diNuc <- granges(atacReads_diNuc)
myCoverage_diNuc <- coverage(atacFragments_diNuc)
export.bw(myCoverage_diNuc,"DiNucleosome.bw")

结果如下:

(未完待续~)

本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。原始发表:2025-03-15,如有侵权请联系 cloudcommunity@tencent 删除索引数据分析cycle排序数据

本文标签: ATACseq 数据分析全流程实战(三)课后习题