admin管理员组文章数量:1037775
ATACseq 数据分析全流程实战(三):课后习题
本帖子学习资源:/
续上前面的实战:
在R语言中的 ATACseq 数据分析全流程实战(一)
在R语言中的 ATACseq 数据分析全流程实战(二)
6.2 课后习题
题目:.html
练习数据:
今天的练习数据来自 Christina Leslie’s lab,样本为 T-regulatory cells的ATAC-seq数据,下载地址:
代码语言:javascript代码运行次数:0运行复制read1:/@@download/ENCFF175VOD.fastq.gz
read2:/@@download/ENCFF447BGX.fastq.gz
bam:/@@download/ENCFF053CGD.bam
当然我们也可以用前面的数据进行分析:ATACseq_50k_Rep2 sample GEO - GSM1155958
ATAC-seq预处理
1.从read1和read2分别随机读取10000个reads,绘制Q值的箱线图
使用 FastqSampler
这个函数进行读取:
library(GenomicRanges)
library(Rsamtools)
library(rtracklayer)
library(GenomicAlignments)
library(ShortRead)
library(ggplot2)
# 随机读取read1和read2,n设置为10000
f1 <- FastqSampler("./ENCSR724UJS/ENCFF175VOD.fastq.gz", n=10000)
f2 <- FastqSampler("./ENCSR724UJS/ENCFF447BGX.fastq.gz", n=10000)
# 设置随机种子
set.seed(123456)
p1 <- yield(f1)
p2 <- yield(f2)
# 绘制read1
allQuals <- quality(p1)
forBox <- as(allQuals,"matrix")
dim(forBox)
# 每一行代表一个reads,每一列代表一个碱基位置
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox,main="Read1")
# 绘制read2
allQuals <- quality(p2)
forBox <- as(allQuals,"matrix")
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox,main="Read2")
结果如下:
2.绘制read1和read2的ATGCN含量
代码语言:javascript代码运行次数:0运行复制# 获取read1的atgcn
alpByCyle <- alphabetByCycle(sread(p1))
alpByCyleFilt <- alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:max(width(sread(p1))))
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:max(width(sread(p1))))
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:max(width(sread(p1))))
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:max(width(sread(p1))))
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:max(width(sread(p1))))
myFrameRead1 <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
myFrameRead1$Read <- "Read1"
# 获取read2的atgcn
alpByCyle <- alphabetByCycle(sread(p2))
alpByCyleFilt <- alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:max(width(sread(p2))))
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:max(width(sread(p2))))
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:max(width(sread(p2))))
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:max(width(sread(p2))))
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:max(width(sread(p2))))
myFrameRead2 <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
myFrameRead2$Read <- "Read2"
myFrame <- rbind(myFrameRead1,myFrameRead2)
# 绘图
ggplot(myFrame,aes(x=Cycle,y=Freq,colour=Base))+geom_line()+theme_bw()+facet_grid(~Read)
结果如下:
3.对fq进行比对,并对生成的bam进行排序并构建索引
这个在前面已经做过了,比较简单:
代码语言:javascript代码运行次数:0运行复制# 提取参考基因组fa
library(BSgenome.Hsapiens.UCSC.hg38)
mainChromosomes <- paste0("chr",c(1:21,"X","Y","M"))
mainChrSeq <- lapply(mainChromosomes,
function(x)BSgenome.Hsapiens.UCSC.hg38[[x]])
names(mainChrSeq) <- mainChromosomes
mainChrSeqSet <- DNAStringSet(mainChrSeq)
writeXStringSet(mainChrSeqSet,"BSgenome.Hsapiens.UCSC.hg38.mainChrs.fa")
# 对参考基因组构建索引,memory单位为M,可以设置大一点
library(Rsubread)
buildindex("BSgenome.Hsapiens.UCSC.hg38.mainChrs",
"BSgenome.Hsapiens.UCSC.hg38.mainChrs.fa",
indexSplit = TRUE,
memory = 50000) # 大约48G
# 比对
read1 <- "./ENCSR724UJS/ENCFF175VOD.fastq.gz"
read2 <- "./ENCSR724UJS/ENCFF447BGX.fastq.gz"
outBAM <- "TcellReg_ATAC.bam"
# nthreads使用线程数
align("BSgenome.Hsapiens.UCSC.hg38.mainChrs",
readfile1=read1,readfile2=read2,
output_file = outBAM,
nthreads=64,type=1,
unique=TRUE,maxFragLength = 2000)
# 对bam进行排序并构建索引
library(Rsamtools)
sortBam(outBAM,"Sorted_TcellReg_ATAC")
indexBam("Sorted_TcellReg_ATAC.bam")
4.使用bam文件,提取10号染色体上的fragment,并用ggplot2绘制其长度分布
代码语言:javascript代码运行次数:0运行复制library(Rsamtools)
library(ggplot2)
# bam构建索引,可跳过
indexBam("Sorted_TcellReg_ATAC.bam")
library(GenomicAlignments)
toReviewLenOfChrom <- idxstatsBam("Sorted_TcellReg_ATAC.bam")
myParam=ScanBamParam(flag=scanBamFlag(isProperPair =TRUE),
what=c("qname","mapq","isize"),
which=GRanges("chr10", IRanges(1,130694993)))
myPairs <- readGAlignmentPairs("Sorted_TcellReg_ATAC.bam",param = myParam)
IS <- abs(mcols(second(myPairs))$isize)
tableOfIS <- table(IS)
toPlot <- data.frame(InsertSize=as.numeric(names(tableOfIS)),
Count=as.numeric(tableOfIS))
# 绘图
fragLenPlot <- ggplot(toPlot,aes(x=InsertSize,y=Count)) +
geom_line() +
scale_y_continuous(trans='log2') +
theme_minimal()
fragLenPlot
结果如下:
5.使用这篇文献中的定义:nucleosome free (< 100bp), mono-nucleosome (180bp-247bp) and di-nucleosome (315-437),绘制每种类型中fragment的数量直方图
文献:Transposition of native chromatin for multimodal regulatory analysis and personal epigenomics Jason D. Buenrostro, Paul G. Giresi, Lisa C. Zaba, Howard Y. Chang, and William J. Greenleaf.
代码语言:javascript代码运行次数:0运行复制atacReads_Open <- myPairs[IS < 100,]
atacReads_MonoNuc <- myPairs[IS > 180 & IS < 247,]
atacReads_diNuc <- myPairs[IS > 315 & IS < 437,]
toPlot <- data.frame(Fraction=c("NucleosomeFree","MonoNucleosome","DiNucleosome"),
Total=c(length(atacReads_Open), length(atacReads_MonoNuc),length(atacReads_diNuc))
)
ggplot(toPlot,aes(x=Fraction,y=Total,fill=Fraction)) +
geom_bar(stat="identity") +
theme_bw()
结果如下:
6.分别对nucleosome free (< 100bp), mono-nucleosome (180bp-247bp) and di-nucleosome (315-437) fractions的部分生成一个bigwig文件并导入IGV可视化
代码语言:javascript代码运行次数:0运行复制# nucleosome free (< 100bp)
atacFragments_Open <- granges(atacReads_Open)
myCoverage_Open <- coverage(atacFragments_Open)
export.bw(myCoverage_Open,"NucleosomeFree.bw")
# mono-nucleosome (180bp-247bp)
atacFragments_MonoNuc <- granges(atacReads_MonoNuc)
myCoverage_MonoNuc <- coverage(atacFragments_MonoNuc)
export.bw(myCoverage_MonoNuc,"MonoNucleosome.bw")
# di-nucleosome (315-437) fractions
atacFragments_diNuc <- granges(atacReads_diNuc)
myCoverage_diNuc <- coverage(atacFragments_diNuc)
export.bw(myCoverage_diNuc,"DiNucleosome.bw")
结果如下:
(未完待续~)
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。原始发表:2025-03-15,如有侵权请联系 cloudcommunity@tencent 删除索引数据分析cycle排序数据ATACseq 数据分析全流程实战(三):课后习题
本帖子学习资源:/
续上前面的实战:
在R语言中的 ATACseq 数据分析全流程实战(一)
在R语言中的 ATACseq 数据分析全流程实战(二)
6.2 课后习题
题目:.html
练习数据:
今天的练习数据来自 Christina Leslie’s lab,样本为 T-regulatory cells的ATAC-seq数据,下载地址:
代码语言:javascript代码运行次数:0运行复制read1:/@@download/ENCFF175VOD.fastq.gz
read2:/@@download/ENCFF447BGX.fastq.gz
bam:/@@download/ENCFF053CGD.bam
当然我们也可以用前面的数据进行分析:ATACseq_50k_Rep2 sample GEO - GSM1155958
ATAC-seq预处理
1.从read1和read2分别随机读取10000个reads,绘制Q值的箱线图
使用 FastqSampler
这个函数进行读取:
library(GenomicRanges)
library(Rsamtools)
library(rtracklayer)
library(GenomicAlignments)
library(ShortRead)
library(ggplot2)
# 随机读取read1和read2,n设置为10000
f1 <- FastqSampler("./ENCSR724UJS/ENCFF175VOD.fastq.gz", n=10000)
f2 <- FastqSampler("./ENCSR724UJS/ENCFF447BGX.fastq.gz", n=10000)
# 设置随机种子
set.seed(123456)
p1 <- yield(f1)
p2 <- yield(f2)
# 绘制read1
allQuals <- quality(p1)
forBox <- as(allQuals,"matrix")
dim(forBox)
# 每一行代表一个reads,每一列代表一个碱基位置
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox,main="Read1")
# 绘制read2
allQuals <- quality(p2)
forBox <- as(allQuals,"matrix")
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox,main="Read2")
结果如下:
2.绘制read1和read2的ATGCN含量
代码语言:javascript代码运行次数:0运行复制# 获取read1的atgcn
alpByCyle <- alphabetByCycle(sread(p1))
alpByCyleFilt <- alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:max(width(sread(p1))))
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:max(width(sread(p1))))
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:max(width(sread(p1))))
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:max(width(sread(p1))))
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:max(width(sread(p1))))
myFrameRead1 <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
myFrameRead1$Read <- "Read1"
# 获取read2的atgcn
alpByCyle <- alphabetByCycle(sread(p2))
alpByCyleFilt <- alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:max(width(sread(p2))))
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:max(width(sread(p2))))
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:max(width(sread(p2))))
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:max(width(sread(p2))))
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:max(width(sread(p2))))
myFrameRead2 <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
myFrameRead2$Read <- "Read2"
myFrame <- rbind(myFrameRead1,myFrameRead2)
# 绘图
ggplot(myFrame,aes(x=Cycle,y=Freq,colour=Base))+geom_line()+theme_bw()+facet_grid(~Read)
结果如下:
3.对fq进行比对,并对生成的bam进行排序并构建索引
这个在前面已经做过了,比较简单:
代码语言:javascript代码运行次数:0运行复制# 提取参考基因组fa
library(BSgenome.Hsapiens.UCSC.hg38)
mainChromosomes <- paste0("chr",c(1:21,"X","Y","M"))
mainChrSeq <- lapply(mainChromosomes,
function(x)BSgenome.Hsapiens.UCSC.hg38[[x]])
names(mainChrSeq) <- mainChromosomes
mainChrSeqSet <- DNAStringSet(mainChrSeq)
writeXStringSet(mainChrSeqSet,"BSgenome.Hsapiens.UCSC.hg38.mainChrs.fa")
# 对参考基因组构建索引,memory单位为M,可以设置大一点
library(Rsubread)
buildindex("BSgenome.Hsapiens.UCSC.hg38.mainChrs",
"BSgenome.Hsapiens.UCSC.hg38.mainChrs.fa",
indexSplit = TRUE,
memory = 50000) # 大约48G
# 比对
read1 <- "./ENCSR724UJS/ENCFF175VOD.fastq.gz"
read2 <- "./ENCSR724UJS/ENCFF447BGX.fastq.gz"
outBAM <- "TcellReg_ATAC.bam"
# nthreads使用线程数
align("BSgenome.Hsapiens.UCSC.hg38.mainChrs",
readfile1=read1,readfile2=read2,
output_file = outBAM,
nthreads=64,type=1,
unique=TRUE,maxFragLength = 2000)
# 对bam进行排序并构建索引
library(Rsamtools)
sortBam(outBAM,"Sorted_TcellReg_ATAC")
indexBam("Sorted_TcellReg_ATAC.bam")
4.使用bam文件,提取10号染色体上的fragment,并用ggplot2绘制其长度分布
代码语言:javascript代码运行次数:0运行复制library(Rsamtools)
library(ggplot2)
# bam构建索引,可跳过
indexBam("Sorted_TcellReg_ATAC.bam")
library(GenomicAlignments)
toReviewLenOfChrom <- idxstatsBam("Sorted_TcellReg_ATAC.bam")
myParam=ScanBamParam(flag=scanBamFlag(isProperPair =TRUE),
what=c("qname","mapq","isize"),
which=GRanges("chr10", IRanges(1,130694993)))
myPairs <- readGAlignmentPairs("Sorted_TcellReg_ATAC.bam",param = myParam)
IS <- abs(mcols(second(myPairs))$isize)
tableOfIS <- table(IS)
toPlot <- data.frame(InsertSize=as.numeric(names(tableOfIS)),
Count=as.numeric(tableOfIS))
# 绘图
fragLenPlot <- ggplot(toPlot,aes(x=InsertSize,y=Count)) +
geom_line() +
scale_y_continuous(trans='log2') +
theme_minimal()
fragLenPlot
结果如下:
5.使用这篇文献中的定义:nucleosome free (< 100bp), mono-nucleosome (180bp-247bp) and di-nucleosome (315-437),绘制每种类型中fragment的数量直方图
文献:Transposition of native chromatin for multimodal regulatory analysis and personal epigenomics Jason D. Buenrostro, Paul G. Giresi, Lisa C. Zaba, Howard Y. Chang, and William J. Greenleaf.
代码语言:javascript代码运行次数:0运行复制atacReads_Open <- myPairs[IS < 100,]
atacReads_MonoNuc <- myPairs[IS > 180 & IS < 247,]
atacReads_diNuc <- myPairs[IS > 315 & IS < 437,]
toPlot <- data.frame(Fraction=c("NucleosomeFree","MonoNucleosome","DiNucleosome"),
Total=c(length(atacReads_Open), length(atacReads_MonoNuc),length(atacReads_diNuc))
)
ggplot(toPlot,aes(x=Fraction,y=Total,fill=Fraction)) +
geom_bar(stat="identity") +
theme_bw()
结果如下:
6.分别对nucleosome free (< 100bp), mono-nucleosome (180bp-247bp) and di-nucleosome (315-437) fractions的部分生成一个bigwig文件并导入IGV可视化
代码语言:javascript代码运行次数:0运行复制# nucleosome free (< 100bp)
atacFragments_Open <- granges(atacReads_Open)
myCoverage_Open <- coverage(atacFragments_Open)
export.bw(myCoverage_Open,"NucleosomeFree.bw")
# mono-nucleosome (180bp-247bp)
atacFragments_MonoNuc <- granges(atacReads_MonoNuc)
myCoverage_MonoNuc <- coverage(atacFragments_MonoNuc)
export.bw(myCoverage_MonoNuc,"MonoNucleosome.bw")
# di-nucleosome (315-437) fractions
atacFragments_diNuc <- granges(atacReads_diNuc)
myCoverage_diNuc <- coverage(atacFragments_diNuc)
export.bw(myCoverage_diNuc,"DiNucleosome.bw")
结果如下:
(未完待续~)
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。原始发表:2025-03-15,如有侵权请联系 cloudcommunity@tencent 删除索引数据分析cycle排序数据本文标签: ATACseq 数据分析全流程实战(三)课后习题
版权声明:本文标题:ATACseq 数据分析全流程实战(三):课后习题 内容由热心网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://it.en369.cn/jiaocheng/1748248283a2275058.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论