library(maftools)
#path to TCGA LAML MAF file
laml.maf = system.file('extdata', 'tcga_laml.maf.gz', package = 'maftools')
#clinical information containing survival information and histology. This is optional
laml.clin = system.file('extdata', 'tcga_laml_annot.tsv', package = 'maftools')
laml = read.maf(maf = laml.maf, clinicalData = laml.clin)maftools
1 基本介紹
1.1 什麼是 maf 格式 (Mutation Annotation Format)?
MAF(Mutation Annotation Format)是一種廣泛使用的突變註解檔格式,常見於 TCGA、ICGC 與各大癌症基因體研究專案。
每一列代表一個 somatic variant,並包含以下主要欄位:
- Hugo_Symbol:基因名稱
- Entrez_Gene_Id:Entrez 基因 ID
- Chromosome / Start_Position / End_Position:變異所在的基因組座標
- Variant_Classification:變異功能類型(Missense, Nonsense, Splice_Site…)
- Variant_Type:SNP / INS / DEL
- Tumor_Seq_Allele1 / Tumor_Seq_Allele2:腫瘤樣本的等位基因
- Tumor_Sample_Barcode:樣本 ID
- Protein_Change:蛋白層級變異(如 p.R1517H)
- i_TumorVAF_WU:突變的 variant allele frequency
1.2 如何產生 maf
- 原檔為
VCF可以使用 vcf2maf 將 vcf 轉 maf - GATK的 funcotator
- ANNOVAR 註釋後,可使用
maftools中的annovarToMaf函數,將表格形式的 ANNOVAR 輸出轉換為 MAF。
1.3 什麼是 maftools?
maftools 是一個用於分析並視覺化 MAF 檔案的套件。
1.4 參考資料
以下指令是根據官方文檔範例撰寫,並新增個人的筆記內容,所有相關著作權屬於原作者(如下):
Mayakonda A, Lin DC, Assenov Y, Plass C, Koeffler HP. 2018. Maftools: efficient and comprehensive analysis of somatic variants in cancer. Genome Research. PMID: 30341162
2 下載 maftools 套件
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (!requireNamespace("maftools", quietly = TRUE)) {
BiocManager::install("maftools", update = FALSE, ask = FALSE)
}
# create `results/` folder for output data
if (!dir.exists("results")) dir.create("results", recursive = TRUE)
3 讀取 maf 檔
使用 read.maf 函數
3.1 查看 MAF object
#Typing laml shows basic summary of MAF file.
laml3.2 匯出 maf summary 檔案
#Shows sample summry.
getSampleSummary(laml)
#Shows gene summary.
getGeneSummary(laml)
#shows clinical data associated with samples
getClinicalData(laml)
#Shows all fields in MAF
getFields(laml)
#Writes maf summary to an output file with basename laml.
write.mafSummary(maf = laml, basename = 'results/laml')4 視覺化
4.1 summary plot
plotmafSummary 可以繪製整體資料的摘要,用於快速釐清資料
plotmafSummary(maf = laml, rmOutlier = TRUE, addStat = 'median', dashboard = TRUE, titvRaw = FALSE)4.2 bar plot
mafbarplot 用於繪製選定基因變異種類之比率
mafbarplot(maf = laml, genes = c("DNMT3A", "IDH1", "IDH2"))4.3 oncoplot plot
oncoplot 繪製樣本中的變異基因與變異種類 (Multi_Hit 是指在同一樣本中發生多次突變的基因)
#oncoplot for top ten mutated genes.
oncoplot(maf = laml, top = 10)4.4 Transition and Transversions
在突變類型中,單一鹼基替換(Single Nucleotide Variant, SNV)可依照其替換的化學性質分為兩大類:
1. Transition (Ti) Purine ↔︎ Purine 或 Pyrimidine ↔︎ Pyrimidine 的替換,包括: - A ↔︎ G(兩者皆為 purine) - C ↔︎ T(兩者皆為 pyrimidine)
Transition 牽涉的化學變化較小,因此通常比 transversion 更常見。
2. Transversion (Tv) Purine ↔︎ Pyrimidine 的替換,包括: - A ↔︎ C - A ↔︎ T - G ↔︎ C - G ↔︎ T
由於涉及較大的化學結構變化,發生頻率通常較低。
maftools 提供 titv() 函式來計算 Transition 與 Transversion 的數量與比例,
並可利用 plotTiTv() 將結果視覺化。
laml.titv = titv(maf = laml, plot = FALSE, useSyn = TRUE)
#plot titv summary
plotTiTv(res = laml.titv)4.5 胺基酸變化的 Lollipop plots
#lollipop plot for DNMT3A, which is one of the most frequent mutated gene in Leukemia.
lollipopPlot(
maf = laml,
gene = 'DNMT3A',
AACol = 'Protein_Change',
showMutationRate = TRUE,
labelPos = 882
)4.5.1 自訂資料作為輸入
資料應包含變異位置(pos)與變異數量(count)兩個欄位
#example data
my_data = data.frame(pos = sample.int(912, 15, replace = TRUE), count = sample.int(30, 15, replace = TRUE))
head(my_data)
lollipopPlot(data = my_data, gene = "DNMT3A")
plotProtein(gene = "TP53", refSeqID = "NM_000546")4.6 Rainfall plots
rainfallPlot 繪製 Rainfall plots 觀察 mutation hot spot - Kataegis 定義為包含六個或更多連續突變的基因組片段,平均突變間距離小於或等於1,00 bp
brca <- system.file("extdata", "brca.maf.gz", package = "maftools")
brca = read.maf(maf = brca, verbose = FALSE)
rainfallPlot(maf = brca, detectChangePoints = TRUE, pointSize = 0.4)
file.rename(list.files(pattern="\\.tsv$"), file.path("results", list.files(pattern="\\.tsv$")))4.7 與 33個 TCGA cohorts 比較 mutation load
laml.mutload = tcgaCompare(maf = laml, cohortName = 'Example-LAML', logscale = TRUE, capture_size = 50)4.8 VAF plot
plotVaf 繪製 Variant Allele Frequencies,快速估計突變最嚴重的基因的克隆狀態
plotVaf(maf = laml, vafCol = 'i_TumorVAF_WU')5 處理 copy-number 資料
5.1 讀取 GISTIC 的輸出檔
gistic_res_folder <- system.file("extdata", package = "maftools")
laml.gistic = readGistic(gisticDir = gistic_res_folder, isTCGA = TRUE)
#GISTIC object
laml.gistic5.2 視覺化 GISTIC 結果
5.2.1 繪製 GISTIC Chrom Plot
gisticChromPlot(gistic = laml.gistic, markBands = "all")
# Co-gisticChromPlot
coGisticChromPlot(gistic1 = laml.gistic, gistic2 = laml.gistic, g1Name = "AML-1", g2Name = "AML-2", type = 'Amp')5.2.2 繪製 Bubble Plot
gisticBubblePlot(gistic = laml.gistic)5.2.3 繪製 Onco Plot
gisticOncoPlot(gistic = laml.gistic, clinicalData = getClinicalData(x = laml), clinicalFeatures = 'FAB_classification', sortByAnnotation = TRUE, top = 10)5.3 CBS segments
5.3.1 總結 chromosomal arm level CN
laml.seg <- system.file("extdata", "LAML_CBS_segments.tsv.gz", package = "maftools")
segSummarize_results = segSummarize(seg = laml.seg)5.3.2 視覺化 CBS segments
tcga.ab.009.seg <- system.file("extdata", "TCGA.AB.3009.hg19.seg.txt", package = "maftools")
plotCBSsegments(cbsFile = tcga.ab.009.seg)6 綜合分析
6.1 體細胞相互作用
#exclusive/co-occurance event analysis on top 10 mutated genes.
somaticInteractions(maf = laml, top = 25, pvalue = c(0.05, 0.1))6.2 使用 cluster 偵測 cancer driver genes
laml.sig = oncodrive(maf = laml, AACol = 'Protein_Change', minMut = 5, pvalMethod = 'zscore')
head(laml.sig)
plotOncodrive(res = laml.sig, fdrCutOff = 0.1, useFraction = TRUE, labelSize = 0.5)6.3 新增 pfam Domains
laml.pfam = pfamDomains(maf = laml, AACol = 'Protein_Change', top = 10)
#Protein summary (Printing first 7 columns for display convenience)
laml.pfam$proteinSummary[,1:7, with = FALSE]
#Domain summary (Printing first 3 columns for display convenience)
laml.pfam$domainSummary[,1:3, with = FALSE]6.4 Survival 分析
6.4.1 單一基因
#Survival analysis based on grouping of DNMT3A mutation status
mafSurvival(maf = laml, genes = 'DNMT3A', time = 'days_to_last_followup', Status = 'Overall_Survival_Status', isTCGA = TRUE)6.4.2 找出導致存活率低的基因集
#Using top 20 mutated genes to identify a set of genes (of size 2) to predict poor prognostic groups
prog_geneset = survGroup(maf = laml, top = 20, geneSetSize = 2, time = "days_to_last_followup", Status = "Overall_Survival_Status", verbose = FALSE)
print(prog_geneset)
mafSurvGroup(maf = laml, geneSet = c("DNMT3A", "FLT3"), time = "days_to_last_followup", Status = "Overall_Survival_Status")6.5 比較兩個 cohort
#Primary APL MAF
primary.apl = system.file("extdata", "APL_primary.maf.gz", package = "maftools")
primary.apl = read.maf(maf = primary.apl)
#Relapse APL MAF
relapse.apl = system.file("extdata", "APL_relapse.maf.gz", package = "maftools")
relapse.apl = read.maf(maf = relapse.apl)
#Considering only genes which are mutated in at-least in 5 samples in one of the cohort to avoid bias due to genes mutated in single sample.
pt.vs.rt <- mafCompare(m1 = primary.apl, m2 = relapse.apl, m1Name = 'Primary', m2Name = 'Relapse', minMut = 5)
print(pt.vs.rt)6.5.1 森林圖
forestPlot(mafCompareRes = pt.vs.rt, pVal = 0.1)6.5.2 並排繪製兩個腫瘤圖
genes = c("PML", "RARA", "RUNX1", "ARID1B", "FLT3")
coOncoplot(m1 = primary.apl, m2 = relapse.apl, m1Name = 'PrimaryAPL', m2Name = 'RelapseAPL', genes = genes, removeNonMutated = TRUE)6.5.3 並排繪製兩個長條圖
coBarplot(m1 = primary.apl, m2 = relapse.apl, m1Name = "Primary", m2Name = "Relapse")6.5.4 合併兩個lollipop Plot
lollipopPlot2(m1 = primary.apl, m2 = relapse.apl, gene = "PML", AACol1 = "amino_acid_change", AACol2 = "amino_acid_change", m1_name = "Primary", m2_name = "Relapse")6.6 臨床富集分析
fab.ce = clinicalEnrichment(maf = laml, clinicalFeature = 'FAB_classification')
#Results are returned as a list. Significant associations p-value < 0.05
fab.ce$groupwise_comparision[p_value < 0.05]
plotEnrichmentResults(enrich_res = fab.ce, pVal = 0.05, geneFontSize = 0.5, annoFontSize = 0.6)6.7 藥物-基因交互作用
dgi = drugInteractions(maf = laml, fontSize = 0.75)
dnmt3a.dgi = drugInteractions(genes = "DNMT3A", drugs = TRUE)
#Printing selected columns.
dnmt3a.dgi[,.(Gene, interaction_types, drug_name, drug_claim_name)]6.8 致癌 pathway
pws = pathways(maf = laml, plotType = 'treemap')
plotPathways(maf = laml, pathlist = pws)6.9 腫瘤異質性分析
6.9.1 需要 mclust 套件
if (!requireNamespace("mclust", quietly = TRUE)) {
BiocManager::install("mclust", update = FALSE, ask = FALSE)
}
#Heterogeneity in sample TCGA.AB.2972
library("mclust")6.9.2 腫瘤樣本的異質性
tcga.ab.2972.het = inferHeterogeneity(maf = laml, tsb = 'TCGA-AB-2972', vafCol = 'i_TumorVAF_WU')
print(tcga.ab.2972.het$clusterMeans)
#Visualizing results
plotClusters(clusters = tcga.ab.2972.het)6.9.3 忽略拷貝數變異區域的變異
seg = system.file('extdata', 'TCGA.AB.3009.hg19.seg.txt', package = 'maftools')
tcga.ab.3009.het = inferHeterogeneity(maf = laml, tsb = 'TCGA-AB-3009', segFile = seg, vafCol = 'i_TumorVAF_WU')
#Visualizing results. Highlighting those variants on copynumber altered variants.
plotClusters(clusters = tcga.ab.3009.het, genes = 'CN_altered', showCNvars = TRUE)6.10 突變特徵
6.10.1 需要 BSgenome.Hsapiens.UCSC.hg19
if (!requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)) {
BiocManager::install("BSgenome.Hsapiens.UCSC.hg19", update = FALSE, ask = FALSE)
}
library("BSgenome.Hsapiens.UCSC.hg19")
laml.tnm = trinucleotideMatrix(maf = laml, prefix = 'chr', add = TRUE, ref_genome = "BSgenome.Hsapiens.UCSC.hg19")6.10.2 APOBEC富集樣品與未富集樣品之間的差異
plotApobecDiff(tnm = laml.tnm, maf = laml, pVal = 0.2)6.10.3 簽名分析
packages <- c("NMF", "pheatmap")
for (pkg in packages) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg)
}
}
library('NMF')
laml.sign = estimateSignatures(mat = laml.tnm, nTry = 6, pConstant=0.01)
plotCophenetic(res = laml.sign)
laml.sig = extractSignatures(mat = laml.tnm, n = 3, pConstant=0.01)
#Compate against original 30 signatures
laml.og30.cosm = compareSignatures(nmfRes = laml.sig, sig_db = "legacy")
#Compate against updated version3 60 signatures
laml.v3.cosm = compareSignatures(nmfRes = laml.sig, sig_db = "SBS")
library('pheatmap')
pheatmap::pheatmap(mat = laml.og30.cosm$cosine_similarities, cluster_rows = FALSE, main = "cosine similarity against validated signatures")
maftools::plotSignatures(nmfRes = laml.sig, title_size = 1.2, sig_db = "SBS")
# # 3D bar plot
# library("barplot3d")
# #Visualize first signature
# sig1 = laml.sig$signatures[,1]
# barplot3d::legoplot3d(contextdata = sig1, labels = FALSE, scalexy = 0.01, sixcolors = "sanger", alpha = 0.5)7 變體註釋
7.1 將 annovar 輸出轉換為 MAF
var.annovar = system.file("extdata", "variants.hg19_multianno.txt", package = "maftools")
var.annovar.maf = annovarToMaf(annovar = var.annovar, Center = 'CSI-NUS', refBuild = 'hg19',
tsbCol = 'Tumor_Sample_Barcode', table = 'ensGene')7.2 將 ICGC Simpale Somatic Mutation Format 轉換為 MAF
#Read sample ICGC data for ESCA
esca.icgc <- system.file("extdata", "simple_somatic_mutation.open.ESCA-CN.sample.tsv.gz", package = "maftools")
esca.maf <- icgcSimpleMutationToMAF(icgc = esca.icgc, addHugoSymbol = TRUE)
#Printing first 16 columns for display convenience.
print(esca.maf[1:5,1:16, with = FALSE])7.3 準備用於 MutSigCV 分析的 MAF 文件
laml.mutsig.corrected = prepareMutSig(maf = laml)
# Converting gene names for 1 variants from 1 genes
# Hugo_Symbol MutSig_Synonym N
# 1: ARHGAP35 GRLF1 1
# Original symbols are preserved under column OG_Hugo_Symbol.8 集合運算
8.1 子集 MAF
#Extract data for samples 'TCGA.AB.3009' and 'TCGA.AB.2933' (Printing just 5 rows for display convenience)
subsetMaf(maf = laml, tsb = c('TCGA-AB-3009', 'TCGA-AB-2933'), mafObj = FALSE)[1:5]
##Same as above but return output as an MAF object (Default behaviour)
subsetMaf(maf = laml, tsb = c('TCGA-AB-3009', 'TCGA-AB-2933'))8.2 指定查詢和控制輸出欄位
#Select all Splice_Site mutations from DNMT3A and NPM1
subsetMaf(maf = laml, genes = c('DNMT3A', 'NPM1'), mafObj = FALSE,query = "Variant_Classification == 'Splice_Site'")
#Same as above but include only 'i_transcript_name' column in the output
subsetMaf(maf = laml, genes = c('DNMT3A', 'NPM1'), mafObj = FALSE, query = "Variant_Classification == 'Splice_Site'", fields = 'i_transcript_name')8.3 按臨床資料進行子集劃分
#Select all samples with FAB clasification M4 in clinical data
laml_m4 = subsetMaf(maf = laml, clinQuery = "FAB_classification %in% 'M4'")9 樣品交換識別
#Path to BAM files
bams = c(
"DBW-40-N.bam",
"DBW-40-1T.bam",
"DBW-40-2T.bam",
"DBW-40-3T.bam",
"DBW-43-N.bam",
"DBW-43-1T.bam"
)
res = maftools::sampleSwaps(bams = bams, build = "hg19")
# Fetching readcounts from BAM files..
# Summarizing allele frequncy table..
# Performing pairwise comparison..
# Done!
res$pairwise_comparisonThe returned results is a list containing:
- a
matrixof allele frequency table for every genotyped SNP - a
data.frameof readcounts for ref and alt allele for every genotyped SNP - a table with pair-wise concordance among samples
- a list with potentially matched samples
res$pairwise_comparison# X_bam Y_bam concordant_snps discordant_snps fract_concordant_snps cor_coef XY_possibly_paired
# 1: DBW-40-1T DBW-40-2T 5488 571 0.9057600 0.9656484 Yes
# 2: DBW-40-1T DBW-40-3T 5793 266 0.9560984 0.9758083 Yes
# 3: DBW-40-1T DBW-43-N 5534 525 0.9133520 0.9667620 Yes
# 4: DBW-40-2T DBW-40-3T 5853 206 0.9660010 0.9817475 Yes
# 5: DBW-40-2T DBW-43-N 5131 928 0.8468394 0.9297096 Yes
# 6: DBW-40-3T DBW-43-N 5334 725 0.8803433 0.9550670 Yes
# 7: DBW-40-N DBW-43-1T 5709 350 0.9422347 0.9725684 Yes
# 8: DBW-40-1T DBW-40-N 2829 3230 0.4669087 0.3808831 No
# 9: DBW-40-1T DBW-43-1T 2796 3263 0.4614623 0.3755364 No
# 10: DBW-40-2T DBW-40-N 2760 3299 0.4555207 0.3641647 No
# 11: DBW-40-2T DBW-43-1T 2736 3323 0.4515597 0.3579747 No
# 12: DBW-40-3T DBW-40-N 2775 3284 0.4579964 0.3770581 No
# 13: DBW-40-3T DBW-43-1T 2753 3306 0.4543654 0.3721022 No
# 14: DBW-40-N DBW-43-N 2965 3094 0.4893547 0.3839140 No
# 15: DBW-43-1T DBW-43-N 2876 3183 0.4746658 0.3797829 Nores$BAM_matches# [[1]]
# [1] "DBW-40-1T" "DBW-40-2T" "DBW-40-3T" "DBW-43-N"
#
# [[2]]
# [1] "DBW-40-2T" "DBW-40-3T" "DBW-43-N"
#
# [[3]]
# [1] "DBW-40-3T" "DBW-43-N"
#
# [[4]]
# [1] "DBW-40-N" "DBW-43-1T"Results can be visualized with the correlation plot.
cor_table = cor(res$AF_table)pheatmap::pheatmap(cor_table, breaks = seq(0, 1, 0.01))10 TCGA cohort
10.1 可用 cohort
tcga_avail = tcgaAvailable()
head(tcga_avail, 3)10.2 載入 TCGA 佇列
# By default MAF from MC3 project will be loaded
laml_mc3 = tcgaLoad(study = "LAML")
laml_mc3
# Change the source to Firehose
laml_fh = tcgaLoad(study = "LAML", source = "Firehose")
laml_fhBiocManager::install(pkgs = "PoisonAlien/TCGAmutations")11 多重檢測實驗
laml_mae = maf2mae(m = laml)
laml_mae12 套件資訊
sessionInfo()