RGBM
RGBM is a LS-TreeBoost and LAD-TreeBoost for Gene Regulatory Network Reconstruction.
It Provides an implementation of Regularized LS-TreeBoost & LAD-TreeBoost algorithm for Regulatory Network inference from any type of expression data (Microarray/RNA-seq etc).
Availability: RGBM is also available as
RGBM Tutorial
Raghvendra Mall and Khalid Kunji
April 25, 2018
Steps For Inferring GRN using RGBM and perform MRA using FGSEA
This is a tutorial for using RGBM (for inferring gene regulatory network) and performing Master Regulator Analysis.
You will need to install the following packages: RGBM, doMC, foreach, igraph, gplots from CRAN. You will also need the fgsea package from bioconductor. You can register as many processes as available on your resource with registerDoMC(no_of_threads).
library(RGBM)
## Loading required package: foreach
## Loading required package: plyr
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(doMC)
library(foreach)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(fgsea)
## Loading required package: Rcpp
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
registerDoMC(20)
Loading Data and Necessary Utility Functions
You can source all utility functions for MRA and loading data
source('gene-reverse-network.R')
source('loadData.R')
source('get_functions.R')
Loading the log2-transformed quantile normalized data file (the data should be quantile normalized followed by log2 transformation, if the raw data is RNA-Seq counts).
load("Dataset/gep_1250_agi_rnaseq-dup.RData")
boxplot(z[,1:10])
Loading the Mechanistic TF-target regulations file
load("Dataset/me.net.Rdata")
rownames(me.net)
## [1] "ALX3" "ARNTL" "ARX" "AR" "ATF4" "ATF7" "BCL6B"
## [8] "CEBPB" "CEBPD" "CEBPG" "CENPB" "CLOCK" "CPEB1" "CREB3L1"
## [15] "CREB3" "CTCF" "CREB3L2" "CREB5" "DBP" "DLX1" "DLX2"
## [22] "DLX5" "DLX6" "E2F1" "E2F2" "E2F3" "E2F4" "E2F7"
## [29] "E2F8" "EBF1" "EGR1" "EGR2" "EGR3" "ELF1" "ELF3"
## [36] "ELF4" "ELK1" "ELK3" "ELK4" "EMX1" "EMX2" "EN1"
## [43] "EN2" "ERF" "ERG" "ESR1" "ESRRA" "ESRRG" "ETS1"
## [50] "ETV1" "ETV3" "ETV4" "ETV5" "ETV6" "FLI1" "FOXC1"
## [57] "FOXD2" "FOXD3" "FOXG1" "FOXJ2" "FOXJ3" "FOXK1" "FOXO1"
## [64] "FOXO3" "FOXO4" "GABPA" "GATA4" "GBX2" "GLI2" "GLIS1"
## [71] "GLIS2" "GLIS3" "GMEB2" "GRHL1" "GSC" "GSX1" "GSX2"
## [78] "HES5" "HESX1" "HEY1" "HEY2" "HIC2" "HLF" "HMBOX1"
## [85] "HNF1A" "HOMEZ" "HOXA10" "HOXA13" "HOXA1" "HOXA2" "HOXB13"
## [92] "HOXB2" "HOXB3" "HOXC10" "HOXC13" "HOXD11" "HOXD13" "HOXD8"
## [99] "HSF1" "HSF2" "HSF4" "HIC1" "HOXA11" "HOXD3" "HOXD9"
## [106] "ID4" "IRF3" "IRF5" "IRF7" "IRF8" "IRF9" "IRX2"
## [113] "IRX5" "IRX3" "JDP2" "KLF13" "KLF16" "KLF12" "LEF1"
## [120] "LHX2" "LHX6" "LHX9" "MAFF" "MAFG" "MAFK" "MAX"
## [127] "MEF2A" "MEF2B" "MEF2D" "MEIS1" "MEIS2" "MEIS3" "MEOX2"
## [134] "MESP1" "MGA" "MLXIPL" "MLX" "MNT" "MSC" "MSX1"
## [141] "MSX2" "MTF1" "MYBL1" "MYBL2" "MAFB" "NEUROD2" "NFAT5"
## [148] "NFATC1" "NFIA" "NFIB" "NFIL3" "NFIX" "NFKB1" "NFKB2"
## [155] "NHLH1" "NKX3-2" "NKX6-2" "NR2C2" "NR2E1" "NR2F1" "NR2F6"
## [162] "NR3C1" "NR3C2" "NR4A2" "NRF1" "NRL" "OLIG1" "OLIG2"
## [169] "ONECUT2" "OTX1" "OTX2" "PAX1" "PAX3" "PAX6" "PAX7"
## [176] "PITX1" "PKNOX1" "PKNOX2" "POU2F1" "POU2F2" "POU3F1" "POU3F2"
## [183] "POU3F3" "POU4F1" "PRDM1" "PRDM4" "PROX1" "PRRX1" "RARA"
## [190] "RARB" "RARG" "RFX2" "RFX3" "RFX4" "RFX5" "RORA"
## [197] "RUNX2" "RUNX3" "RXRA" "RXRB" "RXRG" "SCRT1" "SCRT2"
## [204] "SHOX2" "SMAD3" "SNAI2" "SOX10" "SOX15" "SOX18" "SOX21"
## [211] "SOX2" "SOX4" "SOX7" "SOX8" "SOX9" "SP1" "SP3"
## [218] "SP4" "SP8" "SPI1" "SREBF2" "SRF" "SOX11" "SOX17"
## [225] "SOX1" "SOX3" "SREBF1" "TBR1" "TBX15" "TBX19" "TBX1"
## [232] "TBX2" "TBX5" "TCF3" "TCF4" "TCF7L1" "TEAD1" "TEAD3"
## [239] "TEAD4" "TEF" "TFAP2A" "TFAP2B" "TFAP4" "TFCP2" "TFE3"
## [246] "TFEB" "TFEC" "TGIF1" "TGIF2" "THRA" "THRB" "TP63"
## [253] "TCF7" "TP53" "TP73" "USF1" "VAX2" "VDR" "VENTX"
## [260] "XBP1" "YY1" "ZBTB7A" "ZBTB7B" "ZBTB7C" "ZIC1" "ZIC3"
## [267] "ZIC4" "ZNF143" "ZNF232" "ZNF238" "ZNF282" "ZNF410" "ZNF524"
## [274] "ZNF740" "ZNF75A" "ZNF784" "DBX2" "HDX" "HOXA3" "HOXA4"
## [281] "HOXA5" "HOXA7" "HOXA9" "HOXB4" "HOXB7" "HOXC4" "HOXC6"
## [288] "HOXD10" "HOXD1" "LHX5" "NKX2-2" "NKX2-5" "OTP" "PBX1"
## [295] "POU6F1" "SIX1" "SIX4" "SIX6" "ELF2" "YAP1" "ARID3A"
## [302] "ARID5A" "ATF1" "BBX" "FOXJ1" "GMEB1" "HBP1" "IRF6"
## [309] "KLF7" "NR2F2" "OSR1" "OSR2" "PLAGL1" "SOX12" "SOX13"
## [316] "SOX5" "SP100" "TBP" "TCF7L2" "ZBTB12" "ZBTB3" "ZFP161"
## [323] "ZIC2" "RUNX1" "ARNT" "AHR" "CREB1" "DDIT3" "CEBPA"
## [330] "FOXF2" "GATA2" "KLF4" "FOXQ1" "IRF1" "IRF2" "MZF1"
## [337] "MYC" "NFYA" "PPARG" "RREB1" "NFE2L1" "TAL1" "JUN"
## [344] "FOS" "REL" "ZEB1" "MYCN" "RELA" "HLTF" "NR1H2"
## [351] "ZNF423" "NFIC" "TLX1" "ZNF354C" "STAT1" "REST" "STAT3"
## [358] "ZFX" "NFE2L2" "NFATC2" "INSM1" "PLAG1" "ESR2" "HIF1A"
## [365] "BATF" "BCL6" "E2F6" "FOSL1" "FOSL2" "FOXP1" "HNF4G"
## [372] "JUNB" "JUND" "NR1H3" "MEF2C" "MYOD1" "MAF" "NFYB"
## [379] "NR5A2" "RFX1" "SMAD2" "SMAD4" "SOX6" "SP2" "STAT2"
## [386] "STAT4" "STAT5A" "STAT5B" "STAT6" "TCF12" "USF2" "ZBTB33"
## [393] "ZNF263" "BACH1" "FOXP2" "THAP1" "KLF5" "ARID3B" "ATF3"
## [400] "CREM" "ID2" "LIN54" "MITF" "MLXIP" "NFATC3" "NPAS2"
## [407] "TCFL5" "HES1" "ARNT2" "ATF2" "ATF5" "BPTF" "BRCA1"
## [414] "CDC5L" "CEBPZ" "CXXC1" "E2F5" "E4F1" "EPAS1" "ETS2"
## [421] "ETV7" "FOSB" "FOXF1" "FOXM1" "FUBP1" "GLI1" "GLI3"
## [428] "HMGA1" "HMGA2" "IKZF1" "KLF15" "KLF3" "KLF6" "KLF8"
## [435] "MAZ" "MBD2" "MECP2" "NFYC" "NR0B1" "NR1D1" "NR2C1"
## [442] "NR4A1" "NR4A3" "PAX8" "PBX2" "PBX3" "PPARA" "PPARD"
## [449] "PURA" "RELB" "SMAD1" "SNAI1" "TBX3" "TFDP1" "ZBTB4"
## [456] "ZBTB6" "ZFHX3"
sum(me.net>0)
## [1] 1874570
Loading the phenotype information file
load("Dataset/casesTable.RData")
str(casesTable)
## 'data.frame': 1060 obs. of 51 variables:
## $ Case : chr "TCGA-CS-4938" "TCGA-CS-4941" "TCGA-CS-4942" "TCGA-CS-4943" ...
## $ Tissue.source.site : chr "Thomas Jefferson University" "Thomas Jefferson University" "Thomas Jefferson University" "Thomas Jefferson University" ...
## $ Study : chr "Brain Lower Grade Glioma" "Brain Lower Grade Glioma" "Brain Lower Grade Glioma" "Brain Lower Grade Glioma" ...
## $ BCR : chr "IGC" "IGC" "IGC" "IGC" ...
## $ Whole.exome : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Whole.genome : chr "Yes" "Yes" "No" "No" ...
## $ RNAseq : chr "Yes" "Yes" "Yes" "Yes" ...
## $ SNP6 : chr "Yes" "Yes" "Yes" "Yes" ...
## $ U133a : chr "No" "No" "No" "No" ...
## $ HM450 : chr "Yes" "Yes" "Yes" "Yes" ...
## $ HM27 : chr "No" "No" "No" "No" ...
## $ RPPA : chr "Yes" "No" "Yes" "Yes" ...
## $ Histology : chr "astrocytoma" "astrocytoma" "astrocytoma" "astrocytoma" ...
## $ Grade : chr "G2" "G3" "G3" "G3" ...
## $ Age..years.at.diagnosis. : int 31 67 44 37 50 47 39 40 43 53 ...
## $ Gender : chr "female" "male" "female" "male" ...
## $ Survival..months. : num 4.7 7.69 43.86 18.14 10.61 ...
## $ Vital.status..1.dead. : int 0 1 1 0 0 0 0 0 1 0 ...
## $ Karnofsky.Performance.Score : int 90 90 90 50 90 100 100 NA 90 90 ...
## $ Mutation.Count : int 15 50 24 30 20 32 26 22 39 26 ...
## $ Percent.aneuploidy : num 0.0694 0.2241 0.0937 0.1724 0.0603 ...
## $ IDH.status : chr "Mutant" "WT" "Mutant" "Mutant" ...
## $ X1p.19q.codeletion : chr "non-codel" "non-codel" "non-codel" "non-codel" ...
## $ IDH.codel.subtype : chr "IDHmut-non-codel" "IDHwt" "IDHmut-non-codel" "IDHmut-non-codel" ...
## $ MGMT.promoter.status : chr "Unmethylated" "Methylated" "Unmethylated" "Methylated" ...
## $ Chr.7.gain.Chr.10.loss : chr "No combined CNA" "Gain chr 7 & loss chr 10" "No combined CNA" "No combined CNA" ...
## $ Chr.19.20.co.gain : chr "No chr 19/20 gain" "No chr 19/20 gain" "No chr 19/20 gain" "No chr 19/20 gain" ...
## $ TERT.promoter.status : chr "WT" "Mutant" "WT" "WT" ...
## $ TERT.expression..log2. : num 0 3.81 0 0 2.58 ...
## $ TERT.expression.status : chr "Not expressed" "Expressed" "Not expressed" "Not expressed" ...
## $ ATRX.status : chr "Mutant" "WT" "Mutant" "Mutant" ...
## $ DAXX.status : chr "WT" "WT" "WT" "WT" ...
## $ Telomere.Maintenance : chr "ATRX" "TERT" "ATRX" "ATRX" ...
## $ BRAF.V600E.status : chr "WT" "WT" "WT" "WT" ...
## $ BRAF.KIAA1549.fusion : chr "WT" "WT" "WT" "WT" ...
## $ ABSOLUTE.purity : num 0.79 0.61 0.76 0.83 0.74 0.85 0.53 0.92 0.81 0.89 ...
## $ ABSOLUTE.ploidy : num 2 2.05 2.05 3.92 1.94 1.97 4.15 1.95 2.05 1.97 ...
## $ ESTIMATE.stromal.score : num 422 707 563 460 701 ...
## $ ESTIMATE.immune.score : num 805 2283 2076 819 1282 ...
## $ ESTIMATE.combined.score : num 1227 2990 2640 1279 1983 ...
## $ Original.Subtype : chr "IDHmut-non-codel" "IDHwt" "IDHmut-non-codel" "IDHmut-non-codel" ...
## $ Transcriptome.Subtype : chr NA "CL" "PN" "PN" ...
## $ Pan.Glioma.RNA.Expression.Cluster : chr "LGr3" "LGr4" "LGr3" "LGr3" ...
## $ IDH.specific.RNA.Expression.Cluster : chr "IDHmut-R3" "IDHwt-R2" "IDHmut-R3" "IDHmut-R3" ...
## $ Pan.Glioma.DNA.Methylation.Cluster : chr "LGm2" "LGm5" "LGm2" "LGm2" ...
## $ IDH.specific.DNA.Methylation.Cluster : chr "IDHmut-K2" "IDHwt-K2" "IDHmut-K2" "IDHmut-K2" ...
## $ Supervised.DNA.Methylation.Cluster : chr "G-CIMP-high" "Mesenchymal-like" "G-CIMP-high" "G-CIMP-high" ...
## $ Random.Forest.Sturm.Cluster : chr "IDH" "Mesenchymal" "IDH" "IDH" ...
## $ RPPA.cluster : chr "K2" NA "K2" "K2" ...
## $ Telomere.length.estimate.in.blood.normal..Kb.: num 8.11 1.42 NA NA 3.26 ...
## $ Telomere.length.estimate.in.tumor..Kb. : num 5.19 1.61 NA NA 1.5 ...
Loading the list of TFs
tfs <- as.character(as.vector(read.csv("Dataset/ColNames_TF.csv",header=TRUE)$x))
Perform Initial Pre-Processing to obtain Expression Matrix, Perturbation Matrix [Matrix of Zeros], Mechanistic Matrix, TFs and Target Genes
#Here we consider me.net is the matrix containing prior set of possible interactions
out <- initial_preprocessing(z,me.net)
#Some targets might be mismatching with mechanistic network, to make things consistent
#we do the pre-processing. Here we don't consider any Knockout or Knockdown information
#is available.
mod_z <- out[[1]]
K <- out[[2]]
M <- out[[3]]
tfs <- out[[4]]
targets <- out[[5]]
Specify the path to save results obtained from RGBM
filename <- "GLIOMA"
outputpath <- paste0("Results/",filename)
sample_name <- paste0(filename,"_Full_")
Run the RGBM algorithm to reverse-engineer the regulatory network
V_final <- RGBM(E = mod_z , K = K, g_M = M, tfs = tfs, targets = targets, M=5000, nu = 0.001,
s_f = 0.3, lf = 1, no_iterations = 3, mink = 0, experimentid = 1, outputpath
= outputpath, sample_type = sample_name, real = 1);
write.table(V_final,file=paste("Results/",filename,"/Adjacency_Matrix/",sample_name,
"Final_Adjacency_Matrix.csv",sep=""),row.names = T, col.names = T)
net <- read.table(paste0("Results/",filename,"/Adjacency_Matrix/",sample_name,
"Final_Adjacency_Matrix.csv"),header=TRUE,sep=" ")
Make average size of regulon less than 100
#Make average regulon size to less than 100
while((sum(net>0)/length(tfs))>100)
{
median_weight <- median(net[net>0]);
net[net<median_weight] <- 0;
}
Make the correlation matrix to get signed regulation information
#Create correlation matrix
corr_matrix <- cross.cor(t(mod_z)[tfs,],t(mod_z),met="spearman",ncore = 16)
corr_matrix[net==0] <- 0;
load(paste0(outputpath,"/Adjacency_Matrix/",sample_name,"Correlation_matrix.Rdata"))
Get all the regulons for all the TFs
regulon_sets <- get_regulons(net,minsize=10)
Create the activity matrix
amat <- activity_mc(mexp = t(mod_z), cormat = corr_matrix, tflist = rownames(net), tau = 0.0)
Get phenotype from case table
glioma_cases <- casesTable$Case;
idh_mutant_cases <- which(casesTable$IDH.status=="Mutant");
idh_wildtype_cases <- which(casesTable$IDH.status=="WT")
tcga_mutant_sample_names <- glioma_cases[idh_mutant_cases];
tcga_wildtype_sample_names <- glioma_cases[idh_wildtype_cases];
mutant_indices <- which(rownames(mod_z) %in% tcga_mutant_sample_names);
wildtype_indices <- which(rownames(mod_z) %in% tcga_wildtype_sample_names)
To perform MRA using FGSEA need to see difference in expression for all targets
#Get difference in expression of target genes using mutant vs wildtype info
pheno_t <- c()
pv_t <- c()
for (j in 1:ncol(mod_z)){
mutant_expr <- mean(mod_z[mutant_indices,j]);
wildtype_expr <- mean(mod_z[wildtype_indices,j]);
pheno_t <- c(pheno_t,mutant_expr-wildtype_expr);
pv_t <- c(pv_t,wilcox.test(mod_z[mutant_indices,j],mod_z[wildtype_indices,j],
exact=F)$p.value)
}
padj_t = p.adjust(pv_t,method="fdr")
names(pheno_t) <- colnames(mod_z);
sorted_pheno_t <- sort(pheno_t,decreasing = TRUE)
Perform Gene Set Enrichment
fgseaRes <- fgsea(regulon_sets,sorted_pheno_t,minSize=10,maxSize=max(unlist(lapply(
regulon_sets,length))), nperm=100000)
topPathwaysUp <- fgseaRes[ES > 0][head(order(pval), n=20), pathway]
topPathwaysDown <- fgseaRes[ES < 0][head(order(pval), n=20), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
Plot the most enriched TFs with target info
plotGseaTable(regulon_sets[topPathways], sorted_pheno_t, fgseaRes, gseaParam = 0.5)
Identify top MRs
mra=as.data.frame(fgseaRes[order(pval)])
mra$leadingEdge=as.character(mra$leadingEdge)
topmr = mra$pathway[mra$padj<0.01]
topmr_info = mra[mra$padj<0.01,]
indices = which(mra$padj<0.01);
mra_table <- data.frame(topmr=mra$pathway[indices],padj=mra$padj[indices],
NES=mra$NES[indices])
Build the activity matrix based on top MRs
amat.mr <- amat[topmr,]
amat.mr.mutant <- amat.mr[,mutant_indices];
amat.mr.wildtype <- amat.mr[,wildtype_indices]
wilcox_test_info <- perform_wilcox_test(amat.mr.mutant,amat.mr.wildtype)
wilcox_test_info <- wilcox_test_info[order(wilcox_test_info$FC_Mean,decreasing=T),]
new_mr_activity_matrix <- as.matrix(amat[rownames(wilcox_test_info),
c(mutant_indices,wildtype_indices)])
rownames(new_mr_activity_matrix)
## [1] "SOX1" "SOX8" "THRA" "FOXO4" "SOX6" "SCRT1" "TEF"
## [8] "ZNF238" "MYOD1" "MYCN" "TFAP4" "NR0B1" "HLF" "HEY2"
## [15] "KLF12" "ARNT2" "ZBTB3" "ZBTB4" "CTCF" "BPTF" "PAX1"
## [22] "GLIS2" "HIC2" "TBR1" "ID4" "EMX1" "ZNF263" "THRB"
## [29] "NEUROD2" "SOX10" "LHX6" "MEF2C" "STAT4" "IRF8" "MAFF"
## [36] "JUNB" "IRF5" "E2F2" "RUNX2" "BCL6B" "PRDM1" "STAT3"
## [43] "E2F1" "SPI1" "FOSL2" "BATF" "FOXM1" "HIC1" "KLF6"
## [50] "CEBPB" "IRF7" "RELB" "VDR" "ARID5A" "MBD2" "RUNX1"
## [57] "STAT1" "IRF1" "TGIF1" "FOSL1" "MEOX2" "ETV7" "CEBPD"
## [64] "TEAD3" "HOXC10" "HOXC6"
colnames(new_mr_activity_matrix) <- NULL
colcol <- rep("white",ncol(new_mr_activity_matrix));
colcol[1:length(mutant_indices)] <- "yellow"
colcol[(length(wildtype_indices)+1):length(colcol)] <- "blue"
hc.cols <- hclust(dist(t(new_mr_activity_matrix),method="manhattan"),method="ward.D2")
new_mr_activity_matrix[new_mr_activity_matrix <= quantile(new_mr_activity_matrix, 0.05)
] <- quantile(new_mr_activity_matrix, 0.05)
new_mr_activity_matrix[new_mr_activity_matrix >= quantile(new_mr_activity_matrix, 0.95)
] <- quantile(new_mr_activity_matrix, 0.95)
#Here yellow colour is for mutant case and blue colour is for wildtype
par(mar=c(1,1,1,1))
heatmap.2(x=new_mr_activity_matrix, Rowv = F, Colv = as.dendrogram(hc.cols), col =
greenred(50), dendrogram = "column",trace="none", ColSideColors = colcol)