#V3: redo after updating diff. exp. testing with voom quality weights strategy and changing model #the only workable model I can come up with is analyzing GHneg and GHpos samples seperately by HIVe status regardless of age. #This avoids including two samples from the same infant in the model library(org.Hs.eg.db) library(DOSE) library(pathview) library(clusterProfiler) library(AnnotationHub) library(ensembldb) library(tidyverse) library(SPIA) library(annotables) #Set working dir setwd("/Users/katielennard/Documents/Katie/IDM_projects/Gray") ## Explore the mouse annotation table loaded by the annotables library grch38 grch37 ##Load DE results of interest #GHneg samples HEU vs. HUU: not used # GHneg_hiv <- read.csv2(paste0(getwd(),"/outlier_samples_removed_voomWQW_GHneg_hivE_full_table.csv"), sep = ",", stringsAsFactors = F) # dim(GHneg_hiv)#11727 8 #OR #GHneg samples HEU vs. HUU with age groups specified in model (used for publication) GHneg_hiv <- read.csv2(paste0(getwd(),"/outlier_samples_removed_voomWQW_GHneg_hivE_age_full_table.csv"), sep = ",", stringsAsFactors = F) dim(GHneg_hiv)#11727 8 #GHpos samples HEU vs. HUU: not used # GHpos_hiv <- read.csv2(paste0(getwd(),"/outlier_samples_removed_voomWQW_GHpos_hivE_full_table.csv"), sep = ",", stringsAsFactors = F) # dim(GHpos_hiv)#9497 8 #OR #GHpos samples HEU vs. HUU with age groups specified in model (used for publication) GHpos_hiv <- read.csv2(paste0(getwd(),"/outlier_samples_removed_voomWQW_GHpos_hivE_age_full_table.csv"), sep = ",", stringsAsFactors = F) dim(GHpos_hiv)#9497 8 grch37 <- data.frame(grch37) grch37_GHneg <- grch37[grch37$ensgene %in% GHneg_hiv$X,] grch37_GHpos <- grch37[grch37$ensgene %in% GHpos_hiv$X,] dim(grch37_GHneg)#12686 some duplicates as shown in example below temp <- which(duplicated(grch37_GHneg$ensgene)) head(temp) grch37_GHneg[45:46,]#duplicate example - they have different entrez gene symbols for a given ensgene symbol # ensgene entrez symbol chr start end strand biotype # 68 ENSG00000004866 93655 ST7 7 116593292 116870157 1 protein_coding # 69 ENSG00000004866 7982 ST7 7 116593292 116870157 1 protein_coding # ## Merge the IDs with the results res_ids_GHneg <- inner_join(GHneg_hiv, grch37_GHneg, by=c("genes"="symbol")) head(res_ids_GHneg) dim(res_ids_GHneg)#12700 16 # res_ids_GHpos <- inner_join(GHpos_hiv, grch37_GHpos, by=c("genes"="symbol")) head(res_ids_GHpos) dim(res_ids_GHpos)#10250 16 # #--- ## Remove any entrez NA values for SPIA analysis, which accepts entrez gene IDs res_entrez_GHneg_spia <- dplyr::filter(res_ids_GHneg, entrez != "NA") dim(res_entrez_GHneg_spia)#11674 16 #There will still be ensemble gene ID duplicates here (same ENS maps to multiple entrez gene IDs). Need to use #only 1 so as not to bias pathway analysis length(unique(res_entrez_GHneg_spia$ensgene))#10712 #remove rows with duplicate ensemble gene IDs res_entrez_GHneg_spia <- res_entrez_GHneg_spia %>% dplyr::distinct(ensgene, .keep_all = TRUE) dim(res_entrez_GHneg_spia)#10712 #remove rows with duplicate entrez gene IDs res_entrez_GHneg_spia <- res_entrez_GHneg_spia %>% dplyr::distinct(entrez, .keep_all = TRUE) dim(res_entrez_GHneg_spia)#10627 # res_entrez_GHpos_spia <- dplyr::filter(res_ids_GHpos, entrez != "NA") length(unique(res_entrez_GHpos_spia$ensgene))#8985 dim(res_entrez_GHpos_spia)#9736 16 res_entrez_GHpos_spia <- res_entrez_GHpos_spia %>% dplyr::distinct(ensgene, .keep_all = TRUE) dim(res_entrez_GHpos_spia)#8985 res_entrez_GHpos_spia <- res_entrez_GHpos_spia %>% dplyr::distinct(entrez, .keep_all = TRUE) dim(res_entrez_GHpos_spia)#8931 ## Significant genes is a vector of fold changes where the names are ENTREZ gene IDs. #The background set is a vector of all the genes used for differential expression testing https://www.biostars.org/p/188571/ #GHneg sig_res_entrez_GHneg <- res_entrez_GHneg_spia[which(res_entrez_GHneg_spia$P.Value < 0.05), ] sig_entrez_GHneg <- as.numeric(sig_res_entrez_GHneg$logFC) names(sig_entrez_GHneg) <- sig_res_entrez_GHneg$entrez head(sig_entrez_GHneg) length(sig_entrez_GHneg)#928 | 636 if including age reference <- as.character(res_entrez_GHneg_spia$entrez) spia_result_GHneg <- spia(de=sig_entrez_GHneg, all=reference, organism="hsa") sig_spia_GHneg <- spia_result_GHneg[spia_result_GHneg$pG < 0.05,] dim(sig_spia_GHneg)#13 12 |6 12 if including age in the model head(sig_spia_GHneg) # write.csv(sig_spia_GHneg,"SPIA_vWQW_GHneg_HEU_vs_HUU_p_0_05_outlier_samples_removed.csv") write.csv(sig_spia_GHneg,"SPIA_vWQW_GHneg_age_HEU_vs_HUU_p_0_05_outlier_samples_removed.csv") #GHpos sig_res_entrez_GHpos <- res_entrez_GHpos_spia[which(res_entrez_GHpos_spia$P.Value < 0.05), ] sig_entrez_GHpos <- as.numeric(sig_res_entrez_GHpos$logFC) names(sig_entrez_GHpos) <- sig_res_entrez_GHpos$entrez head(sig_entrez_GHpos) length(sig_entrez_GHpos)#513 | 588 if including age reference <- as.character(res_entrez_GHpos_spia$entrez) spia_result_GHpos <- spia(de=sig_entrez_GHpos, all=reference, organism="hsa") sig_spia_GHpos <- spia_result_GHpos[spia_result_GHpos$pG < 0.05,] dim(sig_spia_GHpos)#4 | 2 if age is included in the model head(sig_spia_GHpos)# # write.csv(sig_spia_GHpos,"SPIA_vWQW_GHpos_HEU_vs_HUU_p_0_05_outlier_samples_removed.csv") write.csv(sig_spia_GHpos,"SPIA_vWQW_GHpos_age_HEU_vs_HUU_p_0_05_outlier_samples_removed.csv") #----------- #ClusterProfiler GO enrichment analysis ## Run GO enrichment analysis GHneg HEU vs HUU ego <- enrichGO(gene = as.character(sig_res_entrez_GHneg$entrez), universe = as.character(res_entrez_GHneg$entrez), keyType = "ENTREZID", OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", qvalueCutoff = 0.1, readable = TRUE) head(data.frame(ego))#0 | 0 ego <- enrichGO(gene = as.character(sig_res_entrez_GHneg$ensgene), universe = as.character(res_entrez_GHneg$ensgene), keyType = "ENSEMBL", OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", qvalueCutoff = 0.1, readable = TRUE) dim(data.frame(ego))#19 9 | 0 9 if including age in the model #NB if we use entrez gene ids we get no significant hits; if we use ensemble we do.. cluster_summary_GHneg <- data.frame(ego)#Note that most of these are protein coding ribosomal proteins that remain after filtering in the pipeline cluster_summary_GHneg[1,] write.csv(cluster_summary_GHneg, "clusterProfiler_enrichGO_vWQW_GHneg_HUE_vs_HUU_outlier_removed.csv") # #GENE SET ENRICHMENT GO head(sig_entrez_GHneg) GHneg_sorted <- sort(sig_entrez_GHneg, decreasing = TRUE) head(GHneg_sorted) set.seed(1234) gse_GO <- gseGO(gene = GHneg_sorted,#named FC list OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05, nPerm = 10000, seed=TRUE ) dim(gse_GO)#321 | 0 if including age in the model head(gse_GO) write.csv(gse_GO, "clusterProfiler_GO_GSEA_vWQW_GHneg_HUE_vs_HUU_outlier_removed.csv") #---------- #what happens if we use ensemble gene IDs instead of entrez gene IDs sig_res_ens_GHneg <- res_entrez_GHneg_spia[which(res_entrez_GHneg_spia$P.Value < 0.05), ] sig_ens_GHneg <- as.numeric(sig_res_ens_GHneg$logFC) names(sig_ens_GHneg) <- sig_res_ens_GHneg$ensgene head(sig_ens_GHneg) GHneg_sorted <- sort(sig_ens_GHneg, decreasing = TRUE) head(GHneg_sorted) set.seed(1234) gse_GO_ens <- gseGO(gene = GHneg_sorted,#named FC list OrgDb = org.Hs.eg.db, ont = "BP", keyType = "ENSEMBL", pAdjustMethod = "BH", pvalueCutoff = 0.05, nPerm = 10000, seed=TRUE ) dim(gse_GO_ens)#296 | 0 if including age head(gse_GO_ens) str(gse_GO_ens) gse_GO_ens <- data.frame(gse_GO_ens) gse_GO <- data.frame(gse_GO) length(intersect(rownames(gse_GO_ens),rownames(gse_GO)))#285 so very similar between entrez vs. ensemble temp <- setdiff(rownames(gse_GO),rownames(gse_GO_ens)) gse_GO[temp,] write.csv(gse_GO_ens, "clusterProfiler_GO_GSEA_vWQW_GHneg_HUE_vs_HUU_outlier_removed_ensembleIDs.csv") # # kk <- enrichKEGG(gene = as.character(sig_res_entrez_GHneg$entrez), universe = as.character(res_entrez_GHneg$entrez), organism = 'hsa', pvalueCutoff = 0.05) dim(kk)#1 9 | 0 if including age in the model data.frame(kk) #Note: ensemble gene IDs not supported for hsa # #--------------------- # GO enrichment, GHpos HEU vs. HUU ego_GHpos <- enrichGO(gene = as.character(sig_res_entrez_GHpos$entrez), universe = as.character(res_entrez_GHpos$entrez), keyType = "ENTREZID", OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", qvalueCutoff = 0.1, readable = TRUE) dim(ego_GHpos)#0 | 0 #different results when we use ENSEMBL vs ENTREZ - multimapping issue, creates artifact, stick to ENSEMBL ego_GHpos <- enrichGO(gene = as.character(sig_res_entrez_GHpos$ensgene), universe = as.character(res_entrez_GHpos$ensgene), keyType = "ENSEMBL", OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", qvalueCutoff = 0.1, readable = TRUE) # cluster_summary_GHpos <- data.frame(ego_GHpos) dim(cluster_summary_GHpos)#0 | 0 head(cluster_summary_GHpos)# # kk_pos <- enrichKEGG(gene = as.character(sig_res_entrez_GHpos$entrez), universe = as.character(res_entrez_GHpos$entrez), organism = 'hsa', pvalueCutoff = 0.05) dim(kk_pos)#1 or 0 if including age in the model #---- GHpos_sorted <- sort(sig_entrez_GHpos, decreasing = TRUE) head(GHpos_sorted) set.seed(1234) gse_GO_pos <- gseGO(gene = GHpos_sorted,#named FC list OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", nPerm = 10000, pvalueCutoff = 0.05,seed = TRUE) head(gse_GO_pos) dim(gse_GO_pos)#0 | 0 if including age in the model #-------- #If we use ensemble instead sig_res_ens_GHpos <- res_entrez_GHpos_spia[which(res_entrez_GHpos_spia$P.Value < 0.05), ] sig_ens_GHpos <- as.numeric(sig_res_ens_GHpos$logFC) names(sig_ens_GHpos) <- sig_res_ens_GHpos$ensgene head(sig_ens_GHpos) GHpos_sorted <- sort(sig_ens_GHpos, decreasing = TRUE) head(GHpos_sorted) set.seed(1234) gse_GO_ens <- gseGO(gene = GHpos_sorted,#named FC list OrgDb = org.Hs.eg.db, ont = "BP", keyType = "ENSEMBL", pAdjustMethod = "BH", pvalueCutoff = 0.05, nPerm = 10000, seed=TRUE ) dim(gse_GO_ens)#0 #-------- #GSEA KEGG GHpos_sorted <- sort(sig_entrez_GHpos, decreasing = TRUE) set.seed(1234) gse_k_pos <- gseKEGG(gene = GHpos_sorted,#named FC list organism = "hsa", pAdjustMethod = "BH", pvalueCutoff = 0.05, seed = TRUE,nPerm = 10000) head(gse_k_pos) dim(gse_k_pos)#0 | 0 if including age in the model #------------------------------ #On recommendation from Susan Holmes, co-author and statistician at Stanford, we will also use the #R package Bionet library(BioNet) library(DLBCL) library(ALL) data("dataLym") data("interactome") data(exprLym) data(ALL) interactome # A graphNEL graph with undirected edges # Number of Nodes = 9386 # Number of Edges = 36504 head(dataLym) pvals <- cbind(t = dataLym$t.pval, s = dataLym$s.pval) rownames(pvals) <- dataLym$label pval <- aggrPvals(pvals, order = 2, plot = FALSE) head(pvals) dim(pvals) #3583 subnet <- subNetwork(dataLym$label, interactome) subnet <- rmSelfLoops(subnet) subnet #-------------- #remove duplicates use <- res_entrez_GHneg[!duplicated(res_entrez_GHneg$genes),] dim(use)#10069 head(use) rownames(use) <- use$genes #Now make list of interactome nodes minus node number to match to gene symbol in use interactome_symbols <- unlist(nodes(interactome)) head(interactome_symbols) inter_df <- cbind(interactome_symbols,interactome_symbols) head(inter_df) colnames(inter_df)[2] <- c("nodes") inter_df[,1] <- gsub("\\(.*","",inter_df[,1]) head(inter_df) rownames(inter_df) <- inter_df[,1] find_nodes <- intersect(rownames(use),rownames(inter_df)) length(find_nodes)#4910 # find_nodes <- lapply(use$genes,function(x) nodes(interactome)[grep(x,nodes(interactome))]) head(find_nodes) use <- use[find_nodes,] dim(use)#4910 use$node <- inter_df[rownames(use),"nodes"] head(use) network <- subNetwork(use$node, interactome) network # A graphNEL graph with undirected edges # Number of Nodes = 4910 # Number of Edges = 16584 #--------------------------- #Repeat for GH pos dataset use_pos <- res_entrez_GHpos[!duplicated(res_entrez_GHpos$genes),] dim(use_pos)#8562 head(use_pos) rownames(use_pos) <- use_pos$genes #Now make list of interactome nodes minus node number to match to gene symbol in use find_nodes <- intersect(rownames(use_pos),rownames(inter_df)) length(find_nodes)#4390 head(find_nodes) use_pos <- use_pos[find_nodes,] dim(use_pos)#4390 use_pos$node <- inter_df[rownames(use_pos),"nodes"] head(use_pos) network_pos <- subNetwork(use_pos$node, interactome) network # A graphNEL graph with undirected edges # Number of Nodes = 4910 # Number of Edges = 16584 length(intersect(nodes(network),nodes(network_pos))) #4327 pos_pval <- data.frame(rownames(use_pos),use_pos[,"P.Value"]) head(pos_pval) pos_pval <- data.frame(pos_pval[,2]) rownames(pos_pval) <- use_pos[,"node"] colnames(pos_pval) <- "p_value" pos_pval <- as.vector(pos_pval) pos_pval <- as.numeric(as.character(unlist(pos_pval))) names(pos_pval) <- use_pos[,"node"] fb <- fitBumModel(pos_pval, plot = TRUE) scores <- scoreNodes(network = network_pos, fb = fb, fdr = 0.05) scores max(scores) min(scores) hotSub = runFastHeinz(network_pos, scores) hotSub #--------------------------- #Make dataframe of corresponding p-values common <- intersect(rownames(use),rownames(use_pos)) length(common)#4327 pvals <- cbind(use[common,"P.Value"],use_pos[common,"P.Value"]) head(pvals) colnames(pvals) <- c("GHneg_pval","GHpos_pval") rownames(pvals) <- use[common,"node"] pvals <- data.frame(pvals) pvals[,1] <- as.numeric(as.character(unlist(pvals[,1]))) pvals[,2] <- as.numeric(as.character(unlist(pvals[,2]))) str(pvals) head(pvals) pval <- aggrPvals(as.matrix(pvals), order = 2, plot = FALSE) head(pval) network_joint <- subNetwork(rownames(pvals), interactome) network_joint # A graphNEL graph with undirected edges # Number of Nodes = 4327 # Number of Edges = 14013 # Now we can use the aggregated p-values to fit the Beta-uniform mixture model # to the distribution. The following plot shows the fitted Beta-uniform mixture # model in a histogram of the p-values. # fb <- fitBumModel(pval, plot = TRUE) # fb # scores <- scoreNodes(network = network_joint, fb = fb, fdr = 0.001) # # In the next step the network with the scores and edges is written to a file and # # the heinz algorithm is used to calculate the maximum-scoring subnetwork. In # # order to run heinz self-loops have to be removed from the network # network_2 <- largestComp(network_joint) # network_2 <- rmSelfLoops(network_2) # network_2 # # A graphNEL graph with undirected edges # # Number of Nodes = 3709 # # Number of Edges = 12929 # writeHeinzEdges(network = network_2, file = "GHpos_neg_edges_001",use.score = FALSE) # writeHeinzNodes(network = network_2, file = "GHpos_neg_nodes_001",node.scores = scores) #///////////////////////// el <- cbind(c("a", "b", "c", "d", "e", "f", "d"), c("b", "c", "d", "e", "f", "a", "b")) graph <- graph.edgelist(el, directed=TRUE) node.list <- c("a", "b", "c") graph2 <- subNetwork(nodeList=node.list, network=graph) data(colorectalcancer) str(DE_Colorectal) head(DE_Colorectal) A <- c(9,5) B <- c(5,9) C <- c(7,8) D <- data.frame(A,B,C) D chisq.test(t(D)) entrez_test <- c("84872","554313","121504","8370","8368","8366","8367","8365","8364","8362","8363","8361","8360","8359","8294") temp <- grch37[grch37$entrez %in% entrez_test,] unique(temp$description) entrez_test <- c("10553","127428","55756","9869","5990","339318","554313","121504", "8370","8368","8366","8367","8365","8364","8362","8363","79724","6941","3280","27005","57585", "51360","1277","653361","339344","2334","126375","54971","22976","9400","51003","11267","10623", "2302","91120","7733","86","58486","9139","92105","84503") temp <- sig_res_entrez_GHpos[sig_res_entrez_GHpos$entrez %in% entrez_test,] unique(temp$description) na.omit(grch37[grch37$entrez == "8370",])