#V3: redo after updating diff. exp. testing with voom quality weights strategy and changing model
#the only workable model I can come up with is analyzing GHneg and GHpos samples seperately by HIVe status regardless of age.
#This avoids including two samples from the same infant in the model
library(org.Hs.eg.db)
library(DOSE)
library(pathview)
library(clusterProfiler)
library(AnnotationHub)
library(ensembldb)
library(tidyverse)
library(SPIA)
library(annotables)
#Set working dir
setwd("/Users/katielennard/Documents/Katie/IDM_projects/Gray")

## Explore the mouse annotation table loaded by the annotables library
grch38
grch37

##Load DE results of interest
#GHneg samples HEU vs. HUU: not used
# GHneg_hiv <- read.csv2(paste0(getwd(),"/outlier_samples_removed_voomWQW_GHneg_hivE_full_table.csv"), sep = ",", stringsAsFactors = F)
# dim(GHneg_hiv)#11727 8
#OR
#GHneg samples HEU vs. HUU with age groups specified in model (used for publication)
GHneg_hiv <- read.csv2(paste0(getwd(),"/outlier_samples_removed_voomWQW_GHneg_hivE_age_full_table.csv"), sep = ",", stringsAsFactors = F)
dim(GHneg_hiv)#11727 8

#GHpos samples HEU vs. HUU: not used
# GHpos_hiv <- read.csv2(paste0(getwd(),"/outlier_samples_removed_voomWQW_GHpos_hivE_full_table.csv"), sep = ",", stringsAsFactors = F)
# dim(GHpos_hiv)#9497 8
#OR
#GHpos samples HEU vs. HUU with age groups specified in model (used for publication)
GHpos_hiv <- read.csv2(paste0(getwd(),"/outlier_samples_removed_voomWQW_GHpos_hivE_age_full_table.csv"), sep = ",", stringsAsFactors = F)
dim(GHpos_hiv)#9497 8

grch37 <- data.frame(grch37)
grch37_GHneg <- grch37[grch37$ensgene %in% GHneg_hiv$X,]
grch37_GHpos <- grch37[grch37$ensgene %in% GHpos_hiv$X,]

dim(grch37_GHneg)#12686 some duplicates as shown in example below
temp <- which(duplicated(grch37_GHneg$ensgene))
head(temp)
grch37_GHneg[45:46,]#duplicate example - they have different entrez gene symbols for a given ensgene symbol
# ensgene entrez symbol chr     start       end strand        biotype
# 68 ENSG00000004866  93655    ST7   7 116593292 116870157      1 protein_coding
# 69 ENSG00000004866   7982    ST7   7 116593292 116870157      1 protein_coding

#
## Merge the IDs with the results 
res_ids_GHneg <- inner_join(GHneg_hiv, grch37_GHneg, by=c("genes"="symbol"))         
head(res_ids_GHneg)
dim(res_ids_GHneg)#12700   16
#
res_ids_GHpos <- inner_join(GHpos_hiv, grch37_GHpos, by=c("genes"="symbol"))         
head(res_ids_GHpos)
dim(res_ids_GHpos)#10250 16
#
#---
## Remove any entrez NA values for SPIA analysis, which accepts entrez gene IDs
res_entrez_GHneg_spia <- dplyr::filter(res_ids_GHneg, entrez != "NA")
dim(res_entrez_GHneg_spia)#11674 16
#There will still be ensemble gene ID duplicates here (same ENS maps to multiple entrez gene IDs). Need to use
#only 1 so as not to bias pathway analysis
length(unique(res_entrez_GHneg_spia$ensgene))#10712
#remove rows with duplicate ensemble gene IDs
res_entrez_GHneg_spia <- res_entrez_GHneg_spia %>% dplyr::distinct(ensgene, .keep_all = TRUE)
dim(res_entrez_GHneg_spia)#10712
#remove rows with duplicate entrez gene IDs
res_entrez_GHneg_spia <- res_entrez_GHneg_spia %>% dplyr::distinct(entrez, .keep_all = TRUE)
dim(res_entrez_GHneg_spia)#10627
#
res_entrez_GHpos_spia <- dplyr::filter(res_ids_GHpos, entrez != "NA")
length(unique(res_entrez_GHpos_spia$ensgene))#8985
dim(res_entrez_GHpos_spia)#9736 16
res_entrez_GHpos_spia <- res_entrez_GHpos_spia %>% dplyr::distinct(ensgene, .keep_all = TRUE)
dim(res_entrez_GHpos_spia)#8985
res_entrez_GHpos_spia <- res_entrez_GHpos_spia %>% dplyr::distinct(entrez, .keep_all = TRUE)
dim(res_entrez_GHpos_spia)#8931

## Significant genes is a vector of fold changes where the names are ENTREZ gene IDs. 
#The background set is a vector of all the genes used for differential expression testing https://www.biostars.org/p/188571/

#GHneg
sig_res_entrez_GHneg <- res_entrez_GHneg_spia[which(res_entrez_GHneg_spia$P.Value < 0.05), ]
sig_entrez_GHneg <- as.numeric(sig_res_entrez_GHneg$logFC)
names(sig_entrez_GHneg) <- sig_res_entrez_GHneg$entrez
head(sig_entrez_GHneg)
length(sig_entrez_GHneg)#928 | 636 if including age
reference <- as.character(res_entrez_GHneg_spia$entrez)
spia_result_GHneg <- spia(de=sig_entrez_GHneg, all=reference, organism="hsa")
sig_spia_GHneg <- spia_result_GHneg[spia_result_GHneg$pG < 0.05,]
dim(sig_spia_GHneg)#13 12 |6  12 if including age in the model
head(sig_spia_GHneg)
# write.csv(sig_spia_GHneg,"SPIA_vWQW_GHneg_HEU_vs_HUU_p_0_05_outlier_samples_removed.csv")
write.csv(sig_spia_GHneg,"SPIA_vWQW_GHneg_age_HEU_vs_HUU_p_0_05_outlier_samples_removed.csv")

#GHpos
sig_res_entrez_GHpos <- res_entrez_GHpos_spia[which(res_entrez_GHpos_spia$P.Value < 0.05), ]
sig_entrez_GHpos <- as.numeric(sig_res_entrez_GHpos$logFC)
names(sig_entrez_GHpos) <- sig_res_entrez_GHpos$entrez
head(sig_entrez_GHpos)
length(sig_entrez_GHpos)#513 | 588 if including age
reference <- as.character(res_entrez_GHpos_spia$entrez)
spia_result_GHpos <- spia(de=sig_entrez_GHpos, all=reference, organism="hsa")
sig_spia_GHpos <- spia_result_GHpos[spia_result_GHpos$pG < 0.05,]
dim(sig_spia_GHpos)#4 | 2 if age is included in the model
head(sig_spia_GHpos)#
# write.csv(sig_spia_GHpos,"SPIA_vWQW_GHpos_HEU_vs_HUU_p_0_05_outlier_samples_removed.csv")
write.csv(sig_spia_GHpos,"SPIA_vWQW_GHpos_age_HEU_vs_HUU_p_0_05_outlier_samples_removed.csv")

#-----------
#ClusterProfiler GO enrichment analysis
## Run GO enrichment analysis GHneg HEU vs HUU

ego <- enrichGO(gene = as.character(sig_res_entrez_GHneg$entrez),
                universe = as.character(res_entrez_GHneg$entrez),
                keyType = "ENTREZID",
                OrgDb = org.Hs.eg.db,
                ont = "BP",
                pAdjustMethod = "BH",
                qvalueCutoff = 0.1,
                readable = TRUE)
head(data.frame(ego))#0 | 0
ego <- enrichGO(gene = as.character(sig_res_entrez_GHneg$ensgene), 
                universe = as.character(res_entrez_GHneg$ensgene),
                keyType = "ENSEMBL",
                OrgDb = org.Hs.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                qvalueCutoff = 0.1, 
                readable = TRUE)
dim(data.frame(ego))#19 9 | 0 9 if including age in the model
#NB if we use entrez gene ids we get no significant hits; if we use ensemble we do..
cluster_summary_GHneg <- data.frame(ego)#Note that most of these are protein coding ribosomal proteins that remain after filtering in the pipeline
cluster_summary_GHneg[1,]

write.csv(cluster_summary_GHneg, "clusterProfiler_enrichGO_vWQW_GHneg_HUE_vs_HUU_outlier_removed.csv")

#
#GENE SET ENRICHMENT GO
head(sig_entrez_GHneg)
GHneg_sorted <- sort(sig_entrez_GHneg, decreasing = TRUE)
head(GHneg_sorted)
set.seed(1234)
gse_GO <- gseGO(gene = GHneg_sorted,#named FC list
                          OrgDb = org.Hs.eg.db, 
                          ont = "BP", 
                          pAdjustMethod = "BH", 
                          pvalueCutoff = 0.05,
                          nPerm = 10000,
                           seed=TRUE )

dim(gse_GO)#321 | 0 if including age in the model
head(gse_GO)
write.csv(gse_GO, "clusterProfiler_GO_GSEA_vWQW_GHneg_HUE_vs_HUU_outlier_removed.csv")
#----------
#what happens if we use ensemble gene IDs instead of entrez gene IDs
sig_res_ens_GHneg <- res_entrez_GHneg_spia[which(res_entrez_GHneg_spia$P.Value < 0.05), ]
sig_ens_GHneg <- as.numeric(sig_res_ens_GHneg$logFC)
names(sig_ens_GHneg) <- sig_res_ens_GHneg$ensgene
head(sig_ens_GHneg)
GHneg_sorted <- sort(sig_ens_GHneg, decreasing = TRUE)
head(GHneg_sorted)
set.seed(1234)
gse_GO_ens <- gseGO(gene = GHneg_sorted,#named FC list
                OrgDb = org.Hs.eg.db, 
                ont = "BP", 
                keyType = "ENSEMBL",
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05,
                nPerm = 10000,
                seed=TRUE )

dim(gse_GO_ens)#296 | 0 if including age
head(gse_GO_ens)
str(gse_GO_ens)
gse_GO_ens <- data.frame(gse_GO_ens)
gse_GO <- data.frame(gse_GO)
length(intersect(rownames(gse_GO_ens),rownames(gse_GO)))#285 so very similar between entrez vs. ensemble
temp <- setdiff(rownames(gse_GO),rownames(gse_GO_ens))
gse_GO[temp,]
write.csv(gse_GO_ens, "clusterProfiler_GO_GSEA_vWQW_GHneg_HUE_vs_HUU_outlier_removed_ensembleIDs.csv")

#
#
kk <- enrichKEGG(gene = as.character(sig_res_entrez_GHneg$entrez),
                 universe = as.character(res_entrez_GHneg$entrez),
                 organism  = 'hsa',
                 pvalueCutoff = 0.05)
dim(kk)#1 9 | 0 if including age in the model
data.frame(kk)
#Note: ensemble gene IDs not supported for hsa
#
#---------------------
# GO enrichment, GHpos HEU vs. HUU
ego_GHpos <- enrichGO(gene = as.character(sig_res_entrez_GHpos$entrez),
                universe = as.character(res_entrez_GHpos$entrez),
                keyType = "ENTREZID",
                OrgDb = org.Hs.eg.db,
                ont = "BP",
                pAdjustMethod = "BH",
                qvalueCutoff = 0.1,
                readable = TRUE)
dim(ego_GHpos)#0 | 0
#different results when we use ENSEMBL vs ENTREZ - multimapping issue, creates artifact, stick to ENSEMBL
ego_GHpos <- enrichGO(gene = as.character(sig_res_entrez_GHpos$ensgene), 
                      universe = as.character(res_entrez_GHpos$ensgene),
                      keyType = "ENSEMBL",
                      OrgDb = org.Hs.eg.db, 
                      ont = "BP", 
                      pAdjustMethod = "BH", 
                      qvalueCutoff = 0.1, 
                      readable = TRUE)
#
cluster_summary_GHpos <- data.frame(ego_GHpos)
dim(cluster_summary_GHpos)#0 | 0
head(cluster_summary_GHpos)#

#
kk_pos <- enrichKEGG(gene = as.character(sig_res_entrez_GHpos$entrez),
                 universe = as.character(res_entrez_GHpos$entrez),
                 organism  = 'hsa',
                 pvalueCutoff = 0.05)
dim(kk_pos)#1 or 0 if including age in the model
#----
GHpos_sorted <- sort(sig_entrez_GHpos, decreasing = TRUE)
head(GHpos_sorted)
set.seed(1234)
gse_GO_pos <- gseGO(gene = GHpos_sorted,#named FC list
                OrgDb = org.Hs.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                nPerm = 10000,
                pvalueCutoff = 0.05,seed = TRUE)
head(gse_GO_pos)
dim(gse_GO_pos)#0 | 0 if including age in the model
#--------

#If we use ensemble instead
sig_res_ens_GHpos <- res_entrez_GHpos_spia[which(res_entrez_GHpos_spia$P.Value < 0.05), ]
sig_ens_GHpos <- as.numeric(sig_res_ens_GHpos$logFC)
names(sig_ens_GHpos) <- sig_res_ens_GHpos$ensgene
head(sig_ens_GHpos)
GHpos_sorted <- sort(sig_ens_GHpos, decreasing = TRUE)
head(GHpos_sorted)
set.seed(1234)
gse_GO_ens <- gseGO(gene = GHpos_sorted,#named FC list
                    OrgDb = org.Hs.eg.db, 
                    ont = "BP", 
                    keyType = "ENSEMBL",
                    pAdjustMethod = "BH", 
                    pvalueCutoff = 0.05,
                    nPerm = 10000,
                    seed=TRUE )
dim(gse_GO_ens)#0
#--------
#GSEA KEGG
GHpos_sorted <- sort(sig_entrez_GHpos, decreasing = TRUE)
set.seed(1234)
gse_k_pos <- gseKEGG(gene = GHpos_sorted,#named FC list
                    organism = "hsa",
                    pAdjustMethod = "BH", 
                    pvalueCutoff = 0.05, seed = TRUE,nPerm = 10000)
head(gse_k_pos)
dim(gse_k_pos)#0 | 0 if including age in the model


#------------------------------
#On recommendation from Susan Holmes, co-author and statistician at Stanford, we will also use the
#R package Bionet
library(BioNet)
library(DLBCL)
library(ALL)
data("dataLym")
data("interactome")
data(exprLym)
data(ALL)
interactome
# A graphNEL graph with undirected edges
# Number of Nodes = 9386 
# Number of Edges = 36504
head(dataLym)
pvals <- cbind(t = dataLym$t.pval, s = dataLym$s.pval)
rownames(pvals) <- dataLym$label
pval <- aggrPvals(pvals, order = 2, plot = FALSE)
head(pvals)
dim(pvals)
#3583
subnet <- subNetwork(dataLym$label, interactome)
subnet <- rmSelfLoops(subnet)
subnet
#--------------
#remove duplicates
use <- res_entrez_GHneg[!duplicated(res_entrez_GHneg$genes),]
dim(use)#10069
head(use)
rownames(use) <- use$genes
#Now make list of interactome nodes minus node number to match to gene symbol in use
interactome_symbols <- unlist(nodes(interactome))
head(interactome_symbols)
inter_df <- cbind(interactome_symbols,interactome_symbols)
head(inter_df)
colnames(inter_df)[2] <- c("nodes")
inter_df[,1] <- gsub("\\(.*","",inter_df[,1])
head(inter_df)
rownames(inter_df) <- inter_df[,1]
find_nodes <- intersect(rownames(use),rownames(inter_df))
length(find_nodes)#4910
# find_nodes <- lapply(use$genes,function(x) nodes(interactome)[grep(x,nodes(interactome))])
head(find_nodes)
use <- use[find_nodes,]
dim(use)#4910
use$node <- inter_df[rownames(use),"nodes"]
head(use)
network <- subNetwork(use$node, interactome)
network
# A graphNEL graph with undirected edges
# Number of Nodes = 4910 
# Number of Edges = 16584 
#---------------------------
#Repeat for GH pos dataset
use_pos <- res_entrez_GHpos[!duplicated(res_entrez_GHpos$genes),]
dim(use_pos)#8562
head(use_pos)
rownames(use_pos) <- use_pos$genes
#Now make list of interactome nodes minus node number to match to gene symbol in use
find_nodes <- intersect(rownames(use_pos),rownames(inter_df))
length(find_nodes)#4390
head(find_nodes)
use_pos <- use_pos[find_nodes,]
dim(use_pos)#4390
use_pos$node <- inter_df[rownames(use_pos),"nodes"]
head(use_pos)
network_pos <- subNetwork(use_pos$node, interactome)
network
# A graphNEL graph with undirected edges
# Number of Nodes = 4910 
# Number of Edges = 16584 

length(intersect(nodes(network),nodes(network_pos)))
#4327
pos_pval <- data.frame(rownames(use_pos),use_pos[,"P.Value"])
head(pos_pval)
pos_pval <- data.frame(pos_pval[,2])
rownames(pos_pval) <- use_pos[,"node"]
colnames(pos_pval) <- "p_value"
pos_pval <- as.vector(pos_pval)
pos_pval <- as.numeric(as.character(unlist(pos_pval)))
names(pos_pval) <- use_pos[,"node"]
fb <- fitBumModel(pos_pval, plot = TRUE)
scores <- scoreNodes(network = network_pos, fb = fb, fdr = 0.05)
scores
max(scores)
min(scores)
hotSub  =  runFastHeinz(network_pos, scores)
hotSub
#---------------------------
#Make dataframe of corresponding p-values
common <- intersect(rownames(use),rownames(use_pos))
length(common)#4327
pvals <- cbind(use[common,"P.Value"],use_pos[common,"P.Value"])
head(pvals)
colnames(pvals) <- c("GHneg_pval","GHpos_pval")
rownames(pvals) <- use[common,"node"]
pvals <- data.frame(pvals)
pvals[,1] <- as.numeric(as.character(unlist(pvals[,1])))
pvals[,2] <- as.numeric(as.character(unlist(pvals[,2])))
str(pvals)
head(pvals)
pval <- aggrPvals(as.matrix(pvals), order = 2, plot = FALSE)
head(pval)
network_joint <- subNetwork(rownames(pvals), interactome)
network_joint
# A graphNEL graph with undirected edges
# Number of Nodes = 4327 
# Number of Edges = 14013
# Now we can use the aggregated p-values to fit the Beta-uniform mixture model
# to the distribution. The following plot shows the fitted Beta-uniform mixture
# model in a histogram of the p-values.
# fb <- fitBumModel(pval, plot = TRUE)
# fb
# scores <- scoreNodes(network = network_joint, fb = fb, fdr = 0.001)
# # In the next step the network with the scores and edges is written to a file and
# # the heinz algorithm is used to calculate the maximum-scoring subnetwork. In
# # order to run heinz self-loops have to be removed from the network
# network_2 <- largestComp(network_joint)
# network_2 <- rmSelfLoops(network_2)
# network_2
# # A graphNEL graph with undirected edges
# # Number of Nodes = 3709 
# # Number of Edges = 12929 
# writeHeinzEdges(network = network_2, file = "GHpos_neg_edges_001",use.score = FALSE)
# writeHeinzNodes(network = network_2, file = "GHpos_neg_nodes_001",node.scores = scores)
#/////////////////////////

el <- cbind(c("a", "b", "c", "d", "e", "f", "d"), c("b", "c", "d", "e", "f", "a", "b"))
graph <- graph.edgelist(el, directed=TRUE)

node.list <- c("a", "b", "c")
graph2 <- subNetwork(nodeList=node.list, network=graph)
data(colorectalcancer)
str(DE_Colorectal)
head(DE_Colorectal)

A <- c(9,5)
B <- c(5,9)
C <- c(7,8)
D <- data.frame(A,B,C)
D
chisq.test(t(D))

entrez_test <- c("84872","554313","121504","8370","8368","8366","8367","8365","8364","8362","8363","8361","8360","8359","8294")
temp <- grch37[grch37$entrez %in% entrez_test,]
unique(temp$description)
entrez_test <- c("10553","127428","55756","9869","5990","339318","554313","121504",
                 "8370","8368","8366","8367","8365","8364","8362","8363","79724","6941","3280","27005","57585",
                 "51360","1277","653361","339344","2334","126375","54971","22976","9400","51003","11267","10623",
                 "2302","91120","7733","86","58486","9139","92105","84503")
        
temp <- sig_res_entrez_GHpos[sig_res_entrez_GHpos$entrez %in% entrez_test,]
unique(temp$description)
na.omit(grch37[grch37$entrez == "8370",])