#RNAseqs differential expression testing using edgeR
#Set the working directory to where your input files are located:
#V4 specific: rerun with voomWithQualityWeights (given variable RIN scores) AND advice from Susan Holmes 
#Note that it is not possible to include infant ID in the model (two samples per infant) -  not enough degrees of
#freedom. I therefore analysed GHneg and GHpos samples seperately (combining birth and W15 sample) to test HEU vs. HUU
# setwd("/Users/katielennard/Documents/Academic/IDM_projects/Gray/")
setwd("/Users/katielennard/Documents/Katie/IDM_projects/Gray")

#Load the relevant packages, install as necessary with install.packages() or Bioconductor 'BiocManager'
library(edgeR)
library(NMF)
library(biomaRt)#gene annotation
library(ggfortify)#PCA plot
library(magrittr)#%>%
library(dplyr)
library(tibble)  # for `rownames_to_column` and `column_to_rownames`
library(statmod)#for estimateDisp with robust=TRUE
library(GO.db)
library(org.Hs.eg.db)
library(gplots)
library(vegan)#permanova (adonis)
library(ggplot2)
#import counts:
Rcounts <- read.csv(paste0(getwd(),"/RNAseq_Ilifu_run/merged_gene_counts.txt"), sep = "\t", row.names = 1)
dim(Rcounts)#63677 21
head(Rcounts)
#Clean sample names:
colnames(Rcounts)
colnames(Rcounts) <- sub(".fwAligned.sortedByCoord.out.bam","",colnames(Rcounts))
colnames(Rcounts)
colnames(Rcounts) <- sapply(strsplit(colnames(Rcounts), "_S"), `[`, 1)
colnames(Rcounts)
colnames(Rcounts) <- sub("X","",colnames(Rcounts))

colnames(Rcounts) <- sub("\\.","-",colnames(Rcounts))
colnames(Rcounts) <- sub("\\.","-",colnames(Rcounts))
colnames(Rcounts)#clean
#Find empty rows
RS <- sort(rowSums(Rcounts[,-1]))
head(RS)
length(which(RS==0))#remove 10136 transcripts with only zero values
#Remove empty rows
Rcounts <- Rcounts[-which(rowSums(Rcounts[,-1])==0),]
dim(Rcounts)#23034
# write.csv2(Rcounts, file = "Sonwabile_Rcounts_unfiltered.csv")
#import sample metadata
meta <- read.csv2(paste0(getwd(),"/from_sonwabile/mdata.csv"), sep =  ";", row.names = 1)
#import RIN scores
RINs <- read.csv2("RINs_collated_from_QC_reports-KSL_forR.csv", row.names = 1, stringsAsFactors = F)
identical(rownames(meta),rownames(RINs))#TRUE
RINs
meta$RIN <- as.numeric(RINs$RIN)
#check that sampleIDs overlap between metadata and counts table
all(rownames(meta) %in% colnames(Rcounts))#TRUE
head(meta)
length(unique(meta$Infant_ID))#only 8 infants, with two different time points
table(meta$gut_homing)
# Neg Pos 
# 10  10 
table(meta$Time)
# birth week15 
# 10     10 
#Make sure sample order is same between sev_meta and counts
tmp <- Rcounts[,rownames(meta)]
Rcounts2 <- data.frame(Rcounts[,1],tmp, check.names = F)
colnames(Rcounts2)[1] <- "gene"
identical(rownames(meta),colnames(Rcounts2)[-1])#TRUE
#santity check
Rcounts[1:10,"20028R-01-03"]
Rcounts2[1:10,"20028R-01-03"]
#same
#---------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------
#Make DGElist object for edgeR use
d <- DGEList(counts=as.matrix(Rcounts2[,-1]),genes = Rcounts2[,1], group=factor(meta$gut_homing))
d
#-------------------EXPLORATORY QC----------------------------------------------------
df <- data.frame(t(Rcounts2[,-1]),meta)
df_counts <- df[,1:23034]
# pca_res <- prcomp(df_counts, scale. = TRUE)
# pdf("MDS_hiv_age.pdf", 4,3)
# autoplot(pca_res, data = df, colour = 'hiv', shape = 'Time')+theme_light()
# dev.off()
# #
# pdf("MDS_gut_homing_age.pdf", 4,3)
# autoplot(pca_res, data = df, colour = 'gut_homing', shape = 'Time')+theme_light()
# dev.off()
#

#Two outliers  - both from the same infant C-0088-B (20028R-01-03 and 20028R-01-04); both have significantly lower counts than the rest of the samples (poorer assignment)
sampleQC <- data.frame(colSums(d$counts), meta$gut_homing) 
sampleQC %>%
  rownames_to_column('sample') %>%
  arrange(.,colSums.d.counts.) %>%
  column_to_rownames('sample')
# colSums.d.counts. meta.gut_homing
# 20028R-01-04           3509430             Neg
# 20028R-01-03           7766737             Pos
# 20028R-01-09          12848739             Pos
# 20028R-01-19          15671557             Pos
# 20028R-01-37          16760425             Pos
# 20028R-01-21          16877973             Pos
# 20028R-01-11          19001689             Pos
# 20028R-01-10          20317721             Neg
# 20028R-01-17          20602737             Pos
# 20028R-01-18          20649613             Neg
# 20028R-01-08          21137498             Neg
# 20028R-01-38          21677469             Neg
# 20028R-01-22          22828992             Neg
# 20028R-01-06          24094352             Neg
# 20028R-01-05          25158132             Pos
# 20028R-01-39          26705488             Pos
# 20028R-01-12          26784120             Neg
# 20028R-01-40          27251114             Neg
# 20028R-01-07          27316135             Pos
# 20028R-01-20          28427178             Neg
#---------------------------------------------------------
#Remove outliers and redo MDS plot:
grep("20028R-01-04",colnames(Rcounts2))#3
grep("20028R-01-03",colnames(Rcounts2))#2
grep("20028R-01-04",rownames(meta))#2
grep("20028R-01-03",rownames(meta))#1
meta.sub <- meta[-(1:2),]
df <- data.frame(t(Rcounts2[,-(1:3)]),meta.sub)
df_counts <- df[,1:23034]
df_counts <- df_counts[,-which(colSums(df_counts)==0)]
dim(df_counts)#18 22702
#NB, when we remove those two outlier samples there are again genes with only zeros across all remaining samples:
pca_res <- prcomp(df_counts, scale. = TRUE)
# pdf("MDS_age_HIVstatus_outliers_removed.pdf",4,3)
# autoplot(pca_res, data = df, colour = 'hiv', shape = 'Time')+theme_light()
# dev.off()
# pdf("MDS_age_gut_homing_outliers_removed.pdf",4,3)
# autoplot(pca_res, data = df, colour = 'gut_homing', shape = 'Time')+theme_light()
# dev.off()
#-------------------EXPLORATORY QC----------------------------------------------------

#---------------------GENE ANNOTATION-----------------------------------------
#Lets start by filtering out any remaining rRNA in this dataset
# mart = useMart('ensembl')
# # list all the ensembl database of organisms
# listDatasets(mart)  
# #choose database of your interest ; in this case its "cfamiliaris_gene_ensembl" I guess
# ensembl = useMart( "ensembl", dataset = "hsapiens_gene_ensembl" , host = "grch37.ensembl.org")  
# # choose attributes of your interest
# listAttributes(ensembl)
# gene <- getBM( attributes = c("ensembl_gene_id","external_gene_name","description", "percentage_gene_gc_content"),values = rownames(Rcounts2),mart = ensembl)  
# head(gene)
# rownames(gene) <- gene$ensembl_gene_id
# dim(gene)#63677 for gchr37
# length(intersect(rownames(gene), rownames(Rcounts2)))#53541 for gchr37 (USED)
# saveRDS(gene,"severin_ensembl_gchr37_annotation.rds")
gene <- readRDS("severin_ensembl_gchr37_annotation.rds")#to reload
#---------------------GENE ANNOTATION------------------------------------------------

#---------------------DESIGN MATRIX--------------------------------------------------
#COMPARISONS TO BE MADE:
# Treg pos HEU vs HUU birth 
# Treg neg HEU vs HUU birth
# Treg pos HEU vs HUU 15 weeks
# Treg neg HEU vs HUU 15 weeks
# HEU vs HUU birth
# HEU vs HUU 15 weeks
# 
# birth vs 15 weeks HUU
# birth vs 15 weeks HEU

#However, for most of these the group sizes are N=2: see table below
#------
#Add unique group specifier to meta so we can use contrasts
meta.sub$Group <- rep(NA,nrow(meta.sub))
meta.sub$Group[meta.sub$Time=="week15" & meta.sub$gut_homing=="Pos" & meta.sub$hiv == "HUU"]   <- "w15_GHpos_HUU"  
meta.sub$Group[meta.sub$Time=="week15" & meta.sub$gut_homing=="Pos" & meta.sub$hiv == "HEU"]   <- "w15_GHpos_HEU" 
meta.sub$Group[meta.sub$Time=="week15" & meta.sub$gut_homing=="Neg" & meta.sub$hiv == "HUU"]   <- "w15_GHneg_HUU"     
meta.sub$Group[meta.sub$Time=="week15" & meta.sub$gut_homing=="Neg" & meta.sub$hiv == "HEU"]   <- "w15_GHneg_HEU"   
meta.sub$Group[meta.sub$Time=="birth" & meta.sub$gut_homing=="Pos" & meta.sub$hiv == "HUU"]   <- "b_GHpos_HUU"     
meta.sub$Group[meta.sub$Time=="birth" & meta.sub$gut_homing=="Pos" & meta.sub$hiv == "HEU"]   <- "b_GHpos_HEU"  
meta.sub$Group[meta.sub$Time=="birth" & meta.sub$gut_homing=="Neg" & meta.sub$hiv == "HUU"]   <- "b_GHneg_HUU"   
meta.sub$Group[meta.sub$Time=="birth" & meta.sub$gut_homing=="Neg" & meta.sub$hiv == "HEU"]   <- "b_GHneg_HEU"  
meta.sub$Group <- as.factor(meta.sub$Group)

table(meta.sub$Group)
# b_GHneg_HEU   b_GHneg_HUU   b_GHpos_HEU   b_GHpos_HUU w15_GHneg_HEU w15_GHneg_HUU w15_GHpos_HEU w15_GHpos_HUU 
# 2             2             2             2             3             2             3             2 
#------------------------------------------------------------------------------------
#HEATMAP ALL samples including outliers
d <- DGEList(counts=as.matrix(Rcounts2[,-1]),genes = Rcounts2[,1], group=factor(meta$hiv))
y <- calcNormFactors(d,log=TRUE)
logCPM <- cpm(y, prior.count=2, log=TRUE)#prior.count is to avoid undefined values and to reduce
norm_counts <- data.frame(logCPM)
# t.track <- meta[,c(3:6)]
# pdf("heatmap_log2_CPM.pdf")#From this heatmap we see the effect of RIN score on the profiles with higher RIN samples having more comprehensive gene coverage
# aheatmap(norm_counts, annCol = t.track)
# dev.off()
#--------
#Filter by expression
# design <- model.matrix(~0+hiv, data=meta)#force intercept through zero so as not to select any one group as 'reference'
# keep <- filterByExpr(d, design = design, group = factor(meta$hiv), min.count = 10)
# length(which(keep==TRUE))#10493
# y <- d[keep, , keep.lib.sizes=FALSE]
# logCPM <- cpm(y, prior.count=2, log=TRUE)
# norm_counts <- data.frame(logCPM)
# t.track <- meta[,c(3:6)]
# pdf("heatmap_log2_CPM_filtered.pdf")#From this heatmap we see the effect of RIN score on the profiles with higher RIN samples having more comprehensive gene coverage
# aheatmap(norm_counts, annCol = t.track)
# dev.off()
#
#+++++++++++++++++++++++++++++++++++++++
#+++++++++++++++++++++++++++++++++++++++
#voomwithqualityweights differential expression testing section
#Note that there are not enough samples (degrees of freedom to include infant ID in the model so we have to split by gut homing
#status, because we cant include two samples from one infant in the model)
dim(meta)#20
#All gut homing negative samples together, irrespective of age
GHneg <- meta.sub[grep("neg",meta.sub$Group),]
table(GHneg$Group)
table(GHneg$hiv)
# HEU HUU 
# 5   4 
#---------------
#---------------
GHpos <- meta.sub[grep("pos",meta.sub$Group),]
table(GHpos$hiv)
# HEU HUU 
# 5   4 
#----------------
df_counts <- t(df_counts)#2 outlier removed before
head(df_counts)
dim(df_counts)
gene_symbols <- gene[rownames(df_counts),]
head(gene_symbols)
identical(rownames(meta.sub),colnames(df_counts))#T

GHneg_counts <- df_counts[,rownames(GHneg)]
dim(GHneg_counts)#22702 9
GHpos_counts <- df_counts[,rownames(GHpos)]
dim(GHpos_counts)#22702 9
#-------------------------
#Faceted NMDS plots of GH neg and GH pos 
# MDS with Bray-Curtis instead of PCA Euclidean
colour_palette <- c("red","blue")
set.seed(2)
nmds <- metaMDS(t(df_counts),k=2, trymax = 500)
stressplot(nmds)
data.scores <- as.data.frame(scores(nmds))  #Using the scores function from vegan to extract the sample scores and convert to a data.frame
data.scores$samples <- rownames(data.scores)  #create a column of site names, from the rownames of data.scores
data.scores[,"HIV"] <- meta.sub[,"hiv"]
data.scores[,"Age"] <- meta.sub[,"Time"]
data.scores[,"gut_homing"] <- meta.sub[,"gut_homing"]
title = c("NMDS by gut homing status")
colour = data.scores$HIV
shape = data.scores$Age
c.lab = "HIV exposure"
s.lab = "Age"
p <- ggplot() + 
  geom_point(data=data.scores,aes(x=NMDS1,y=NMDS2,shape=shape,colour=colour),size=1.5) + # add the point markers
  scale_colour_manual(values=colour_palette)+
  #NB - EDIT TITLE AS NEEDED
  ggtitle(title)+
  coord_equal() +
  theme_bw() + 
  labs(color=c.lab, shape = s.lab)+
  theme(plot.title = element_text(hjust = 0.5))+
theme(panel.background = element_blank(), 
      panel.grid.major = element_blank(),  #remove major-grid labels
      panel.grid.minor = element_blank(),  #remove minor-grid labels
      plot.background = element_blank())
p <- p + facet_grid(cols = vars(gut_homing))
ggsave(p,filename = "NMDS_bray_curtis_age_facet_HIV-col",device = "tiff",width = 7, height = 7)
tiff("NMDS_bray_curtis_age_facet_HIV-col", unit = "cm", width = 10,height = 10, res = 300)
p
dev.off()
#---------------------------------
#GHneg
#---------------------------------
d <- DGEList(counts=as.matrix(GHneg_counts),genes = gene_symbols$external_gene_name, group=factor(GHneg$hiv))
keep <- filterByExpr(d, group=factor(GHneg$hiv))
y <- d[keep, , keep.lib.sizes=FALSE]#filter option
y <- edgeR::calcNormFactors(y)
print(table(keep))
# keep
# FALSE  TRUE 
# 10975 11727
#PERMANOVA:
grouping <- GHneg$hiv
diss1 <- vegdist(t(y$counts),method = "bray")#
betadine <- betadisper(diss1, group=grouping)#Where ‘diss1’ is your distance matrix
anova(betadine)#If NS then adonis meets assumptions of homogeneous dispersion - not sure what group to specify if you're testing multiple effects with adonis though..
#P=0.6
counts <- cpm(y, log=FALSE)#prior.count is to avoid undefined values and to reduce 
#I think it is better to use Bray here because this data is a bit more like 16S in that there are many zero entries (sparsity related to low coverage & low RINs)
#Adonis 2 where order does not matter and we look at marginal probabilities, each time conrolling for all other factors
set.seed(6)
a1=adonis2(t(counts) ~  hiv+Time+RIN , data = GHneg, method="bray", by="margin")
a1#NB note: in the version 1 script I didn't use a start seed so results vary slightly from below
# Df SumOfSqs      R2      F Pr(>F)
# hiv       1 0.022304 0.11978 0.9502  0.546
# Time      1 0.019608 0.10530 0.8354  0.661
# RIN       1 0.020222 0.10860 0.8615  0.647
# Residual  5 0.117363 0.63027              
# Total     8 0.186211 1.00000    
#-------
#Final model selected for publication:
#-------
set.seed(6)
a2=adonis(t(counts) ~  Time+hiv+RIN , data = GHneg, method="bray")
a2
# Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)
# Time       1  0.026064 0.026065 1.11042 0.13997  0.320
# hiv        1  0.022561 0.022561 0.96118 0.12116  0.522
# RIN        1  0.020222 0.020222 0.86153 0.10860  0.631
# Residuals  5  0.117363 0.023473         0.63027       
# Total      8  0.186211                  1.00000 
#-------
set.seed(6)
a3=adonis(t(counts) ~  hiv+Time+RIN , data = GHneg, method="bray")
a3
# Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)
# hiv        1  0.022982 0.022982 0.97909 0.12342  0.501
# Time       1  0.025644 0.025644 1.09252 0.13772  0.338
# RIN        1  0.020222 0.020222 0.86153 0.10860  0.647
# Residuals  5  0.117363 0.023473         0.63027       
# Total      8  0.186211                  1.00000        
# ---
set.seed(6)
a4=adonis(t(counts) ~  hiv+Time+RIN, data = GHneg, method="bray")
a4
# Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)
# hiv        1  0.022982 0.022982 0.97909 0.12342  0.501
# Time       1  0.025644 0.025644 1.09252 0.13772  0.338
# RIN        1  0.020222 0.020222 0.86153 0.10860  0.647
# Residuals  5  0.117363 0.023473         0.63027       
# Total      8  0.186211                  1.00000    
set.seed(6)
a5=adonis(t(counts) ~  RIN+hiv+Time, data = GHneg, method="bray")
a5
# Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)
# RIN        1  0.026265 0.026265 1.11895 0.14105  0.323
# hiv        1  0.022975 0.022975 0.97881 0.12338  0.501
# Time       1  0.019608 0.019608 0.83538 0.10530  0.661
# Residuals  5  0.117363 0.023473         0.63027       
# Total      8  0.186211                  1.00000  
#-----
#Specify design
# design <- model.matrix(~0+hiv,data=GHneg)
GHneg$Group <- as.factor(as.character(unlist(GHneg$Group)))
design <- model.matrix(~0+Group,data=GHneg)#including age covariate
colnames(design) <- sub("Group","",colnames(design))
# design <- model.matrix(~0+hiv,data=GHneg)#hiv only

v <- voom(y, design = design)
aw <- arrayWeights(v, design = design, method = "genebygene", maxiter = 50, 
                   tol = 1e-05)
aw
# 1         2         3         4         5         6         7         8         9 
# 0.3658284 2.0283134 0.5525365 0.6917837 1.0215903 2.2208566 1.8537101 1.9753257 0.4244029 
qual <- data.frame(arrayweight = aw,RIN = GHneg$RIN, reads = y$samples$lib.size, edgeR_norm_factor = y$samples$norm.factors)
rownames(qual) <-rownames(y$samples)
write.csv(qual, file = "sample_quality_metrics_GHneg_minus_outlier_samples.csv", row.names=T)
pdf("GHneg+quality_metrics_heatmap.pdf",7,5)
aheatmap(t(qual), scale = "row", cexRow = 0.25)
dev.off()
VQW <- voomWithQualityWeights(y, design=design, normalize.method="none", 
                              plot=TRUE,method="genebygene")
saveRDS(VQW,file = "VQW_counts_GHneg_outliers_removed.rds")#hiv + age in design
saveRDS(VQW,file = "VQW_counts_GHneg_hiv_design_outliers_removed.rds")

pdf("GHnegexpression_boxplots_after_voomWQW_and_filtering.pdf",10,7)
par(mar = c(10,5,5,5))
boxplot(VQW$E, las = 2)
dev.off()

vfit <- lmFit(VQW, design)
contr.matrix = makeContrasts(#See https://www.biostars.org/p/435540/ for more on complex contrasts
  hiv_exposure=(b_GHneg_HEU+w15_GHneg_HEU)-(b_GHneg_HUU+w15_GHneg_HUU),
  levels = colnames(design))
#or HIV only
# contr.matrix = makeContrasts(#See https://www.biostars.org/p/435540/ for more on complex contrasts
#   hiv_exposure=hivHEU-hivHUU,
#   levels = colnames(design))

vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)#empirical Bayes moderation is carried out by borrowing information across all the genes to obtain more precise estimates of gene-wise variability
plotSA(efit, main="Final model: Mean-variance trend")
summary(decideTests(efit,p.value = 0.05))#GHneg
# hiv_exposure
# Down              0
# NotSig        11727
# Up                0
hivE <- topTreat(efit, n=Inf)#
head(hivE)#GHneg
# genes       logFC      AveExpr         t      P.Value adj.P.Val         B
# ENSG00000112137 PHACTR1 -17.9849427  0.966167983 -5.092831 0.0001298288 0.7592297 -4.591433
# ENSG00000211459 MT-RNR1  -0.9660598 11.481368608 -4.855067 0.0002063292 0.7592297 -3.838716
# ENSG00000169554    ZEB2  -7.5123294  4.725513983 -4.327066 0.0005902181 0.7592297 -4.578196
# ENSG00000124491   F13A1 -13.9912460  0.001734975 -4.256654 0.0006803949 0.7592297 -4.591905
# ENSG00000109971   HSPA8   1.6185846 10.076426276  4.239249 0.0007047769 0.7592297 -4.295128
# ENSG00000152240   HAUS1  17.2087789 -0.532570425  4.200540 0.0007622495 0.7592297 -4.593537
write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHneg_hivE_age_full_table.csv", row.names = T)

#Or with only HIV, ignoring age:
# genes     logFC      AveExpr         t      P.Value adj.P.Val          B
# ENSG00000140564   FURIN -1.992993  6.657183143 -7.008480 9.842485e-06 0.1154228  3.8148108
# ENSG00000107263 RAPGEF1 -1.222185  7.914476555 -6.214754 3.316827e-05 0.1491656  2.4745489
# ENSG00000124491   F13A1 -8.534395  0.001734975 -5.949393 5.075220e-05 0.1491656 -0.6556639
# ENSG00000229164    TRAC  1.227566  8.556926867  5.474973 1.112645e-04 0.1491656  1.1988014
# ENSG00000158427 TMSB15B  8.261919 -0.938983494  5.408162 1.245837e-04 0.1491656 -1.7374527
# ENSG00000170525  PFKFB3 -1.478686  7.613763293 -5.393902 1.276370e-04 0.1491656  1.1428482
write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHneg_hivE_full_table.csv", row.names = T)

#-----------------------------------
#---------------------------------
#OR GHpos
d <- DGEList(counts=as.matrix(GHpos_counts),genes = gene_symbols$external_gene_name, group=factor(GHpos$hiv))
keep <- filterByExpr(d, group=factor(GHpos$hiv))
print(table(keep))
# keep
# FALSE  TRUE 
# 13205  9497 
#
y <- d[keep, , keep.lib.sizes=FALSE]#filter option
y <- edgeR::calcNormFactors(y)
counts <- cpm(y, log=FALSE)
#---
diss1 <- vegdist(t(y$counts),method = "bray")#
betadine <- betadisper(diss1, group=grouping)#Where ‘diss1’ is your distance matrix
anova(betadine)#p=0.3
#---
set.seed(6)
a1=adonis2(t(counts) ~  hiv+Time+RIN , data = GHpos, method="bray", by="margin")
a1
# Df SumOfSqs      R2      F Pr(>F)  
# hiv       1  0.03395 0.10309 0.9748  0.398  
# Time      1  0.04403 0.13370 1.2642  0.134  
# RIN       1  0.06804 0.20661 1.9536  0.073 .
# Residual  5  0.17413 0.52877                
# Total     8  0.32930 1.00000   
set.seed(6)
a2=adonis(t(counts) ~  Time+hiv+RIN , data = GHpos, method="bray")
a2
# Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)  
# Time       1   0.04303 0.043034  1.2357 0.13068  0.103  
# hiv        1   0.04411 0.044106  1.2665 0.13394  0.096 .
# RIN        1   0.06804 0.068037  1.9536 0.20661  0.073 .
# Residuals  5   0.17413 0.034825         0.52877         
# Total      8   0.32930                  1.00000  
set.seed(6)
a2=adonis(t(counts) ~  RIN+hiv+Time , data = GHpos, method="bray")
a2
# Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)  
# RIN        1   0.07556 0.075564  2.1698 0.22947  0.084 .
# hiv        1   0.03559 0.035586  1.0218 0.10806  0.354  
# Time       1   0.04403 0.044026  1.2642 0.13370  0.134  
# Residuals  5   0.17413 0.034825         0.52877         
# Total      8   0.32930                  1.00000
set.seed(6)
a4=adonis2(t(counts) ~  hiv+Time+RIN , data = GHpos, method="bray")
a4
# adonis2(formula = t(counts) ~ hiv + Time + RIN, data = GHpos, method = "bray")
# Df SumOfSqs      R2      F Pr(>F)  
# hiv       1  0.04690 0.14241 1.3466  0.047 *
#   Time      1  0.04024 0.12220 1.1555  0.242  
# RIN       1  0.06804 0.20661 1.9536  0.073 .
# Residual  5  0.17413 0.52877                
# Total     8  0.32930 1.00000                
# ---
#   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------
# design <- model.matrix(~0+hiv,data=GHpos)
GHpos$Group <- as.factor(as.character(unlist(GHpos$Group)))
design <- model.matrix(~0+Group,data=GHpos)#including age covariate
colnames(design) <- sub("Group","",colnames(design))
#---------
#quality factors profile
v <- voom(y, design = design)
aw <- arrayWeights(v, design = design, method = "genebygene", maxiter = 50, 
                   tol = 1e-05)
aw#this is for design with HIV exposure and age
# 1         2         3         4         5         6         7         8         9 
# 0.5742654 1.1690314 1.2991858 0.9834691 0.9380155 1.3745857 1.1495239 1.1630325 0.6762970
GHpos$RIN

#or GHpos
qual <- data.frame(arrayweight = aw,RIN = GHpos$RIN, reads = y$samples$lib.size, edgeR_norm_factor = y$samples$norm.factors)

rownames(qual) <-rownames(y$samples)
write.csv(qual, file = "sample_quality_metrics_GHpos_minus_outlier_samples.csv", row.names=T)

pdf("GHpos+quality_metrics_heatmap.pdf",7,5)
aheatmap(t(qual), scale = "row", cexRow = 0.25)
dev.off()
#----------
VQW <- voomWithQualityWeights(y, design=design, normalize.method="none", 
                              plot=TRUE,method="genebygene")

saveRDS(VQW,file = "VQW_counts_GHpos_outliers_removed.rds")#hiv + age design
saveRDS(VQW,file = "VQW_counts_GHpos_hiv_design_outliers_removed.rds")

pdf("GHposexpression_boxplots_after_voomWQW_and_filtering.pdf",10,7)
par(mar = c(10,5,5,5))
boxplot(VQW$E, las = 2)
dev.off()

# pdf("MDS_filtered_TMMnormalized_voomwithqualityweights.pdf", 10,10)#same because quality weights only incorporated in model?
# plotMDS(VQW$E, col=colors[meta$hiv], pch=pch[meta$Time], labels = rownames(y$samples))
# legend("topleft", legend=levels(meta$hiv), pch=pch, col=colors, ncol=2)
# dev.off()

vfit <- lmFit(VQW, design)
# contr.matrix = makeContrasts(#See https://www.biostars.org/p/435540/ for more on complex contrasts
#   hiv_exposure=hivHEU-hivHUU,
#   levels = colnames(design))
#OR 
contr.matrix = makeContrasts(#See https://www.biostars.org/p/435540/ for more on complex contrasts
  hiv_exposure=(b_GHpos_HEU+w15_GHpos_HEU)-(b_GHpos_HUU+w15_GHpos_HUU),
  levels = colnames(design))
#
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)#empirical Bayes moderation is carried out by borrowing information across all the genes to obtain more precise estimates of gene-wise variability
plotSA(efit, main="Final model: Mean-variance trend")
summary(decideTests(efit,p.value = 0.05))#GHneg
# hiv_exposure
# Down              0
# NotSig        11727
# Up                0
summary(decideTests(efit,p.value = 0.05))#GHpos
# hiv_exposure
# Down              0
# NotSig        11727
# Up                0
summary(decideTests(efit,p.value = 0.05))#(b_GHneg_HEU+w15_GHneg_HEU)-(b_GHneg_HUU+w15_GHneg_HUU)
# hiv_exposure
# Down              0
# NotSig         9497
# Up                0
summary(decideTests(efit,p.value = 0.05))#(b_GHpos_HEU+w15_GHpos_HEU)-(b_GHpos_HUU+w15_GHpos_HUU)
# hiv_exposure
# Down              0
# NotSig         9497
# Up                0
hivE <- topTreat(efit, n=Inf)#
head(hivE)#GHneg
head(hivE)#GHpos (with age)
# genes     logFC    AveExpr         t      P.Value adj.P.Val         B
# ENSG00000161921    CXCL16 -20.51375 -0.1260973 -5.579889 6.585667e-05 0.6254408 -2.955051
# ENSG00000106266      SNX8 -18.22758 -0.9661042 -4.918088 2.213090e-04 0.7553206 -3.290720
# ENSG00000188825 LINC00910 -18.29395 -1.3108051 -4.780268 2.868754e-04 0.7553206 -3.389920
# ENSG00000116586   LAMTOR2 -18.10961 -1.0850030 -4.544925 4.491482e-04 0.7553206 -3.456200
# ENSG00000138073      PREB  17.66504  0.3200658  4.473071 5.156697e-04 0.7553206 -3.469776
# ENSG00000140511    HAPLN3  12.54819  3.5169033  4.419684 5.716001e-04 0.7553206 -2.815706
head(hivE)#GHpos without age
# genes      logFC    AveExpr         t      P.Value adj.P.Val           B
# ENSG00000161921    CXCL16 -10.245448 -0.1260973 -5.898592 2.110782e-05 0.2004609  0.01865015
# ENSG00000106266      SNX8  -9.205317 -0.9661042 -5.186982 8.573379e-05 0.4071069 -0.85044926
# ENSG00000188825 LINC00910  -9.076590 -1.3108051 -4.902008 1.529126e-04 0.4299683 -1.20075994
# ENSG00000138073      PREB   8.890520  0.3200658  4.737566 2.143875e-04 0.4299683 -1.26928535
# ENSG00000109854   HTATIP2   8.645864 -0.2706302  4.566386 3.056367e-04 0.4299683 -1.54605008
# ENSG00000116586   LAMTOR2  -8.935969 -1.0850030 -4.554578 3.132365e-04 0.4299683 -1.49729011
#---------------------
hivE <- topTreat(efit, n=Inf)#
head(hivE)

#----------------------
# write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHneg_hivE_full_table.csv", row.names = T)
write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHneg_hivE_age_full_table.csv", row.names = T)
#OR
# write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHpos_hivE_full_table.csv", row.names = T)
write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHpos_hivE_age_full_table.csv", row.names = T)
#--------------
#VISUALISING PATHWAY ANALYSIS RESULTS (HIV + AGE SPIA, for selected pathways of interest)
#For GHneg, do heatmaps fo SPIA pathway results (of potential interest) that had unadjusted p-values < 0.05
#These include: TGF-B signalling pathway, carbohydrate digestion and absorption, wnt signalling pathway
#For GHpos: antigen processing and presentation
#--------
#including hiv and age in the model as specified above:
#GHpos:
antigen <- c("972","4801","1385","1514","3106","3134","3326")
#GHneg:
b_cell <- c("1147","7409","23533","5295","3635","3636","4773","5880","118788")
tgfb <- c("8454","4088","4609","5933","5516","9475","9241")
carbs <- c("476","481","2542","3098","23533","5295")
wnt  <- c("4088","4773","818","5880","9475","81839","4609","51176","8454","6907","5516","5567","4040")
# length(htlv1)#18
library(annotables)
grch37 <- data.frame(grch37)#
#for antigen processing
#
keep<- grch37[grch37$entrez %in% antigen,]
dim(keep)#18
length(unique(keep$symbol))#7
unique(keep$symbol)
# [1] "CD74"     "HSP90AB1" "CREB1"    "NFYB"     "CTSL"     "HLA-F"    "HLA-B"
#---
#OR b cell
keep<- grch37[grch37$entrez %in% b_cell,]
dim(keep)#9
length(unique(keep$symbol))#7
unique(keep$symbol)
# [1] "NFATC2"  "RAC2"    "PIK3R5"  "VAV1"    "PIK3R1"  "PIK3AP1" "INPPL1"  "INPP5D"  "CHUK"  
#
#---
#OR TGFB
keep<- grch37[grch37$entrez %in% tgfb,]
dim(keep)#7
length(unique(keep$symbol))#7
unique(keep$symbol)
# [1] "CUL1"   "RBL1"   "PPP2CB" "ROCK2"  "MYC"    "SMAD3"  "NOG"
#----
#OR carbs
keep<- grch37[grch37$entrez %in% carbs,]
dim(keep)#7 9
length(unique(keep$symbol))#6
unique(keep$symbol)
# [1] "SLC37A4" "PIK3R5"  "ATP1B1"  "PIK3R1"  "HK1"     "ATP1A1" 
#----
#----
#OR WNT signaling
keep<- grch37[grch37$entrez %in% wnt,]
dim(keep)#7 9
length(unique(keep$symbol))#6
unique(keep$symbol)
# [1] "CUL1"   "LRP6"   "NFATC2" "TBL1X"  "PPP2CB" "RAC2"   "ROCK2"  "MYC"    "LEF1"   "PRKACB"
# [11] "CAMK2G" "SMAD3"  "VANGL1"
#----
samples <- GHpos
#OR
samples <- GHneg
#sort by HEU status
samples <- samples %>% 
  rownames_to_column('ID') %>%
  arrange(.,hiv) %>%
  column_to_rownames('ID')
#load normalised counts
#GHneg, HIV+ age
#Note that these counts are based on the voom function which outputs:
# a numeric matrix of normalized expression values on the log2 scale (Transform count data to log2-counts per million (logCPM), estimate the mean-variance relationship and use this to compute appropriate observation-level weights. The data are then ready for linear modelling)
VQWneg <- readRDS(file = "VQW_counts_GHneg_outliers_removed.rds")
# VQWneg_simple <- readRDS(file = "VQW_counts_GHneg_hiv_design_outliers_removed.rds")
# identical(VQWneg_simple$E,VQWneg$E)#Same regardless of design
#GHpos, HIV+age
VQWpos <- readRDS(file = "VQW_counts_GHpos_outliers_removed.rds")
VQWpos$design

# find_ids <- rownames(VQWneg$genes)[VQWneg$genes$genes %in% t_cell]
# find_ids
# toplot <- VQWneg$E[find_ids,rownames(samples)]

#OR
#GHpos
find_ids <- rownames(VQWpos$genes)[VQWpos$genes$genes %in% unique(keep$symbol)]
toplot <- VQWpos$E[find_ids,rownames(samples)]
#---
#GHneg
#OR for b cell signaling
find_ids <- rownames(VQWneg$genes)[VQWneg$genes$genes %in% unique(keep$symbol)]
toplot <- VQWneg$E[find_ids,rownames(samples)]

rownames(toplot) <- gene[rownames(toplot),"external_gene_name"]
t.track <- samples[,c(4,5)]
colnames(t.track) <- c("HIV exposure","age")
rownames(t.track) <- rownames(samples)

# colnames(t.track) <- "HIV exposure"
cols <- c("HEU" = "red", "HUU" = "blue")
age.cols <- c("birth" = "darkolivegreen4","week15" = "deepskyblue4")
HEU.cols <- list("HIV exposure" = cols, "age" = age.cols)
#
tiff("Heatmap_GHneg_HEU_vs_HUU_antigen_presentation_ordered.tiff",6,4, units = "in", res =200)
pheatmap(toplot[,rownames(t.track)],  
         annotation_col = t.track,annotation_colors = HEU.cols,cluster_cols = FALSE)
dev.off()
tiff("Heatmap_GHneg_HEU_vs_HUU_antigen_presentation.tiff",6,4, units = "in", res =200)
pheatmap(toplot[,rownames(t.track)],  
         annotation_col = t.track,annotation_colors = HEU.cols)
dev.off()
#
tiff("Heatmap_GHneg_HEU_vs_HUU_b_cell_differentiation.tiff",6,4, units = "in", res =200)
pheatmap(toplot[,rownames(t.track)],  
         annotation_col = t.track,annotation_colors = HEU.cols)
dev.off()
tiff("Heatmap_GHneg_HEU_vs_HUU_b_cell_differentiation_ordered.tiff",6,4, units = "in", res =200)
pheatmap(toplot[,rownames(t.track)],  
         annotation_col = t.track,annotation_colors = HEU.cols, cluster_cols = FALSE)
dev.off()
#
tiff("Heatmap_GHneg_HEU_vs_HUU_tgfb_signaling.tiff",6,4, units = "in", res =200)
pheatmap(toplot[,rownames(t.track)],  
         annotation_col = t.track,annotation_colors = HEU.cols)
dev.off()
tiff("Heatmap_GHneg_HEU_vs_HUU_tgfb_signaling_ordered.tiff",6,4, units = "in", res =200)
pheatmap(toplot[,rownames(t.track)],  
         annotation_col = t.track,annotation_colors = HEU.cols, cluster_cols = FALSE)
dev.off()
tiff("Heatmap_GHneg_HEU_vs_HUU_wnt_signaling_ordered.tiff",6,4, units = "in", res =200)
pheatmap(toplot[,rownames(t.track)],  
         annotation_col = t.track,annotation_colors = HEU.cols, cluster_cols = FALSE)
dev.off()
tiff("Heatmap_GHneg_HEU_vs_HUU_wnt_signaling.tiff",6,4, units = "in", res =200)
pheatmap(toplot[,rownames(t.track)],  
         annotation_col = t.track,annotation_colors = HEU.cols)
dev.off()
#
tiff("Heatmap_GHneg_HEU_vs_HUU_carbohydrate_digestion_absorption_ordered.tiff",6,4, units = "in", res =200)
pheatmap(toplot[,rownames(t.track)],  
         annotation_col = t.track,annotation_colors = HEU.cols, cluster_cols = FALSE)
dev.off()
tiff("Heatmap_GHneg_HEU_vs_HUU_carbohydrate_digestion_absorption.tiff",6,4, units = "in",res = 200)
pheatmap(toplot[,rownames(t.track)],  
         annotation_col = t.track,annotation_colors = HEU.cols)
dev.off()
#--------------
#////
#--------------
#Plotting various normalisation methods (this was for GHneg)
raw <- as.matrix(d$counts[rownames(counts),])
dim(raw)#11727 9
# colnames(raw) <- colnames(d$counts)
raw <- log2(raw + 1)
raw <- reshape2::melt(raw, id = rownames(raw))
names(raw)[1:2] <- c("id", "sample")
raw$method <- rep("raw",nrow(raw))
#
cpm <- log2(cpm(y)+1)
cpm <- reshape2::melt(cpm, id = rownames(y$counts))
names(cpm)[1:2] <- c ("id", "sample")
cpm$method <- rep("cpm", nrow(cpm))
#
voom <- as.matrix(v$E)
voom <- reshape2::melt(v$E, id = rownames(v$E))
names(voom)[1:2] <- c("id", "sample")
voom$method <- rep("voom", nrow(voom))
#
df_allnorm <- rbind(raw,cpm, voom)
df_allnorm$method <- factor(df_allnorm$method,
                            levels = c("raw","cpm", "voom"))

p <- ggplot(data=df_allnorm, aes(x=sample, y=value, fill=method))
p <- p + geom_boxplot()  
p <- p + theme_bw()
p <- p + ggtitle("Boxplots of normalized pseudo counts\n
for all samples by normalization methods")
p <- p + facet_grid(. ~ method) 
p <- p + ylab(expression(log[2] ~ (normalized ~ count + 1))) + xlab("")
p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
               axis.ticks.x = element_blank())
p
ggsave(p, "boxplots_different_normalisations.pdf", device = "pdf")
#//////
#
pdf("Heatmap_GHneg_HEU_vs_HUU_t_cell_differentiation.pdf",6,6)
aheatmap(toplot[,rownames(t.track)],annCol = t.track,annColors = HEU.cols,cexRow = 0.2,scale="row")
dev.off()
#
# pdf(paste0("heatmap_voom_with_weights_hivE_FDR_0_05_rowscale.pdf"),7,5)
# aheatmap(to_plot[,rownames(t.track)], annCol = t.track[3:5], Colv = NA, cexRow = 0.6, scale = "row")
# dev.off()
# #outliers removed:
# #with red/blue for HUE/HUU at sonwabile's request
# t.track <- meta.sub[,c(4,5)]
# t.track <- t.track %>% arrange(hiv)
# colnames(t.track) <- c("HIV exposure","Age")
# cols <- c("birth" = "seagreen1", "week15" = "darkgoldenrod1")
# cols2 <- c("HUU"= "steelblue","HEU" = "chocolate")
# age.cols <- list("Age" = cols, "HIV exposure" = cols2)
# pdf(paste0("heatmap_voom_with_weights_hivE_FDR_0_05_outlier_samples_removed.pdf"),7,5)
# aheatmap(to_plot[,rownames(t.track)], annCol = t.track,annColors = age.cols ,Colv = NA, cexRow = 0.6)
# dev.off()
# pdf(paste0("heatmap_voom_with_weights_hivE_FDR_0_05_outlier_samples_removed_clustered.pdf"),7,5)
# aheatmap(to_plot[,rownames(t.track)], annCol = t.track, annColors = age.cols,cexRow = 1)
# dev.off()
#ignore this section
#Heatmap T cell differentiation set from pathway analysis GHneg
# t_cell <- c("IL7R","LEF1","CD74","CHD7","RELB","ADAM17","TCF7","RPL22","BCL11B","LCK","CD3E","IL18R1","PTPRC","CD3D","PRELID1",
#   "PRDM1","LY9","JMJD6","BCL3","EGR1","SART1","ANXA1","THEMIS","SRF","ZMIZ1","NKAP","FANCA","SASH3","SOD1")
# il_6 <- c("ENSG00000169403","ENSG00000159674","ENSG00000090376","ENSG00000153064","ENSG00000211578",
#           "ENSG00000011600","ENSG00000181104")
# htlv1 <- c("916","917","3106","22800","7417","1385","10393","9184","5971","2874","2113",
#            "4801","3122","2648","3932","4603","7538","472")
#--------