#RNAseqs differential expression testing using edgeR #Set the working directory to where your input files are located: #V4 specific: rerun with voomWithQualityWeights (given variable RIN scores) AND advice from Susan Holmes #Note that it is not possible to include infant ID in the model (two samples per infant) - not enough degrees of #freedom. I therefore analysed GHneg and GHpos samples seperately (combining birth and W15 sample) to test HEU vs. HUU # setwd("/Users/katielennard/Documents/Academic/IDM_projects/Gray/") setwd("/Users/katielennard/Documents/Katie/IDM_projects/Gray") #Load the relevant packages, install as necessary with install.packages() or Bioconductor 'BiocManager' library(edgeR) library(NMF) library(biomaRt)#gene annotation library(ggfortify)#PCA plot library(magrittr)#%>% library(dplyr) library(tibble) # for `rownames_to_column` and `column_to_rownames` library(statmod)#for estimateDisp with robust=TRUE library(GO.db) library(org.Hs.eg.db) library(gplots) library(vegan)#permanova (adonis) library(ggplot2) #import counts: Rcounts <- read.csv(paste0(getwd(),"/RNAseq_Ilifu_run/merged_gene_counts.txt"), sep = "\t", row.names = 1) dim(Rcounts)#63677 21 head(Rcounts) #Clean sample names: colnames(Rcounts) colnames(Rcounts) <- sub(".fwAligned.sortedByCoord.out.bam","",colnames(Rcounts)) colnames(Rcounts) colnames(Rcounts) <- sapply(strsplit(colnames(Rcounts), "_S"), `[`, 1) colnames(Rcounts) colnames(Rcounts) <- sub("X","",colnames(Rcounts)) colnames(Rcounts) <- sub("\\.","-",colnames(Rcounts)) colnames(Rcounts) <- sub("\\.","-",colnames(Rcounts)) colnames(Rcounts)#clean #Find empty rows RS <- sort(rowSums(Rcounts[,-1])) head(RS) length(which(RS==0))#remove 10136 transcripts with only zero values #Remove empty rows Rcounts <- Rcounts[-which(rowSums(Rcounts[,-1])==0),] dim(Rcounts)#23034 # write.csv2(Rcounts, file = "Sonwabile_Rcounts_unfiltered.csv") #import sample metadata meta <- read.csv2(paste0(getwd(),"/from_sonwabile/mdata.csv"), sep = ";", row.names = 1) #import RIN scores RINs <- read.csv2("RINs_collated_from_QC_reports-KSL_forR.csv", row.names = 1, stringsAsFactors = F) identical(rownames(meta),rownames(RINs))#TRUE RINs meta$RIN <- as.numeric(RINs$RIN) #check that sampleIDs overlap between metadata and counts table all(rownames(meta) %in% colnames(Rcounts))#TRUE head(meta) length(unique(meta$Infant_ID))#only 8 infants, with two different time points table(meta$gut_homing) # Neg Pos # 10 10 table(meta$Time) # birth week15 # 10 10 #Make sure sample order is same between sev_meta and counts tmp <- Rcounts[,rownames(meta)] Rcounts2 <- data.frame(Rcounts[,1],tmp, check.names = F) colnames(Rcounts2)[1] <- "gene" identical(rownames(meta),colnames(Rcounts2)[-1])#TRUE #santity check Rcounts[1:10,"20028R-01-03"] Rcounts2[1:10,"20028R-01-03"] #same #--------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------- #Make DGElist object for edgeR use d <- DGEList(counts=as.matrix(Rcounts2[,-1]),genes = Rcounts2[,1], group=factor(meta$gut_homing)) d #-------------------EXPLORATORY QC---------------------------------------------------- df <- data.frame(t(Rcounts2[,-1]),meta) df_counts <- df[,1:23034] # pca_res <- prcomp(df_counts, scale. = TRUE) # pdf("MDS_hiv_age.pdf", 4,3) # autoplot(pca_res, data = df, colour = 'hiv', shape = 'Time')+theme_light() # dev.off() # # # pdf("MDS_gut_homing_age.pdf", 4,3) # autoplot(pca_res, data = df, colour = 'gut_homing', shape = 'Time')+theme_light() # dev.off() # #Two outliers - both from the same infant C-0088-B (20028R-01-03 and 20028R-01-04); both have significantly lower counts than the rest of the samples (poorer assignment) sampleQC <- data.frame(colSums(d$counts), meta$gut_homing) sampleQC %>% rownames_to_column('sample') %>% arrange(.,colSums.d.counts.) %>% column_to_rownames('sample') # colSums.d.counts. meta.gut_homing # 20028R-01-04 3509430 Neg # 20028R-01-03 7766737 Pos # 20028R-01-09 12848739 Pos # 20028R-01-19 15671557 Pos # 20028R-01-37 16760425 Pos # 20028R-01-21 16877973 Pos # 20028R-01-11 19001689 Pos # 20028R-01-10 20317721 Neg # 20028R-01-17 20602737 Pos # 20028R-01-18 20649613 Neg # 20028R-01-08 21137498 Neg # 20028R-01-38 21677469 Neg # 20028R-01-22 22828992 Neg # 20028R-01-06 24094352 Neg # 20028R-01-05 25158132 Pos # 20028R-01-39 26705488 Pos # 20028R-01-12 26784120 Neg # 20028R-01-40 27251114 Neg # 20028R-01-07 27316135 Pos # 20028R-01-20 28427178 Neg #--------------------------------------------------------- #Remove outliers and redo MDS plot: grep("20028R-01-04",colnames(Rcounts2))#3 grep("20028R-01-03",colnames(Rcounts2))#2 grep("20028R-01-04",rownames(meta))#2 grep("20028R-01-03",rownames(meta))#1 meta.sub <- meta[-(1:2),] df <- data.frame(t(Rcounts2[,-(1:3)]),meta.sub) df_counts <- df[,1:23034] df_counts <- df_counts[,-which(colSums(df_counts)==0)] dim(df_counts)#18 22702 #NB, when we remove those two outlier samples there are again genes with only zeros across all remaining samples: pca_res <- prcomp(df_counts, scale. = TRUE) # pdf("MDS_age_HIVstatus_outliers_removed.pdf",4,3) # autoplot(pca_res, data = df, colour = 'hiv', shape = 'Time')+theme_light() # dev.off() # pdf("MDS_age_gut_homing_outliers_removed.pdf",4,3) # autoplot(pca_res, data = df, colour = 'gut_homing', shape = 'Time')+theme_light() # dev.off() #-------------------EXPLORATORY QC---------------------------------------------------- #---------------------GENE ANNOTATION----------------------------------------- #Lets start by filtering out any remaining rRNA in this dataset # mart = useMart('ensembl') # # list all the ensembl database of organisms # listDatasets(mart) # #choose database of your interest ; in this case its "cfamiliaris_gene_ensembl" I guess # ensembl = useMart( "ensembl", dataset = "hsapiens_gene_ensembl" , host = "grch37.ensembl.org") # # choose attributes of your interest # listAttributes(ensembl) # gene <- getBM( attributes = c("ensembl_gene_id","external_gene_name","description", "percentage_gene_gc_content"),values = rownames(Rcounts2),mart = ensembl) # head(gene) # rownames(gene) <- gene$ensembl_gene_id # dim(gene)#63677 for gchr37 # length(intersect(rownames(gene), rownames(Rcounts2)))#53541 for gchr37 (USED) # saveRDS(gene,"severin_ensembl_gchr37_annotation.rds") gene <- readRDS("severin_ensembl_gchr37_annotation.rds")#to reload #---------------------GENE ANNOTATION------------------------------------------------ #---------------------DESIGN MATRIX-------------------------------------------------- #COMPARISONS TO BE MADE: # Treg pos HEU vs HUU birth # Treg neg HEU vs HUU birth # Treg pos HEU vs HUU 15 weeks # Treg neg HEU vs HUU 15 weeks # HEU vs HUU birth # HEU vs HUU 15 weeks # # birth vs 15 weeks HUU # birth vs 15 weeks HEU #However, for most of these the group sizes are N=2: see table below #------ #Add unique group specifier to meta so we can use contrasts meta.sub$Group <- rep(NA,nrow(meta.sub)) meta.sub$Group[meta.sub$Time=="week15" & meta.sub$gut_homing=="Pos" & meta.sub$hiv == "HUU"] <- "w15_GHpos_HUU" meta.sub$Group[meta.sub$Time=="week15" & meta.sub$gut_homing=="Pos" & meta.sub$hiv == "HEU"] <- "w15_GHpos_HEU" meta.sub$Group[meta.sub$Time=="week15" & meta.sub$gut_homing=="Neg" & meta.sub$hiv == "HUU"] <- "w15_GHneg_HUU" meta.sub$Group[meta.sub$Time=="week15" & meta.sub$gut_homing=="Neg" & meta.sub$hiv == "HEU"] <- "w15_GHneg_HEU" meta.sub$Group[meta.sub$Time=="birth" & meta.sub$gut_homing=="Pos" & meta.sub$hiv == "HUU"] <- "b_GHpos_HUU" meta.sub$Group[meta.sub$Time=="birth" & meta.sub$gut_homing=="Pos" & meta.sub$hiv == "HEU"] <- "b_GHpos_HEU" meta.sub$Group[meta.sub$Time=="birth" & meta.sub$gut_homing=="Neg" & meta.sub$hiv == "HUU"] <- "b_GHneg_HUU" meta.sub$Group[meta.sub$Time=="birth" & meta.sub$gut_homing=="Neg" & meta.sub$hiv == "HEU"] <- "b_GHneg_HEU" meta.sub$Group <- as.factor(meta.sub$Group) table(meta.sub$Group) # b_GHneg_HEU b_GHneg_HUU b_GHpos_HEU b_GHpos_HUU w15_GHneg_HEU w15_GHneg_HUU w15_GHpos_HEU w15_GHpos_HUU # 2 2 2 2 3 2 3 2 #------------------------------------------------------------------------------------ #HEATMAP ALL samples including outliers d <- DGEList(counts=as.matrix(Rcounts2[,-1]),genes = Rcounts2[,1], group=factor(meta$hiv)) y <- calcNormFactors(d,log=TRUE) logCPM <- cpm(y, prior.count=2, log=TRUE)#prior.count is to avoid undefined values and to reduce norm_counts <- data.frame(logCPM) # t.track <- meta[,c(3:6)] # pdf("heatmap_log2_CPM.pdf")#From this heatmap we see the effect of RIN score on the profiles with higher RIN samples having more comprehensive gene coverage # aheatmap(norm_counts, annCol = t.track) # dev.off() #-------- #Filter by expression # design <- model.matrix(~0+hiv, data=meta)#force intercept through zero so as not to select any one group as 'reference' # keep <- filterByExpr(d, design = design, group = factor(meta$hiv), min.count = 10) # length(which(keep==TRUE))#10493 # y <- d[keep, , keep.lib.sizes=FALSE] # logCPM <- cpm(y, prior.count=2, log=TRUE) # norm_counts <- data.frame(logCPM) # t.track <- meta[,c(3:6)] # pdf("heatmap_log2_CPM_filtered.pdf")#From this heatmap we see the effect of RIN score on the profiles with higher RIN samples having more comprehensive gene coverage # aheatmap(norm_counts, annCol = t.track) # dev.off() # #+++++++++++++++++++++++++++++++++++++++ #+++++++++++++++++++++++++++++++++++++++ #voomwithqualityweights differential expression testing section #Note that there are not enough samples (degrees of freedom to include infant ID in the model so we have to split by gut homing #status, because we cant include two samples from one infant in the model) dim(meta)#20 #All gut homing negative samples together, irrespective of age GHneg <- meta.sub[grep("neg",meta.sub$Group),] table(GHneg$Group) table(GHneg$hiv) # HEU HUU # 5 4 #--------------- #--------------- GHpos <- meta.sub[grep("pos",meta.sub$Group),] table(GHpos$hiv) # HEU HUU # 5 4 #---------------- df_counts <- t(df_counts)#2 outlier removed before head(df_counts) dim(df_counts) gene_symbols <- gene[rownames(df_counts),] head(gene_symbols) identical(rownames(meta.sub),colnames(df_counts))#T GHneg_counts <- df_counts[,rownames(GHneg)] dim(GHneg_counts)#22702 9 GHpos_counts <- df_counts[,rownames(GHpos)] dim(GHpos_counts)#22702 9 #------------------------- #Faceted NMDS plots of GH neg and GH pos # MDS with Bray-Curtis instead of PCA Euclidean colour_palette <- c("red","blue") set.seed(2) nmds <- metaMDS(t(df_counts),k=2, trymax = 500) stressplot(nmds) data.scores <- as.data.frame(scores(nmds)) #Using the scores function from vegan to extract the sample scores and convert to a data.frame data.scores$samples <- rownames(data.scores) #create a column of site names, from the rownames of data.scores data.scores[,"HIV"] <- meta.sub[,"hiv"] data.scores[,"Age"] <- meta.sub[,"Time"] data.scores[,"gut_homing"] <- meta.sub[,"gut_homing"] title = c("NMDS by gut homing status") colour = data.scores$HIV shape = data.scores$Age c.lab = "HIV exposure" s.lab = "Age" p <- ggplot() + geom_point(data=data.scores,aes(x=NMDS1,y=NMDS2,shape=shape,colour=colour),size=1.5) + # add the point markers scale_colour_manual(values=colour_palette)+ #NB - EDIT TITLE AS NEEDED ggtitle(title)+ coord_equal() + theme_bw() + labs(color=c.lab, shape = s.lab)+ theme(plot.title = element_text(hjust = 0.5))+ theme(panel.background = element_blank(), panel.grid.major = element_blank(), #remove major-grid labels panel.grid.minor = element_blank(), #remove minor-grid labels plot.background = element_blank()) p <- p + facet_grid(cols = vars(gut_homing)) ggsave(p,filename = "NMDS_bray_curtis_age_facet_HIV-col",device = "tiff",width = 7, height = 7) tiff("NMDS_bray_curtis_age_facet_HIV-col", unit = "cm", width = 10,height = 10, res = 300) p dev.off() #--------------------------------- #GHneg #--------------------------------- d <- DGEList(counts=as.matrix(GHneg_counts),genes = gene_symbols$external_gene_name, group=factor(GHneg$hiv)) keep <- filterByExpr(d, group=factor(GHneg$hiv)) y <- d[keep, , keep.lib.sizes=FALSE]#filter option y <- edgeR::calcNormFactors(y) print(table(keep)) # keep # FALSE TRUE # 10975 11727 #PERMANOVA: grouping <- GHneg$hiv diss1 <- vegdist(t(y$counts),method = "bray")# betadine <- betadisper(diss1, group=grouping)#Where ‘diss1’ is your distance matrix anova(betadine)#If NS then adonis meets assumptions of homogeneous dispersion - not sure what group to specify if you're testing multiple effects with adonis though.. #P=0.6 counts <- cpm(y, log=FALSE)#prior.count is to avoid undefined values and to reduce #I think it is better to use Bray here because this data is a bit more like 16S in that there are many zero entries (sparsity related to low coverage & low RINs) #Adonis 2 where order does not matter and we look at marginal probabilities, each time conrolling for all other factors set.seed(6) a1=adonis2(t(counts) ~ hiv+Time+RIN , data = GHneg, method="bray", by="margin") a1#NB note: in the version 1 script I didn't use a start seed so results vary slightly from below # Df SumOfSqs R2 F Pr(>F) # hiv 1 0.022304 0.11978 0.9502 0.546 # Time 1 0.019608 0.10530 0.8354 0.661 # RIN 1 0.020222 0.10860 0.8615 0.647 # Residual 5 0.117363 0.63027 # Total 8 0.186211 1.00000 #------- #Final model selected for publication: #------- set.seed(6) a2=adonis(t(counts) ~ Time+hiv+RIN , data = GHneg, method="bray") a2 # Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) # Time 1 0.026064 0.026065 1.11042 0.13997 0.320 # hiv 1 0.022561 0.022561 0.96118 0.12116 0.522 # RIN 1 0.020222 0.020222 0.86153 0.10860 0.631 # Residuals 5 0.117363 0.023473 0.63027 # Total 8 0.186211 1.00000 #------- set.seed(6) a3=adonis(t(counts) ~ hiv+Time+RIN , data = GHneg, method="bray") a3 # Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) # hiv 1 0.022982 0.022982 0.97909 0.12342 0.501 # Time 1 0.025644 0.025644 1.09252 0.13772 0.338 # RIN 1 0.020222 0.020222 0.86153 0.10860 0.647 # Residuals 5 0.117363 0.023473 0.63027 # Total 8 0.186211 1.00000 # --- set.seed(6) a4=adonis(t(counts) ~ hiv+Time+RIN, data = GHneg, method="bray") a4 # Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) # hiv 1 0.022982 0.022982 0.97909 0.12342 0.501 # Time 1 0.025644 0.025644 1.09252 0.13772 0.338 # RIN 1 0.020222 0.020222 0.86153 0.10860 0.647 # Residuals 5 0.117363 0.023473 0.63027 # Total 8 0.186211 1.00000 set.seed(6) a5=adonis(t(counts) ~ RIN+hiv+Time, data = GHneg, method="bray") a5 # Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) # RIN 1 0.026265 0.026265 1.11895 0.14105 0.323 # hiv 1 0.022975 0.022975 0.97881 0.12338 0.501 # Time 1 0.019608 0.019608 0.83538 0.10530 0.661 # Residuals 5 0.117363 0.023473 0.63027 # Total 8 0.186211 1.00000 #----- #Specify design # design <- model.matrix(~0+hiv,data=GHneg) GHneg$Group <- as.factor(as.character(unlist(GHneg$Group))) design <- model.matrix(~0+Group,data=GHneg)#including age covariate colnames(design) <- sub("Group","",colnames(design)) # design <- model.matrix(~0+hiv,data=GHneg)#hiv only v <- voom(y, design = design) aw <- arrayWeights(v, design = design, method = "genebygene", maxiter = 50, tol = 1e-05) aw # 1 2 3 4 5 6 7 8 9 # 0.3658284 2.0283134 0.5525365 0.6917837 1.0215903 2.2208566 1.8537101 1.9753257 0.4244029 qual <- data.frame(arrayweight = aw,RIN = GHneg$RIN, reads = y$samples$lib.size, edgeR_norm_factor = y$samples$norm.factors) rownames(qual) <-rownames(y$samples) write.csv(qual, file = "sample_quality_metrics_GHneg_minus_outlier_samples.csv", row.names=T) pdf("GHneg+quality_metrics_heatmap.pdf",7,5) aheatmap(t(qual), scale = "row", cexRow = 0.25) dev.off() VQW <- voomWithQualityWeights(y, design=design, normalize.method="none", plot=TRUE,method="genebygene") saveRDS(VQW,file = "VQW_counts_GHneg_outliers_removed.rds")#hiv + age in design saveRDS(VQW,file = "VQW_counts_GHneg_hiv_design_outliers_removed.rds") pdf("GHnegexpression_boxplots_after_voomWQW_and_filtering.pdf",10,7) par(mar = c(10,5,5,5)) boxplot(VQW$E, las = 2) dev.off() vfit <- lmFit(VQW, design) contr.matrix = makeContrasts(#See https://www.biostars.org/p/435540/ for more on complex contrasts hiv_exposure=(b_GHneg_HEU+w15_GHneg_HEU)-(b_GHneg_HUU+w15_GHneg_HUU), levels = colnames(design)) #or HIV only # contr.matrix = makeContrasts(#See https://www.biostars.org/p/435540/ for more on complex contrasts # hiv_exposure=hivHEU-hivHUU, # levels = colnames(design)) vfit <- contrasts.fit(vfit, contrasts=contr.matrix) efit <- eBayes(vfit)#empirical Bayes moderation is carried out by borrowing information across all the genes to obtain more precise estimates of gene-wise variability plotSA(efit, main="Final model: Mean-variance trend") summary(decideTests(efit,p.value = 0.05))#GHneg # hiv_exposure # Down 0 # NotSig 11727 # Up 0 hivE <- topTreat(efit, n=Inf)# head(hivE)#GHneg # genes logFC AveExpr t P.Value adj.P.Val B # ENSG00000112137 PHACTR1 -17.9849427 0.966167983 -5.092831 0.0001298288 0.7592297 -4.591433 # ENSG00000211459 MT-RNR1 -0.9660598 11.481368608 -4.855067 0.0002063292 0.7592297 -3.838716 # ENSG00000169554 ZEB2 -7.5123294 4.725513983 -4.327066 0.0005902181 0.7592297 -4.578196 # ENSG00000124491 F13A1 -13.9912460 0.001734975 -4.256654 0.0006803949 0.7592297 -4.591905 # ENSG00000109971 HSPA8 1.6185846 10.076426276 4.239249 0.0007047769 0.7592297 -4.295128 # ENSG00000152240 HAUS1 17.2087789 -0.532570425 4.200540 0.0007622495 0.7592297 -4.593537 write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHneg_hivE_age_full_table.csv", row.names = T) #Or with only HIV, ignoring age: # genes logFC AveExpr t P.Value adj.P.Val B # ENSG00000140564 FURIN -1.992993 6.657183143 -7.008480 9.842485e-06 0.1154228 3.8148108 # ENSG00000107263 RAPGEF1 -1.222185 7.914476555 -6.214754 3.316827e-05 0.1491656 2.4745489 # ENSG00000124491 F13A1 -8.534395 0.001734975 -5.949393 5.075220e-05 0.1491656 -0.6556639 # ENSG00000229164 TRAC 1.227566 8.556926867 5.474973 1.112645e-04 0.1491656 1.1988014 # ENSG00000158427 TMSB15B 8.261919 -0.938983494 5.408162 1.245837e-04 0.1491656 -1.7374527 # ENSG00000170525 PFKFB3 -1.478686 7.613763293 -5.393902 1.276370e-04 0.1491656 1.1428482 write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHneg_hivE_full_table.csv", row.names = T) #----------------------------------- #--------------------------------- #OR GHpos d <- DGEList(counts=as.matrix(GHpos_counts),genes = gene_symbols$external_gene_name, group=factor(GHpos$hiv)) keep <- filterByExpr(d, group=factor(GHpos$hiv)) print(table(keep)) # keep # FALSE TRUE # 13205 9497 # y <- d[keep, , keep.lib.sizes=FALSE]#filter option y <- edgeR::calcNormFactors(y) counts <- cpm(y, log=FALSE) #--- diss1 <- vegdist(t(y$counts),method = "bray")# betadine <- betadisper(diss1, group=grouping)#Where ‘diss1’ is your distance matrix anova(betadine)#p=0.3 #--- set.seed(6) a1=adonis2(t(counts) ~ hiv+Time+RIN , data = GHpos, method="bray", by="margin") a1 # Df SumOfSqs R2 F Pr(>F) # hiv 1 0.03395 0.10309 0.9748 0.398 # Time 1 0.04403 0.13370 1.2642 0.134 # RIN 1 0.06804 0.20661 1.9536 0.073 . # Residual 5 0.17413 0.52877 # Total 8 0.32930 1.00000 set.seed(6) a2=adonis(t(counts) ~ Time+hiv+RIN , data = GHpos, method="bray") a2 # Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) # Time 1 0.04303 0.043034 1.2357 0.13068 0.103 # hiv 1 0.04411 0.044106 1.2665 0.13394 0.096 . # RIN 1 0.06804 0.068037 1.9536 0.20661 0.073 . # Residuals 5 0.17413 0.034825 0.52877 # Total 8 0.32930 1.00000 set.seed(6) a2=adonis(t(counts) ~ RIN+hiv+Time , data = GHpos, method="bray") a2 # Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) # RIN 1 0.07556 0.075564 2.1698 0.22947 0.084 . # hiv 1 0.03559 0.035586 1.0218 0.10806 0.354 # Time 1 0.04403 0.044026 1.2642 0.13370 0.134 # Residuals 5 0.17413 0.034825 0.52877 # Total 8 0.32930 1.00000 set.seed(6) a4=adonis2(t(counts) ~ hiv+Time+RIN , data = GHpos, method="bray") a4 # adonis2(formula = t(counts) ~ hiv + Time + RIN, data = GHpos, method = "bray") # Df SumOfSqs R2 F Pr(>F) # hiv 1 0.04690 0.14241 1.3466 0.047 * # Time 1 0.04024 0.12220 1.1555 0.242 # RIN 1 0.06804 0.20661 1.9536 0.073 . # Residual 5 0.17413 0.52877 # Total 8 0.32930 1.00000 # --- # Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #------------ # design <- model.matrix(~0+hiv,data=GHpos) GHpos$Group <- as.factor(as.character(unlist(GHpos$Group))) design <- model.matrix(~0+Group,data=GHpos)#including age covariate colnames(design) <- sub("Group","",colnames(design)) #--------- #quality factors profile v <- voom(y, design = design) aw <- arrayWeights(v, design = design, method = "genebygene", maxiter = 50, tol = 1e-05) aw#this is for design with HIV exposure and age # 1 2 3 4 5 6 7 8 9 # 0.5742654 1.1690314 1.2991858 0.9834691 0.9380155 1.3745857 1.1495239 1.1630325 0.6762970 GHpos$RIN #or GHpos qual <- data.frame(arrayweight = aw,RIN = GHpos$RIN, reads = y$samples$lib.size, edgeR_norm_factor = y$samples$norm.factors) rownames(qual) <-rownames(y$samples) write.csv(qual, file = "sample_quality_metrics_GHpos_minus_outlier_samples.csv", row.names=T) pdf("GHpos+quality_metrics_heatmap.pdf",7,5) aheatmap(t(qual), scale = "row", cexRow = 0.25) dev.off() #---------- VQW <- voomWithQualityWeights(y, design=design, normalize.method="none", plot=TRUE,method="genebygene") saveRDS(VQW,file = "VQW_counts_GHpos_outliers_removed.rds")#hiv + age design saveRDS(VQW,file = "VQW_counts_GHpos_hiv_design_outliers_removed.rds") pdf("GHposexpression_boxplots_after_voomWQW_and_filtering.pdf",10,7) par(mar = c(10,5,5,5)) boxplot(VQW$E, las = 2) dev.off() # pdf("MDS_filtered_TMMnormalized_voomwithqualityweights.pdf", 10,10)#same because quality weights only incorporated in model? # plotMDS(VQW$E, col=colors[meta$hiv], pch=pch[meta$Time], labels = rownames(y$samples)) # legend("topleft", legend=levels(meta$hiv), pch=pch, col=colors, ncol=2) # dev.off() vfit <- lmFit(VQW, design) # contr.matrix = makeContrasts(#See https://www.biostars.org/p/435540/ for more on complex contrasts # hiv_exposure=hivHEU-hivHUU, # levels = colnames(design)) #OR contr.matrix = makeContrasts(#See https://www.biostars.org/p/435540/ for more on complex contrasts hiv_exposure=(b_GHpos_HEU+w15_GHpos_HEU)-(b_GHpos_HUU+w15_GHpos_HUU), levels = colnames(design)) # vfit <- contrasts.fit(vfit, contrasts=contr.matrix) efit <- eBayes(vfit)#empirical Bayes moderation is carried out by borrowing information across all the genes to obtain more precise estimates of gene-wise variability plotSA(efit, main="Final model: Mean-variance trend") summary(decideTests(efit,p.value = 0.05))#GHneg # hiv_exposure # Down 0 # NotSig 11727 # Up 0 summary(decideTests(efit,p.value = 0.05))#GHpos # hiv_exposure # Down 0 # NotSig 11727 # Up 0 summary(decideTests(efit,p.value = 0.05))#(b_GHneg_HEU+w15_GHneg_HEU)-(b_GHneg_HUU+w15_GHneg_HUU) # hiv_exposure # Down 0 # NotSig 9497 # Up 0 summary(decideTests(efit,p.value = 0.05))#(b_GHpos_HEU+w15_GHpos_HEU)-(b_GHpos_HUU+w15_GHpos_HUU) # hiv_exposure # Down 0 # NotSig 9497 # Up 0 hivE <- topTreat(efit, n=Inf)# head(hivE)#GHneg head(hivE)#GHpos (with age) # genes logFC AveExpr t P.Value adj.P.Val B # ENSG00000161921 CXCL16 -20.51375 -0.1260973 -5.579889 6.585667e-05 0.6254408 -2.955051 # ENSG00000106266 SNX8 -18.22758 -0.9661042 -4.918088 2.213090e-04 0.7553206 -3.290720 # ENSG00000188825 LINC00910 -18.29395 -1.3108051 -4.780268 2.868754e-04 0.7553206 -3.389920 # ENSG00000116586 LAMTOR2 -18.10961 -1.0850030 -4.544925 4.491482e-04 0.7553206 -3.456200 # ENSG00000138073 PREB 17.66504 0.3200658 4.473071 5.156697e-04 0.7553206 -3.469776 # ENSG00000140511 HAPLN3 12.54819 3.5169033 4.419684 5.716001e-04 0.7553206 -2.815706 head(hivE)#GHpos without age # genes logFC AveExpr t P.Value adj.P.Val B # ENSG00000161921 CXCL16 -10.245448 -0.1260973 -5.898592 2.110782e-05 0.2004609 0.01865015 # ENSG00000106266 SNX8 -9.205317 -0.9661042 -5.186982 8.573379e-05 0.4071069 -0.85044926 # ENSG00000188825 LINC00910 -9.076590 -1.3108051 -4.902008 1.529126e-04 0.4299683 -1.20075994 # ENSG00000138073 PREB 8.890520 0.3200658 4.737566 2.143875e-04 0.4299683 -1.26928535 # ENSG00000109854 HTATIP2 8.645864 -0.2706302 4.566386 3.056367e-04 0.4299683 -1.54605008 # ENSG00000116586 LAMTOR2 -8.935969 -1.0850030 -4.554578 3.132365e-04 0.4299683 -1.49729011 #--------------------- hivE <- topTreat(efit, n=Inf)# head(hivE) #---------------------- # write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHneg_hivE_full_table.csv", row.names = T) write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHneg_hivE_age_full_table.csv", row.names = T) #OR # write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHpos_hivE_full_table.csv", row.names = T) write.csv(hivE,file = "outlier_samples_removed_voomWQW_GHpos_hivE_age_full_table.csv", row.names = T) #-------------- #VISUALISING PATHWAY ANALYSIS RESULTS (HIV + AGE SPIA, for selected pathways of interest) #For GHneg, do heatmaps fo SPIA pathway results (of potential interest) that had unadjusted p-values < 0.05 #These include: TGF-B signalling pathway, carbohydrate digestion and absorption, wnt signalling pathway #For GHpos: antigen processing and presentation #-------- #including hiv and age in the model as specified above: #GHpos: antigen <- c("972","4801","1385","1514","3106","3134","3326") #GHneg: b_cell <- c("1147","7409","23533","5295","3635","3636","4773","5880","118788") tgfb <- c("8454","4088","4609","5933","5516","9475","9241") carbs <- c("476","481","2542","3098","23533","5295") wnt <- c("4088","4773","818","5880","9475","81839","4609","51176","8454","6907","5516","5567","4040") # length(htlv1)#18 library(annotables) grch37 <- data.frame(grch37)# #for antigen processing # keep<- grch37[grch37$entrez %in% antigen,] dim(keep)#18 length(unique(keep$symbol))#7 unique(keep$symbol) # [1] "CD74" "HSP90AB1" "CREB1" "NFYB" "CTSL" "HLA-F" "HLA-B" #--- #OR b cell keep<- grch37[grch37$entrez %in% b_cell,] dim(keep)#9 length(unique(keep$symbol))#7 unique(keep$symbol) # [1] "NFATC2" "RAC2" "PIK3R5" "VAV1" "PIK3R1" "PIK3AP1" "INPPL1" "INPP5D" "CHUK" # #--- #OR TGFB keep<- grch37[grch37$entrez %in% tgfb,] dim(keep)#7 length(unique(keep$symbol))#7 unique(keep$symbol) # [1] "CUL1" "RBL1" "PPP2CB" "ROCK2" "MYC" "SMAD3" "NOG" #---- #OR carbs keep<- grch37[grch37$entrez %in% carbs,] dim(keep)#7 9 length(unique(keep$symbol))#6 unique(keep$symbol) # [1] "SLC37A4" "PIK3R5" "ATP1B1" "PIK3R1" "HK1" "ATP1A1" #---- #---- #OR WNT signaling keep<- grch37[grch37$entrez %in% wnt,] dim(keep)#7 9 length(unique(keep$symbol))#6 unique(keep$symbol) # [1] "CUL1" "LRP6" "NFATC2" "TBL1X" "PPP2CB" "RAC2" "ROCK2" "MYC" "LEF1" "PRKACB" # [11] "CAMK2G" "SMAD3" "VANGL1" #---- samples <- GHpos #OR samples <- GHneg #sort by HEU status samples <- samples %>% rownames_to_column('ID') %>% arrange(.,hiv) %>% column_to_rownames('ID') #load normalised counts #GHneg, HIV+ age #Note that these counts are based on the voom function which outputs: # a numeric matrix of normalized expression values on the log2 scale (Transform count data to log2-counts per million (logCPM), estimate the mean-variance relationship and use this to compute appropriate observation-level weights. The data are then ready for linear modelling) VQWneg <- readRDS(file = "VQW_counts_GHneg_outliers_removed.rds") # VQWneg_simple <- readRDS(file = "VQW_counts_GHneg_hiv_design_outliers_removed.rds") # identical(VQWneg_simple$E,VQWneg$E)#Same regardless of design #GHpos, HIV+age VQWpos <- readRDS(file = "VQW_counts_GHpos_outliers_removed.rds") VQWpos$design # find_ids <- rownames(VQWneg$genes)[VQWneg$genes$genes %in% t_cell] # find_ids # toplot <- VQWneg$E[find_ids,rownames(samples)] #OR #GHpos find_ids <- rownames(VQWpos$genes)[VQWpos$genes$genes %in% unique(keep$symbol)] toplot <- VQWpos$E[find_ids,rownames(samples)] #--- #GHneg #OR for b cell signaling find_ids <- rownames(VQWneg$genes)[VQWneg$genes$genes %in% unique(keep$symbol)] toplot <- VQWneg$E[find_ids,rownames(samples)] rownames(toplot) <- gene[rownames(toplot),"external_gene_name"] t.track <- samples[,c(4,5)] colnames(t.track) <- c("HIV exposure","age") rownames(t.track) <- rownames(samples) # colnames(t.track) <- "HIV exposure" cols <- c("HEU" = "red", "HUU" = "blue") age.cols <- c("birth" = "darkolivegreen4","week15" = "deepskyblue4") HEU.cols <- list("HIV exposure" = cols, "age" = age.cols) # tiff("Heatmap_GHneg_HEU_vs_HUU_antigen_presentation_ordered.tiff",6,4, units = "in", res =200) pheatmap(toplot[,rownames(t.track)], annotation_col = t.track,annotation_colors = HEU.cols,cluster_cols = FALSE) dev.off() tiff("Heatmap_GHneg_HEU_vs_HUU_antigen_presentation.tiff",6,4, units = "in", res =200) pheatmap(toplot[,rownames(t.track)], annotation_col = t.track,annotation_colors = HEU.cols) dev.off() # tiff("Heatmap_GHneg_HEU_vs_HUU_b_cell_differentiation.tiff",6,4, units = "in", res =200) pheatmap(toplot[,rownames(t.track)], annotation_col = t.track,annotation_colors = HEU.cols) dev.off() tiff("Heatmap_GHneg_HEU_vs_HUU_b_cell_differentiation_ordered.tiff",6,4, units = "in", res =200) pheatmap(toplot[,rownames(t.track)], annotation_col = t.track,annotation_colors = HEU.cols, cluster_cols = FALSE) dev.off() # tiff("Heatmap_GHneg_HEU_vs_HUU_tgfb_signaling.tiff",6,4, units = "in", res =200) pheatmap(toplot[,rownames(t.track)], annotation_col = t.track,annotation_colors = HEU.cols) dev.off() tiff("Heatmap_GHneg_HEU_vs_HUU_tgfb_signaling_ordered.tiff",6,4, units = "in", res =200) pheatmap(toplot[,rownames(t.track)], annotation_col = t.track,annotation_colors = HEU.cols, cluster_cols = FALSE) dev.off() tiff("Heatmap_GHneg_HEU_vs_HUU_wnt_signaling_ordered.tiff",6,4, units = "in", res =200) pheatmap(toplot[,rownames(t.track)], annotation_col = t.track,annotation_colors = HEU.cols, cluster_cols = FALSE) dev.off() tiff("Heatmap_GHneg_HEU_vs_HUU_wnt_signaling.tiff",6,4, units = "in", res =200) pheatmap(toplot[,rownames(t.track)], annotation_col = t.track,annotation_colors = HEU.cols) dev.off() # tiff("Heatmap_GHneg_HEU_vs_HUU_carbohydrate_digestion_absorption_ordered.tiff",6,4, units = "in", res =200) pheatmap(toplot[,rownames(t.track)], annotation_col = t.track,annotation_colors = HEU.cols, cluster_cols = FALSE) dev.off() tiff("Heatmap_GHneg_HEU_vs_HUU_carbohydrate_digestion_absorption.tiff",6,4, units = "in",res = 200) pheatmap(toplot[,rownames(t.track)], annotation_col = t.track,annotation_colors = HEU.cols) dev.off() #-------------- #//// #-------------- #Plotting various normalisation methods (this was for GHneg) raw <- as.matrix(d$counts[rownames(counts),]) dim(raw)#11727 9 # colnames(raw) <- colnames(d$counts) raw <- log2(raw + 1) raw <- reshape2::melt(raw, id = rownames(raw)) names(raw)[1:2] <- c("id", "sample") raw$method <- rep("raw",nrow(raw)) # cpm <- log2(cpm(y)+1) cpm <- reshape2::melt(cpm, id = rownames(y$counts)) names(cpm)[1:2] <- c ("id", "sample") cpm$method <- rep("cpm", nrow(cpm)) # voom <- as.matrix(v$E) voom <- reshape2::melt(v$E, id = rownames(v$E)) names(voom)[1:2] <- c("id", "sample") voom$method <- rep("voom", nrow(voom)) # df_allnorm <- rbind(raw,cpm, voom) df_allnorm$method <- factor(df_allnorm$method, levels = c("raw","cpm", "voom")) p <- ggplot(data=df_allnorm, aes(x=sample, y=value, fill=method)) p <- p + geom_boxplot() p <- p + theme_bw() p <- p + ggtitle("Boxplots of normalized pseudo counts\n for all samples by normalization methods") p <- p + facet_grid(. ~ method) p <- p + ylab(expression(log[2] ~ (normalized ~ count + 1))) + xlab("") p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), axis.ticks.x = element_blank()) p ggsave(p, "boxplots_different_normalisations.pdf", device = "pdf") #////// # pdf("Heatmap_GHneg_HEU_vs_HUU_t_cell_differentiation.pdf",6,6) aheatmap(toplot[,rownames(t.track)],annCol = t.track,annColors = HEU.cols,cexRow = 0.2,scale="row") dev.off() # # pdf(paste0("heatmap_voom_with_weights_hivE_FDR_0_05_rowscale.pdf"),7,5) # aheatmap(to_plot[,rownames(t.track)], annCol = t.track[3:5], Colv = NA, cexRow = 0.6, scale = "row") # dev.off() # #outliers removed: # #with red/blue for HUE/HUU at sonwabile's request # t.track <- meta.sub[,c(4,5)] # t.track <- t.track %>% arrange(hiv) # colnames(t.track) <- c("HIV exposure","Age") # cols <- c("birth" = "seagreen1", "week15" = "darkgoldenrod1") # cols2 <- c("HUU"= "steelblue","HEU" = "chocolate") # age.cols <- list("Age" = cols, "HIV exposure" = cols2) # pdf(paste0("heatmap_voom_with_weights_hivE_FDR_0_05_outlier_samples_removed.pdf"),7,5) # aheatmap(to_plot[,rownames(t.track)], annCol = t.track,annColors = age.cols ,Colv = NA, cexRow = 0.6) # dev.off() # pdf(paste0("heatmap_voom_with_weights_hivE_FDR_0_05_outlier_samples_removed_clustered.pdf"),7,5) # aheatmap(to_plot[,rownames(t.track)], annCol = t.track, annColors = age.cols,cexRow = 1) # dev.off() #ignore this section #Heatmap T cell differentiation set from pathway analysis GHneg # t_cell <- c("IL7R","LEF1","CD74","CHD7","RELB","ADAM17","TCF7","RPL22","BCL11B","LCK","CD3E","IL18R1","PTPRC","CD3D","PRELID1", # "PRDM1","LY9","JMJD6","BCL3","EGR1","SART1","ANXA1","THEMIS","SRF","ZMIZ1","NKAP","FANCA","SASH3","SOD1") # il_6 <- c("ENSG00000169403","ENSG00000159674","ENSG00000090376","ENSG00000153064","ENSG00000211578", # "ENSG00000011600","ENSG00000181104") # htlv1 <- c("916","917","3106","22800","7417","1385","10393","9184","5971","2874","2113", # "4801","3122","2648","3932","4603","7538","472") #--------