#V2-specific: Whereas V1 was exploratory in nature regarding dada2 trimming and taxonomic database, the V2 analysis uses the selected criteria (trimming default just primers and with refseq_rds tax database) library(dada2) source("/Users/katielennard/Documents/workspace/R_projects/R_projects/Microbiome_project/microbiome_custom_functions.R") #source("/Users/katielennard/Documents/Katie/IDM_projects/Levin/microbiome_custom_functions.R") #setwd("/Users/katielennard/Documents/Katie/IDM_projects/Levin") setwd("/Users/katielennard/Documents/Academic/IDM_projects/Levin/") inDir <- getwd() outDir <- getwd() #Read in metadata meta <- read.csv2(paste0(inDir,"/dada2_results/metadata_forR.txt"), sep = "\t", row.names=1) meta #31/10/2019 #Refine labels for publication based on info sent by Pieter # “Urban” should change to: URBAN fresh cow’s milk # “Rural” should change to: RURAL fresh cow’s milk # “Com_othando” should change to: Commercially fermented milk #1 # “Com_maass” should change to: Commercially fermented milk #2 # “Com_amyoli” should change to: Commercially fermented milk #3 # “Amazi” should change to: Home fermented milk (amasi) meta # Description # ZymoPC positive_control # Urbancow3 urban # Urbancow2 urban # Urbancow1 urban # Ruralcow2 rural # Ruralcow3 rural # HomemadeAmazi amazi # Com-othando com_othando # Com-amyoli com_amyoli # Com-maas com_maass meta$Description <- as.character(unlist(meta$Description)) meta[2,] <- "Urban fresh cow's milk" meta[3,] <- "Urban fresh cow's milk" meta[4,] <- "Urban fresh cow's milk" meta[5,] <- "Rural fresh cow's milk" meta[6,] <- "Rural fresh cow's milk" meta[7,] <- "Home fermented milk (amasi)" meta[8,] <- "Commercially fermented milk" meta[9,] <- "Commercially fermented milk" meta[10,] <- "Commercially fermented milk" meta$Description <- as.factor(meta$Description) #Add column that can be used for publication-ready sample names: meta$sample_names <- c("Positive control","Urban fresh cow's milk 1", "Urban fresh cow's milk 2", "Urban fresh cow's milk 3", "Rural fresh cow's milk 1","Rural fresh cow's milk 2","Home fermented milk (amasi)","Commercially fermented milk 1", "Commercially fermented milk 2", "Commercially fermented milk 3") meta #Read in OTU table ASVs <- readRDS(paste0(inDir,"/dada2_results/seqtab_final_refseq_rdp.RDS"))#Default options with primer trimming & new refseq_rdp DB dim(ASVs)#10 2968? #flip table so that taxa are rows ASVs <- t(ASVs) #Read in taxonomy table taxa <- readRDS(paste0(inDir,"/dada2_results/tax_final_refseq_rdp.RDS")) seqs <- rownames(ASVs) ASV.IDs <- paste0("ASV",c(1:length(seqs))) #Named vector: names(seqs) <- ASV.IDs head(seqs) seq_lens <- nchar(seqs) seq_lens plot(density(seq_lens)) phy <- phyloseq(otu_table(ASVs,taxa_are_rows=TRUE),tax_table(taxa)) identical(taxa_names(phy),rownames(ASVs))#TRUE taxa_names(phy) <- names(seqs) str(phy) #tax_df <- data.frame(tax_table(phy)) #tax_df$seq_len <- seq_lens #head(tax_df) #length(which(is.na(tax_df[,"Species"]))) #1462/2968 ~ 50% species-level assignments!! compared to only ~ 8% with GreenGenes........ #---- a = which(tax_table(phy)[,"Kingdom"]!="Bacteria") b = which(tax_table(phy)[,"Family"]=="mitochondria") c = which(tax_table(phy)[,"Class"]=="Chloroplast") d = c(a,b,c) d e = taxa_names(phy)[-d] #rename samples in phy to match metadata sample_names(phy) # [1] "Com-amyoli_S9_L001" "Com-maas_S10_L001" "Com-othando_S8_L001" "HomemadeAmazi_S7_L001" "Ruralcow2_S5_L001" "Ruralcow3_S6_L001" "Urbancow1_S4_L001" # [8] "Urbancow2_S3_L001" "Urbancow3_S2_L001" "ZymoPC_S1_L001" sample_names(phy) <- sapply(strsplit(sample_names(phy), "_"), `[`, 1) intersect(sample_names(phy),rownames(meta)) #Now add metadata sample_data(phy) <- meta #Publication ready sample names sample_names(phy) <- unlist(sample_data(phy)[,"sample_names"]) #Mild abundance filter M.new = prune_taxa(e,phy) ntaxa(M.new)#2916 | 2855 | 2952 (refseq_rdp) #standardize to total read counts total = median(sample_sums(M.new)) length(which(sample_sums(M.new)<5000))#0 standf = function(x, t=total) round(t * (x / sum(x))) M.std = transform_sample_counts(M.new, standf) M.std.f <- filter_taxa(M.std,function(x) sum(x > 10) > (0.1*length(x)) | sum(x) > 0.001*total, TRUE) ntaxa(M.std.f)#599 (refseq_rdp) #tax_df <- data.frame(tax_table(M.std.f)) #species <- tax_df[tax_df$Species != "" & !is.na(tax_df$Species), ] #dim(species)#367 #species[,3:7] #alpha diversity: p <- plot_richness(M.std.f,x = "Description",measures=c("Shannon"), title = paste0("Standardized to total reads, N=",nsamples(M.std)))+theme(axis.text=element_text(size=16, face="bold"), axis.title=element_text(size=16,face="bold"))+geom_point(size=5) p p = p+theme_bw()+theme(axis.text=element_text(size=14, face="bold"), axis.title=element_text(size=14,face="bold"))+theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank())+theme(axis.text.x = element_text(angle = 90)) pdf(paste0(outDir,"/Shannon_alpha_diversity_by_sample_refseq_rdp_v2.pdf")) p dev.off() #SIMPSON: p <- plot_richness(M.std.f,x = "Description",measures=c("Simpson"), title = paste0("Standardized to total reads, N=",nsamples(M.std)))+theme(axis.text=element_text(size=16, face="bold"), axis.title=element_text(size=16,face="bold"))+geom_point(size=5) p = p+theme_bw()+theme(axis.text=element_text(size=14, face="bold"), axis.title=element_text(size=14,face="bold"))+theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank())+theme(axis.text.x = element_text(angle = 90)) p pdf(paste0(outDir,"/Simpson_alpha_diversity_by_sample_refseq_rdp_v2.pdf")) p dev.off() #merge taxa at lowest available taxonomic level length(which(colSums(otu_table(M.std.f))==0)) M.merged <- tax_glom.kv(M.std.f) #heatmap filename <- c("heatmap_merged_taxa_refseq_rdp") main <- paste("Merged taxa, Bray-Curtis distance") f = paste0(outDir,"/dada2_results/",filename,".pdf") # Create distance matrix and calculate tree: set.seed(2) diss <- phyloseq::distance(M.merged,method = "bray", type = "samples") clust.res<-hclust(diss) sample.order = clust.res$order #Colour specifications Fcs = c("yes" = "darkgoldenrod3","no" = "mediumseagreen") Dcs = c("Commercially fermented milk" = "#CC79A7","Home fermented milk (amasi)" = "#56B4E9", "Rural fresh cow's milk" = "#F0E442", "Urban fresh cow's milk" = "darkolivegreen4", "positive_control" = "white") Cs = list(Fermented = Fcs, Description = Dcs) # Heatmap is output to file (the heatmap.k function can be found in the 'microbiome_custom_functions.R' script) hm = heatmap.k(physeq= M.merged, annot.cols = c(1), main = main,filename = f,Colv = sample.order,labrow = TRUE, cexCol=0.5, cexRow=0.1, pdf.h = 9, pdf.w = 7,colours = Cs) # merged.genus <- tax_glom(M.std.f, "Genus") ntaxa(merged.genus)#196 (refseq_rdp) merged.family <- tax_glom(M.std.f, "Family") ntaxa(merged.family)#108 (refseq_rdp) otu_table(merged.family) # #heatmap filename <- c("heatmap_family_taxa_refseq_rdp") main <- paste("Family-level taxa, Bray-Curtis distance") f = paste0(outDir,"/dada2_results/",filename,".pdf") # Create distance matrix and calculate tree: set.seed(2) diss <- phyloseq::distance(merged.family,method = "bray", type = "samples") clust.res<-hclust(diss) sample.order = clust.res$order # Heatmap is output to file (the heatmap.k function can be found in the 'microbiome_custom_functions.R' script) hm = heatmap.k(physeq= merged.family, merged = "Family",annot.cols = c(1), main = main,filename = f, colours = Cs, Colv = sample.order,labrow = TRUE, cexCol=0.7, cexRow=0.5, pdf.h = 9, pdf.w = 7) # #NMDS: #For some reason this fails if we have only one column in the sample data so lets add a dummy (think it has to do with psmelt) sample_data(M.std.f)[,2] <- rep("dummy",10) set.seed(2) color = "Description" #BC <- ordinate(M.std.f, "NMDS", "bray", k=1, trymax=100) # stress (nearly) zero, may have insufficient data if using k=2/3 # MDS/PCoA instead BC <- ordinate(M.std.f, "MDS", "bray", k=2, trymax=100) shape = c("Description") #title=c("NMDS of 16S microbiome,Bray-Curtis distance,k=1") title=c("PCoA of 16S microbiome,Bray-Curtis distance") MDS = plot_ordination(M.std.f, BC, shape = shape, title = title) #MDS$layers <- MDS$layers[-1] MDS MDS.1 = MDS +theme(axis.text=element_text(size=16, face="bold"), axis.title=element_text(size=18,face="bold"), legend.title=element_text(size=14))+ theme_bw()+labs(shape=shape)+geom_point(size=5, shape=21) MDS.1 #pdf(paste0(outDir,"/NMDS_default_dada2_Bray_Curtis.pdf"),8,5) pdf(paste0(outDir,"/dada2_results/PCoA_dada2_Bray_Curtis_refseq_rds.pdf"),8,5) MDS.1 dev.off() #+++++++++++++++++++++++++++++++ #merged = filter_taxa(merged.genus, function(x) sum(x > count) > (perc*length(x)), TRUE)#filter taxa for barplot display #ntaxa(merged) level = "Genus" count = 1500 perc = 0.1 #---Repeat at phylum level level = "Phylum" count = 500 perc = 0.1 # Barplot will be written to file (the bar.plots function can be found in the 'microbiome_custom_functions.R' script) #Change sample names to publication ready: sample_names(M.std) <- unlist(sample_data(M.std)[,"sample_names"]) barplot = bar.plots(physeq = M.std,cat = "Description",level = level, merge.samples = FALSE, count = count, perc = perc, outDir=outDir, filen = 'Barplots_per_sample_TEST', pdf.h = 7, pdf.w = 8) print(barplot) #Investigate suspect Staph in Urbancow3 - is it the same as ZymoPC staph?? Staph <- tax_table(M.merged)[grep("Staphylococcus",tax_table(M.merged)[,"Genus"]),] dim(Staph) Staph # Kingdom Phylum Class Order Family Genus Species # ASV8 "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Staphylococcaceae" "Staphylococcus" "Staphylococcus_aureus(D83355)" # ASV16 "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Staphylococcaceae" "Staphylococcus" "Staphylococcus_epidermidis(D83363)" # ASV68 "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Staphylococcaceae" "Staphylococcus" "Staphylococcus_chromogenes(D83360)" # ASV256 "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Staphylococcaceae" "Staphylococcus" "Staphylococcus_saprophyticus(AP008934)" # ASV21 "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Staphylococcaceae" "Staphylococcus" NA otu_table(M.merged)[rownames(Staph),"Urbancow3"] # Urbancow3 # ASV8 71 # ASV16 5766 # ASV68 95 # ASV256 134 # ASV21 5120 #So looks like a different Staph, probably from sample collector? #+++++++++++++++++++++++++++++++ #15/10/2019 Perform differential abundance testing lumping all fermented (amaas) vs. all unfermented samples together (milks) #Subset of samples for testing: all except ZymoPC control df.phy <- prune_samples(sample_names(M.new)[1:9],M.new) #Create two-class comparison of interest sample_data(df.phy)$Fermented <- as.factor(c(rep("yes",4),rep("no",5))) #Merge at lowest available taxonomic annotation raw.merged <- tax_glom.kv(df.phy)#1159 merged.total <- median(sample_sums(raw.merged))#50272 RMF <- filter_taxa(raw.merged,function(x) sum(x > 10) > (1/9*length(x)) | sum(x) > 0.001*merged.total, TRUE) ntaxa(RMF)#437 #colours, redefine without positive control Fcs = c("yes" = "darkgoldenrod3","no" = "mediumseagreen") Dcs = c("Commercially fermented milk" = "#CC79A7","Home fermented milk (amasi)" = "#56B4E9", "Rural fresh cow's milk" = "#F0E442", "Urban fresh cow's milk" = "darkolivegreen4") Cs = list(Fermented = Fcs, Description = Dcs) b = super.fitZig.kv(physeq = RMF,factor = "Fermented",outDir = outDir,FileName =c("1_5FC_0.2_Fermented_Unfermented"), heatmap.descriptor=c("tax_annot"), main=c("Fermented vs. Unfermented cow's milk"), subt=c("subt = FDR < 0.05,|coeff| >= 1.5, >60%+ in either group"), ordered=TRUE, p=0.05, FC = 1.5, perc=0.6, extra.cols = c("Description"),colours = Cs,cexRow = 0.8, pdf.h = 11, pdf.w = 10) #//////////////////////// if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("decontam") library(DECIPHER) library(phangorn) seqs <- getSequences(t(ASVs)) seqs names(seqs) <- seqs # This propagates to the tip labels of the tree alignment <- AlignSeqs(DNAStringSet(seqs), anchor=NA) writeXStringSet(alignment, "aligned_seqs.fasta") # TODO: optimize this, or maybe split into a second step? phang.align <- phyDat(as(alignment, "matrix"), type="DNA") #Compute pairwise distances based on DNA seqs dm <- dist.ml(phang.align) #Construct neighbour-joining tree based on pairwise distances treeNJ <- NJ(dm) # Note, tip order != sequence order #Compute likelihood of tree given sequence alignment and a model fit1 = pml(treeNJ, data=phang.align) out_dir = '/Users/katielennard/Documents/Katie/IDM_projects/Teaching/dada2/' write.tree(fit1$tree, file = paste0(out_dir,"/tree.newick")) ## negative edges length changed to 0! fitGTR <- update(fit1, k=4, inv=0.2) #KL: I don't know how these params were chosen (adds number of rate categories as 4 which is for A, C, T, G?) fitGTR # loglikelihood: -260144.2 # # unconstrained loglikelihood: -4681.278 # Proportion of invariant sites: 0.2 # Discrete gamma model # Number of rate categories: 4 # Shape parameter: 1 # Rate matrix: # a c g t # a 0 1 1 1 # c 1 0 1 1 # g 1 1 0 1 # t 1 1 1 0 # # Base frequencies: # 0.25 0.25 0.25 0.25 #Optimize model fit for GTR (general time reversible) model #fitGTR <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE, # rearrangement = "stochastic", control = pml.control(trace = 0)) fitGTR #saveRDS(fitGTR, paste0(out_dir,"/phangorn.tree.RDS")) #write.tree(fitGTR$tree, paste0(outDir, file = "/tree.GTR.newick")) phangorn_tree <- readRDS(paste0(outDir,"/dada2_results/phangorn.tree.RDS")) str(phangorn_tree) phangorn_tree GTR_Tree <- read.tree(paste0(outDir,"/dada2_results/tree.GTR.newick")) str(GTR_Tree) class(GTR_Tree) head(GTR_Tree$tip.label) length(intersect(GTR_Tree$tip.label,seqs))#2968 length(seqs)#2968 length(GTR_Tree$tip.label)#2968 #Rename GTR_Tree tip labels with ASV.IDs for_tree.IDs <- seqs[match(GTR_Tree$tip.label,seqs)] head(for_tree.IDs) GTR_Tree$tip.label <- names(for_tree.IDs) str(phy) phy.test <- merge_phyloseq(phy,GTR_Tree) p.f <- filter_taxa(phy.test,function(x) sum(x > 100) > (0.1*length(x)) | sum(x) > 0.01*total, TRUE) ntaxa(p.f)#109 plot_tree(p.f,color = "Description",base.spacing=0.05,label.tips = taxa_names) str(phy.test) head(otu_table(phy.test)) BC.test <- phyloseq::ordinate(phy.test, "NMDS", "unifrac") plot_tree(phy.test) data(esophagus) plot_tree(esophagus) plot_tree(esophagus, color="Sample",base.spacing=0.03) #------------------------- setwd("/Users/katielennard/Downloads") Don <- load(paste0(getwd(),"/InfantDanish.RData")) str(Don) # phy.obj.Danish str(phy.obj.Danish) head(tax_table(phy.obj.Danish)) Don.tax <- data.frame(tax_table(phy.obj.Danish)) Don.merge <- tax_glom.kv(phy.obj.Danish)#402 merged taxa dups <- duplicated(tax_table(Don.merge)[,"Species"]) tax_table(Don.merge)[,"Species"][dups] # ASV39 "massiliensis" # ASV56 "massiliensis" # ASV299 "massiliensis" # ASV614 "massiliensis" # ASV677 "massiliensis" Don.tax[c("ASV39","ASV56","ASV299","ASV614","ASV677","ASV707"),] # Kingdom Phylum Class Order Family Genus Species # ASV39 Bacteria Firmicutes Clostridia Clostridiales Lachnospiraceae Blautia massiliensis # ASV56 Bacteria Actinobacteria Coriobacteriia Coriobacteriales Coriobacteriaceae Enorma massiliensis # ASV299 Bacteria Firmicutes Negativicutes Selenomonadales Acidaminococcaceae Acidaminococcus massiliensis # ASV614 Bacteria Firmicutes Negativicutes Selenomonadales Veillonellaceae Negativicoccus massiliensis # ASV677 Bacteria Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides massiliensis phylum.merge <- tax_glom(M.std,"Phylum") grep("Candidatus_Saccharibacteria",tax_table(phylum.merge)[,"Phylum"])#11 tax_table(phylum.merge)[11,] # Kingdom Phylum Class Order Family Genus Species # ASV342 "Bacteria" "Candidatus_Saccharibacteria" NA NA NA NA NA otu_table(phylum.merge)[11,] # Com-amyoli Com-maas Com-othando HomemadeAmazi Ruralcow2 Ruralcow3 Urbancow1 Urbancow2 Urbancow3 ZymoPC # ASV342 0 0 0 0 3 535 13 60 38 0 merged = filter_taxa(merged, function(x) sum(x > count) > (perc*length(x)), TRUE)#filter taxa for barplot display Fk <- function(x) { sum(x > count) >= (perc*length(x)) } Fk(otu_table(phylum.merge)[11,])