#V2-specific: Whereas V1 was exploratory in nature regarding dada2 trimming and taxonomic database, the V2 analysis uses the selected criteria (trimming default just primers and with refseq_rds tax database)
library(dada2)

source("/Users/katielennard/Documents/workspace/R_projects/R_projects/Microbiome_project/microbiome_custom_functions.R")
#source("/Users/katielennard/Documents/Katie/IDM_projects/Levin/microbiome_custom_functions.R")

#setwd("/Users/katielennard/Documents/Katie/IDM_projects/Levin")
setwd("/Users/katielennard/Documents/Academic/IDM_projects/Levin/")
inDir <- getwd()
outDir <- getwd()
#Read in metadata
meta <- read.csv2(paste0(inDir,"/dada2_results/metadata_forR.txt"), sep = "\t", row.names=1)
meta
#31/10/2019
#Refine labels for publication based on info sent by Pieter
# “Urban” should change to: URBAN fresh cow’s milk
# “Rural” should change to: RURAL fresh cow’s milk
# “Com_othando” should change to: Commercially fermented milk #1
# “Com_maass” should change to: Commercially fermented milk #2
# “Com_amyoli” should change to: Commercially fermented milk #3
# “Amazi” should change to: Home fermented milk (amasi)
meta
# Description
# ZymoPC        positive_control
# Urbancow3                urban
# Urbancow2                urban
# Urbancow1                urban
# Ruralcow2                rural
# Ruralcow3                rural
# HomemadeAmazi            amazi
# Com-othando        com_othando
# Com-amyoli          com_amyoli
# Com-maas             com_maass
meta$Description <- as.character(unlist(meta$Description))
meta[2,] <- "Urban fresh cow's milk"
meta[3,] <- "Urban fresh cow's milk"
meta[4,] <- "Urban fresh cow's milk"
meta[5,] <- "Rural fresh cow's milk"
meta[6,] <- "Rural fresh cow's milk"
meta[7,] <- "Home fermented milk (amasi)"
meta[8,] <- "Commercially fermented milk"
meta[9,] <- "Commercially fermented milk"
meta[10,] <- "Commercially fermented milk"
meta$Description <- as.factor(meta$Description)
#Add column that can be used for publication-ready sample names:
meta$sample_names <- c("Positive control","Urban fresh cow's milk 1", "Urban fresh cow's milk 2", "Urban fresh cow's milk 3",
                       "Rural fresh cow's milk 1","Rural fresh cow's milk 2","Home fermented milk (amasi)","Commercially fermented milk 1",
                       "Commercially fermented milk 2", "Commercially fermented milk 3")
meta
#Read in OTU table
ASVs <- readRDS(paste0(inDir,"/dada2_results/seqtab_final_refseq_rdp.RDS"))#Default options with primer trimming & new refseq_rdp DB
dim(ASVs)#10 2968?
#flip table so that taxa are rows
ASVs <- t(ASVs)
#Read in taxonomy table
taxa <- readRDS(paste0(inDir,"/dada2_results/tax_final_refseq_rdp.RDS"))

seqs <- rownames(ASVs)
ASV.IDs <- paste0("ASV",c(1:length(seqs)))
#Named vector:
names(seqs) <- ASV.IDs
head(seqs)
seq_lens <- nchar(seqs)
seq_lens
plot(density(seq_lens))
phy <- phyloseq(otu_table(ASVs,taxa_are_rows=TRUE),tax_table(taxa))
identical(taxa_names(phy),rownames(ASVs))#TRUE
taxa_names(phy) <- names(seqs)
str(phy)
#tax_df <- data.frame(tax_table(phy))
#tax_df$seq_len <- seq_lens
#head(tax_df)
#length(which(is.na(tax_df[,"Species"]))) #1462/2968 ~ 50% species-level assignments!! compared to only ~ 8% with GreenGenes........
#----
a = which(tax_table(phy)[,"Kingdom"]!="Bacteria")
b = which(tax_table(phy)[,"Family"]=="mitochondria")
c = which(tax_table(phy)[,"Class"]=="Chloroplast")
d = c(a,b,c)
d
e = taxa_names(phy)[-d]

#rename samples in phy to match metadata
sample_names(phy)
# [1] "Com-amyoli_S9_L001"    "Com-maas_S10_L001"     "Com-othando_S8_L001"   "HomemadeAmazi_S7_L001" "Ruralcow2_S5_L001"     "Ruralcow3_S6_L001"     "Urbancow1_S4_L001"    
# [8] "Urbancow2_S3_L001"     "Urbancow3_S2_L001"     "ZymoPC_S1_L001" 
sample_names(phy) <- sapply(strsplit(sample_names(phy), "_"), `[`, 1)
intersect(sample_names(phy),rownames(meta))

#Now add metadata
sample_data(phy) <- meta

#Publication ready sample names
sample_names(phy) <- unlist(sample_data(phy)[,"sample_names"])

#Mild abundance filter
M.new = prune_taxa(e,phy)
ntaxa(M.new)#2916 | 2855 | 2952 (refseq_rdp)
#standardize to total read counts

total = median(sample_sums(M.new))
length(which(sample_sums(M.new)<5000))#0
standf = function(x, t=total) round(t * (x / sum(x)))
M.std = transform_sample_counts(M.new, standf)
M.std.f <- filter_taxa(M.std,function(x) sum(x > 10) > (0.1*length(x)) | sum(x) > 0.001*total, TRUE)
ntaxa(M.std.f)#599 (refseq_rdp)
#tax_df <- data.frame(tax_table(M.std.f))
#species <- tax_df[tax_df$Species != "" & !is.na(tax_df$Species), ]
#dim(species)#367
#species[,3:7]
#alpha diversity:
p <- plot_richness(M.std.f,x = "Description",measures=c("Shannon"), 
                   title = paste0("Standardized to total reads, N=",nsamples(M.std)))+theme(axis.text=element_text(size=16, face="bold"),
                                                                                            axis.title=element_text(size=16,face="bold"))+geom_point(size=5)
p

p = p+theme_bw()+theme(axis.text=element_text(size=14, face="bold"),
                         axis.title=element_text(size=14,face="bold"))+theme(
                           panel.grid.major = element_blank(),
                           panel.grid.minor = element_blank())+theme(axis.text.x = element_text(angle = 90))

pdf(paste0(outDir,"/Shannon_alpha_diversity_by_sample_refseq_rdp_v2.pdf"))
p
dev.off()
#SIMPSON:
p <- plot_richness(M.std.f,x = "Description",measures=c("Simpson"), 
                   title = paste0("Standardized to total reads, N=",nsamples(M.std)))+theme(axis.text=element_text(size=16, face="bold"),
                                                                                            axis.title=element_text(size=16,face="bold"))+geom_point(size=5)
p = p+theme_bw()+theme(axis.text=element_text(size=14, face="bold"),
                       axis.title=element_text(size=14,face="bold"))+theme(
                         panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank())+theme(axis.text.x = element_text(angle = 90))
p

pdf(paste0(outDir,"/Simpson_alpha_diversity_by_sample_refseq_rdp_v2.pdf"))
p
dev.off()

#merge taxa at lowest available taxonomic level
length(which(colSums(otu_table(M.std.f))==0))
M.merged <- tax_glom.kv(M.std.f)

#heatmap
filename <- c("heatmap_merged_taxa_refseq_rdp")
main <- paste("Merged taxa, Bray-Curtis distance")
f = paste0(outDir,"/dada2_results/",filename,".pdf")

# Create distance matrix and calculate tree:
set.seed(2)
diss <- phyloseq::distance(M.merged,method = "bray", type = "samples")
clust.res<-hclust(diss)
sample.order = clust.res$order
#Colour specifications
Fcs = c("yes" = "darkgoldenrod3","no" = "mediumseagreen")
Dcs = c("Commercially fermented milk" = "#CC79A7","Home fermented milk (amasi)" = "#56B4E9", 
        "Rural fresh cow's milk" = "#F0E442", "Urban fresh cow's milk" = "darkolivegreen4", "positive_control" = "white")
Cs = list(Fermented = Fcs, Description = Dcs)
# Heatmap is output to file (the heatmap.k function can be found in the 'microbiome_custom_functions.R' script)
hm = heatmap.k(physeq= M.merged, annot.cols = c(1), main = main,filename = f,Colv = sample.order,labrow = TRUE, cexCol=0.5, cexRow=0.1, pdf.h = 9, pdf.w = 7,colours = Cs)  
#
merged.genus <- tax_glom(M.std.f, "Genus")
ntaxa(merged.genus)#196 (refseq_rdp)
merged.family <- tax_glom(M.std.f, "Family")
ntaxa(merged.family)#108 (refseq_rdp)
otu_table(merged.family)
#
#heatmap
filename <- c("heatmap_family_taxa_refseq_rdp")
main <- paste("Family-level taxa, Bray-Curtis distance")
f = paste0(outDir,"/dada2_results/",filename,".pdf")

# Create distance matrix and calculate tree:
set.seed(2)
diss <- phyloseq::distance(merged.family,method = "bray", type = "samples")
clust.res<-hclust(diss)
sample.order = clust.res$order
# Heatmap is output to file (the heatmap.k function can be found in the 'microbiome_custom_functions.R' script)
hm = heatmap.k(physeq= merged.family, merged = "Family",annot.cols = c(1), main = main,filename = f,
               colours = Cs, Colv = sample.order,labrow = TRUE, cexCol=0.7, cexRow=0.5, pdf.h = 9, pdf.w = 7)  
#

#NMDS:
#For some reason this fails if we have only one column in the sample data so lets add a dummy (think it has to do with psmelt)
sample_data(M.std.f)[,2] <- rep("dummy",10)
set.seed(2)
color = "Description"
#BC <- ordinate(M.std.f, "NMDS", "bray", k=1, trymax=100) # stress (nearly) zero, may have insufficient data if using k=2/3
# MDS/PCoA instead
BC <- ordinate(M.std.f, "MDS", "bray", k=2, trymax=100) 

shape = c("Description")
#title=c("NMDS of 16S microbiome,Bray-Curtis distance,k=1")
title=c("PCoA of 16S microbiome,Bray-Curtis distance")

MDS = plot_ordination(M.std.f, BC, shape = shape, 
                      title = title)
#MDS$layers <- MDS$layers[-1]
MDS
MDS.1  = MDS +theme(axis.text=element_text(size=16, face="bold"),
                    axis.title=element_text(size=18,face="bold"), legend.title=element_text(size=14))+
  theme_bw()+labs(shape=shape)+geom_point(size=5, shape=21)

MDS.1

#pdf(paste0(outDir,"/NMDS_default_dada2_Bray_Curtis.pdf"),8,5)
pdf(paste0(outDir,"/dada2_results/PCoA_dada2_Bray_Curtis_refseq_rds.pdf"),8,5)
MDS.1
dev.off()
#+++++++++++++++++++++++++++++++
#merged = filter_taxa(merged.genus, function(x) sum(x > count) > (perc*length(x)), TRUE)#filter taxa for barplot display
#ntaxa(merged)
level = "Genus"
count = 1500
perc = 0.1
#---Repeat at phylum level
level = "Phylum"
count = 500
perc = 0.1
# Barplot will be written to file (the bar.plots function can be found in the 'microbiome_custom_functions.R' script)
#Change sample names to publication ready:
sample_names(M.std) <- unlist(sample_data(M.std)[,"sample_names"])
barplot = bar.plots(physeq = M.std,cat = "Description",level = level, merge.samples = FALSE,
                    count = count, perc = perc, outDir=outDir, 
                    filen = 'Barplots_per_sample_TEST', pdf.h = 7, pdf.w = 8)
print(barplot)


#Investigate suspect Staph in Urbancow3 - is it the same as ZymoPC staph??
Staph <- tax_table(M.merged)[grep("Staphylococcus",tax_table(M.merged)[,"Genus"]),]
dim(Staph)
Staph
# Kingdom    Phylum       Class     Order        Family              Genus            Species                                 
# ASV8   "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Staphylococcaceae" "Staphylococcus" "Staphylococcus_aureus(D83355)"         
# ASV16  "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Staphylococcaceae" "Staphylococcus" "Staphylococcus_epidermidis(D83363)"    
# ASV68  "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Staphylococcaceae" "Staphylococcus" "Staphylococcus_chromogenes(D83360)"    
# ASV256 "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Staphylococcaceae" "Staphylococcus" "Staphylococcus_saprophyticus(AP008934)"
# ASV21  "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Staphylococcaceae" "Staphylococcus" NA 

otu_table(M.merged)[rownames(Staph),"Urbancow3"]
# Urbancow3
# ASV8          71
# ASV16       5766
# ASV68         95
# ASV256       134
# ASV21       5120

#So looks like a different Staph, probably from sample collector?
#+++++++++++++++++++++++++++++++

#15/10/2019 Perform differential abundance testing lumping all fermented (amaas) vs. all unfermented samples together (milks)
  
#Subset of samples for testing: all except ZymoPC control
df.phy <- prune_samples(sample_names(M.new)[1:9],M.new)
#Create two-class comparison of interest
sample_data(df.phy)$Fermented <- as.factor(c(rep("yes",4),rep("no",5)))

#Merge at lowest available taxonomic annotation
raw.merged <- tax_glom.kv(df.phy)#1159
merged.total <-  median(sample_sums(raw.merged))#50272
RMF <- filter_taxa(raw.merged,function(x) sum(x > 10) > (1/9*length(x)) | sum(x) > 0.001*merged.total, TRUE)
ntaxa(RMF)#437
#colours, redefine without positive control
Fcs = c("yes" = "darkgoldenrod3","no" = "mediumseagreen")
Dcs = c("Commercially fermented milk" = "#CC79A7","Home fermented milk (amasi)" = "#56B4E9", 
        "Rural fresh cow's milk" = "#F0E442", "Urban fresh cow's milk" = "darkolivegreen4")
Cs = list(Fermented = Fcs, Description = Dcs)
b = super.fitZig.kv(physeq = RMF,factor = "Fermented",outDir = outDir,FileName =c("1_5FC_0.2_Fermented_Unfermented"),
                    heatmap.descriptor=c("tax_annot"), main=c("Fermented vs. Unfermented cow's milk"), subt=c("subt = FDR < 0.05,|coeff| >= 1.5, >60%+ in either group"), 
                    ordered=TRUE, p=0.05, FC = 1.5, perc=0.6, extra.cols = c("Description"),colours = Cs,cexRow = 0.8, pdf.h = 11, pdf.w = 10)

#////////////////////////
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("decontam")
library(DECIPHER)
library(phangorn)
seqs <- getSequences(t(ASVs))
seqs
names(seqs) <- seqs # This propagates to the tip labels of the tree
alignment <- AlignSeqs(DNAStringSet(seqs),
                       anchor=NA)
writeXStringSet(alignment, "aligned_seqs.fasta")
# TODO: optimize this, or maybe split into a second step?
phang.align <- phyDat(as(alignment, "matrix"), type="DNA")
#Compute pairwise distances based on DNA seqs
dm <- dist.ml(phang.align)
#Construct neighbour-joining tree based on pairwise distances
treeNJ <- NJ(dm) # Note, tip order != sequence order
#Compute likelihood of tree given sequence alignment and a model
fit1 = pml(treeNJ, data=phang.align)
out_dir = '/Users/katielennard/Documents/Katie/IDM_projects/Teaching/dada2/'
write.tree(fit1$tree, file = paste0(out_dir,"/tree.newick"))
## negative edges length changed to 0!
fitGTR <- update(fit1, k=4, inv=0.2) #KL: I don't know how these params were chosen (adds number of rate categories as 4 which is for A, C, T, G?)
fitGTR
# loglikelihood: -260144.2 
# 
# unconstrained loglikelihood: -4681.278 
# Proportion of invariant sites: 0.2 
# Discrete gamma model
# Number of rate categories: 4 
# Shape parameter: 1 
# Rate matrix:
#   a c g t
# a 0 1 1 1
# c 1 0 1 1
# g 1 1 0 1
# t 1 1 1 0
# 
# Base frequencies:  
#   0.25 0.25 0.25 0.25 


#Optimize model fit for GTR (general time reversible) model
#fitGTR <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE,
#                    rearrangement = "stochastic", control = pml.control(trace = 0))
fitGTR


#saveRDS(fitGTR, paste0(out_dir,"/phangorn.tree.RDS"))
#write.tree(fitGTR$tree, paste0(outDir, file = "/tree.GTR.newick"))
phangorn_tree <- readRDS(paste0(outDir,"/dada2_results/phangorn.tree.RDS"))
str(phangorn_tree)
phangorn_tree
GTR_Tree <- read.tree(paste0(outDir,"/dada2_results/tree.GTR.newick"))
str(GTR_Tree)
class(GTR_Tree)
head(GTR_Tree$tip.label)
length(intersect(GTR_Tree$tip.label,seqs))#2968
length(seqs)#2968
length(GTR_Tree$tip.label)#2968
#Rename GTR_Tree tip labels with ASV.IDs
for_tree.IDs <- seqs[match(GTR_Tree$tip.label,seqs)]
head(for_tree.IDs)
GTR_Tree$tip.label <- names(for_tree.IDs)
str(phy)
phy.test <- merge_phyloseq(phy,GTR_Tree)
p.f <- filter_taxa(phy.test,function(x) sum(x > 100) > (0.1*length(x)) | sum(x) > 0.01*total, TRUE)
ntaxa(p.f)#109
plot_tree(p.f,color = "Description",base.spacing=0.05,label.tips = taxa_names)


str(phy.test)
head(otu_table(phy.test))

BC.test <- phyloseq::ordinate(phy.test, "NMDS", "unifrac") 
plot_tree(phy.test)
data(esophagus)
plot_tree(esophagus)
plot_tree(esophagus, color="Sample",base.spacing=0.03)


#-------------------------
setwd("/Users/katielennard/Downloads")
Don <- load(paste0(getwd(),"/InfantDanish.RData"))
str(Don)
# phy.obj.Danish
str(phy.obj.Danish)
head(tax_table(phy.obj.Danish))
Don.tax <- data.frame(tax_table(phy.obj.Danish))
Don.merge <- tax_glom.kv(phy.obj.Danish)#402 merged taxa
dups <- duplicated(tax_table(Don.merge)[,"Species"])
tax_table(Don.merge)[,"Species"][dups]

# ASV39   "massiliensis"
# ASV56   "massiliensis"
# ASV299  "massiliensis"
# ASV614  "massiliensis"
# ASV677  "massiliensis"

Don.tax[c("ASV39","ASV56","ASV299","ASV614","ASV677","ASV707"),]
# Kingdom         Phylum          Class            Order             Family                   Genus      Species
# ASV39  Bacteria     Firmicutes     Clostridia    Clostridiales    Lachnospiraceae                 Blautia massiliensis
# ASV56  Bacteria Actinobacteria Coriobacteriia Coriobacteriales  Coriobacteriaceae                  Enorma massiliensis
# ASV299 Bacteria     Firmicutes  Negativicutes  Selenomonadales Acidaminococcaceae         Acidaminococcus massiliensis
# ASV614 Bacteria     Firmicutes  Negativicutes  Selenomonadales    Veillonellaceae          Negativicoccus massiliensis
# ASV677 Bacteria  Bacteroidetes    Bacteroidia    Bacteroidales     Bacteroidaceae             Bacteroides massiliensis

phylum.merge <- tax_glom(M.std,"Phylum")
grep("Candidatus_Saccharibacteria",tax_table(phylum.merge)[,"Phylum"])#11
tax_table(phylum.merge)[11,]
# Kingdom    Phylum                        Class Order Family Genus Species
# ASV342 "Bacteria" "Candidatus_Saccharibacteria" NA    NA    NA     NA    NA     
otu_table(phylum.merge)[11,]
# Com-amyoli Com-maas Com-othando HomemadeAmazi Ruralcow2 Ruralcow3 Urbancow1 Urbancow2 Urbancow3 ZymoPC
# ASV342          0        0           0             0         3       535        13        60        38      0

merged = filter_taxa(merged, function(x) sum(x > count) > (perc*length(x)), TRUE)#filter taxa for barplot display
Fk <- function(x) {
  sum(x > count) >= (perc*length(x))
}

Fk(otu_table(phylum.merge)[11,])