#V3 changes:This version deals specifically with the publication ready analyses where we're using ST303 vs. other (as opposed to B/A vs. D)
#V4 changes: Stefan, Chad Clinton changed their mind regarding the classification of 5 NF samples that cluster with ST303 (likely ST303) but that differ from the 
#ST303 type by one of the six markers
#Here, those 5 samples, instead of being included with the non-ST303s as before will now be included with the ST303s
#From Stefan: D01, D06, D13, D15, C11- differ at the acsA locus only
library(tidyverse)
library(ggtree)
library(ggimage)
library(GeneMates)#function heatMapPAM
library(ape)#drop.tip
library(metagenomeSeq)#fitPA
#import phylogenetic SNP tree
setwd("/Users/katielennard/Documents/Academic/IDM_projects/Nicol/Jan2019_single_isolate_Pseudomonas_project/SNPs_downstream/")
#source(paste0(getwd(),"/rscripts/plotSRST2data.R")) #from https://github.com/katholt/srst2#plotting-output-in-r

tree <- read.tree(paste0(getwd(),"/Ps_aerug_sample_and_PAO1_ref_genome_tree.core.tre"))
str(tree)
#Remove reference genome from tree as well as D18 (QC):
tree$tip.label
tree <- drop.tip(tree,"GCA_000006765_1_ASM676v1_genomic")
tree <- drop.tip(tree,"D18_S18_L001")
grep("D18",tree$tip.label)#gone
#clean sample labels
tree$tip.label <- sapply(strsplit(tree$tip.label, "_"), `[`, 1)
tree$tip.label

#import VF and AMR tables (80% gene length coverage, 90% identity)
amr <- read.delim(paste0(getwd(),"/Ps_AMR_cov_80_ID_90_report__compiledResults.txt"), stringsAsFactors = F, row.names = 1) #MLST and AMR genes
head(amr)
dim(amr)#96 13
identical(rownames(amr),tree$tip.label)#FALSE
#Remove D18
which(rownames(amr)=="D18")#90
amr <- amr[-90,]
amr <- amr[tree$tip.label,]
head(amr)
#Import VFs with >= 80% coverage of length of gene and at least 90% identity
mlst.vf <- read.delim(paste0(getwd(),"/Ps_VF_cov_80_ID_90_report__compiledResults.txt"), stringsAsFactors = F,row.names = 1) #MLST and VF genes
dim(mlst.vf)#96 416
#------------------------
#There are so many VFs (> 400) that we need to identify those present in all samples and filter these out to get the heatmap less cluttered
VFs_count <- apply(mlst.vf,2,function(x) length(which(x=="-")))
col_nums <- which(VFs_count==0)
toFilter <-  !colnames(mlst.vf) %in% names(col_nums)
filteredVFs <- mlst.vf[,toFilter]
dim(filteredVFs)#151
#------------------------
#Lets create a combined matrix with both VFs and AMRs
# identical(rownames(amr),rownames(filteredVFs))#TRUE
# mlst.vf.amr <- cbind(amr,filteredVFs)
# head(mlst.vf.amr)
# #manual spot checks:
# mlst.vf.amr["B23",1:10]
# # AacAad_AGly AadB_AGly  Aph3.IIb_AGly Arr_Rif   CatB7_Phe DfrB_Tmt OXA.1_Bla   OXA.50_Bla OXA.5_Bla StrA_AGly
# # B23           -         - Aph3-IIb_1503*       - CatB7_1178*        -         - OXA-50_1499*         -         -
# #Correct, checked against individual B23 file
# mlst.vf.amr["B23",20:30]
# # PA3144                    PA3157                  PA4298                   PA4541 PLES_19091 PLES_19101
# # B23 PA3144_CVF520_VFG014114? PA3157_CVF520_VFG014101*? PA4298_AI098_VFG042745? PA4541_TX184_VFG043810*?          -          -
# #   PLES_19111 PLES_19131 PLES_19141 PLES_19151 PLES_19161
# # B23          -          -          -          -          -
# #yes
# mlst.vf.amr["A10",1:10]
# # AacAad_AGly AadB_AGly  Aph3.IIb_AGly Arr_Rif   CatB7_Phe DfrB_Tmt OXA.1_Bla   OXA.50_Bla OXA.5_Bla StrA_AGly
# # A10           -         - Aph3-IIb_1503*       - CatB7_1178*        -         - OXA-50_1499*         -         -
# #yes
# mlst.vf.amr["A10",20:30]
# # PA3144 PA3157                 PA4298                  PA4541 PLES_19091 PLES_19101 PLES_19111 PLES_19131
# # A10 PA3144_CVF520_VFG014114*?      - PA4298_AI098_VFG042745 PA4541_TX184_VFG043810*          -          -          -          -
# #   PLES_19141 PLES_19151 PLES_19161
# # A10          -          -          -
# #yes

#-------------------------
VF.clean <- apply(filteredVFs, 1, function(x) sub("\\*","",x))
VF.clean <- apply(VF.clean, 1, function(x) sub("\\?","",x))
VF.clean <- apply(VF.clean, 1, function(x) sub("-","0",x))
VF.clean <- ifelse(VF.clean=="0","absent","present")
VF.clean <- t(VF.clean)
VF.clean <- data.frame(VF.clean)
str(VF.clean)
head(VF.clean)
dim(VF.clean)
VF.clean[10,1:20]
filteredVFs[10,1:20]

# metadata ----------------------------------------------------------------
#import metadata
meta <- read.csv2(paste0(getwd(),"/Metadata-KLeditSTs-v2StefanEdits.txt"), sep = "\t")
head(meta)
#match metadata sample IDs to data sample IDs
colnames(meta)

rownames(meta) <- meta$ID
#clean
rownames(meta) <- sapply(strsplit(rownames(meta), "_"), `[`, 1)
dim(meta)#97 12
setdiff(tree$tip.label,rownames(meta))

setdiff(rownames(meta),tree$tip.label)
# [1] "D18" "GCA"
length(intersect(rownames(meta),rownames(VF.clean)))#96
length(intersect(rownames(meta),tree$tip.label))#95

identical(rownames(meta),rownames(VF.clean))#FALSE
identical(tree$tip.label,rownames(meta))#FALSE
meta <- meta[tree$tip.label,]#order to match tree
rownames(meta)
VF.clean <- VF.clean[tree$tip.label,]#order to match tree
#Change IDs to PS isolate numbers as requested:
#30/8/2019 update: Stefan sent codes to replace current sample codes with isolate IDs
strain_IDs <- read.table(paste0(getwd(),"/Stefan_new_sample_codes_forR.txt"),stringsAsFactors = F, row.names = 1, header = TRUE)
head(strain_IDs)
#append 'PS' to strain IDs as requested
strain_IDs$Number <- paste0("PS",strain_IDs$Number)
length(intersect(rownames(strain_IDs),rownames(VF.clean)))#95
identical(rownames(meta),rownames(strain_IDs))#FASLE
length(intersect(rownames(meta),rownames(strain_IDs)))#95
strain_IDs <- strain_IDs[rownames(meta),]
names(strain_IDs) <- rownames(meta)
head(strain_IDs)
#Change meta IDs to PS numbers
rownames(meta) <- strain_IDs
head(meta)
#Change tree tip labels to PS numbers
tree$tip.label <- strain_IDs
#Change VF.clean rownames to PS numbers
rownames(VF.clean) <- strain_IDs
#Change AMR rownames to PS numbers
rownames(amr) <- strain_IDs


head(VF.clean)
meta <- cbind(rownames(meta),meta)#First column has to be sample IDs to join to ggtree for gheatmap
meta$Before.During.After <- as.character(unlist(meta$Before.During.After))#Get rid of PA01 level, no longer prsent
meta$Before.During.After <- as.factor(meta$Before.During.After)

#13/01/2019: Add-on request from Clinton: Instead of doing during vs. before/after do outbreakstrain ST303 vs. all other strains
#4/2/2020: after meeting Clinton/Stefan/Chad requested that the samples labeled as ST303? or ST303* are also classified as ST303 in the analyses downstream
head(meta)
meta$Strain
meta$ST303 <- as.character(unlist(meta$Strain))
ST303_grep <- grep("303",meta$ST303)
meta$ST303[ST303_grep]
# [1] "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303" 
# [20] "303"  "303*" "303"  "303"  "303"  "303"  "303"  "303"  "303?" "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303"  "303" 
# [39] "303?" "303"

#There are 3 samples that are still labelled as 'uncertain' ST303 because one marker remained ambiguous after manual curation. 
#They are more than likely ST303 though so we will include them as such in the analyses and note this in the figure legend
meta$ST303[ST303_grep] <- "ST303"
meta$ST303 <- ifelse(meta$ST303 == "ST303", "ST303", "other") #NB here 'NF' or not found are included in other
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#Here we change for the 5 samples D01, D06, D13, D15, C11 (need corresponding PS numbers here)
NF_5 <- strain_IDs[c("D01","D06","D13","D15","C11")]
# D01    D06    D13    D15    C11 
# "PS83" "PS87" "PS20" "PS43" "PS61" 
meta[c(NF_5),] 
# rownames(meta) Selected index B.D.A Before.During.After Community..Nosocomial..Enviromental           ID Mont.Year Outbreak.month.year.category Sample.Type Strain   ST
# PS83           PS83       NA    43     D              During                          Nosocomial  D01_S1_L001   12/2016                                    Blood     NF  NF*
#   PS87           PS87       NA    40     D              During                          Nosocomial  D06_S6_L001   05/2017                                    Blood     NF NF*?
# PS20           PS20       NA    39     D              During                           Community D13_S13_L001   01/2017                                    Blood     NF NF*?
# PS43           PS43       NA    37     D              During                           Community D15_S15_L001   06/2017                                    Blood     NF NF*?
# PS61           PS61       NA     7     D              During                           Community C11_S11_L001   03/2017                Feb17 - Mar17       Urine     NF NF*?
# Wild.type..Multi.drug.resistant ST303
# PS83                       Wild-type other
# PS87                       Wild-type other
# PS20                       Wild-type other
# PS43                       Wild-type other
# PS61                       Wild-type other
#----
meta[c(NF_5),"ST303"] <- "ST303"
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
meta$ST303 <- as.factor(meta$ST303)
levels(meta$ST303)
#[1] "other" "ST303"
#Let's flip the levels so that the analyses represent ST303 vs. other (as opposed to other vs. ST303)
meta$ST303 <- relevel(meta$ST303,"ST303")
table(meta$ST303)
# ST303 other 
# 45    50 
#-------------------------
#HEATMAP + GGTREE

length(intersect(tree$tip.label,rownames(VF.clean)))#95 (D18 removed)

#
#Classic rectangular layout, lets cluster VFs to order heatmap columns
#Distance matrix (need numeric matrix as input)
VF.mat <- ifelse(na.omit(VF.clean)=="present",1,0)
VF.dist <- dist(VF.mat)
VF.cluster <- hclust(VF.dist)
VF.cluster$order
VF.ordered <- VF.mat[VF.cluster$order,]
heatmap(VF.ordered)
VF.discrete <- ifelse(VF.ordered==1,"present","absent")
VF.discrete <- data.frame(VF.discrete)
VF.discrete <- VF.discrete[rownames(VF.clean),] #match sample order to tree

# heatMapPAM function -----------------------------------------------------
#the heatMapPAM function was taken from the R package GeneMates, which I couldn't install here
heatMapPAM <- function(p, data, col_colours = "black", null_colour = "grey90",
                       border_colour = "white",
                       cluster_cols = FALSE, cluster_method = "single",
                       cluster_distance = "binary", rev_cols = FALSE,
                       colnames = TRUE, colnames_position = "bottom", colnames_angle = 0,
                       colnames_level = NULL, set_label_colours = FALSE,
                       colnames_offset_x = 0, colnames_offset_y = 0,
                       font.size = 4, hjust = 0.5, offset = 0, width = 1,
                       show_legend = FALSE) {
  # The first two packages are dependencies of the package ggtree.
  require(ggplot2)
  require(tidyr)  # for the function gather
  require(tibble)
  require(magrittr)  # for operators "%<>%" and "%>%"(github.com/GuangchuangYu/ggtree/blob/master/R/operator.R)
  require(ggtree)

  colnames_position %<>% match.arg(c("bottom", "top"))
  variable <- value <- lab <- y <- NULL
  
  ## convert relative width of the PAM to width of each cell
  width <- width * (p$data$x %>% range(na.rm = TRUE) %>% diff) / ncol(data)
  isTip <- x <- y <- variable <- value <- from <- to <- NULL
  
  df <- p$data
  df <- df[df$isTip, ]
  start <- max(df$x, na.rm = TRUE) + offset
  
  # Column-wise clustering
  if (cluster_cols && ncol(data) > 2) {
    hc <- hclust(d = dist(t(data), method = cluster_distance),
                 method = cluster_method)
    data <- data[, hc$order]  # reorder columns
    clustered <- TRUE
  } else {
    clustered <- FALSE
  }
  
  # Inverse columns for improving visualisation
  if (rev_cols) {
    data <- data[, ncol(data) : 1]
  }
  
  # weight cells before converting the PAM (dd) into a data frame
  n_colours <- length(col_colours)
  is_multi_colour <- n_colours > 1 && !is.null(names(col_colours))
  if (is_multi_colour) {
    colours_uniq <- sort(unique(as.character(col_colours)), decreasing = FALSE)  # colours for positive values in PAM
    colour_codes <- 1 : length(colours_uniq)  # codes for colours
    names(colour_codes) <- colours_uniq  # e.g., c("red" = 1, "blue" = 2, ...)
    column_names <- colnames(data)  # The matrix product loses column names, that is, the allele names.
    if (clustered) {
      col_colours <- col_colours[column_names]
    } else if (n_colours > length(column_names))  {
      col_colours <- col_colours[column_names]
    }
    data <- data %*% diag(as.integer(colour_codes[as.character(col_colours)]))  # convert colour characters into integer codes
    colnames(data) <- column_names
    names(colours_uniq) <- as.character(colour_codes)
    colours_uniq <- append(c("0" = null_colour), colours_uniq)
    
    # Convert values in the matrix into factors so as to map colours to the levels
    dd <- as.data.frame(data)
    for (i in names(dd)) {
      dd[, i] <- factor(dd[, i], levels = c(0, as.integer(colour_codes)),
                        labels = c("0", as.character(colour_codes)))
    }
  } else {
    dd <- as.data.frame(data)
  }
  
  i <- order(df$y)
  
  ## handle collapsed tree
  ## https://github.com/GuangchuangYu/ggtree/issues/137
  i <- i[!is.na(df$y[i])]
  
  lab <- df$label[i]
  ## https://github.com/GuangchuangYu/ggtree/issues/182
  dd <- dd[match(lab, rownames(dd)), , drop = FALSE]
  dd$y <- sort(df$y)
  dd$lab <- lab
  
  # tibble::set_tidy_names(dd) solves the problem of the error: "Can't bind data because some arguments have the same name"
  # see https://github.com/tidyverse/tidyr/issues/472
  dd <- gather(data = tibble::set_tidy_names(dd), key = variable, value = value, -c(lab, y))
  
  i <- which(dd$value == "")
  if (length(i) > 0) {
    dd$value[i] <- NA
  }
  if (is.null(colnames_level)) {
    dd$variable <- factor(dd$variable, levels = colnames(data))
  } else {
    dd$variable <- factor(dd$variable, levels = colnames_level)
  }
  V2 <- start + as.numeric(dd$variable) * width
  
  # Create a data frame for label attributes
  mapping <- data.frame(from = as.character(dd$variable), to = V2, stringsAsFactors = FALSE)  # from: label texts
  mapping <- unique(mapping)
  paint_labels <- set_label_colours && is_multi_colour
  if (paint_labels) {
    mapping$col <- as.character(col_colours[mapping$from])  # variable label colour
    mapping$col <- factor(mapping$col, levels = as.character(colours_uniq),
                          labels = names(colours_uniq))
  }
  mapping$from = as.factor(mapping$from)  # change back to factors
  
  # Create the coloured tile matrix
  dd$x <- V2
  dd$width <- width
  dd[[".panel"]] <- factor("Tree")
  if (is.null(border_colour)) {
    p2 <- p + geom_tile(data = dd, aes(x, y, fill = value), width = width,
                        inherit.aes = FALSE)
  } else {
    p2 <- p + geom_tile(data = dd, aes(x, y, fill = value), width = width,
                        colour = border_colour, inherit.aes = FALSE)
  }
  
  # Print column names
  if (colnames) {
    if (colnames_position == "bottom") {
      y <- 0
    } else {
      y <- max(p$data$y) + 1
    }
    mapping$y <- y
    mapping[[".panel"]] <- factor("Tree")
    if (paint_labels) {
      p2 <- p2 + geom_text(data = mapping,
                           aes(x = to, y = y, label = from, colour = col),
                           size = font.size, inherit.aes = FALSE,
                           angle = colnames_angle, nudge_x = colnames_offset_x,
                           nudge_y = colnames_offset_y, hjust = hjust)
    } else {  # Labels are printed in black.
      p2 <- p2 + geom_text(data = mapping,
                           aes(x = to, y = y, label = from),
                           size = font.size, inherit.aes = FALSE,
                           angle = colnames_angle, nudge_x = colnames_offset_x,
                           nudge_y = colnames_offset_y, hjust = hjust)
    }
  }
  
  # Scale colours for both tiles and column labels
  if (is_multi_colour) {
    # This command does not scale label colours when paint_labels = FALSE.
    # Class: title of the legend
    # scale_fill_manual: for tile colours; scale_colour_manual: for label colours
    p2 <- p2 + scale_fill_manual(name = "Class",
                                 breaks = c("0", as.character(colour_codes)),
                                 values = colours_uniq, na.value = NA) +
      scale_colour_manual(name = "Class",
                          breaks = c("0", as.character(colour_codes)),
                          values = colours_uniq, na.value = NA)
  } else {
    p2 <- p2 + scale_fill_gradient(low = null_colour, high = col_colours,
                                   na.value = NA)
  }
  
  # Print legend
  if (show_legend) {
    p2 <- p2 + theme(legend.position = "right", legend.title = element_blank())
  }
  
  attr(p2, "mapping") <- mapping
  
  return(list(p = p2, mapping = mapping, data = dd))
}
  
mat.VF <- as.matrix(VF.ordered)


# Apply heatMapPAM function -----------------------------------------------

#------------------------------
#Let's identify VFs differentially present/absent between B&A vs. During
head(VF.ordered)
meta.ordered <- meta[rownames(VF.ordered),]
#10/2/2020
#ST303 vs. other comparison + compare to full genome assembly results
#
res = fitPA(t(VF.ordered),meta.ordered$ST303)
head(res)
res.sig = res[res$adjPvalues <= 0.05,]
dim(res.sig)#51
#----------------------------------
#Not coded here but I tested the setdiff between using during vs. before/after as opposed to ST303 vs other:
# > setdiff(rownames(BA_D_res.sig),rownames(res.sig))
# [1] "fimT"  "pilC"  "pilV"  "pilY2"
# > setdiff(rownames(res.sig),rownames(BA_D_res.sig))
# [1] "PA2372" "fpvA"   "vgrG3"  "xcpQ"
#--------------------------------
#Instead of sample names for the ggtree, lets have ST types (coloring by B/D/A)

levels(meta$Before.During.After)
newplot3 <- ggtree(tree) %<+% meta + geom_tiplab(aes(color=Before.During.After) ,align=T,linetype=1, linesize=.02,
                                                 size=2.5)  +
  scale_color_manual(values = c("dodgerblue3","seagreen3","red3"),labels = c("After","Before","During")) +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "right")

#Heatmap of significant VFs
VF.sig <- VF.ordered[,rownames(res.sig)]
dim(VF.sig)#95 51
# pdf(paste0(getwd(),"/srst2_ST303_heatMapPAM_VFs_rectangle_hclust_fisher_significant.pdf"), 10,10)
# 
# heatMapPAM(newplot3, VF.sig, col_colours = "black", null_colour = "white",
#            border_colour = NA,
#            cluster_cols = TRUE, cluster_method = "single",
#            cluster_distance = "binary", rev_cols = FALSE,
#            colnames = TRUE, colnames_position = "bottom", colnames_angle = 90,
#            colnames_level = NULL, set_label_colours = FALSE,
#            colnames_offset_x = 0, colnames_offset_y = 0,
#            font.size = 3, hjust = 0.5, offset = 0.05, width = 2,
#            show_legend = TRUE)
# dev.off()
#-----------
#Add AMR genes to significant VFs
rownames(amr)
rownames(VF.sig)
amr <- amr[rownames(VF.sig),]
amr.use <- amr
amr.use <- apply(amr.use, 2, function(x) sub("\\*","",x))
amr.use <- apply(amr.use, 2, function(x) sub("\\?","",x))
amr.use <- apply(amr.use, 2, function(x) sub("-","0",x))
amr.use <- ifelse(amr.use=="0",0,1)
amr.use[1:2,1:10]
#Cluster AMRs by sample
amr_hc <- hclust(d = dist(t(amr.use), method = "binary"),
             method = "single")
amr_clustered<- amr.use[,amr_hc$order]

identical(rownames(amr_clustered),rownames(VF.sig))#TRUE
#vf.amr <- cbind(amr_clustered,VF.sig)
#The problem is we only want to cluster the VFs not the AMRs, we want the AMRs static on the left of the heatmap and then clustering of VFs
#From the heatMapPAM function the code that clusters is:
hc <- hclust(d = dist(t(VF.sig), method = "binary"),
             method = "single")
data <- VF.sig[,hc$order]  # reorder columns
head(data)
colnames(VF.sig)
colnames(data)
#
vf.amr <- cbind(data,amr_clustered)
colnames(vf.amr)
pdf(paste0(getwd(),"/ST303heatMapPAM_VFs_rectangle_hclust_fisher_significant_plusAMR_TEST.pdf"), 10,10)
GeneMates::heatMapPAM(newplot3, vf.amr, col_colours = "black", null_colour = "white",
           presence_label = "Present", absence_label = "Absent",
           border_colour = NA,
           cluster_cols = FALSE, cluster_method = "single",
           cluster_distance = "binary", rev_cols = FALSE,
           colnames = TRUE, colnames_position = "bottom", colnames_angle = 90,
           colnames_level = NULL, set_label_colours = FALSE,
           colnames_offset_x = 0, colnames_offset_y = -8,
           font.size = 2.5, hjust = 0, offset = 0.07, width = 2.5,
           show_legend = TRUE)
dev.off()
#


newplot4 <- ggtree(tree) %<+% meta + geom_tiplab(aes(label = Strain) ,offset = 0.07,align=T,linetype=0,
                                                 size=2.5)  +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "right")

#Just to get a column with strain types for use in pptx editing after:
pdf(paste0(getwd(),"/ST303heatMapPAM_VFs_strains_column_v2.pdf"), 10,10)
newplot4
dev.off()

#////////////////////////

she <- c(143,145,159,176,178)
lennard <- c(198,179,174,183,177)
stage <- c(1,2,3,4,5)
toplot <- data.frame(stage = stage,NP = she,lennard = lennard)

ggplot(toplot, aes(x = stage))+
  geom_point(aes(y = NP, color = "she"), size = 2, alpha = 0.5)+
  geom_point(aes(y = lennard, color = "Lennard"), size = 2, alpha = 0.5)+
  geom_smooth(aes(y = sheppard), method = lm)+
  geom_smooth(aes(y = lennard), method = lm)


library(plotly)
x <- 1:10
y <- jitter(x^2)

DF <- data.frame(x, y)
DF