#V3 changes:This version deals specifically with the publication ready analyses where we're using ST303 vs. other (as opposed to B/A vs. D) #V4 changes: Stefan, Chad Clinton changed their mind regarding the classification of 5 NF samples that cluster with ST303 (likely ST303) but that differ from the #ST303 type by one of the six markers #Here, those 5 samples, instead of being included with the non-ST303s as before will now be included with the ST303s #From Stefan: D01, D06, D13, D15, C11- differ at the acsA locus only library(tidyverse) library(ggtree) library(ggimage) library(GeneMates)#function heatMapPAM library(ape)#drop.tip library(metagenomeSeq)#fitPA #import phylogenetic SNP tree setwd("/Users/katielennard/Documents/Academic/IDM_projects/Nicol/Jan2019_single_isolate_Pseudomonas_project/SNPs_downstream/") #source(paste0(getwd(),"/rscripts/plotSRST2data.R")) #from https://github.com/katholt/srst2#plotting-output-in-r tree <- read.tree(paste0(getwd(),"/Ps_aerug_sample_and_PAO1_ref_genome_tree.core.tre")) str(tree) #Remove reference genome from tree as well as D18 (QC): tree$tip.label tree <- drop.tip(tree,"GCA_000006765_1_ASM676v1_genomic") tree <- drop.tip(tree,"D18_S18_L001") grep("D18",tree$tip.label)#gone #clean sample labels tree$tip.label <- sapply(strsplit(tree$tip.label, "_"), `[`, 1) tree$tip.label #import VF and AMR tables (80% gene length coverage, 90% identity) amr <- read.delim(paste0(getwd(),"/Ps_AMR_cov_80_ID_90_report__compiledResults.txt"), stringsAsFactors = F, row.names = 1) #MLST and AMR genes head(amr) dim(amr)#96 13 identical(rownames(amr),tree$tip.label)#FALSE #Remove D18 which(rownames(amr)=="D18")#90 amr <- amr[-90,] amr <- amr[tree$tip.label,] head(amr) #Import VFs with >= 80% coverage of length of gene and at least 90% identity mlst.vf <- read.delim(paste0(getwd(),"/Ps_VF_cov_80_ID_90_report__compiledResults.txt"), stringsAsFactors = F,row.names = 1) #MLST and VF genes dim(mlst.vf)#96 416 #------------------------ #There are so many VFs (> 400) that we need to identify those present in all samples and filter these out to get the heatmap less cluttered VFs_count <- apply(mlst.vf,2,function(x) length(which(x=="-"))) col_nums <- which(VFs_count==0) toFilter <- !colnames(mlst.vf) %in% names(col_nums) filteredVFs <- mlst.vf[,toFilter] dim(filteredVFs)#151 #------------------------ #Lets create a combined matrix with both VFs and AMRs # identical(rownames(amr),rownames(filteredVFs))#TRUE # mlst.vf.amr <- cbind(amr,filteredVFs) # head(mlst.vf.amr) # #manual spot checks: # mlst.vf.amr["B23",1:10] # # AacAad_AGly AadB_AGly Aph3.IIb_AGly Arr_Rif CatB7_Phe DfrB_Tmt OXA.1_Bla OXA.50_Bla OXA.5_Bla StrA_AGly # # B23 - - Aph3-IIb_1503* - CatB7_1178* - - OXA-50_1499* - - # #Correct, checked against individual B23 file # mlst.vf.amr["B23",20:30] # # PA3144 PA3157 PA4298 PA4541 PLES_19091 PLES_19101 # # B23 PA3144_CVF520_VFG014114? PA3157_CVF520_VFG014101*? PA4298_AI098_VFG042745? PA4541_TX184_VFG043810*? - - # # PLES_19111 PLES_19131 PLES_19141 PLES_19151 PLES_19161 # # B23 - - - - - # #yes # mlst.vf.amr["A10",1:10] # # AacAad_AGly AadB_AGly Aph3.IIb_AGly Arr_Rif CatB7_Phe DfrB_Tmt OXA.1_Bla OXA.50_Bla OXA.5_Bla StrA_AGly # # A10 - - Aph3-IIb_1503* - CatB7_1178* - - OXA-50_1499* - - # #yes # mlst.vf.amr["A10",20:30] # # PA3144 PA3157 PA4298 PA4541 PLES_19091 PLES_19101 PLES_19111 PLES_19131 # # A10 PA3144_CVF520_VFG014114*? - PA4298_AI098_VFG042745 PA4541_TX184_VFG043810* - - - - # # PLES_19141 PLES_19151 PLES_19161 # # A10 - - - # #yes #------------------------- VF.clean <- apply(filteredVFs, 1, function(x) sub("\\*","",x)) VF.clean <- apply(VF.clean, 1, function(x) sub("\\?","",x)) VF.clean <- apply(VF.clean, 1, function(x) sub("-","0",x)) VF.clean <- ifelse(VF.clean=="0","absent","present") VF.clean <- t(VF.clean) VF.clean <- data.frame(VF.clean) str(VF.clean) head(VF.clean) dim(VF.clean) VF.clean[10,1:20] filteredVFs[10,1:20] # metadata ---------------------------------------------------------------- #import metadata meta <- read.csv2(paste0(getwd(),"/Metadata-KLeditSTs-v2StefanEdits.txt"), sep = "\t") head(meta) #match metadata sample IDs to data sample IDs colnames(meta) rownames(meta) <- meta$ID #clean rownames(meta) <- sapply(strsplit(rownames(meta), "_"), `[`, 1) dim(meta)#97 12 setdiff(tree$tip.label,rownames(meta)) setdiff(rownames(meta),tree$tip.label) # [1] "D18" "GCA" length(intersect(rownames(meta),rownames(VF.clean)))#96 length(intersect(rownames(meta),tree$tip.label))#95 identical(rownames(meta),rownames(VF.clean))#FALSE identical(tree$tip.label,rownames(meta))#FALSE meta <- meta[tree$tip.label,]#order to match tree rownames(meta) VF.clean <- VF.clean[tree$tip.label,]#order to match tree #Change IDs to PS isolate numbers as requested: #30/8/2019 update: Stefan sent codes to replace current sample codes with isolate IDs strain_IDs <- read.table(paste0(getwd(),"/Stefan_new_sample_codes_forR.txt"),stringsAsFactors = F, row.names = 1, header = TRUE) head(strain_IDs) #append 'PS' to strain IDs as requested strain_IDs$Number <- paste0("PS",strain_IDs$Number) length(intersect(rownames(strain_IDs),rownames(VF.clean)))#95 identical(rownames(meta),rownames(strain_IDs))#FASLE length(intersect(rownames(meta),rownames(strain_IDs)))#95 strain_IDs <- strain_IDs[rownames(meta),] names(strain_IDs) <- rownames(meta) head(strain_IDs) #Change meta IDs to PS numbers rownames(meta) <- strain_IDs head(meta) #Change tree tip labels to PS numbers tree$tip.label <- strain_IDs #Change VF.clean rownames to PS numbers rownames(VF.clean) <- strain_IDs #Change AMR rownames to PS numbers rownames(amr) <- strain_IDs head(VF.clean) meta <- cbind(rownames(meta),meta)#First column has to be sample IDs to join to ggtree for gheatmap meta$Before.During.After <- as.character(unlist(meta$Before.During.After))#Get rid of PA01 level, no longer prsent meta$Before.During.After <- as.factor(meta$Before.During.After) #13/01/2019: Add-on request from Clinton: Instead of doing during vs. before/after do outbreakstrain ST303 vs. all other strains #4/2/2020: after meeting Clinton/Stefan/Chad requested that the samples labeled as ST303? or ST303* are also classified as ST303 in the analyses downstream head(meta) meta$Strain meta$ST303 <- as.character(unlist(meta$Strain)) ST303_grep <- grep("303",meta$ST303) meta$ST303[ST303_grep] # [1] "303" "303" "303" "303" "303" "303" "303" "303" "303" "303" "303" "303" "303" "303" "303" "303" "303" "303" "303" # [20] "303" "303*" "303" "303" "303" "303" "303" "303" "303?" "303" "303" "303" "303" "303" "303" "303" "303" "303" "303" # [39] "303?" "303" #There are 3 samples that are still labelled as 'uncertain' ST303 because one marker remained ambiguous after manual curation. #They are more than likely ST303 though so we will include them as such in the analyses and note this in the figure legend meta$ST303[ST303_grep] <- "ST303" meta$ST303 <- ifelse(meta$ST303 == "ST303", "ST303", "other") #NB here 'NF' or not found are included in other #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #Here we change for the 5 samples D01, D06, D13, D15, C11 (need corresponding PS numbers here) NF_5 <- strain_IDs[c("D01","D06","D13","D15","C11")] # D01 D06 D13 D15 C11 # "PS83" "PS87" "PS20" "PS43" "PS61" meta[c(NF_5),] # rownames(meta) Selected index B.D.A Before.During.After Community..Nosocomial..Enviromental ID Mont.Year Outbreak.month.year.category Sample.Type Strain ST # PS83 PS83 NA 43 D During Nosocomial D01_S1_L001 12/2016 Blood NF NF* # PS87 PS87 NA 40 D During Nosocomial D06_S6_L001 05/2017 Blood NF NF*? # PS20 PS20 NA 39 D During Community D13_S13_L001 01/2017 Blood NF NF*? # PS43 PS43 NA 37 D During Community D15_S15_L001 06/2017 Blood NF NF*? # PS61 PS61 NA 7 D During Community C11_S11_L001 03/2017 Feb17 - Mar17 Urine NF NF*? # Wild.type..Multi.drug.resistant ST303 # PS83 Wild-type other # PS87 Wild-type other # PS20 Wild-type other # PS43 Wild-type other # PS61 Wild-type other #---- meta[c(NF_5),"ST303"] <- "ST303" #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! meta$ST303 <- as.factor(meta$ST303) levels(meta$ST303) #[1] "other" "ST303" #Let's flip the levels so that the analyses represent ST303 vs. other (as opposed to other vs. ST303) meta$ST303 <- relevel(meta$ST303,"ST303") table(meta$ST303) # ST303 other # 45 50 #------------------------- #HEATMAP + GGTREE length(intersect(tree$tip.label,rownames(VF.clean)))#95 (D18 removed) # #Classic rectangular layout, lets cluster VFs to order heatmap columns #Distance matrix (need numeric matrix as input) VF.mat <- ifelse(na.omit(VF.clean)=="present",1,0) VF.dist <- dist(VF.mat) VF.cluster <- hclust(VF.dist) VF.cluster$order VF.ordered <- VF.mat[VF.cluster$order,] heatmap(VF.ordered) VF.discrete <- ifelse(VF.ordered==1,"present","absent") VF.discrete <- data.frame(VF.discrete) VF.discrete <- VF.discrete[rownames(VF.clean),] #match sample order to tree # heatMapPAM function ----------------------------------------------------- #the heatMapPAM function was taken from the R package GeneMates, which I couldn't install here heatMapPAM <- function(p, data, col_colours = "black", null_colour = "grey90", border_colour = "white", cluster_cols = FALSE, cluster_method = "single", cluster_distance = "binary", rev_cols = FALSE, colnames = TRUE, colnames_position = "bottom", colnames_angle = 0, colnames_level = NULL, set_label_colours = FALSE, colnames_offset_x = 0, colnames_offset_y = 0, font.size = 4, hjust = 0.5, offset = 0, width = 1, show_legend = FALSE) { # The first two packages are dependencies of the package ggtree. require(ggplot2) require(tidyr) # for the function gather require(tibble) require(magrittr) # for operators "%<>%" and "%>%"(github.com/GuangchuangYu/ggtree/blob/master/R/operator.R) require(ggtree) colnames_position %<>% match.arg(c("bottom", "top")) variable <- value <- lab <- y <- NULL ## convert relative width of the PAM to width of each cell width <- width * (p$data$x %>% range(na.rm = TRUE) %>% diff) / ncol(data) isTip <- x <- y <- variable <- value <- from <- to <- NULL df <- p$data df <- df[df$isTip, ] start <- max(df$x, na.rm = TRUE) + offset # Column-wise clustering if (cluster_cols && ncol(data) > 2) { hc <- hclust(d = dist(t(data), method = cluster_distance), method = cluster_method) data <- data[, hc$order] # reorder columns clustered <- TRUE } else { clustered <- FALSE } # Inverse columns for improving visualisation if (rev_cols) { data <- data[, ncol(data) : 1] } # weight cells before converting the PAM (dd) into a data frame n_colours <- length(col_colours) is_multi_colour <- n_colours > 1 && !is.null(names(col_colours)) if (is_multi_colour) { colours_uniq <- sort(unique(as.character(col_colours)), decreasing = FALSE) # colours for positive values in PAM colour_codes <- 1 : length(colours_uniq) # codes for colours names(colour_codes) <- colours_uniq # e.g., c("red" = 1, "blue" = 2, ...) column_names <- colnames(data) # The matrix product loses column names, that is, the allele names. if (clustered) { col_colours <- col_colours[column_names] } else if (n_colours > length(column_names)) { col_colours <- col_colours[column_names] } data <- data %*% diag(as.integer(colour_codes[as.character(col_colours)])) # convert colour characters into integer codes colnames(data) <- column_names names(colours_uniq) <- as.character(colour_codes) colours_uniq <- append(c("0" = null_colour), colours_uniq) # Convert values in the matrix into factors so as to map colours to the levels dd <- as.data.frame(data) for (i in names(dd)) { dd[, i] <- factor(dd[, i], levels = c(0, as.integer(colour_codes)), labels = c("0", as.character(colour_codes))) } } else { dd <- as.data.frame(data) } i <- order(df$y) ## handle collapsed tree ## https://github.com/GuangchuangYu/ggtree/issues/137 i <- i[!is.na(df$y[i])] lab <- df$label[i] ## https://github.com/GuangchuangYu/ggtree/issues/182 dd <- dd[match(lab, rownames(dd)), , drop = FALSE] dd$y <- sort(df$y) dd$lab <- lab # tibble::set_tidy_names(dd) solves the problem of the error: "Can't bind data because some arguments have the same name" # see https://github.com/tidyverse/tidyr/issues/472 dd <- gather(data = tibble::set_tidy_names(dd), key = variable, value = value, -c(lab, y)) i <- which(dd$value == "") if (length(i) > 0) { dd$value[i] <- NA } if (is.null(colnames_level)) { dd$variable <- factor(dd$variable, levels = colnames(data)) } else { dd$variable <- factor(dd$variable, levels = colnames_level) } V2 <- start + as.numeric(dd$variable) * width # Create a data frame for label attributes mapping <- data.frame(from = as.character(dd$variable), to = V2, stringsAsFactors = FALSE) # from: label texts mapping <- unique(mapping) paint_labels <- set_label_colours && is_multi_colour if (paint_labels) { mapping$col <- as.character(col_colours[mapping$from]) # variable label colour mapping$col <- factor(mapping$col, levels = as.character(colours_uniq), labels = names(colours_uniq)) } mapping$from = as.factor(mapping$from) # change back to factors # Create the coloured tile matrix dd$x <- V2 dd$width <- width dd[[".panel"]] <- factor("Tree") if (is.null(border_colour)) { p2 <- p + geom_tile(data = dd, aes(x, y, fill = value), width = width, inherit.aes = FALSE) } else { p2 <- p + geom_tile(data = dd, aes(x, y, fill = value), width = width, colour = border_colour, inherit.aes = FALSE) } # Print column names if (colnames) { if (colnames_position == "bottom") { y <- 0 } else { y <- max(p$data$y) + 1 } mapping$y <- y mapping[[".panel"]] <- factor("Tree") if (paint_labels) { p2 <- p2 + geom_text(data = mapping, aes(x = to, y = y, label = from, colour = col), size = font.size, inherit.aes = FALSE, angle = colnames_angle, nudge_x = colnames_offset_x, nudge_y = colnames_offset_y, hjust = hjust) } else { # Labels are printed in black. p2 <- p2 + geom_text(data = mapping, aes(x = to, y = y, label = from), size = font.size, inherit.aes = FALSE, angle = colnames_angle, nudge_x = colnames_offset_x, nudge_y = colnames_offset_y, hjust = hjust) } } # Scale colours for both tiles and column labels if (is_multi_colour) { # This command does not scale label colours when paint_labels = FALSE. # Class: title of the legend # scale_fill_manual: for tile colours; scale_colour_manual: for label colours p2 <- p2 + scale_fill_manual(name = "Class", breaks = c("0", as.character(colour_codes)), values = colours_uniq, na.value = NA) + scale_colour_manual(name = "Class", breaks = c("0", as.character(colour_codes)), values = colours_uniq, na.value = NA) } else { p2 <- p2 + scale_fill_gradient(low = null_colour, high = col_colours, na.value = NA) } # Print legend if (show_legend) { p2 <- p2 + theme(legend.position = "right", legend.title = element_blank()) } attr(p2, "mapping") <- mapping return(list(p = p2, mapping = mapping, data = dd)) } mat.VF <- as.matrix(VF.ordered) # Apply heatMapPAM function ----------------------------------------------- #------------------------------ #Let's identify VFs differentially present/absent between B&A vs. During head(VF.ordered) meta.ordered <- meta[rownames(VF.ordered),] #10/2/2020 #ST303 vs. other comparison + compare to full genome assembly results # res = fitPA(t(VF.ordered),meta.ordered$ST303) head(res) res.sig = res[res$adjPvalues <= 0.05,] dim(res.sig)#51 #---------------------------------- #Not coded here but I tested the setdiff between using during vs. before/after as opposed to ST303 vs other: # > setdiff(rownames(BA_D_res.sig),rownames(res.sig)) # [1] "fimT" "pilC" "pilV" "pilY2" # > setdiff(rownames(res.sig),rownames(BA_D_res.sig)) # [1] "PA2372" "fpvA" "vgrG3" "xcpQ" #-------------------------------- #Instead of sample names for the ggtree, lets have ST types (coloring by B/D/A) levels(meta$Before.During.After) newplot3 <- ggtree(tree) %<+% meta + geom_tiplab(aes(color=Before.During.After) ,align=T,linetype=1, linesize=.02, size=2.5) + scale_color_manual(values = c("dodgerblue3","seagreen3","red3"),labels = c("After","Before","During")) + theme(plot.title = element_text(hjust = 0.5)) + theme(legend.position = "right") #Heatmap of significant VFs VF.sig <- VF.ordered[,rownames(res.sig)] dim(VF.sig)#95 51 # pdf(paste0(getwd(),"/srst2_ST303_heatMapPAM_VFs_rectangle_hclust_fisher_significant.pdf"), 10,10) # # heatMapPAM(newplot3, VF.sig, col_colours = "black", null_colour = "white", # border_colour = NA, # cluster_cols = TRUE, cluster_method = "single", # cluster_distance = "binary", rev_cols = FALSE, # colnames = TRUE, colnames_position = "bottom", colnames_angle = 90, # colnames_level = NULL, set_label_colours = FALSE, # colnames_offset_x = 0, colnames_offset_y = 0, # font.size = 3, hjust = 0.5, offset = 0.05, width = 2, # show_legend = TRUE) # dev.off() #----------- #Add AMR genes to significant VFs rownames(amr) rownames(VF.sig) amr <- amr[rownames(VF.sig),] amr.use <- amr amr.use <- apply(amr.use, 2, function(x) sub("\\*","",x)) amr.use <- apply(amr.use, 2, function(x) sub("\\?","",x)) amr.use <- apply(amr.use, 2, function(x) sub("-","0",x)) amr.use <- ifelse(amr.use=="0",0,1) amr.use[1:2,1:10] #Cluster AMRs by sample amr_hc <- hclust(d = dist(t(amr.use), method = "binary"), method = "single") amr_clustered<- amr.use[,amr_hc$order] identical(rownames(amr_clustered),rownames(VF.sig))#TRUE #vf.amr <- cbind(amr_clustered,VF.sig) #The problem is we only want to cluster the VFs not the AMRs, we want the AMRs static on the left of the heatmap and then clustering of VFs #From the heatMapPAM function the code that clusters is: hc <- hclust(d = dist(t(VF.sig), method = "binary"), method = "single") data <- VF.sig[,hc$order] # reorder columns head(data) colnames(VF.sig) colnames(data) # vf.amr <- cbind(data,amr_clustered) colnames(vf.amr) pdf(paste0(getwd(),"/ST303heatMapPAM_VFs_rectangle_hclust_fisher_significant_plusAMR_TEST.pdf"), 10,10) GeneMates::heatMapPAM(newplot3, vf.amr, col_colours = "black", null_colour = "white", presence_label = "Present", absence_label = "Absent", border_colour = NA, cluster_cols = FALSE, cluster_method = "single", cluster_distance = "binary", rev_cols = FALSE, colnames = TRUE, colnames_position = "bottom", colnames_angle = 90, colnames_level = NULL, set_label_colours = FALSE, colnames_offset_x = 0, colnames_offset_y = -8, font.size = 2.5, hjust = 0, offset = 0.07, width = 2.5, show_legend = TRUE) dev.off() # newplot4 <- ggtree(tree) %<+% meta + geom_tiplab(aes(label = Strain) ,offset = 0.07,align=T,linetype=0, size=2.5) + theme(plot.title = element_text(hjust = 0.5)) + theme(legend.position = "right") #Just to get a column with strain types for use in pptx editing after: pdf(paste0(getwd(),"/ST303heatMapPAM_VFs_strains_column_v2.pdf"), 10,10) newplot4 dev.off() #//////////////////////// she <- c(143,145,159,176,178) lennard <- c(198,179,174,183,177) stage <- c(1,2,3,4,5) toplot <- data.frame(stage = stage,NP = she,lennard = lennard) ggplot(toplot, aes(x = stage))+ geom_point(aes(y = NP, color = "she"), size = 2, alpha = 0.5)+ geom_point(aes(y = lennard, color = "Lennard"), size = 2, alpha = 0.5)+ geom_smooth(aes(y = sheppard), method = lm)+ geom_smooth(aes(y = lennard), method = lm) library(plotly) x <- 1:10 y <- jitter(x^2) DF <- data.frame(x, y) DF