############################################################################################################################# # GuardS_et_al_JOVE_analysis_and_plotting_script_MaxQuantIPMS_and_SAINT.R # Author: William M. Old & Steve Guard, University of Colorado Boulder # Date: 6/13/2019 # Guard, S. et al. JOVE manuscript # License: GPL-2 (GNU General Public License Version 2) # Perform differential analysis on the MaxQuant proteinGroups.txt IP-MS data that has been imputed with Perseus # Required files: # 1. DYRK1A_Interactome_MasterMaxQuantAnalysis_perseusAnnot_imputed.txt : MaxQuant proteinGroups.txt file with perseus # imputed LFQ intensities # 2. export_CRAPomeSAINT_Input_File.R : Script to generate the input file provided to CRAPome.org # tool for generating the analysis results file # # 3. targets_for_crapome.txt : Contains sample name information for subsetting analysis files # # 4. Nuclear_Subcell_CRAPome.txt : CRAPome analysis results file (obtained by running export_CRAPomeSAINT_Input_File.R) # # 5. genes2plot_JOVEPaper.txt : put gene symbols you want to annotate in plots in a # file genes2plot_JOVEPaper.txt in working directory # # 6. ggmaplot_functions.R : R file with functions for plotting MA plots and # other accessory tasks # 7. main_differential_analysis.R (main file) # # ############################################################################################################################# library(limma) library(tidyverse) library(ggplot2) library(plotly) library(listviewer) source('./ggmaplot_functions.R') mqperseusimputedfile <- "DYRK1A_Interactome_MasterMaxQuantAnalysis_perseusAnnot_imputed.txt" crapome_file <- 'Nuclear_Subcell_CRAPome.txt' genes2plotfile <- 'genes2plot_JOVEPaper.txt' # Read in the Perseus imputed iBAQ and LFQ values from MaxQuant proteinGroups.txt file # d1a <- read.delim(mqperseusimputedfile, comment.char="#", stringsAsFactors=FALSE) d1a$Gene.name <- unlist(lapply(strsplit(d1a$Gene.name, ";"), "[", 1)) d1a$uniprot.acc <- unlist(lapply(strsplit(d1a$Majority.protein.IDs, "\\|"), "[", 2)) # rename razor peptide columns razid <- grepl("Razor...unique.peptides.", names(d1a)) raznms <- grep("Razor...unique.peptides.", names(d1a), value = T) newraznms <- gsub("Razor\\.\\.\\.unique\\.peptides\\.(.*)", "\\1_RZUP", raznms) names(d1a)[razid] <- newraznms d1a$gene.uniprot.acc <- paste(d1a$Gene.name, d1a$uniprot.acc, sep='_') d1a$razuniqpeps <- d1a$Razor...unique.peptides cntnms <- c("MS.MS.count", "razuniqpeps","Unique.peptides" , "Sequence.coverage....","Unique...razor.sequence.coverage...." ,"Unique.sequence.coverage....") # Use the razor unique peptide values to select the protein isoform with the largest number of razor peptides. # This tends to favor the dominant isoform d1a <- d1a %>% dplyr::group_by( Gene.name) %>% dplyr::mutate(the_rank = rank(-razuniqpeps, ties.method = "random")) %>% dplyr::filter(the_rank == 1) #View(d1a) #d1afull <- d1a # adapted from the miRtest packages, allows for testing only those proteins enriched over control (lower = FALSE) # using moderated empirical Bayes t-test from limma package. This is required to test for only proteins # enriched over the control (and not also negatively enriched) # returns p-values from all coefficients in model limma.one.sided = function (fit.tmp,lower=FALSE) { se.coef <- sqrt(fit.tmp$s2.post) * fit.tmp$stdev.unscaled df.total <- fit.tmp$df.prior + fit.tmp$df.residual pt(fit.tmp$t, df = df.total, lower.tail = lower) } # Function for renaming columns with problematic names filt.datadf <- function(dftmp) { names(dftmp)[1:112] <- gsub("^Intensity","int", names(dftmp)[1:112]) names(dftmp)[1:112] <- gsub("^LFQ.intensity","LFQ", names(dftmp)[1:112]) names(dftmp)[1:112] names(dftmp)[names(dftmp) == 'LFQ.AbNova_NE1'] <- 'LFQ.AbNova_NE_1' names(dftmp)[names(dftmp) == 'int.AbNova_NE1'] <- 'int.AbNova_NE_1' names(dftmp)[names(dftmp) == 'iBAQ.AbNova_NE1'] <- 'iBAQ.AbNova_NE_1' dftmp <- dftmp[!grepl("REV__", dftmp$Majority.protein.IDs),] dftmp <- dftmp[!grepl("CON__", dftmp$Majority.protein.IDs),] return(dftmp) } d1a <- filt.datadf(d1a) # Most of these samples are not used. alltnms <- c("int.AbCam_NE_1", "int.AbCam_NE_2", "int.AbCam_NE_3", # "int.AbCam_NE_4", "int.AbCam_NE_5", "int.AbNova_NE_2", "int.AbNova_NE_3", "int.AbNova_NE_1", "int.Bethyl_NE_1", "int.Bethyl_NE_2", "int.Bethyl_NE_3", "int.SC_NE_1", "int.SC_NE_2", "int.SC_NE_3", "int.RNF169_NE_1", "int.RNF169_NE_2", "int.BO_NE_1","int.BO_NE_2", "int.BO_NE_3", "int.Chrom_BO_Subcell_1", "int.Chrom_BO_Subcell_2","int.Chrom_BO_Subcell_3", "int.Chrom_D1A_Subcell_1", "int.Chrom_D1A_Subcell_2","int.Chrom_D1A_Subcell_3", "int.Cyt_BO_Subcell_1", "int.Cyt_BO_Subcell_2","int.Cyt_BO_Subcell_3", "int.Cyt_D1A_Subcell_1", "int.Cyt_D1A_Subcell_2","int.Cyt_D1A_Subcell_3", "int.Nuc_BO_Subcell_1", "int.Nuc_BO_Subcell_2","int.Nuc_BO_Subcell_3", "int.Nuc_D1A_Subcell_1", "int.Nuc_D1A_Subcell_2","int.Nuc_D1A_Subcell_3", "iBAQ.AbCam_NE_1", "iBAQ.AbCam_NE_2", "iBAQ.AbCam_NE_3",# "iBAQ.AbCam_NE_4", "iBAQ.AbCam_NE_5", "iBAQ.AbNova_NE_2", "iBAQ.AbNova_NE_3", "iBAQ.AbNova_NE_1", "iBAQ.Bethyl_NE_1", "iBAQ.Bethyl_NE_2", "iBAQ.Bethyl_NE_3", "iBAQ.SC_NE_1", "iBAQ.SC_NE_2", "iBAQ.SC_NE_3", "iBAQ.RNF169_NE_1", "iBAQ.RNF169_NE_2", "iBAQ.BO_NE_1","iBAQ.BO_NE_2", "iBAQ.BO_NE_3", "iBAQ.Chrom_BO_Subcell_1", "iBAQ.Chrom_BO_Subcell_2","iBAQ.Chrom_BO_Subcell_3", "iBAQ.Chrom_D1A_Subcell_1", "iBAQ.Chrom_D1A_Subcell_2","iBAQ.Chrom_D1A_Subcell_3", "iBAQ.Cyt_BO_Subcell_1", "iBAQ.Cyt_BO_Subcell_2","iBAQ.Cyt_BO_Subcell_3", "iBAQ.Cyt_D1A_Subcell_1", "iBAQ.Cyt_D1A_Subcell_2","iBAQ.Cyt_D1A_Subcell_3", "iBAQ.Nuc_BO_Subcell_1", "iBAQ.Nuc_BO_Subcell_2","iBAQ.Nuc_BO_Subcell_3", "iBAQ.Nuc_D1A_Subcell_1", "iBAQ.Nuc_D1A_Subcell_2","iBAQ.Nuc_D1A_Subcell_3", "LFQ.AbCam_NE_1", "LFQ.AbCam_NE_2", "LFQ.AbCam_NE_3", # "LFQ.AbCam_NE_4", "LFQ.AbCam_NE_5", "LFQ.AbNova_NE_2", "LFQ.AbNova_NE_3", "LFQ.AbNova_NE_1", "LFQ.Bethyl_NE_1", "LFQ.Bethyl_NE_2", "LFQ.Bethyl_NE_3", "LFQ.SC_NE_1", "LFQ.SC_NE_2", "LFQ.SC_NE_3", "LFQ.RNF169_NE_1", "LFQ.RNF169_NE_2", "LFQ.BO_NE_1","LFQ.BO_NE_2", "LFQ.BO_NE_3", "LFQ.Chrom_BO_Subcell_1", "LFQ.Chrom_BO_Subcell_2","LFQ.Chrom_BO_Subcell_3", "LFQ.Chrom_D1A_Subcell_1", "LFQ.Chrom_D1A_Subcell_2","LFQ.Chrom_D1A_Subcell_3", "LFQ.Cyt_BO_Subcell_1", "LFQ.Cyt_BO_Subcell_2","LFQ.Cyt_BO_Subcell_3", "LFQ.Cyt_D1A_Subcell_1", "LFQ.Cyt_D1A_Subcell_2","LFQ.Cyt_D1A_Subcell_3", "LFQ.Nuc_BO_Subcell_1", "LFQ.Nuc_BO_Subcell_2","LFQ.Nuc_BO_Subcell_3", "LFQ.Nuc_D1A_Subcell_1", "LFQ.Nuc_D1A_Subcell_2","LFQ.Nuc_D1A_Subcell_3") beadnms <- c("int.BO_NE_1","int.BO_NE_2", "int.BO_NE_3", "int.Cyt_BO_Subcell_1", "int.Cyt_BO_Subcell_2","int.Cyt_BO_Subcell_3", "int.Chrom_BO_Subcell_1", "int.Chrom_BO_Subcell_2","int.Chrom_BO_Subcell_3", "int.Nuc_BO_Subcell_1", "int.Nuc_BO_Subcell_2","int.Nuc_BO_Subcell_3", "iBAQ.BO_NE_1","iBAQ.BO_NE_2", "iBAQ.BO_NE_3", "iBAQ.Cyt_BO_Subcell_1", "iBAQ.Cyt_BO_Subcell_2","iBAQ.Cyt_BO_Subcell_3", "iBAQ.Chrom_BO_Subcell_1", "iBAQ.Chrom_BO_Subcell_2","iBAQ.Chrom_BO_Subcell_3", "iBAQ.Nuc_BO_Subcell_1", "iBAQ.Nuc_BO_Subcell_2","iBAQ.Nuc_BO_Subcell_3", "LFQ.BO_NE_1","LFQ.BO_NE_2", "LFQ.BO_NE_3", "LFQ.Cyt_BO_Subcell_1", "LFQ.Cyt_BO_Subcell_2","LFQ.Cyt_BO_Subcell_3", "LFQ.Chrom_BO_Subcell_1", "LFQ.Chrom_BO_Subcell_2","LFQ.Chrom_BO_Subcell_3", "LFQ.Nuc_BO_Subcell_1", "LFQ.Nuc_BO_Subcell_2","LFQ.Nuc_BO_Subcell_3" ) alltnms <- alltnms[alltnms %in% names(d1a)] beadnms <- beadnms[beadnms %in% names(d1a)] # d1a <- d1afull # replace NaN with NA d1a[ is.na(d1a) ] <- NA tnms <- grep("^LFQ.",alltnms, value = T) beadnms <- grep("^LFQ.",beadnms, value = T) nonbeadnms <- tnms[!tnms %in% beadnms] datacols.ibaq <- grep("^iBAQ.",alltnms, value = T) datacols <- tnms # Normalize d1a[,datacols] <- normalizeBetweenArrays(d1a[,datacols], method = "cyclicloess") d1a$Amean <- apply(d1a[,datacols], 1, mean, na.rm=T) # Perform the statistical testing on normalized LFQ values (not iBAQ) targets <- data.frame(samp = datacols, bait = gsub("LFQ\\.(.*)_.*(\\d+)","\\1", datacols)) targets$bait <- gsub("AbNova","Abnova",targets$bait) factlevels <- c("BO_NE","AbCam_NE", "Abnova_NE","Bethyl_NE","SC_NE", "RNF169_NE", "Chrom_BO_Subcell", "Chrom_D1A_Subcell", "Cyt_BO_Subcell", "Cyt_D1A_Subcell", "Nuc_BO_Subcell","Nuc_D1A_Subcell") f <- factor(targets$bait, levels = factlevels) design <- model.matrix(~0+f) colnames(design) <- factlevels rownames(design) <- targets$samp #fit <- lmFit(d1a[,datacols], design) fit.norm <- lmFit(d1a[,datacols], design) contrast.matrix <- makeContrasts(abcam=AbCam_NE-BO_NE, abnova=Abnova_NE-BO_NE, bethyl=Bethyl_NE-BO_NE, scruz=SC_NE-BO_NE, rnf169=RNF169_NE-BO_NE, cytoabnova=Cyt_D1A_Subcell-Cyt_BO_Subcell, nuclabnova=Nuc_D1A_Subcell-Nuc_BO_Subcell, chromabnova=Chrom_D1A_Subcell-Chrom_BO_Subcell, levels = design) fit2 <- contrasts.fit(fit.norm, contrast.matrix) fit2 <- eBayes(fit2, trend = F) tophits.abcam <- topTable(fit2, coef = 1, genelist = d1a[,c('id','Gene.name', 'Protein.name')], p.value = 1, number = Inf, confint = T, sort.by = "B") sum(tophits.abcam$adj.P.Val < 0.01, na.rm = T) tophits <- topTable(fit2, coef = 1:length(colnames(contrast.matrix)), genelist = d1a$id, p.value = 1, number=Inf, confint = T, sort.by = "none") pvalmat <- limma.one.sided(fit2, lower = F) sum(p.adjust(pvalmat, method = "BH") < 0.01) # adjpvalmat <- matrix(p.adjust(pvalmat, method = "BH"), ncol = 8) adjpvalmat <- apply(pvalmat, 2, function(x) p.adjust(x, method = "BH")) colnames(adjpvalmat) <- colnames(pvalmat) colnames(pvalmat) <- paste(colnames(pvalmat), ".P.Value", sep = "") colnames(adjpvalmat) <- paste(colnames(adjpvalmat), ".adj.P.Val", sep = "") tophits <- cbind(tophits,pvalmat, adjpvalmat) sum(tophits$adj.P.Val < 0.01) tophits <- tophits %>% left_join(d1a, by = c("ProbeID" = "id")) tophits <- tophits %>% dplyr::arrange(abnova.P.Value, abcam.P.Value) coefs <- colnames(fit2$coefficients) tnms <- c("Gene.name", coefs, paste(coefs, "adj.P.Val", sep = "."), "Protein.name", "AveExpr", paste(coefs, "P.Value", sep = "."), datacols,"ProbeID", "F", "P.Value", "adj.P.Val") tnms[!tnms %in% names(tophits)] othernms <- names(tophits)[!names(tophits) %in% tnms] tophits <- tophits[, c(tnms, othernms)] tophits$minpval <- apply(tophits[, grep("\\.adj.P.Val",names(tophits), value = T)], 1, min, na.rm = T) qthresh <- 0.01 tophits2 <- tophits[tophits$minpval < qthresh ,] tophits2 <- tophits2[tophits2$Gene.name != '' & !is.na(tophits2$Gene.name),] write.table(tophits,file='DYRK1A_interactomeIPMS_MQeBayes_all_hits_20190613_.txt', quote = F,sep = '\t',na = "",row.names = F,col.names = T) write.table(names(tophits), file='DYRK1A_interactomeIPMS_MQeBayes_all_hits_20190613_colnames_metadata.txt', col.names = F, row.names = F, quote = F) write.table(tophits2,file='DYRK1A_interactomeIPMS_MQeBayes_q10pct_hits_20190613_.txt', quote = F,sep = '\t',na = "",row.names = F,col.names = T) write.table(names(tophits2), file='DYRK1A_interactomeIPMS_MQeBayes_q10pct_hits_20190613_colnames_metadata.txt', col.names = F, row.names = F, quote = F) ############################################################################################################################# # MA Plot visualization with SAINT/CRAPome data ############################################################################################################################# # MA plot razoutnms <- c('Gene.name', 'uniprot.acc', 'uniprot.id',newraznms, names(tophits)[2:19], 'gene.uniprot.acc') razoutnms[!razoutnms %in% names(tophits)] tophits$razuniqpeps <- tophits$Razor...unique.peptides cntnms <- c("MS.MS.count", "razuniqpeps","Unique.peptides" , "Sequence.coverage....","Unique...razor.sequence.coverage...." ,"Unique.sequence.coverage....") preylist <- split(as.character(targets.ibaq$samp), targets.ibaq$bait) meansum <- function(xnms) { apply(tophits[,xnms], 1, mean, na.rm = T) } tmp <- lapply(preylist, meansum) tmp <- as.data.frame(tmp) names(tmp) <- paste("AvgIBAQ",names(tmp), sep = '_') tophits <- cbind(tophits, tmp) # put gene symbols you want to annotate in plots in a file genes2plot_JOVEPaper.txt in working directory prots2plot <- read.table(genes2plotfile, col.names = F,stringsAsFactors = F)[,1] prots2plot <- tophits$Gene.name[tophits$Gene.name %in% prots2plot] xlims <- c(-11,11) ylims <- c(10,32) fdrcuts <- c(0,0.01,0.02,0.05,1) myggma <- tophits %>% ggmaplot(xvar = nuclabnova, yvar = AvgIBAQ_Nuc_D1A_Subcell, fdr.range.cut = fdrcuts, adjpvalvar = nuclabnova.adj.P.Val, xlimits = xlims, ylimits = ylims, xtitle = "log2(DYRK1A Abnova/NonOpt.Nucl Extract)", plottitle = "Optimal IP-MS", annotgenes = F) names(myggma) abnova_OptNE <- myggma$maplot.ggplot abnova_OptNE.plotly <- myggma$maplot.plotly abnova_OptNE.plotly <- abnova_OptNE.plotly %>% add_annotations(x = tophits %>% filter(Gene.name %in% prots2plot ) %>% select(nuclabnova) %>% pull(), y = tophits %>% filter(Gene.name %in% prots2plot) %>% select(AvgIBAQ_Nuc_D1A_Subcell) %>% pull(), text = prots2plot, font = list(family = 'Arial', size = 15, color = 'black'), showarrow = T, visible = T, arrowhead = 4, arrowwidth = 1, arrowcolor = 'black', arrowsize = .5, ax = 100, ay = -20, clicktoshow = "onoff") # Plot to view abnova_OptNE.plotly # Prepare # Prepare CRAPome data (this file is output from the CRAPome/SAINT webtool) optAbnova <- read.table(crapome_file,header =TRUE, sep = "", stringsAsFactors = F) fdr.range.cut = c(0,0.01,0.05,0.1,1) gcol <- brewer.pal(n = 5, name = "Greys")[3] rcols <- rev(brewer.pal(n = 7, name = "Reds"))[1:(length(fdr.range.cut) - 1)] optAbnova$sigintact <- FALSE optAbnova$sigintact[optAbnova$DYRK1A_FC_A > 3 & optAbnova$DYRK1A_SP > 0.7] <- TRUE ggy <- optAbnova %>% ggplot(mapping = aes( DYRK1A_FC_A, DYRK1A_SP) ) + geom_point(aes(color = sigintact)) + scale_colour_manual(values = c(gcol, rev(rcols))) + geom_vline(xintercept = 3,color = "red") + theme_classic(base_size = 15, base_line_size = 2) + geom_hline(yintercept = 0.7,color = "red") + xlab("FC-A") + ylab("SAINT probability") + theme(legend.title = element_blank()) + ylim(c(0, 1.05)) ggply <- ggplotly(ggy, tooltip = "text") tdf.thresh <- suboptAbnova %>% filter(suboptAbnova$DYRK1A_FC_A > 3 & suboptAbnova$DYRK1A_SP > 0.7) tdf.thresh.dyr <- tdf.thresh %>% filter(GENE %in% c('DYRK1A','DCAF7')) tdf.thresh <- tdf.thresh %>% filter(!GENE %in% c('FN1','TNRC6B','DYRK1A','DCAF7','TRMT61B')) ggply <- ggply %>% add_annotations(x = tdf.thresh %>% select(DYRK1A_FC_A) %>% pull(), y = tdf.thresh %>% select(DYRK1A_SP) %>% pull(), text = tdf.thresh$GENE, font = list(family = 'Arial', size = 15, color = 'black'), showarrow = T, visible = T, arrowhead = 4, arrowwidth = 0.5, arrowcolor = 'grey', arrowsize = .5, ax = 20, ay = 20, clicktoshow = "onoff") ggply <- ggply %>% add_annotations(x = tdf.thresh.dyr %>% select(DYRK1A_FC_A) %>% pull(), y = tdf.thresh.dyr %>% select(DYRK1A_SP) %>% pull(), text = tdf.thresh.dyr$GENE, font = list(family = 'Arial', size = 15, color = 'black'), showarrow = T, visible = T, arrowhead = 4, arrowwidth = 0.5, arrowcolor = 'grey', arrowsize = .5, ax = 20, ay = -20, clicktoshow = "onoff") pibaq2 <- subplot(style(ggply,showlegend = F), style(abnova_NonOptNE.plotly,showlegend = T), widths = c(0.5, 0.5), heights = NULL, which_layout = 1,margin = 0.1, shareX = F, shareY = F, titleX = T, titleY = T); pibaq2 # This will plot in a figure window, which can be exported to an html file # and shown in chrome etc. Print to pdf for a vector format plot pibaq2