#############################################################################################################################
# GuardS_et_al_JOVE_analysis_and_plotting_script_MaxQuantIPMS_and_SAINT.R
# Author: William M. Old & Steve Guard, University of Colorado Boulder
# Date: 6/13/2019
# Guard, S. et al. JOVE manuscript
# License:  GPL-2 (GNU General Public License Version 2)

# Perform differential analysis on the MaxQuant proteinGroups.txt IP-MS data that has been imputed with Perseus
# Required files:
# 1. DYRK1A_Interactome_MasterMaxQuantAnalysis_perseusAnnot_imputed.txt : MaxQuant proteinGroups.txt file with perseus 
#                                                                         imputed LFQ intensities
# 2. export_CRAPomeSAINT_Input_File.R                                   : Script to generate the input file provided to CRAPome.org
#                                                                         tool for generating the analysis results file
# 
# 3. targets_for_crapome.txt                                            : Contains sample name information for subsetting analysis files
# 
# 4. Nuclear_Subcell_CRAPome.txt                                        : CRAPome analysis results file (obtained by running export_CRAPomeSAINT_Input_File.R)
# 
# 5. genes2plot_JOVEPaper.txt                                           : put gene symbols you want to annotate in plots in a 
#                                                                         file genes2plot_JOVEPaper.txt in working directory
# 
# 6. ggmaplot_functions.R                                               : R file with functions for plotting MA plots and 
#                                                                         other accessory tasks
# 7. main_differential_analysis.R  (main file)     
# 
# 
#############################################################################################################################

library(limma)
library(tidyverse)
library(ggplot2)
library(plotly)
library(listviewer)

source('./ggmaplot_functions.R')

mqperseusimputedfile <- "DYRK1A_Interactome_MasterMaxQuantAnalysis_perseusAnnot_imputed.txt"
crapome_file <- 'Nuclear_Subcell_CRAPome.txt'
genes2plotfile <- 'genes2plot_JOVEPaper.txt'

# Read in the Perseus imputed iBAQ and LFQ values from MaxQuant proteinGroups.txt file
# 

d1a <- read.delim(mqperseusimputedfile, 
                  comment.char="#", stringsAsFactors=FALSE)

d1a$Gene.name <- unlist(lapply(strsplit(d1a$Gene.name, ";"), "[", 1))

d1a$uniprot.acc <- unlist(lapply(strsplit(d1a$Majority.protein.IDs, "\\|"), "[", 2))

# rename razor peptide columns 
razid <- grepl("Razor...unique.peptides.", names(d1a))
raznms <- grep("Razor...unique.peptides.", names(d1a), value = T)
newraznms <- gsub("Razor\\.\\.\\.unique\\.peptides\\.(.*)", "\\1_RZUP", raznms)
names(d1a)[razid] <- newraznms

d1a$gene.uniprot.acc <- paste(d1a$Gene.name, d1a$uniprot.acc, sep='_')

d1a$razuniqpeps <- d1a$Razor...unique.peptides

cntnms <- c("MS.MS.count", "razuniqpeps","Unique.peptides" ,
            "Sequence.coverage....","Unique...razor.sequence.coverage...." ,"Unique.sequence.coverage....")

# Use the razor unique peptide values to select the protein isoform with the largest number of razor peptides.
# This tends to favor the dominant isoform 

d1a <- d1a %>%  dplyr::group_by( Gene.name) %>% 
  dplyr::mutate(the_rank  = rank(-razuniqpeps, ties.method = "random")) %>% 
  dplyr::filter(the_rank == 1) 

#View(d1a)
#d1afull <- d1a
# adapted from the miRtest packages, allows for testing only those proteins enriched over control (lower = FALSE)
# using moderated empirical Bayes t-test from limma package. This is required to test for only proteins
# enriched over the control (and not also negatively enriched)
# returns p-values from all coefficients in model
limma.one.sided = function (fit.tmp,lower=FALSE) {
  
    se.coef <- sqrt(fit.tmp$s2.post) * fit.tmp$stdev.unscaled
    df.total <- fit.tmp$df.prior + fit.tmp$df.residual
    pt(fit.tmp$t, df = df.total, lower.tail = lower)
    
  
}

# Function for renaming columns with problematic names
filt.datadf <- function(dftmp) {
  names(dftmp)[1:112] <- gsub("^Intensity","int", names(dftmp)[1:112])
  names(dftmp)[1:112] <- gsub("^LFQ.intensity","LFQ", names(dftmp)[1:112])
  names(dftmp)[1:112]
  
  names(dftmp)[names(dftmp) == 'LFQ.AbNova_NE1'] <- 'LFQ.AbNova_NE_1'
  names(dftmp)[names(dftmp) == 'int.AbNova_NE1'] <- 'int.AbNova_NE_1'
  names(dftmp)[names(dftmp) == 'iBAQ.AbNova_NE1'] <- 'iBAQ.AbNova_NE_1'
  
  
  dftmp <- dftmp[!grepl("REV__", dftmp$Majority.protein.IDs),]
  dftmp <- dftmp[!grepl("CON__", dftmp$Majority.protein.IDs),]
  return(dftmp)
  
}


d1a <- filt.datadf(d1a)

# Most of these samples are not used. 

alltnms <- c("int.AbCam_NE_1", "int.AbCam_NE_2", "int.AbCam_NE_3",  # "int.AbCam_NE_4", "int.AbCam_NE_5", 
             "int.AbNova_NE_2", "int.AbNova_NE_3", "int.AbNova_NE_1", 
             "int.Bethyl_NE_1", "int.Bethyl_NE_2", "int.Bethyl_NE_3", 
             "int.SC_NE_1", "int.SC_NE_2", "int.SC_NE_3",
             "int.RNF169_NE_1", "int.RNF169_NE_2",
             "int.BO_NE_1","int.BO_NE_2", "int.BO_NE_3", 
             
             "int.Chrom_BO_Subcell_1", "int.Chrom_BO_Subcell_2","int.Chrom_BO_Subcell_3", 
             "int.Chrom_D1A_Subcell_1", "int.Chrom_D1A_Subcell_2","int.Chrom_D1A_Subcell_3", 
             "int.Cyt_BO_Subcell_1", "int.Cyt_BO_Subcell_2","int.Cyt_BO_Subcell_3", 
             "int.Cyt_D1A_Subcell_1", "int.Cyt_D1A_Subcell_2","int.Cyt_D1A_Subcell_3", 
             "int.Nuc_BO_Subcell_1", "int.Nuc_BO_Subcell_2","int.Nuc_BO_Subcell_3", 
             "int.Nuc_D1A_Subcell_1", "int.Nuc_D1A_Subcell_2","int.Nuc_D1A_Subcell_3", 
              
             
             "iBAQ.AbCam_NE_1", "iBAQ.AbCam_NE_2", "iBAQ.AbCam_NE_3",#  "iBAQ.AbCam_NE_4", "iBAQ.AbCam_NE_5", 
             "iBAQ.AbNova_NE_2", "iBAQ.AbNova_NE_3", "iBAQ.AbNova_NE_1", 
             "iBAQ.Bethyl_NE_1", "iBAQ.Bethyl_NE_2", "iBAQ.Bethyl_NE_3", 
             "iBAQ.SC_NE_1", "iBAQ.SC_NE_2", "iBAQ.SC_NE_3",
             "iBAQ.RNF169_NE_1", "iBAQ.RNF169_NE_2", 
             "iBAQ.BO_NE_1","iBAQ.BO_NE_2", "iBAQ.BO_NE_3", 
             
             "iBAQ.Chrom_BO_Subcell_1", "iBAQ.Chrom_BO_Subcell_2","iBAQ.Chrom_BO_Subcell_3", 
             "iBAQ.Chrom_D1A_Subcell_1", "iBAQ.Chrom_D1A_Subcell_2","iBAQ.Chrom_D1A_Subcell_3", 
             "iBAQ.Cyt_BO_Subcell_1", "iBAQ.Cyt_BO_Subcell_2","iBAQ.Cyt_BO_Subcell_3", 
             "iBAQ.Cyt_D1A_Subcell_1", "iBAQ.Cyt_D1A_Subcell_2","iBAQ.Cyt_D1A_Subcell_3", 
             "iBAQ.Nuc_BO_Subcell_1", "iBAQ.Nuc_BO_Subcell_2","iBAQ.Nuc_BO_Subcell_3", 
             "iBAQ.Nuc_D1A_Subcell_1", "iBAQ.Nuc_D1A_Subcell_2","iBAQ.Nuc_D1A_Subcell_3", 
             
             
             "LFQ.AbCam_NE_1", "LFQ.AbCam_NE_2", "LFQ.AbCam_NE_3", # "LFQ.AbCam_NE_4", "LFQ.AbCam_NE_5", 
             "LFQ.AbNova_NE_2", "LFQ.AbNova_NE_3", "LFQ.AbNova_NE_1", 
             "LFQ.Bethyl_NE_1", "LFQ.Bethyl_NE_2", "LFQ.Bethyl_NE_3", 
             "LFQ.SC_NE_1", "LFQ.SC_NE_2", "LFQ.SC_NE_3",
             "LFQ.RNF169_NE_1", "LFQ.RNF169_NE_2",
             "LFQ.BO_NE_1","LFQ.BO_NE_2", "LFQ.BO_NE_3", 
             
             "LFQ.Chrom_BO_Subcell_1", "LFQ.Chrom_BO_Subcell_2","LFQ.Chrom_BO_Subcell_3", 
             "LFQ.Chrom_D1A_Subcell_1", "LFQ.Chrom_D1A_Subcell_2","LFQ.Chrom_D1A_Subcell_3", 
             "LFQ.Cyt_BO_Subcell_1", "LFQ.Cyt_BO_Subcell_2","LFQ.Cyt_BO_Subcell_3", 
             "LFQ.Cyt_D1A_Subcell_1", "LFQ.Cyt_D1A_Subcell_2","LFQ.Cyt_D1A_Subcell_3", 
             "LFQ.Nuc_BO_Subcell_1", "LFQ.Nuc_BO_Subcell_2","LFQ.Nuc_BO_Subcell_3", 
             "LFQ.Nuc_D1A_Subcell_1", "LFQ.Nuc_D1A_Subcell_2","LFQ.Nuc_D1A_Subcell_3") 
 
beadnms <- c("int.BO_NE_1","int.BO_NE_2", "int.BO_NE_3", 
             "int.Cyt_BO_Subcell_1", "int.Cyt_BO_Subcell_2","int.Cyt_BO_Subcell_3", 
             "int.Chrom_BO_Subcell_1", "int.Chrom_BO_Subcell_2","int.Chrom_BO_Subcell_3",
             "int.Nuc_BO_Subcell_1", "int.Nuc_BO_Subcell_2","int.Nuc_BO_Subcell_3", 
             
             "iBAQ.BO_NE_1","iBAQ.BO_NE_2", "iBAQ.BO_NE_3", 
             "iBAQ.Cyt_BO_Subcell_1", "iBAQ.Cyt_BO_Subcell_2","iBAQ.Cyt_BO_Subcell_3", 
             "iBAQ.Chrom_BO_Subcell_1", "iBAQ.Chrom_BO_Subcell_2","iBAQ.Chrom_BO_Subcell_3",
             "iBAQ.Nuc_BO_Subcell_1", "iBAQ.Nuc_BO_Subcell_2","iBAQ.Nuc_BO_Subcell_3",
             
             "LFQ.BO_NE_1","LFQ.BO_NE_2", "LFQ.BO_NE_3", 
             "LFQ.Cyt_BO_Subcell_1", "LFQ.Cyt_BO_Subcell_2","LFQ.Cyt_BO_Subcell_3", 
             "LFQ.Chrom_BO_Subcell_1", "LFQ.Chrom_BO_Subcell_2","LFQ.Chrom_BO_Subcell_3",
             "LFQ.Nuc_BO_Subcell_1", "LFQ.Nuc_BO_Subcell_2","LFQ.Nuc_BO_Subcell_3" )     

alltnms <- alltnms[alltnms %in% names(d1a)]     
beadnms <- beadnms[beadnms %in% names(d1a)]     

# d1a <- d1afull
# replace NaN with NA
d1a[ is.na(d1a) ] <- NA

tnms <- grep("^LFQ.",alltnms, value = T)
beadnms <- grep("^LFQ.",beadnms, value = T)
nonbeadnms <- tnms[!tnms %in% beadnms]

datacols.ibaq <- grep("^iBAQ.",alltnms, value = T)

datacols <- tnms 

# Normalize
d1a[,datacols] <- normalizeBetweenArrays(d1a[,datacols], method = "cyclicloess")

d1a$Amean <- apply(d1a[,datacols], 1, mean, na.rm=T)

# Perform the statistical testing on normalized LFQ values (not iBAQ)  
targets <- data.frame(samp = datacols,
                      bait = gsub("LFQ\\.(.*)_.*(\\d+)","\\1", datacols))



targets$bait <- gsub("AbNova","Abnova",targets$bait) 

factlevels <- c("BO_NE","AbCam_NE", "Abnova_NE","Bethyl_NE","SC_NE",  "RNF169_NE",
                "Chrom_BO_Subcell", "Chrom_D1A_Subcell",
                "Cyt_BO_Subcell", "Cyt_D1A_Subcell",
                "Nuc_BO_Subcell","Nuc_D1A_Subcell")

f <- factor(targets$bait, levels = factlevels)
design <- model.matrix(~0+f)
colnames(design) <- factlevels
rownames(design) <- targets$samp
#fit <- lmFit(d1a[,datacols], design)

fit.norm <- lmFit(d1a[,datacols], design)

contrast.matrix <- makeContrasts(abcam=AbCam_NE-BO_NE,
                                 abnova=Abnova_NE-BO_NE,
                                 bethyl=Bethyl_NE-BO_NE,
                                 scruz=SC_NE-BO_NE, 
                                 rnf169=RNF169_NE-BO_NE,
                                 cytoabnova=Cyt_D1A_Subcell-Cyt_BO_Subcell,
                                 nuclabnova=Nuc_D1A_Subcell-Nuc_BO_Subcell,
                                 chromabnova=Chrom_D1A_Subcell-Chrom_BO_Subcell,
                                 levels = design)

fit2 <- contrasts.fit(fit.norm, contrast.matrix)
fit2 <- eBayes(fit2, trend =  F)

tophits.abcam <- topTable(fit2, coef = 1, genelist = d1a[,c('id','Gene.name', 'Protein.name')], p.value = 1, number = Inf, 
                          confint = T, sort.by = "B")
sum(tophits.abcam$adj.P.Val < 0.01, na.rm = T)


tophits <- topTable(fit2, coef = 1:length(colnames(contrast.matrix)), genelist = d1a$id, p.value = 1, number=Inf, confint = T, sort.by = "none")


pvalmat <- limma.one.sided(fit2, lower = F)
sum(p.adjust(pvalmat, method = "BH") < 0.01)

# adjpvalmat <- matrix(p.adjust(pvalmat, method = "BH"), ncol = 8)

adjpvalmat <- apply(pvalmat, 2, function(x) p.adjust(x, method = "BH"))

colnames(adjpvalmat) <- colnames(pvalmat)

colnames(pvalmat) <- paste(colnames(pvalmat), ".P.Value", sep = "")
colnames(adjpvalmat) <- paste(colnames(adjpvalmat), ".adj.P.Val", sep = "")

tophits <- cbind(tophits,pvalmat, adjpvalmat)



sum(tophits$adj.P.Val < 0.01)

tophits <- tophits %>%  left_join(d1a, by = c("ProbeID" =   "id"))

tophits <- tophits %>% dplyr::arrange(abnova.P.Value,
                                      abcam.P.Value)

coefs <- colnames(fit2$coefficients)
tnms <- c("Gene.name", coefs,
          paste(coefs, "adj.P.Val", sep = "."),
          "Protein.name",
          "AveExpr",
          paste(coefs, "P.Value", sep = "."),
          datacols,"ProbeID",  "F", "P.Value", "adj.P.Val") 


tnms[!tnms %in% names(tophits)]

othernms <- names(tophits)[!names(tophits) %in% tnms]
tophits <- tophits[, c(tnms, othernms)]

tophits$minpval <- apply(tophits[, grep("\\.adj.P.Val",names(tophits), value = T)], 1, min, na.rm = T)

qthresh <-  0.01
tophits2 <- tophits[tophits$minpval < qthresh ,]

tophits2 <- tophits2[tophits2$Gene.name != '' & !is.na(tophits2$Gene.name),]



write.table(tophits,file='DYRK1A_interactomeIPMS_MQeBayes_all_hits_20190613_.txt',
            quote = F,sep = '\t',na = "",row.names = F,col.names = T)
write.table(names(tophits), file='DYRK1A_interactomeIPMS_MQeBayes_all_hits_20190613_colnames_metadata.txt', col.names = F, row.names = F, quote = F)


write.table(tophits2,file='DYRK1A_interactomeIPMS_MQeBayes_q10pct_hits_20190613_.txt',
            quote = F,sep = '\t',na = "",row.names = F,col.names = T)
write.table(names(tophits2), file='DYRK1A_interactomeIPMS_MQeBayes_q10pct_hits_20190613_colnames_metadata.txt', col.names = F, row.names = F, quote = F)

 

#############################################################################################################################

# MA Plot visualization with SAINT/CRAPome data

#############################################################################################################################

# MA plot 
razoutnms <- c('Gene.name', 'uniprot.acc', 'uniprot.id',newraznms,
               names(tophits)[2:19],
               'gene.uniprot.acc')

razoutnms[!razoutnms %in% names(tophits)]

tophits$razuniqpeps <- tophits$Razor...unique.peptides
cntnms <- c("MS.MS.count", "razuniqpeps","Unique.peptides" ,
            "Sequence.coverage....","Unique...razor.sequence.coverage...." ,"Unique.sequence.coverage....")

preylist <- split(as.character(targets.ibaq$samp), targets.ibaq$bait)

meansum <- function(xnms) {
  apply(tophits[,xnms], 1, mean, na.rm = T)
}

tmp <- lapply(preylist, meansum)
tmp <- as.data.frame(tmp)
names(tmp) <- paste("AvgIBAQ",names(tmp), sep = '_')

tophits <- cbind(tophits, tmp)

# put gene symbols you want to annotate in plots in a file genes2plot_JOVEPaper.txt in working directory

prots2plot <- read.table(genes2plotfile, col.names = F,stringsAsFactors = F)[,1]
prots2plot <- tophits$Gene.name[tophits$Gene.name %in% prots2plot]

xlims <- c(-11,11)
ylims <- c(10,32)

fdrcuts <- c(0,0.01,0.02,0.05,1)

myggma <- tophits %>% ggmaplot(xvar = nuclabnova, yvar = AvgIBAQ_Nuc_D1A_Subcell, 
                               fdr.range.cut = fdrcuts,
                               adjpvalvar = nuclabnova.adj.P.Val,
                               xlimits = xlims,
                               ylimits = ylims,
                               xtitle = "log2(DYRK1A Abnova/NonOpt.Nucl Extract)",
                               plottitle = "Optimal IP-MS",
                               annotgenes = F)
names(myggma)

abnova_OptNE <- myggma$maplot.ggplot
abnova_OptNE.plotly <- myggma$maplot.plotly


abnova_OptNE.plotly <- abnova_OptNE.plotly %>% 
  add_annotations(x = tophits %>% 
                    filter(Gene.name %in% prots2plot ) %>% 
                    select(nuclabnova)  %>% pull(),
                  y  = tophits %>% filter(Gene.name %in% prots2plot) %>% 
                    select(AvgIBAQ_Nuc_D1A_Subcell)  %>% pull(),
                  text = prots2plot,
                  font = list(family = 'Arial',
                              size = 15,
                              color = 'black'),
                  showarrow = T,
                  visible = T, 
                  arrowhead = 4,
                  arrowwidth = 1,
                  arrowcolor = 'black',
                  arrowsize = .5,
                  ax = 100,
                  ay = -20,
                  clicktoshow = "onoff")

# Plot to view 
abnova_OptNE.plotly

# Prepare 


# Prepare CRAPome data (this file is output from the CRAPome/SAINT webtool)

optAbnova <- read.table(crapome_file,header =TRUE, sep = "", stringsAsFactors = F)


fdr.range.cut = c(0,0.01,0.05,0.1,1)

gcol <- brewer.pal(n = 5, name = "Greys")[3]
rcols <- rev(brewer.pal(n = 7, name = "Reds"))[1:(length(fdr.range.cut) - 1)]

optAbnova$sigintact <- FALSE
optAbnova$sigintact[optAbnova$DYRK1A_FC_A > 3 & optAbnova$DYRK1A_SP > 0.7] <- TRUE

ggy <- optAbnova %>% ggplot(mapping = aes( DYRK1A_FC_A, DYRK1A_SP) ) +
  geom_point(aes(color = sigintact)) +
  scale_colour_manual(values = c(gcol, rev(rcols))) +
  geom_vline(xintercept = 3,color = "red") +
  theme_classic(base_size = 15,
                base_line_size = 2)  +
  geom_hline(yintercept = 0.7,color = "red")  + xlab("FC-A") + ylab("SAINT probability")  +
  theme(legend.title = element_blank())  + ylim(c(0, 1.05))

ggply <- ggplotly(ggy, tooltip = "text")

tdf.thresh <- suboptAbnova %>% filter(suboptAbnova$DYRK1A_FC_A > 3 & suboptAbnova$DYRK1A_SP > 0.7)

tdf.thresh.dyr <- tdf.thresh %>% filter(GENE %in% c('DYRK1A','DCAF7'))

tdf.thresh <- tdf.thresh %>% filter(!GENE %in% c('FN1','TNRC6B','DYRK1A','DCAF7','TRMT61B'))


ggply <- ggply %>% add_annotations(x = tdf.thresh %>% select(DYRK1A_FC_A) %>% pull(),
                                   y  = tdf.thresh %>% select(DYRK1A_SP) %>% pull(),
                                   text = tdf.thresh$GENE,
                                   font = list(family = 'Arial',
                                               size = 15,
                                               color = 'black'),
                                   showarrow = T,
                                   visible = T, 
                                   arrowhead = 4,
                                   arrowwidth = 0.5,
                                   arrowcolor = 'grey',
                                   arrowsize = .5,
                                   ax = 20,
                                   ay = 20,
                                   clicktoshow = "onoff") 

ggply <- ggply %>% add_annotations(x = tdf.thresh.dyr %>% select(DYRK1A_FC_A) %>% pull(),
                                   y  = tdf.thresh.dyr %>% select(DYRK1A_SP) %>% pull(),
                                   text = tdf.thresh.dyr$GENE,
                                   font = list(family = 'Arial',
                                               size = 15,
                                               color = 'black'),
                                   showarrow = T,
                                   visible = T, 
                                   arrowhead = 4,
                                   arrowwidth = 0.5,
                                   arrowcolor = 'grey',
                                   arrowsize = .5,
                                   ax = 20,
                                   ay = -20,
                                   clicktoshow = "onoff") 

pibaq2 <- subplot(style(ggply,showlegend = F),
                  style(abnova_NonOptNE.plotly,showlegend = T),
                  widths = c(0.5, 0.5), heights = NULL,
                  which_layout = 1,margin = 0.1,
                  shareX = F, shareY = F,
                  titleX = T, titleY = T); pibaq2

# This will plot in a figure window, which can be exported to an html file 
# and shown in chrome etc. Print to pdf for a vector format plot
pibaq2