library(limma)
library(tidyverse)
library(imputeLCMD)

# The targets_for_crapome.txt file is a text tab delimited file that contains tabular data
# with rows corresponding to sample name columns in the MaxQuant proteinGroups.txt file
# that should be selected for analysis.
# 
# The file should contain the following columns:
# samp: arbitrary row id
# SampleName: sample replicate names - important, these should correspond to column names in the MaxQuant file 
# prep: the sample prep name 
# BaitName: name of the bait
# APName: name of the bait group used for differential analysis

crapometargets <- read.table('targets_for_crapome.txt',  sep = '\t', header = T)

crapometargets$SampleName <- gsub("LFQ.","", crapometargets$SampleName)

maxquantfile <- "DYRK1A_Interactome_MasterMaxQuantAnalysis_perseusAnnot_imputed.txt"

# Read in the MaxQuant proteinGroups.txt file to get the peptide spectral counts
# 

mqdf <- read.delim(maxquantfile, 
                  comment.char="#", stringsAsFactors=FALSE)

mqdf$Gene.name <- unlist(lapply(strsplit(mqdf$Gene.name, ";"), "[", 1))
mqdf$uniprot.id <- unlist(lapply(strsplit(mqdf$Majority.protein.IDs, ";"), "[", 1))
mqdf$uniprot.id <- unlist(lapply(strsplit(mqdf$uniprot.id, "\\|"), "[", 3))
mqdf$uniprot.acc <- unlist(lapply(strsplit(mqdf$Majority.protein.IDs, "\\|"), "[", 2))
mqdf$uniprot.acc <- gsub(";.*","",mqdf$uniprot.acc)

razid <- grepl("Razor...unique.peptides.", names(mqdf))
raznms <- grep("Razor...unique.peptides.", names(mqdf), value = T)

newraznms <- gsub("Razor\\.\\.\\.unique\\.peptides\\.(.*)", "\\1_RZUP", raznms)

samps2keep <- paste(crapometargets$SampleName, "_RZUP", sep='')
samps2keep[!samps2keep %in% names(mqdf)]

names(mqdf)[razid] <- newraznms

mqdf$gene.uniprot.acc <- paste(mqdf$Gene.name, mqdf$uniprot.acc, sep='_')

cols2keep <- c(samps2keep,  'uniprot.id')

razoutnms[!razoutnms %in% names(mqdf)]

mqdf$razuniqpeps <- mqdf$Razor...unique.peptides

cntnms <- c("MS.MS.count", "razuniqpeps","Unique.peptides" ,
            "Sequence.coverage....","Unique...razor.sequence.coverage...." ,"Unique.sequence.coverage....")


# This deals with cases where multiple isoforms map to the same uniprot.id
# The isoform with the highest spectral count is selected 
mqdf.uniqgene <- mqdf %>%  dplyr::group_by( uniprot.id) %>% 
  dplyr::mutate(the_rank  = rank(-razuniqpeps, ties.method = "random")) %>% 
  dplyr::filter(the_rank == 1) %>% select(cols2keep) 


# For CRAPome input file generation

newraznms <- raznms <- grep("RZUP", names(mqdf), value = T)

# This command converts the data in "wide" form to the long form that 
# CRAPome analysis requires. The multiple Sample name columns are collapsed into key-value pairs, 
# duplicating the uniprot.id column.

crapome <- mqdf.uniqgene %>% select(uniprot.id, cols2keep) %>%
  gather(key = SampleName,  value = SC, cols2keep, -uniprot.id) %>%
  select(SampleName, uniprot.id, SC)

crapome$SampleName <- gsub("_RZUP","",crapome$SampleName)
crapome$SampleName <- gsub("_","",crapome$SampleName)

# This assumes only 1 bain (DYRK1A in this case) and the sample names
# that have "NucBOSubcell*" as a SampleName are the beads only controls,
# which have to be recoded to "C" as the controls, for the CRAPome analysis 
# to work properly
#  
crapome$SampType <- "DYRK1A"
crapome$SampType[grepl("NucBOSubcell", crapome$SampleName)] <- "C"

crapome <- crapome %>% select(SampType, SampleName, uniprot.id,  SC)

# Write out the file to use as input to the CRAPome tool 
# built by the Alexey Nesvizhskii lab:
#  http://crapome.org/?q=chooseworkflow 
# Use "Workflow 3: Analyze Your Data" workflow to upload this file
# You should use only user controls for this analysis. See help files at crapome.org
# 
write.table(crapome,file='DYRK1A_interactomeIPMS_CRAPomeInputData_20190614.txt',
            quote = F,sep = '\t',na = "",row.names = F,col.names = F)