#! python3
# ubvTrim.py - .

####################################
#    Preamble
####################################

# Usage notes:
# * This code is dependent on the style of the worksheet used as the data source. This will be entirely based upon the output from the "phageDisplayElisaAnalysis.py" code.
# * Only amino acid sequences are trimmed, nucleotide sequences are excluded from this analysis.
# * This code will assume that there is an extra amino acid residue, K, leftover from the FLAG tag when using the output from the "phageDisplaySeqAnalysis.py" code. Without this, it will trim incorrectly.
# * This code will also retain any assumptions that were made from previous code, i.e. if the data source is the output from "phageDisplaySeqAnalysis.py" then all alignments will exclude sequences that weren't full length and those that have premature stop codons.

# Compatibility notes:
# * Advised to use Biopython 1.77 and no later version. If using a more current version, change alignment codes from "alignment.format(format_spec)" to "format(alignment, format_spec".
# * If using Spyder as your IDE, use a version that isn't version 5. This version for some reason has conflicts with the xlsxwriter package and won't get past importing modules.
# * This code is confirmed to work with python 3.8.8. Later versions may work but have not been verified.
# * Confirmed to work in Windows, unconfirmed in Macs and Linux but should work in theory (may need to change lines regarding path names so that the format matches the OS, currently these are optimised for Windows' format).

####################################
#    Modules
####################################

import os, re, logging, xlsxwriter, pandas
from collections import Counter, OrderedDict

####################################
#    Functions
####################################

# Compare components (x and y) of two strings (a and b, respectively) and return a string of the differences (diff) found in string b.
def compare(a, b):
    diff = ''
    for x, y in zip(a, b):
        if x == y:
            diff += '-'
        else:
            diff += y
    return diff

####################################
#    Classes
####################################

# Ordered list of counts.
class OrderedCounter(Counter, OrderedDict):
    pass

####################################
#    Code
####################################

##################
# Colours for print functions.
##################
cyan = lambda text: '\033[0;36m' + text + '\033[0m'
green = lambda text: '\033[0;32m' + text + '\033[0m'

##################
# Setup
##################
# Change working directory.
print(green('\nScript started. This will trim UbV sequences so that only diversified regions remain and pair them with their corresponding ELISA results for comparison.') +
      cyan(''''\n\nEnter folder location/path where sequences are located:

This will also be the location for the final output.'''))
trimPath = input()
trimPath = trimPath.replace('\\', '/')
os.chdir(trimPath)

# Logging setup.
pathRegex = re.compile(r'([\w.]+)$')
locList = pathRegex.findall(trimPath)
locStr = str(locList[0])
logging.basicConfig(filename = trimPath + "/" + locStr + ".log",
                    level = logging.INFO,
                    format = '%(asctime)s - %(message)s',
                    filemode = 'w')
logging.info('Working directory changed to %s.' % (trimPath))

# Choose amino acid sequence alignment file.
print(cyan('''\nEnter amino acid alignment file name:

Must be in .fasta format. Include the file extension in the name.'''))
seqFile = input()
seqFilePath = trimPath + '/' + seqFile
logging.info('%s chosen as amino acid sequence data source.' % (seqFilePath))

# Choose analysed ELISA file.
print(cyan('''\nEnter the analysed ELISA data file name:

Must be in .xlsx format and in the same folder as the previous alignment file. Include the file extension in the name.'''))
elisaFile = input()
elisaFilePath = trimPath + '/' + elisaFile
extensionRegex = re.compile(r'([.].*)')
elisaFileShort = re.sub(r"[.].*", "", elisaFile)

##################
# Data Analysis
##################

# Data extraction.
nameRegex = re.compile(r'>(.*)')
seqRegex = re.compile(r'([A-Z]{10,})')
stopRegex = re.compile(r'([*]+[A-Z]*)')
shortNameRegex = re.compile(r'([_][M][\w]*)')
# Extract amino acid sequence names.
with open(seqFilePath, 'r') as inFile:
    lines1 = inFile.read()
    lines1 = shortNameRegex.sub('', lines1)
    nameListAa = nameRegex.findall(lines1)
logging.info('Amino acid sequence names read from %s.' % (seqFile)) 
# Extract amino acid sequences. 
with open(seqFilePath, 'r') as inFile:
    lines2 = inFile.read()
    lines2 = lines2.replace('\n', '')
    lines2 = stopRegex.sub('', lines2)
    aaList = seqRegex.findall(lines2)
logging.info('Amino acid sequences read from %s.' %  (seqFile))
# Associate names with corresponding sequences.
aaDict = dict(zip(nameListAa, aaList))
logging.info('Dictionary of amino acid sequences and names created.')

# Extract statistics and IDs from ELISA file.
allData = pandas.read_excel(elisaFilePath, sheet_name='Unique Seq - AA', skiprows = 0, usecols = range(81, 87))
logging.info('%s data read.' % (elisaFile))
maxList = allData.iloc[:, 0].tolist()
logging.info('Maximum values extracted.')
minList = allData.iloc[:, 1].tolist()
logging.info('Minimum values extracted.')
medianList = allData.iloc[:, 2].tolist()
logging.info('Median values extracted.')
meanList = allData.iloc[:, 3].tolist()
logging.info('Mean values extracted.')
devList = allData.iloc[:, 4].tolist()
logging.info('Standard deviation values extracted.')
wellList = allData.iloc[:, 5].tolist()
logging.info('Wells extracted.')

# For each amino acid sequence, replace non-diversified regions with dashes.
# Remove amino acid prior to start codon.
shortaaList = []
for key, value in aaDict.items():
    shortaaSeq = value[1:]
    shortaaList.append(shortaaSeq)
logging.info('Initial non-ubiquitin amino acid removed from all sequences.')
# Compare UbV sequences against a consensus sequence and replace conserved amino acids with dashes.
consensusSeq = 'MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGGG'
consensusLen = len(consensusSeq)
conservedList = []
for ubvSeq in shortaaList:
    conservedList.append(compare(consensusSeq,ubvSeq))
logging.info('List of conserved sequences created.')

# Create list of unique amino acid sequences ordered by frequency.
unique = OrderedCounter(conservedList)
unique = unique.most_common()
uniqueDict = dict(unique)
logging.info('Dictionary of unique conserved sequences created.')

##################
# Export as .xlsx.
##################
# Create workbook.
workbook = xlsxwriter.Workbook(trimPath + '/' + elisaFileShort + '_conservation.xlsx')
logging.info('Excel spreadsheet created as "%s_conservation.xlsx".' % (elisaFileShort))

# Cell formatting rules. 
# General.
cell_format = workbook.add_format()
cell_format.set_bg_color('white')
cell_format.set_align('center')
cell_format.set_align('vcenter')
# Names.
name_format = workbook.add_format({'bold': True, 'font_size': 12})
name_format.set_bg_color('white')
name2_format = workbook.add_format({'bold': True, 'font_size': 12})
name2_format.set_align('center')
# IDs.
id_format = workbook.add_format({'bold': True, 'font_size': 12})
id_format.set_align('center')
id_format.set_align('vcenter')
# Numbers.
number_format = workbook.add_format({'num_format': '#,##0.0'})
number_format.set_bg_color('white')
number_format.set_align('center')
number_format.set_align('vcenter')
# Wells.
well_format = workbook.add_format({'font_size': 11})
well_format.set_bg_color('white')
# Residue numbers.
residue_format = workbook.add_format({'font_size': 10})
residue_format.set_align('center')
# Sequences.
sequence_format = workbook.add_format({'font_size': 10})
sequence_format.set_bg_color('white')
sequence_format.set_align('center')
sequence_format.set_align('vcenter')
sequence_format.set_font_name('Lucida Console')
# Region 1.
region1_format = workbook.add_format()
region1_format.set_bg_color('#BD7191')
# Region 2.
region2_format = workbook.add_format()
region2_format.set_bg_color('#8FC1C0')
# Region 3.
region3_format = workbook.add_format()
region3_format.set_bg_color('#DCA16A')
logging.info('Cell formatting rules are set.')

##################
# 'Conserved AA' worksheet.
##################

# Create worksheet for unique conserved amino acid sequences.
worksheet1 = workbook.add_worksheet('Conserved AA')
worksheet1.hide_gridlines(option=2)
worksheet1.set_column(1, consensusLen, 2)
worksheet1.set_column(consensusLen + 2, consensusLen + 5, 8)
worksheet1.freeze_panes(0, 1)
logging.info('Conserved AA worksheet created.')

# Assign IDs to each unique amino acid sequence.
worksheet1.write(1, 0, 'ID', name2_format)
numberList = list(range(1,len(uniqueDict)+1))
row1 = 2
for number in numberList:
    worksheet1.write_number(row1, 0, number, cell_format)
    row1 += 1
logging.info('IDs written to Conserved AA worksheet.')
    
# Write amino acid residue numbers above sequences.
numberList = list(range(1,consensusLen+1))
col1 = 1
for number in numberList:
    worksheet1.write_number(1, col1, number, residue_format)
    col1 += 1    

# Write unique amino acid sequences to Unique Seq - AA worksheet.
worksheet1.write(0, 1, 'Amino Acid Sequence', name_format)
row2 = 2
col2 = 1
for seq in uniqueDict.keys():
    letterList = list(seq)
    for letter in letterList:
        worksheet1.write(row2, col2, letter, sequence_format)
        col2 += 1
    row2 += 1
    col2 = 1
logging.info('Unique conserved sequences written to Conserved AA worksheet.')

# Add counts for each unique amino acid sequence.
row3 = 2
col3 = consensusLen + 1
worksheet1.write(1, consensusLen + 1, 'Count', name2_format)
count = list(uniqueDict.values())
for number in count:
    worksheet1.write_number(row3, col3, number, cell_format)
    row3 += 1
logging.info('Counts written to Conserved AA worksheet.')

# Write statistics to 'Conserved AA' worksheet.
# Max.
row4 = 2
col4 = consensusLen + 2
worksheet1.write(1, consensusLen + 2, 'Max', name2_format)
for number in maxList:
    worksheet1.write(row4, col4, number, number_format)
    row4 += 1
logging.info('Maximum values written to worksheet.')
# Min.
row5 = 2
col5 = consensusLen + 3
worksheet1.write(1, consensusLen + 3, 'Min', name2_format)
for number in minList:
    worksheet1.write(row5, col5, number, number_format)
    row5 += 1
logging.info('Minimum values written to worksheet.')
# Median.
row6 = 2
col6 = consensusLen + 4
worksheet1.write(1, consensusLen + 4, 'Median', name2_format)
for number in medianList:
    worksheet1.write(row6, col6, number, number_format)
    row6 += 1
logging.info('Median values written to worksheet.')
# Mean.
row7 = 2
col7 = consensusLen + 5
worksheet1.write(1, consensusLen + 5, 'Mean', name2_format)
for number in meanList:
    worksheet1.write(row7, col7, number, number_format)
    row7 += 1
logging.info('Mean values written to worksheet.')
# St. dev.
row8 = 2
col8 = consensusLen + 6
worksheet1.write(1, consensusLen + 6, 'St. Dev.', name2_format)
for number in devList:
    worksheet1.write(row8, col8, number, number_format)
    row8 += 1
logging.info('Standard deviation values written to worksheet.')
# Wells.
row9 = 2
col9 = consensusLen + 7
worksheet1.write(1, consensusLen + 7, 'Wells', name2_format)
for well in wellList:
    worksheet1.write(row9, col9, well, well_format)
    row9 += 1
logging.info('Wells written to worksheet.')

# Conditional formatting for statistics.
worksheet1.conditional_format(1, consensusLen + 2, len(unique), consensusLen + 6, {'type': '2_color_scale', 'min_color': '#FAFAFA', 'max_color': '#008000'})
logging.info('Conditional formatting applied to statistics.')

# Region 1 formatting.
worksheet1.write(len(unique)+2, 7, 'Region 1', name_format)
worksheet1.conditional_format(2, 2, len(unique)+1, 2, {'type':'no_blanks', 'format': region1_format})
worksheet1.conditional_format(2, 4, len(unique)+1, 4, {'type':'no_blanks', 'format': region1_format})
worksheet1.conditional_format(2, 6, len(unique)+1, 6, {'type':'no_blanks', 'format': region1_format})
worksheet1.conditional_format(2, 8, len(unique)+1, 12, {'type':'no_blanks', 'format': region1_format})
worksheet1.conditional_format(2, 14, len(unique)+1, 14, {'type':'no_blanks', 'format': region1_format})
logging.info('Region 1 coloured.')

# Region 2 formatting.
worksheet1.write(len(unique)+2, 41, 'Region 2', name_format)
worksheet1.conditional_format(2, 35, len(unique)+1, 35, {'type':'no_blanks', 'format': region2_format})
worksheet1.conditional_format(2, 37, len(unique)+1, 37, {'type':'no_blanks', 'format': region2_format})
worksheet1.conditional_format(2, 39, len(unique)+1, 40, {'type':'no_blanks', 'format': region2_format})
worksheet1.conditional_format(2, 42, len(unique)+1, 42, {'type':'no_blanks', 'format': region2_format})
worksheet1.conditional_format(2, 44, len(unique)+1, 44, {'type':'no_blanks', 'format': region2_format})
worksheet1.conditional_format(2, 46, len(unique)+1, 49, {'type':'no_blanks', 'format': region2_format})
logging.info('Region 2 coloured.')

# Region 3 formatting.
worksheet1.write(len(unique)+2, 70, 'Region 3', name_format)
worksheet1.conditional_format(2, 62, len(unique)+1, 64, {'type':'no_blanks', 'format': region3_format})
worksheet1.conditional_format(2, 66, len(unique)+1, 66, {'type':'no_blanks', 'format': region3_format})
worksheet1.conditional_format(2, 68, len(unique)+1, 68, {'type':'no_blanks', 'format': region3_format})
worksheet1.conditional_format(2, 70, len(unique)+1, 78, {'type':'no_blanks', 'format': region3_format})
logging.info('Region 3 coloured.')

##################
# Final workbook formatting.
##################

# Transform data into proper Excel-formatted tables.
worksheet1.add_table(2, 0, len(unique)+1, consensusLen + 7, {'header_row': False, 'style': None})

# Close .xlsx file.
workbook.close()

# Conclusion.
print(green('\nExcel conserved sequence alignment with ELISA scores saved as %s_conservation.xlsx.' % (elisaFileShort)))
logging.info('Excel file exported as %s_conservation.xlsx.' % (elisaFileShort))
print(green('\nAnalysis finished. See log file for details.'))
logging.info('ubvTrims.py finished running.')

# Shutdown logging.
logging.shutdown()