#! python3 # phageDisplayElisaAnalysis.py - Analyse ELISA results along with corresponding sequence data. Calculates the average of duplicates for each protein and normalizes them against the average of the negative controls/blanks. ELISA results that don't have corresponding sequencing results are removed from the final results. #################################### # Preamble #################################### # Usage notes: # * Accepted plate format: # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 # ---------------------------------------------------------------------------------------- # A | 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 08 09 09 10 10 11 11 12 12 # B | <____________________________________empty___________________________________________> # C | 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23 24 24 # D | <____________________________________empty___________________________________________> # E | 25 25 26 26 27 27 28 28 29 29 30 30 31 31 32 32 33 33 34 34 35 35 36 36 # F | <____________________________________empty___________________________________________> # G | 37 37 38 38 39 39 40 40 41 41 42 42 43 43 44 44 45 45 46 46 47 47 48 48 # H | <____________________________________empty___________________________________________> # I | 49 49 50 50 51 51 52 52 53 53 54 54 55 55 56 56 57 57 58 58 59 59 60 60 # J | <____________________________________empty___________________________________________> # K | 61 61 62 62 63 63 64 64 65 65 66 66 67 67 68 68 69 69 70 70 71 71 72 72 # L | <____________________________________empty___________________________________________> # M | 73 73 74 74 75 75 76 76 77 77 78 78 79 79 80 80 81 81 82 82 83 83 84 84 # N | <____________________________________empty___________________________________________> # O | 85 85 86 86 87 87 88 88 89 89 90 90 91 91 92 92 93 93 94 94 95 95 96 96 # P | <___________________________________BLANKS___________________________________________> # * This code will retain any assumptions that were made from previous code, i.e. if the data source is the output from "phageDisplaySeqAnalysis.py" then all alignments will exclude sequences that weren't full length and those that have premature stop codons. # Compatibility notes: # * Use with "phageDisplayELISA384well" protocol in BioTek plate reader and the "phageDisplay_raw/transformedData" export formats. # * Advised to use Biopython 1.77 and no later version. If using a more current version, change alignment codes from "alignment.format(format_spec)" to "format(alignment, format_spec". # * If using Spyder as your IDE, use a version that isn't version 5. This version for some reason has conflicts with the xlsxwriter package and won't get past importing modules. # * This code is confirmed to work with python 3.8.8. Later versions may work but have not been verified. # * Confirmed to work in Windows, unconfirmed in Macs and Linux but should work in theory (may need to change lines regarding path names so that the format matches the OS, currently these are optimised for Windows' format). #################################### # Modules #################################### import os, re, logging, xlsxwriter, pandas, statistics from Bio import AlignIO from collections import Counter, OrderedDict #################################### # Functions #################################### # Create an average of a list. def aveList(list): return sum(list) / len(list) #################################### # Classes #################################### # Ordered list of counts. class OrderedCounter(Counter, OrderedDict): pass #################################### # Code #################################### ################## # Colours for print functions. ################## cyan = lambda text: '\033[0;36m' + text + '\033[0m' green = lambda text: '\033[0;32m' + text + '\033[0m' ################## # Setup ################## # Change working directory. print(green('\nScript started. This will pair sequencing data with their corresponding ELISA results for comparison.') + cyan(''''\n\nEnter folder location/path where ELISA results are located: This will also be the location for the final output.''')) elisaPath = input() elisaPath = elisaPath.replace('\\', '/') os.chdir(elisaPath) # Choose ELISA data source. print(cyan('''\nEnter the raw ELISA data file name: Must be in .xlsx format. Include the file extension in the name.''')) elisaFile = input() elisaFilePath = elisaPath + '/' + elisaFile extensionRegex = re.compile(r'([.].*)') elisaFileShort = re.sub(r"[.].*", "", elisaFile) # Logging setup. logging.basicConfig(filename = elisaPath + "/" + elisaFileShort + ".log", level = logging.INFO, format = '%(asctime)s - %(message)s', filemode = 'w') logging.info('Working directory changed to %s.' % (elisaPath)) logging.info('%s chosen as ELISA data source.' % (elisaFile)) # Choose sequence file data sources and corresponding alignment files for ELISA data. print(cyan('\nEnter alignment file path:')) seqPath = input() print(cyan('''\nEnter amino acid alignment file name: Must be in .fasta format. Include the file extension in the name.''')) seqFile = input() seqFilePath = seqPath + '/' + seqFile logging.info('%s chosen as amino acid sequence data source.' % (seqFilePath)) print(cyan('''\nEnter nucleotide alignment file name: Must be in .fasta format. Include the file extension in the name.''')) seqFile2 = input() seqFilePath2 = seqPath + '/' + seqFile2 logging.info('%s chosen as nucleotide sequence data source. Include the file extension in the name.' % (seqFilePath2)) ################## # Data Analysis ################## nameRegex = re.compile(r'>(.*)') seqRegex = re.compile(r'([A-Z]{10,})') stopRegex = re.compile(r'([*]+[A-Z]*)') shortNameRegex = re.compile(r'([_][M][\w]*)') # Extract amino acid sequence names. with open(seqFilePath, 'r') as inFile: lines1 = inFile.read() lines1 = shortNameRegex.sub('', lines1) nameListAa = nameRegex.findall(lines1) logging.info('Amino acid sequence names read from %s.' % (seqFile)) # Extract amino acid sequences. with open(seqFilePath, 'r') as inFile: lines2 = inFile.read() lines2 = lines2.replace('\n', '') lines2 = stopRegex.sub('', lines2) aaList = seqRegex.findall(lines2) logging.info('Amino acid sequences read from %s.' % (seqFile)) # Extract nucleotide sequence names. with open(seqFilePath2, 'r') as inFile: lines1 = inFile.read() lines1 = shortNameRegex.sub('', lines1) nameListNt = nameRegex.findall(lines1) logging.info('Nucleotide sequence names read from %s.' % (seqFile2)) # Extract nucleotide sequences. with open(seqFilePath2, 'r') as inFile: lines2 = inFile.read() lines2 = lines2.replace('\n', '') lines2 = stopRegex.sub('', lines2) ntList = seqRegex.findall(lines2) logging.info('Nucleotide sequences read from %s.' % (seqFile2)) # Associate names with corresponding sequences. aaDict = dict(zip(nameListAa, aaList)) ntDict = dict(zip(nameListNt, ntList)) # Align amino acid sequences and calculate alignment length. alignment = AlignIO.read(seqFilePath, "fasta") alignLen = alignment.get_alignment_length() logging.info('Amino acid alignment length calculated.') # Align nucleotide sequences and calculate alignment length. alignment2 = AlignIO.read(seqFilePath2, "fasta") alignLen2 = alignment2.get_alignment_length() logging.info('Nucleotide alignment length calculated.') # # Read ELISA data. # Extract data and make dataframe of raw results. allCells = pandas.read_excel(elisaFilePath, skiprows = [i for i in range(1,46)], usecols = range(2, 26)) logging.info('Raw data read from ELISA file.') # Extract values for negative controls/blanks. blankCells = {'P1': allCells.iloc[15][0], 'P2': allCells.iloc[15][1], 'P3': allCells.iloc[15][2], 'P4': allCells.iloc[15][3], 'P5': allCells.iloc[15][4], 'P6': allCells.iloc[15][5], 'P7': allCells.iloc[15][6], 'P8': allCells.iloc[15][7], 'P9': allCells.iloc[15][8], 'P10': allCells.iloc[15][9], 'P11': allCells.iloc[15][10], 'P12': allCells.iloc[15][11], 'P13': allCells.iloc[15][12], 'P14': allCells.iloc[15][13], 'P15': allCells.iloc[15][14], 'P16': allCells.iloc[15][15], 'P17': allCells.iloc[15][16], 'P18': allCells.iloc[15][17], 'P19': allCells.iloc[15][18], 'P20': allCells.iloc[15][19], 'P21': allCells.iloc[15][20], 'P22': allCells.iloc[15][21], 'P23': allCells.iloc[15][22], 'P24': allCells.iloc[15][23]} print(cyan('''\nEnter wells containing ELISA blanks: Not case-sensitive; separate with commas (no spaces) if more than one.''')) blanks = input() blanks = blanks.upper() blanks = blanks.split(',') logging.info('"%s" used as negative controls/blanks.' % (blanks)) # Create list of ELISA scores for user-inputted blank IDs. blankValues = list() for i in blanks: for key, value in blankCells.items(): if i in blankCells: try: blankValues.append(blankCells.get(i)) break except: pass # Average blanks or pass if there's only one blank. try: blankAve = aveList(blankValues) logging.info('Blank values retrieved and averaged.') except: blankAve = int(blankValues) logging.info('Single blank value retrieved but not cannot be averaged.') # Create list of all ELISA scores. cellValues = list(allCells.iloc[0]) + list(allCells.iloc[2])+ list(allCells.iloc[4]) + list(allCells.iloc[6]) + list(allCells.iloc[8]) + list(allCells.iloc[10]) + list(allCells.iloc[12]) + list(allCells.iloc[14]) cellAve = list() for i in range(0, len(cellValues), 2): try: cellAve.append(statistics.mean([cellValues[i], cellValues[i + 1]])) except: pass logging.info('Result wells extracted from raw data.') # Normalise ELISA scores to the blank average. relAveList = list() for value in cellAve: relAve = value/blankAve relAveList.append(relAve) logging.info('Result averaged relative to average of blanks (or single blank if less than two blanks).') # Remove ELISA data that don't have amino acid sequencing counterparts. # Extract IDs present in sequencing data. seqPlateRegex = re.compile(r'([A-H]\d+)') plateIDs = list() for name in nameListAa: seqID = seqPlateRegex.findall(name) plateIDs.append(seqID) # Turn list of lists into a flat list. seqPlateIDs = [] for sublist in plateIDs: for item in sublist: seqPlateIDs.append(item) # Create list of all ELISA IDs and assign corresponding numbers for indexing. elisaPlateIDs = ['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10', 'A11', 'A12', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09', 'B10', 'B11', 'B12', 'C01', 'C02', 'C03', 'C04', 'C05', 'C06', 'C07', 'C08', 'C09', 'C10', 'C11', 'C12', 'D01', 'D02', 'D03', 'D04', 'D05', 'D06', 'D07', 'D08', 'D09', 'D10', 'D11', 'D12', 'E01', 'E02', 'E03', 'E04', 'E05', 'E06', 'E07', 'E08', 'E09', 'E10', 'E11', 'E12', 'F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07', 'F08', 'F09', 'F10', 'F11', 'F12', 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 'G10', 'G11', 'G12', 'H01', 'H02', 'H03', 'H04', 'H05', 'H06', 'H07', 'H08', 'H09', 'H10', 'H11', 'H12'] wellListConversion = {k: v for k, v in zip(elisaPlateIDs, range(0,96))} # Remove ELISA scores for IDs not present in sequencing data from ELISA data. reducedrelAveListAa = [] for ID in seqPlateIDs: well = wellListConversion.get(ID) reducedrelAveListAa.append(relAveList[well]) logging.info('ELISA results without corresponding sequencing results removed.') # Remove ELISA data that don't have nucleotide sequencing counterparts. # Extract IDs present in sequencing data. plateIDs = list() for name in nameListNt: seqID = seqPlateRegex.findall(name) plateIDs.append(seqID) # Turn list of lists into a flat list. seqPlateIDs = [] for sublist in plateIDs: for item in sublist: seqPlateIDs.append(item) # Remove ELISA scores for IDs not present in sequencing data from ELISA data. reducedRelAveListNt = [] for ID in seqPlateIDs: well = wellListConversion.get(ID) reducedRelAveListNt.append(relAveList[well]) logging.info('ELISA results without corresponding sequencing results removed.') # Relate IDs to their respective ELISA scores (amino acids). newNameRegex = re.compile(r'([A-H])(\d{1}$)') newNameListAa = list() for name in nameListAa: if newNameRegex.findall(name): newName = re.sub(r'([A-H])(\d{1}$)', r'\g<1>0\g<2>', name) newNameListAa.append(newName) else: newNameListAa.append(name) newNameListAa.sort() # Relate IDs to their respective ELISA scores (nucleotides). newNameRegex = re.compile(r'([A-H])(\d{1}$)') newNameListNt = list() for name in nameListNt: if newNameRegex.findall(name): newName = re.sub(r'([A-H])(\d{1}$)', r'\g<1>0\g<2>', name) newNameListNt.append(newName) else: newNameListNt.append(name) newNameListNt.sort() # Create dictionaries with new reorganised names/sequences. newaaDict = dict(zip(newNameListAa, aaList)) newntDict = dict(zip(newNameListNt, ntList)) aveCellsDictAa = dict(zip(newNameListAa, reducedrelAveListAa)) aveCellsDictNt = dict(zip(newNameListNt, reducedRelAveListNt)) ################## # Export as .xlsx. ################## # Create workbook. workbook = xlsxwriter.Workbook(elisaPath + '/' + elisaFileShort + '_analysed.xlsx') logging.info('Excel spreadsheet created as "%s.xlsx".' % (elisaFileShort)) # Cell formatting rules. # General. cell_format = workbook.add_format() cell_format.set_bg_color('white') cell_format.set_align('center') cell_format.set_align('vcenter') # Names. name_format = workbook.add_format({'bold': True, 'font_size': 12}) name_format.set_bg_color('white') name2_format = workbook.add_format({'bold': True, 'font_size': 12}) name2_format.set_align('center') # IDs. id_format = workbook.add_format({'bold': True, 'font_size': 12}) id_format.set_align('center') id_format.set_align('vcenter') # Numbers. number_format = workbook.add_format({'num_format': '#,##0.0'}) number_format.set_bg_color('white') number_format.set_align('center') number_format.set_align('vcenter') # Wells. well_format = workbook.add_format({'font_size': 11}) well_format.set_bg_color('white') ################## # 'All Seq - AA' worksheet. ################## # Create worksheet for all amino acid sequences. worksheet1 = workbook.add_worksheet('All Seq - AA') logging.info('All Seq - AA worksheet created.') worksheet1.set_column(0, 0, 20) worksheet1.set_column(1, alignLen, 1.5) worksheet1.freeze_panes(0, 1) # Write IDs. row1 = 1 col1 = 0 worksheet1.write(0, 0, 'ID', id_format) for name in newNameListAa: worksheet1.write(row1, col1, name, name_format) row1 += 1 logging.info('"%s" written to All Seq - AA worksheet.' % (name)) # Write sequences. worksheet1.write(0, 1, 'Amino Acid Sequence', name_format) row2 = 1 col2 = 1 for aa in aaList: letterList = list(aa) for letter in letterList: worksheet1.write(row2, col2, letter, cell_format) col2 +=1 row2 += 1 col2 = 1 logging.info('Sequences (broken up) written to All Seq - AA worksheet.') # Write scores. row12 = 1 col12 = alignLen + 1 worksheet1.write(0, col12, 'Normalized Absorbance', name_format) for result in reducedrelAveListAa: worksheet1.write(row12, col12, result, number_format) row12 += 1 logging.info('"%s" written to All Seq - AA worksheet.' % (result)) worksheet1.conditional_format(1, alignLen + 1, len(newNameListAa), alignLen + 1, {'type': '2_color_scale', 'min_color': '#FAFAFA', 'max_color': '#008000'}) ################## # 'Unique Seq - AA' worksheet. ################## # Create worksheet for unique amino acid sequences. worksheet2 = workbook.add_worksheet('Unique Seq - AA') logging.info('Unique Seq - AA worksheet created.') worksheet2.set_column(1, alignLen, 1.5) worksheet2.set_column(alignLen + 2, alignLen + 5, 8) # Create list of unique amino acid sequences ordered by frequency. unique = OrderedCounter(aaList) unique = unique.most_common() uniqueDict = dict(unique) # Write unique amino acid sequences to Unique Seq - AA worksheet. worksheet2.write(0, 1, 'Amino Acid Sequence', name_format) row3 = 1 col3 = 1 for seq in uniqueDict.keys(): letterList = list(seq) for letter in letterList: worksheet2.write(row3, col3, letter, cell_format) col3 += 1 row3 += 1 col3 = 1 logging.info('Unique sequences written to Unique Seq - AA worksheet.') # Add counts for each unique amino acid sequence. row5 = 1 col5 = alignLen + 1 worksheet2.write(0, alignLen + 1, 'Count', name2_format) count = list(uniqueDict.values()) for number in count: worksheet2.write_number(row5, col5, number, cell_format) row5 += 1 logging.info('Sequence counts written to Unique Seq - AA worksheet.') ################## # Ordered values: Amino Acids ################## # Get ordered list of wells that correspond to unique sequences; necessary for subsequent statistics. orderedSeq = [] for key in uniqueDict.keys(): orderedSeq.append(key) orderedIndex = [] for seq in orderedSeq: for key, value in newaaDict.items(): if seq in value: orderedIndex.append(key) # Get ordered values corresponding to ordered list of wells for unique sequences; necessary for subsequent statistics. orderedScores = [] for index in orderedIndex: for ID, score in aveCellsDictAa.items(): if index == ID: orderedScores.append(score) # Add zero to beginning of list to make tracking "begin" and "end" easier in subsequent statistics. orderedScores[0:0] = [0] ################## # Statistics: Amino Acids ################## # Notes: (applies to both amino acid and nucleotide sequence statistics) # * Statistics will encounter an error if "overflow" cells are in the original ELISA file, this is reflected in the length of cellAve being less than the total number of sequences. Replace "OVFLW" wells with "4". # * If you encounter an error involving index values being out of range, this is because the sum of all sequence counts in 'uniqueDict' does not match the total in 'orderedScores'. Check the original sequencing file to make sure sequences aren't repeated. # Retrieve max score for ordered values. uniqueMaxList = [0] begin = 1 for seq, count in uniqueDict.items(): end = int(count) + begin try: uniqueMax = max(orderedScores[begin:end]) uniqueMaxList.append(uniqueMax) # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list. except: uniqueMaxList.append(orderedScores[end]) begin += count # Remove zero used for tracking purposes. uniqueMaxList.pop(0) # Retrieve min score for ordered values. uniqueMinList = [0] begin = 1 for seq, count in uniqueDict.items(): end = int(count) + begin try: uniqueMin = min(orderedScores[begin:end]) uniqueMinList.append(uniqueMin) # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list. except: uniqueMinList.append(orderedScores[end]) begin += count # Remove zero used for tracking purposes. uniqueMinList.pop(0) # Retrieve median score for ordered values. uniqueMedianList = [0] begin = 1 for seq, count in uniqueDict.items(): end = int(count) + begin try: uniqueMedian = statistics.median(orderedScores[begin:end]) uniqueMedianList.append(uniqueMedian) # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list. except: uniqueMedianList.append(orderedScores[end]) begin += count # Remove zero used for tracking purposes. uniqueMedianList.pop(0) # Retrieve mean score for ordered values. uniqueMeanList = [0] begin = 1 for seq, count in uniqueDict.items(): end = int(count) + begin try: uniqueMean = statistics.mean(orderedScores[begin:end]) uniqueMeanList.append(uniqueMean) # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list. except: uniqueMeanList.append(orderedScores[end]) begin += count # Remove zero used for tracking purposes. uniqueMeanList.pop(0) # Retrieve stdev score for ordered values. uniqueStdevList = [0] begin = 1 for seq, count in uniqueDict.items(): end = int(count) + begin try: uniqueStdev = statistics.stdev(orderedScores[begin:end]) uniqueStdevList.append(uniqueStdev) # Above statistic won't work if only a single value so append "0.0" value to the list. except: uniqueStdevList.append(0) begin += count # Remove zero used for tracking purposes. uniqueStdevList.pop(0) # Write statistics to Unique Seq - AA worksheet. # Max. row34 = 1 col34 = alignLen + 2 worksheet2.write(0, alignLen + 2, 'Max', name2_format) for seq in uniqueMaxList: worksheet2.write(row34, col34, seq, number_format) row34 += 1 # Min. row34 = 1 col35 = col34 + 1 worksheet2.write(0, alignLen + 3, 'Min', name2_format) for seq in uniqueMinList: worksheet2.write(row34, col35, seq, number_format) row34 += 1 # Median. row34 = 1 col36 = col35 + 1 worksheet2.write(0, alignLen + 4, 'Median', name2_format) for seq in uniqueMedianList: worksheet2.write(row34, col36, seq, number_format) row34 += 1 # Mean. row34 = 1 col37 = col36 + 1 worksheet2.write(0, alignLen + 5, 'Mean', name2_format) for seq in uniqueMeanList: worksheet2.write(row34, col37, seq, number_format) row34 += 1 # Stdev. row34 = 1 col38 = col37 + 1 worksheet2.write(0, alignLen + 6, 'St. Dev.', name2_format) for seq in uniqueStdevList: worksheet2.write(row34, col38, seq, number_format) row34 += 1 # Format statistics columns. worksheet2.conditional_format(1, alignLen + 2, len(uniqueMaxList), alignLen + 5, {'type': '2_color_scale', 'min_color': '#FAFAFA', 'max_color': '#008000'}) # Assign IDs to each unique amino acid sequence. worksheet2.write(0, 0, 'ID', name2_format) numberList = list(range(1,len(uniqueDict)+1)) row7 = 1 for number in numberList: worksheet2.write_number(row7, 0, number, cell_format) row7 += 1 # Associate specific IDs with corresponding unique sequence (amino acids). countID = [] begin = 0 for uniqueSeq, count in uniqueDict.items(): end = int(count) + begin countID.append(orderedIndex[begin:end]) begin += count # Write IDs to worksheet. worksheet2.write(0, alignLen + 7, 'Wells', name2_format) row8 = 1 countIDregex = re.compile(r"([A-Z][0-1][0-9])") sep = ', ' for wellList in countID: wellList = countIDregex.findall(str(wellList)) wellList = sep.join(wellList) worksheet2.write(row8, alignLen + 7, wellList, well_format) row8 += 1 ################## # 'All Seq - NT' worksheet. ################## # Create worksheet for all nucleotide sequences. worksheet3 = workbook.add_worksheet('All Seq - NT') logging.info('All Seq - NT worksheet created.') worksheet3.set_column(0, 0, 20) worksheet3.set_column(1, alignLen2, 1.5) worksheet3.freeze_panes(0, 1) # Write IDs. worksheet3.write(0, 0, 'ID', id_format) row1 = 1 col1 = 0 for name in nameListNt: worksheet3.write(row1, col1, name, name_format) row1 += 1 logging.info('"%s" written to All Seq - NT worksheet.' % (name)) # Write sequences. worksheet3.write(0, 1, 'Nucleotide Sequence', name_format) row2 = 1 col2 = 1 for nt in ntList: letterList = list(nt) for letter in letterList: worksheet3.write(row2, col2, letter, cell_format) col2 +=1 row2 += 1 col2 = 1 logging.info('Nucleotide sequences written to All Seq - NT worksheet.') # Write scores. row12 = 1 col12 = alignLen2 + 1 worksheet3.write(0, col12, 'Normalized Absorbance', name_format) for result in reducedRelAveListNt: worksheet3.write(row12, col12, result, number_format) row12 += 1 logging.info('"%s" written to All Seq - NT worksheet.' % (result)) worksheet3.conditional_format(1, alignLen2 + 1, len(newNameListNt), alignLen2 + 1, {'type': '2_color_scale', 'min_color': '#FAFAFA', 'max_color': '#008000'}) ################## # 'Unique Seq - NT' worksheet. ################## # Create worksheet for unique nucleotide sequences. worksheet4 = workbook.add_worksheet('uniqueSeqNT') logging.info('uniqueSeqNT worksheet created.') worksheet4.set_column(1, alignLen2, 1.5) # Create list of unique amino acid sequences ordered by frequency. unique = OrderedCounter(ntList) unique = unique.most_common() uniqueDict = dict(unique) # Write unique nucleotide sequences to Unique Seq - AA worksheet. row3 = 1 col3 = 1 for seq in uniqueDict.keys(): letterList = list(seq) for letter in letterList: worksheet4.write(row3, col3, letter, cell_format) col3 += 1 row3 += 1 col3 = 1 logging.info('Unique sequences written to uniqueSeqNT worksheet.') # Add counts for each unique sequence. row5 = 1 col5 = alignLen2 + 1 worksheet4.write(0, alignLen2 + 1, 'Count', name2_format) count = list(uniqueDict.values()) for number in count: worksheet4.write_number(row5, col5, number, cell_format) row5 += 1 ################## # Ordered values: Nucleotides ################## # Get ordered list of wells that correspond to unique sequences; necessary for subsequent statistics. orderedSeq = [] for key in uniqueDict.keys(): orderedSeq.append(key) orderedIndex = [] for seq in orderedSeq: for key, value in newntDict.items(): if seq in value: orderedIndex.append(key) # Get ordered values corresponding to ordered list of wells for unique sequences; necessary for subsequent statistics. orderedScores = [] for index in orderedIndex: for ID, score in aveCellsDictNt.items(): if index == ID: orderedScores.append(score) # Add zero to beginning of list to make tracking "begin" and "end" easier in subsequent statistics. orderedScores[0:0] = [0] ################## # Statistics: Nucleiotides ################## # Note: Statistics will encounter an error if "overflow" cells are in the original ELISA file, this is reflected in the length of cellAve being less than the total number of sequences. Replace "OVFLW" wells with "4". # Retrieve max score for ordered values. uniqueMaxList = [0] begin = 1 for seq, count in uniqueDict.items(): end = int(count) + begin try: uniqueMax = max(orderedScores[begin:end]) uniqueMaxList.append(uniqueMax) # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list. except: uniqueMaxList.append(orderedScores[end]) begin += count # Remove zero used for tracking purposes. uniqueMaxList.pop(0) # Retrieve min score for ordered values. uniqueMinList = [0] begin = 1 for seq, count in uniqueDict.items(): end = int(count) + begin try: uniqueMin = min(orderedScores[begin:end]) uniqueMinList.append(uniqueMin) # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list. except: uniqueMinList.append(orderedScores[end]) begin += count # Remove zero used for tracking purposes. uniqueMinList.pop(0) # Retrieve median score for ordered values. uniqueMedianList = [0] begin = 1 for seq, count in uniqueDict.items(): end = int(count) + begin try: uniqueMedian = statistics.median(orderedScores[begin:end]) uniqueMedianList.append(uniqueMedian) # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list. except: uniqueMedianList.append(orderedScores[end]) begin += count # Remove zero used for tracking purposes. uniqueMedianList.pop(0) # Retrieve mean score for ordered values. uniqueMeanList = [0] begin = 1 for seq, count in uniqueDict.items(): end = int(count) + begin try: uniqueMean = statistics.mean(orderedScores[begin:end]) uniqueMeanList.append(uniqueMean) # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list. except: uniqueMeanList.append(orderedScores[end]) begin += count # Remove zero used for tracking purposes. uniqueMeanList.pop(0) # Retrieve stdev score for ordered values. uniqueStdevList = [0] begin = 1 for seq, count in uniqueDict.items(): end = int(count) + begin try: uniqueStdev = statistics.stdev(orderedScores[begin:end]) uniqueStdevList.append(uniqueStdev) # Above statistic won't work if only a single value so append "0.0" value to the list. except: uniqueStdevList.append(0) begin += count # Remove zero used for tracking purposes. uniqueStdevList.pop(0) # Write statistics to Unique Seq - NT worksheet. # Max. row34 = 1 col34 = alignLen2 + 2 worksheet4.write(0, alignLen2 + 2, 'Max', name2_format) for seq in uniqueMaxList: worksheet4.write(row34, col34, seq, number_format) row34 += 1 # Min. row34 = 1 col35 = col34 + 1 worksheet4.write(0, alignLen2 + 3, 'Min', name2_format) for seq in uniqueMinList: worksheet4.write(row34, col35, seq, number_format) row34 += 1 row34 = 1 col36 = col35 + 1 worksheet4.write(0, alignLen2 + 4, 'Median', name2_format) for seq in uniqueMedianList: worksheet4.write(row34, col36, seq, number_format) row34 += 1 # Mean. row34 = 1 col37 = col36 + 1 worksheet4.write(0, alignLen2 + 5, 'Mean', name2_format) for seq in uniqueMeanList: worksheet4.write(row34, col37, seq, number_format) row34 += 1 # Stdev. row34 = 1 col38 = col37 + 1 worksheet4.write(0, alignLen2 + 6, 'St. Dev.', name2_format) for seq in uniqueStdevList: worksheet4.write(row34, col38, seq, number_format) row34 += 1 # Format statistics columns. worksheet4.conditional_format(1, alignLen2 + 2, len(uniqueMaxList), alignLen2 + 5, {'type': '2_color_scale', 'min_color': '#FAFAFA', 'max_color': '#008000'}) # Assign IDs to each unique amino acid sequence. worksheet4.write(0, 0, 'ID', name2_format) numberList = list(range(1,len(uniqueDict)+1)) row7 = 1 for number in numberList: worksheet4.write_number(row7, 0, number, cell_format) row7 += 1 # Associate specific IDs with corresponding unique sequence (nucleotides). countID = [] begin = 0 for uniqueSeq, count in uniqueDict.items(): end = int(count) + begin countID.append(orderedIndex[begin:end]) begin += count # Write IDs to worksheet. worksheet4.write(0, alignLen2 + 7, 'Wells', name2_format) row8 = 1 countIDregex = re.compile(r"([A-Z][0-1][0-9])") sep = ', ' for wellList in countID: wellList = countIDregex.findall(str(wellList)) wellList = sep.join(wellList) worksheet4.write(row8, alignLen2 + 7, wellList, well_format) row8 += 1 ################## # Final workbook formatting. ################## # Transform data in previously made tables into proper Excel-formatted tables. worksheet1.add_table(1, 0, len(newNameListAa), alignLen + 1, {'header_row': False}) worksheet2.add_table(1, 0, len(OrderedCounter(aaList)), alignLen + 7, {'header_row': False}) worksheet3.add_table(1, 0, len(nameListNt), alignLen2 + 1, {'header_row': False}) worksheet4.add_table(1, 0, len(OrderedCounter(ntList)), alignLen2 + 7, {'header_row': False}) workbook.close() logging.shutdown() print(green('\nExcel sequence alignment with ELISA scores saved as %s_analysed.xlsx.' % (elisaFileShort))) logging.info('Excel file exported as %s_analysed.xlsx.' % (elisaFileShort)) print(green('\nAnalysis finished. See log file for details.')) logging.info('phageDisplayElisaAnalysis.py finished running.')