#! python3 # phageDisplaySeqAnalysis.py - Analyse University of Guelph's AAC Genomics Facility Sanger sequencing output files. Renames, reorganises, converts, trims, and aligns .seq/.ab1 files. Final aa and nt alignments are in fasta, clustal, and xlsx formats. Sequences that cannot be trimmed at the 5' end are excluded from the multiple sequence alignment and moved to a separate folder for manual analysis. #################################### # Preamble #################################### # Usage notes: # * All final alignments exclude sequences that weren't full length and those that have premature stop codons. # * Sequences that aren't full length and include premature stop codons are included in batch files but never the final alignment. # Compatibility notes: # * Must use Biopython 1.77 and no later version. If using a more current version, change alignment codes from "alignment.format(format_spec)" to "format(alignment, format_spec". # * If using Spyder as your IDE, use a version that isn't version 5. This version for some reason has conflicts with the xlsxwriter package and won't get past importing modules. # * This code is confirmed to work with python 3.8.8. Later versions may work but have not been verified. # * Confirmed to work in Windows, unconfirmed in Macs and Linux but should work in theory (may need to change lines regarding path names so that the format matches the OS, currently these are optimised for Windows' format). #################################### # Modules #################################### import os, glob, re, shutil, logging, xlsxwriter from shutil import copyfile from Bio.Seq import Seq from Bio.Alphabet import generic_dna from Bio import AlignIO from collections import Counter, OrderedDict #################################### # Classes #################################### # Ordered list of counts. class OrderedCounter(Counter, OrderedDict): pass #################################### # Code #################################### ################## # Colours for print functions. ################## cyan = lambda text: '\033[0;36m' + text + '\033[0m' green = lambda text: '\033[0;32m' + text + '\033[0m' ################## # Setup ################## # Choose data source and change working directory. print(cyan('\nScript started. This will process AAC Genomics Facility (University of Guelph) output by renaming, converting, trimming, and aligning .seq files.') + cyan('\n\nEnter folder location/path where .seq files are located:')) path = input() path = path.replace('\\', '/') os.chdir(path) # Logging setup. pathRegex = re.compile(r'([\w.]+)$') locList = pathRegex.findall(path) locStr = str(locList[0]) logging.basicConfig(filename = path + "/" + locStr + ".log", level = logging.INFO, format = '%(asctime)s - %(message)s', filemode = 'w') logging.info('Working directory changed to %s.' % (path)) ################## # Data Analysis ################## # Create rawSeq folder. if not os.path.exists("./01. rawSeq"): os.makedirs('01. rawSeq') renameLoc = path + '/01. rawSeq' logging.info('%s directory created.' % (renameLoc)) # Rename each raw .seq file to remove well designations used by AAC, add the number 0 to names that have single numbers in them (facilitates downstreeam organization), and move to rawSeq folder. wellRegex = re.compile(r'([_][A-H]\d\d[.])') for fileName in glob.glob('*.seq'): shortName = wellRegex.sub('.', fileName) newName = re.sub(r'(^\w{1}[_]\w{1})(\d{1}[_]\s*\w*)', r'\g<1>0\g<2>', shortName) oldName = path + '/' + fileName newName = path + '/01. rawSeq/' + newName shutil.move(oldName, newName) renameLoc = path + '/01. rawSeq' logging.info('Sequence renamed from "%s" "%s" and moved to %s.' % (fileName, shortName, renameLoc)) # Create rawAb1 folder. if not os.path.exists("./02. rawAb1"): os.makedirs('02. rawAb1') renameLoc = path + '/02. rawAb1' logging.info('%s directory created.' % (renameLoc)) # Rename each raw .ab1 file to remove well designations used by AAC, add the number 0 to names that have single numbers in them (facilitates downstreeam organization), and move to rawAb1 folder. for fileName in glob.glob('*.ab1'): shortName = wellRegex.sub('.', fileName) newName = re.sub(r'(^\w{1}[_]\w{1})(\d{1}[_]\s*\w*)', r'\g<1>0\g<2>', shortName) oldName = path + '/' + fileName newName = path + '/02. rawAb1/' + newName shutil.move(oldName, newName) renameLoc = path + '/02. rawAb1' logging.info('Chromatogram renamed from "%s" "%s" and moved to %s.' % (fileName, shortName, renameLoc)) print(green('\nRenaming finished.')) # Convert files from .seq to .fasta. os.chdir("./01. rawSeq") for fileName in glob.glob('*.seq'): with open(fileName, 'r') as file: seq = file.read().replace('\n', '') name = fileName.replace('.seq', '') seq = ">" + name + "\n" + seq + "\n" # Write output fasta file. output = name + '.fasta' with open(output, "w") as newFile: newFile.write(seq) convertLoc = path + '/01. rawSeq' logging.info('Sequence converted from "%s" to "%s" and moved to %s.' % (fileName, output, convertLoc)) print(green('\n.seq to .fasta conversion finished.')) # Create folder for .fasta files. if not os.path.exists("../03. ntFasta"): os.chdir(path) os.makedirs('03. ntFasta') fastaLoc = path + '/03. ntFasta' logging.info('%s directory created.' % (fastaLoc)) os.chdir('./01. rawSeq/') # Move fasta files to ntFasta folder. for fileName in glob.glob('*.fasta'): oldPath = path + '/01. rawSeq/' + fileName newPath = path + '/03. ntFasta/' + fileName shutil.move(oldPath, newPath) fastaLoc = path + '/03. ntFasta' logging.info('"%s" moved to %s.' % (fileName, fastaLoc)) # Append all .fasta files together into a single batch file. os.chdir('../03. ntFasta') name1 = locStr name2 = locStr + '.fasta' fileNames = glob.glob('*.fasta') with open(name2, 'w') as f: for file in fileNames: with open(file) as infile: f.write(infile.read() + '\n') logging.info('"%s" written to "%s".' % (file, name2)) print(green('\nBatch file ("%s") created.' % (name2))) logging.info('Batch file ("%s") created.' % (name2)) # Trim sequences at 5' end by sequence. print (cyan('''\nEnter sequence to begin 5' trim at (not case-sensitive): E.g. For UbVs, use AAAATG (last FLAG residue + start codon). Otherwise use AUG start codon plus a few conserved nucleotides beyond it. Note: Using conserved nucleotides is a must or else sequences will be trimmed at multiple sites, producing different size products and preventing alignment.''')) start = input() start = start.upper() fileList = glob.glob('*.fasta') for fileName in fileList: os.chdir(path + '/03. ntFasta') index = 0 with open(fileName, 'r') as file: lines = file.readlines() seq = str(lines[1]) index = seq.find(start, index) if index != -1: trimmedSeq = seq[index:] newName = fileName.replace('.fasta', '_ntTrimmed.fasta') with open(newName, "w") as newFile: newFasta = newName.replace('.fasta', '') newFile.write('>' + newFasta + '\n' + trimmedSeq) logging.info('%s found in "%s" at %d. Trimmed sequence exported as "%s".' % (start, fileName, index, newName)) index += 1 # Move untrimmed sequences to noTrim folder. elif index == -1: if not os.path.exists(path + "/noTrim"): os.chdir(path) os.makedirs('noTrim') newPath = path + '/noTrim' logging.info('%s directory created.' % (newPath)) noTrim = path + "/noTrim" os.chdir(noTrim) newName = fileName.replace('.fasta', '_5trimfail.fasta') with open(newName, "w") as newFile: newFasta = newName.replace('.fasta', '') newFile.write('>' + newFasta + '\n' + seq) logging.info('%s not found in "%s". Sequence exported as "%s" to %s.' % (start, fileName, newName, noTrim)) print(green('''\n5' trim finished.''')) # Create ntTrimmed folder. if not os.path.exists("../04. ntTrimmed"): os.chdir(path) os.makedirs('04. ntTrimmed') trimLoc = path + '/04. ntTrimmed' logging.info('%s directory created.' % (trimLoc)) # Trim sequences at 3' end by total length. print (cyan('''\nEnter the number of nucleotides downstream of the 5' trim site to trim at: E.g. For UbVs, use 237 nt.''')) end = input() end = int(end) os.chdir(path + '/03. ntFasta') fileList = glob.glob('*trimmed.fasta') for fileName in fileList: os.chdir(path + '/03. ntFasta') with open(fileName, 'r') as file: lines = file.readlines() fastaName = str(lines[0]) seq = str(lines[1]) finalTrimmedSeq = seq[0:end] file.close() # Move untrimmed sequences to noTrim folder. if len(finalTrimmedSeq) < end: if not os.path.exists(path + "/noTrim"): os.chdir(path) os.makedirs(path + '/noTrim') newPath = path + '/noTrim' logging.info('%s directory created.' % (newPath)) os.chdir(path + '/noTrim') newName = fileName.replace('_ntTrimmed.fasta', '_3trimfail.fasta') with open(newName, "w") as newFile: newFasta = newName.replace('.fasta', '') newFile.write('>' + newFasta + '\n' + seq) logging.info('Sequence trimmed %s nt downstream from %s. Trimmed sequence exported as "%s".' % (end, start, fileName)) else: with open(fileName, 'r') as file: lines = file.readlines() fastaName = str(lines[0]) seq = str(lines[1]) finalTrimmedSeq = seq[0:end] logging.info('%s trimmed %s nt downstream.' % (file, end)) with open(fileName, "w") as newFile: newFile.write(fastaName + finalTrimmedSeq) oldPath = path + '/03. ntFasta/' + fileName newPath = path + '/04. ntTrimmed/' + fileName file.close() shutil.move(oldPath, newPath) logging.info('"%s" moved to %s.' % (fileName, newPath)) print(green('''\n3' trim finished.''')) # Clean up unnecessary/faulty trimmed files. os.chdir(path + '/04. ntTrimmed') name3 = name1 + '_ntTrimmed.fasta' if os.path.exists(name3) == True: os.remove(name3) logging.info('Deleted %s (faulty batch file).' % (name3)) else: pass # Append all trimmed nucleotide fasta files together into a single batch file. if os.path.exists(path + '/03. ntFasta/' + name2): if os.path.exists(path + "/04. ntTrimmed"): fileNames = glob.glob('*trimmed.fasta') with open(name3, 'w') as f: for file in fileNames: with open(file) as infile: f.write(infile.read() + '\n\n') logging.info('"%s" written to "%s".' % (file, name3)) print(green('\nTrimmed nucleotide batch file exported as "%s".' % (name3))) logging.info('Trimmed nucleotide batch file exported as "%s".' % (name3)) # Create 'aaTrimmed' folder. os.chdir(path + "/04. ntTrimmed") transLoc = path + "/04. ntTrimmed" logging.info('Moved to %s.' % (transLoc)) os.makedirs(path + '/05. aaTrimmed') aaLoc = path + '/05. aaTrimmed' logging.info('%s created.' % (aaLoc)) # Make copies of ntTrimmed files in aaTrimmed folder with new names. ntTrimRegex = re.compile(r'(_ntTrimmed)') aaTrimRegex = re.compile(r'(_aaTrimmed)') fileList = glob.glob('*.fasta') for fileName in fileList: oldPath = transLoc + '/' + fileName fileName2 = ntTrimRegex.sub('_aaTrimmed', fileName) newPath = aaLoc + '/' + fileName2 copyfile(oldPath, newPath) logging.info('%s moved from %s to %s.' % (fileName2, oldPath, newPath)) # Translate trimmed nucleotides to amino acids. os.chdir(aaLoc) fileList = glob.glob('*.fasta') for fileName in fileList: with open(fileName, "r+") as newFile: # Extract data. lines = newFile.readlines() fastaName = lines[0] fastaName = ntTrimRegex.sub('_aaTrimmed', fastaName) ntSeq = lines[1] # Remove previous file contents newFile.seek(0) newFile.truncate() # Translate. gene = Seq(ntSeq, generic_dna) try: aaSeq = gene.translate(table = 'Standard', stop_symbol = '*') aaSeq = str(aaSeq) # Write new contents to file. newFile.write(fastaName + aaSeq) logging.info('"%s" translated.' % (fileName)) except: logging.info('"%s" could not be translated.' % (fileName)) pass print(green('''\nTranslation finished.''')) # Clean up faulty amino acid trimmed file. name6 = ntTrimRegex.sub('_aaTrimmed', name3) os.remove(name6) logging.info('Deleted %s (faulty batch file).' % (name6)) # Append all trimmed amino acid fasta files together into a single batch file. fileNames = glob.glob('*.fasta') with open(name6, 'w') as f: for file in fileNames: with open(file) as infile: f.write(infile.read() + '\n') logging.info('%s written to %s.' % (file, name6)) print(green('\nTranslated batch file (%s) created.' % (name6))) logging.info('Translated batch file (%s) created.' % (name6)) # Remove truncated amino acid sequences to prepare for alignment. os.chdir(path + '/05. aaTrimmed') with open(name6, "r") as batchFile: batchData = batchFile.readlines() dataRegex = re.compile('^\w+') batchList = [] batchList2 = [] # Separate amino acid sequences from names in batch file. for entry in batchData: if re.match(dataRegex, entry): batchList += [entry] else: batchList2 += [entry] dataList = [entry[:-1] for entry in batchList] nameList = [entry[:-1] for entry in batchList2] # Remove blank entries in list, if present. nameList = list(filter(None, nameList)) # Remove sequences that aren't full length. listLen = [len(entry) for entry in dataList] maxLen = max(listLen) aaAlignList = [] for entry in dataList: if len(entry) == maxLen: aaAlignList.append(entry) elif len(entry) < maxLen: logging.info('%s removed from pool for alignment.' % (entry)) pass aaAlignDict = dict(zip(nameList, aaAlignList)) # Remove sequences with at least one stop codon. with open(name6, "w") as newFile: for key, value in aaAlignDict.items(): if value.find('*') == -1: newFile.write(key + '\n' + value + '\n') else: pass ################## # Alignment. ################## # Create alignment folder. if not os.path.exists(path + "/06. alignment"): os.chdir(path) os.makedirs('06. alignment') alignLoc = path + '/06. alignment' logging.info('%s directory created.' % (alignLoc)) os.chdir(path + '/06. alignment') # Create amino acid multiple alignments. alignment = AlignIO.read(open(path + '/05. aaTrimmed/' + name6), "fasta") alignLen = alignment.get_alignment_length() fastaAlign = alignment.format("fasta") clustalAlign = alignment.format("clustal") name4 = name6.replace('.fasta','_aligned.fasta') with open(name4, "w") as newFile: newFile.write(fastaAlign) print(green('\nAmino acid fasta alignment created.')) logging.info('Amino acid fasta alignment (%s) created.' % (name4)) name5 = name6.replace('.fasta','_aligned.clustal') with open(name5, "w") as newFile: newFile.write(clustalAlign) print(green('\nAmino acid clustal alignment (%s) created.')) logging.info('Amino acid clustal alignment (%s) created.' % (name5)) # Create nucleotide multiple alignments. alignment2 = AlignIO.read(open(path + '/04. ntTrimmed/' + name3), "fasta") alignLen2 = alignment2.get_alignment_length() fastaAlign2 = alignment2.format("fasta") clustalAlign2 = alignment2.format("clustal") name4 = name3.replace('.fasta','_aligned.fasta') with open(name4, "w") as newFile: newFile.write(fastaAlign2) print(green('\nNucleotide fasta alignment created.')) logging.info('Nucleotide fasta alignment (%s") created.' % (name4)) name5 = name3.replace('.fasta','_aligned.clustal') with open(name5, "w") as newFile: newFile.write(clustalAlign2) print(green('\nNucleotide clustal alignment created.')) logging.info('Nucleotide clustal alignment (%s) created.' % (name5)) ################## # Export as .xlsx. ################## # Create workbook. workbook = xlsxwriter.Workbook(path + '/06. alignment/' + locStr + '_alignment.xlsx') logging.info('Excel spreadsheet created as "%s.xlsx".' % (locStr)) # Cell formatting rules. # General. cell_format = workbook.add_format() cell_format.set_bg_color('white') cell_format.set_align('center') cell_format.set_align('vcenter') # Names. name_format = workbook.add_format({'bold': True, 'font_size': 12}) name_format.set_bg_color('white') name2_format = workbook.add_format({'bold': True, 'font_size': 12}) name2_format.set_align('center') # IDs. id_format = workbook.add_format({'bold': True, 'font_size': 12}) id_format.set_align('center') id_format.set_align('vcenter') ################## # All Seq - AA worksheet. ################## # Create worksheet for all amino acid sequences. worksheet1 = workbook.add_worksheet('All Seq - AA') worksheet1.hide_gridlines(option=2) logging.info('All Seq - AA worksheet created.') worksheet1.set_column(0, 0, 20) worksheet1.set_column(1, alignLen2, 1.5) worksheet1.freeze_panes(0, 1) # Retrieve names and sequences from alignment fasta files. nameRegex = re.compile(r'>(.*)') seqRegex = re.compile(r'(?)([A-Z]{2,})(?!\d|[a-z]|_)') stopRegex = re.compile(r'([*]+[A-Z]*)') # Amino acid alignment data extraction. alignFile = locStr + ('_aaTrimmed_aligned.fasta') with open(alignFile, 'r') as inFile: lines1 = inFile.read() linesTrim1 = aaTrimRegex.sub('', lines1) nameListaa = nameRegex.findall(linesTrim1) lines2 = lines1.replace('\n', '') linesTrim2 = stopRegex.sub('', lines2) aaList = seqRegex.findall(linesTrim2) # Write IDs to 'All Seq - AA' worksheet. worksheet1.write(0, 0, 'ID', id_format) row1 = 1 col1 = 0 for name in nameListaa: worksheet1.write(row1, col1, name, name_format) row1 += 1 logging.info('%s written to All Seq - AA worksheet.' % (name)) # Write amino acid sequences to 'All Seq - AA' worksheet. worksheet1.write(0, 1, 'Amino Acid Sequence', name_format) row2 = 1 col2 = 1 for aa in aaList: letterList = list(aa) for letter in letterList: worksheet1.write(row2, col2, letter, cell_format) col2 +=1 row2 += 1 col2 = 1 logging.info('Amino acid sequences (broken up) written to All Seq - AA worksheet.') ################## # All Seq - NT worksheet. ################## # Create worksheet for all nucleotide sequences. worksheet3 = workbook.add_worksheet('All Seq - NT') worksheet3.hide_gridlines(option=2) logging.info('All Seq - NT worksheet created.') worksheet3.set_column(0, 0, 20) worksheet3.set_column(1, alignLen2, 1.5) worksheet3.freeze_panes(0, 1) # Nucleotide alignment data extraction. alignFile2 = locStr + ('_ntTrimmed_aligned.fasta') with open(alignFile2, 'r') as inFile: lines3 = inFile.read() linesTrim3 = ntTrimRegex.sub('', lines3) nameListnt = nameRegex.findall(linesTrim3) linesTrim4 = lines3.replace('\n', '') ntList = seqRegex.findall(linesTrim4) # Write IDs to 'allSeq - NT' worksheet. worksheet3.write(0, 0, 'ID', id_format) row1 = 1 col1 = 0 for name in nameListnt: worksheet3.write(row1, col1, name, name_format) row1 += 1 logging.info('"%s" written to All Seq - NT worksheet.' % (name)) # Write nucleotide sequences to All Seq - NT worksheet. worksheet3.write(0, 1, 'Nucleotide Sequence', name_format) row2 = 1 col2 = 1 for nt in ntList: letterList = list(nt) for letter in letterList: worksheet3.write(row2, col2, letter, cell_format) col2 +=1 row2 += 1 col2 = 1 logging.info('Nucleotide sequences (broken up) written to allSeq - NT worksheet.') ################## # Unique Seq - AA worksheet. ################## # Create worksheet for unique amino acid sequences. worksheet2 = workbook.add_worksheet('Unique Seq - AA') worksheet2.hide_gridlines(option=2) logging.info('Unique Seq - AA worksheet created.') worksheet2.set_column(1, alignLen, 1.5) # Create list of unique amino acid sequences ordered by frequency. uniqueAa = OrderedCounter(aaList) uniqueAa = uniqueAa.most_common() uniqueDictAa = dict(uniqueAa) # Write unique amino acid sequences to Unique Seq - AA worksheet. row3 = 1 col3 = 1 for seq in uniqueDictAa.keys(): letterList = list(seq) for letter in letterList: worksheet2.write(row3, col3, letter, cell_format) col3 += 1 row3 += 1 col3 = 1 logging.info('Unique sequences written to Unique Seq - AA worksheet.') # # Write sequence counts to 'Unique Seq - AA' worksheet. count = list(uniqueDictAa.values()) row5 = 1 col5 = alignLen + 1 worksheet2.write(0, alignLen + 1, 'Count', name2_format) for number in count: worksheet2.write_number(row5, col5, number, cell_format) row5 += 1 logging.info('Sequence counts written to Unique Seq - AA worksheet.') # Assign IDs to each unique amino acid sequence. worksheet2.write(0, 0, 'ID', name2_format) numberList = list(range(1,len(uniqueAa)+1)) row7 = 1 for number in numberList: worksheet2.write_number(row7, 0, number, cell_format) row7 += 1 ################## # Unique Seq - NT worksheet. ################## # Create worksheet for unique nucleotide sequences. worksheet4 = workbook.add_worksheet('Unique Seq - NT') worksheet4.hide_gridlines(option=2) logging.info('Unique Seq - NT worksheet created.') worksheet4.set_column(1, alignLen2, 1.5) # Create list of unique nucleotide sequences ordered by frequency. uniqueNt = OrderedCounter(ntList) uniqueNt = uniqueNt.most_common() uniqueDictNt = dict(uniqueNt) # Write unique nucleotide sequences to Unique Seq - AA worksheet. row3 = 1 col3 = 1 for seq in uniqueDictNt.keys(): letterList = list(seq) for letter in letterList: worksheet4.write(row3, col3, letter, cell_format) col3 += 1 row3 += 1 col3 = 1 logging.info('Unique sequences written to Unique Seq - NT worksheet.') # Write sequence counts to 'Unique Seq - NT' worksheet. count = list(uniqueDictNt.values()) row5 = 1 col5 = alignLen2 + 1 worksheet4.write(0, alignLen2 + 1, 'Count', name2_format) for number in count: worksheet4.write_number(row5, col5, number, cell_format) row5 += 1 logging.info('Sequence counts written to Unique Seq - NT worksheet.') # Assign IDs to each unique nucleotide sequence. worksheet4.write(0, 0, 'ID', name2_format) numberList = list(range(1,len(uniqueNt)+1)) row7 = 1 for number in numberList: worksheet4.write_number(row7, 0, number, cell_format) row7 += 1 ################## # Final workbook formatting. ################## # Transform data in previously made tables into proper Excel-formatted tables. worksheet1.add_table(1, 0, len(nameListaa), alignLen, {'header_row': False}) worksheet2.add_table(1, 0, len(uniqueAa), alignLen +1, {'header_row': False}) worksheet3.add_table(1, 0, len(nameListnt), alignLen2, {'header_row': False}) worksheet4.add_table(1, 0, len(uniqueNt), alignLen2 +1, {'header_row': False}) workbook.close() logging.shutdown() print(green('\nExcel alignment exported as %s_aaTrimmed_aligned.xlsx.' % (locStr))) logging.info('Excel alignment exported as %s_aaTrimmed_aligned.xlsx.' % (locStr)) print(green('\nAnalysis finished. See log file for details.')) print(cyan('''\nPost-analysis help: Non-trimmed files are in the "noTrim" folder and couldn't be trimmed because: A) There's no sequence. or B) The 5' trim site couldn't be found. In the latter case, look at the corresponding ab1 file and assess whether the bases were called correctly or not. When changing base calls, create a brand new file (do not overwrite the raw file) and type called bases in lowercase to distinguish them from bases called by the machine.''')) logging.info('phageDisplaySeqAnalysis.py finished running.')