#!/usr/bin/env python

import sys, os, string, glob, gzip, math, time, operator, csv

#args = sys.argv
#if len(args) != 2:
#    print 'Usage:', args[0], 'dummy'
#    sys.exit()
inDir ='all_reads' #args[1]#'Preprocessed_Separate_Samples26'
outDir = 'all_reads/pipe4_out'

csvReader = csv.reader(open('Demultiplexing_Trimming_blunt_GTAC.tsv', 'rU'), delimiter = '\t')
#csvReader = csv.reader(open('Plan_Documents24/INDICATOR_Dec2012_NK_UPPER.txt', 'r'), delimiter = '\t')
#csvReader = csv.reader(open('Plan_Documents20/INDICATOR_11232011_UPPER.txt', 'r'), delimiter = '\t')
#csvReader = csv.reader(open('Plan_Documents16/INDICATOR_101511a_UPPER.txt', 'r'), delimiter = '\t')
#csvReader = csv.reader(open('Plan_Documents14/INDICATOR_073111-3_UPPER.txt', 'r'), delimiter = '\t')
#csvReader = csv.reader(open('Plan_Documents13/INDICATOR_060911_UPPER.txt', 'r'), delimiter = '\t')
#csvReader = csv.reader(open('Plan_Documents7/012411_INDICATOR_UPPER.txt', 'r'), delimiter = '\t')
#csvReader = csv.reader(open('Plan_Documents5/092910_INDICATOR-3.txt', 'r'), delimiter = '\t')
#csvReader = csv.reader(open('Plan_Documents3/454_titanium_071510_analysis_indicator_total_yue1_upper.txt', 'r'), \
#    delimiter = '\t')
#csvReader = csv.reader(open('Plan_Documents2/454analysis_indicator_042709.txt', 'r'), delimiter = '\t')
#csvReader = csv.reader(open('Plan_Documents1/454analysis_indicator_total.txt', 'r'), delimiter = '\t')
csvFields = csvReader.next()
csvValues = [row for row in csvReader]
# GATHER ALL
iSample = csvFields.index('SAMPLE')
itag = csvFields.index('MID')
iA = csvFields.index('A')
iB = csvFields.index('B')
iC = csvFields.index('C')
iD = csvFields.index('D')
iDD = csvFields.index('DD')
iE = csvFields.index('E')
iF = csvFields.index('F')
iFF = csvFields.index('FF')
iG = csvFields.index('G')
iJ = csvFields.index('J')
iH = csvFields.index('H')
iI = csvFields.index('I')
iK = csvFields.index('K')
sampleList = []
tagList, aList, bList, cList, dList, ddList, eList, fList, ffList, gList, jhList, iList, kList = \
    [], [], [], [], [], [], [], [], [], [], [], [], []
jnhList = []
for row in csvValues:
    if len(row) == 0: continue
    if len(row) > iSample:
        if row[iSample] not in sampleList and len(row[iSample]):
            if len(row) > iB and len(row[itag]) and len(row[iA]) and len(row[iB]):
                sampleList.append((row[iSample], row[itag], row[iA], row[iB], row[iE]))
    if row[itag] not in tagList and len(row[itag]):
        tagList.append(row[itag])
    if len(row) > iA:
        if row[iA] not in aList and len(row[iA]):
            aList.append(row[iA])
    if len(row) > iB:
        if row[iB] not in bList and len(row[iB]):
            bList.append(row[iB])
    if len(row) > iC:
        if row[iC] not in cList and len(row[iC]):
            cList.append(row[iC])
    if len(row) > iD:
        if row[iD] not in dList and len(row[iD]):
            dList.append(row[iD])
    if len(row) > iDD:
        if row[iDD] not in ddList and len(row[iDD]):
            ddList.append(row[iDD])
    if len(row) > iE:
        if row[iE] not in eList and len(row[iE]):
            eList.append(row[iE])
    if len(row) > iF:
        if row[iF] not in fList and len(row[iF]):
            fList.append(row[iF])
    if len(row) > iFF:
        if row[iFF] not in ffList and len(row[iFF]):
            ffList.append(row[iFF])
    if len(row) > iG:
        if row[iG] not in gList and len(row[iG]):
            gList.append(row[iG])
    if len(row) > iJ and len(row) > iH:
        if row[iJ] + row[iH] not in jhList and len(row[iJ] + row[iH]):
            jhList.append(row[iJ] + row[iH])
            jnhList.append((row[iJ], row[iH]))
    if len(row) > iI:
        if row[iI] not in iList and len(row[iI]):
            iList.append(row[iI])
    if len(row) > iK:
        if row[iK] not in kList and len(row[iK]):
            kList.append(row[iK])
tagList.sort()
aList.sort()
bList.sort()
cList.sort()
dList.sort()
ddList.sort()
eList.sort()
fList.sort()
ffList.sort()
gList.sort()
jhList.sort()
iList.sort()
jnhList.sort()
kList.sort()

sampleIdDict = {}
for sampleId, tagId, aId, bId, eId in sampleList:
    newId = sampleId.replace('_', '').replace('-', '')
    sampleIdDict[(tagList.index(tagId), aList.index(aId), bList.index(bId), eList.index(eId))] = newId
    sampleIdDict[(tagList.index(tagId), -1, bList.index(bId), eList.index(eId))] = newId
    sampleIdDict[(tagList.index(tagId), aList.index(aId), -1, eList.index(eId))] = newId

#-rw-rw-r-- 1 deepreds deepreds    86948 Aug 22 20:12 SAMPLE_TAG_FULLEND_L_TRIMMED3L_850.txt
#-rw-rw-r-- 1 deepreds deepreds   165390 Aug 22 20:12 SAMPLE_TAG_FULLEND_L_TRIMMED3R_1515.txt
#-rw-rw-r-- 1 deepreds deepreds   243313 Aug 22 20:12 SAMPLE_TAG_FULLEND_L_TRIMMED6_1986.txt
#-rw-rw-r-- 1 deepreds deepreds   752821 Aug 22 20:12 SAMPLE_TAG_FULLEND_L_TRIMMED7_7431.txt
#-rw-rw-r-- 1 deepreds deepreds 46592649 Aug 22 20:12 SAMPLE_TAG_FULLEND_L_TRIMMED8_377175.txt
#-rw-rw-r-- 1 deepreds deepreds     6437 Aug 22 20:12 SAMPLE_TAG_FULLEND_NO_TRIMMED3L_53.txt
#-rw-rw-r-- 1 deepreds deepreds     5782 Aug 22 20:12 SAMPLE_TAG_FULLEND_NO_TRIMMED3LT_75.txt
#-rw-rw-r-- 1 deepreds deepreds    36892 Aug 22 20:12 SAMPLE_TAG_FULLEND_NO_TRIMMED3R_427.txt
#-rw-rw-r-- 1 deepreds deepreds    23391 Aug 22 20:12 SAMPLE_TAG_FULLEND_NO_TRIMMED3RT_306.txt
#-rw-rw-r-- 1 deepreds deepreds  4833951 Aug 22 20:12 SAMPLE_TAG_FULLEND_NO_TRIMMED6_45239.txt
#-rw-rw-r-- 1 deepreds deepreds  3165162 Aug 22 20:12 SAMPLE_TAG_FULLEND_NO_TRIMMED6T_36936.txt
#-rw-rw-r-- 1 deepreds deepreds  1912803 Aug 22 20:12 SAMPLE_TAG_NOMATCH_9564.txt
#-rw-rw-r-- 1 deepreds deepreds  6016780 Aug 22 20:12 SAMPLE_TAG_TRUNCATED_L_TRIMMED3_63456.txt
#-rw-rw-r-- 1 deepreds deepreds  2119334 Aug 22 20:12 SAMPLE_TAG_TRUNCATED_NO_TRIMMED3_31208.txt
#-rw-rw-r-- 1 deepreds deepreds   922948 Aug 22 20:12 SAMPLE_TAG_TRUNCATED_NO_TRIMMED3T_14105.txt

# SAVE FILES ACCORDING TO THEIR SAMPLE ID, IF NO ID, SAVE AS 'NONE'
filedict = {}
sampleList = glob.glob(os.path.join(inDir, '*.txt'))
sampleList.sort()
for sampleFileName in sampleList:
    if 'SAMPLE_TAG' not in sampleFileName: continue
    print 'processing', sampleFileName
    sampleHead = os.path.basename(sampleFileName).rsplit('_', 1)[0]
    sampleHead = sampleHead.replace('TRIMMED3LT', 'TRIMMED')
    sampleHead = sampleHead.replace('TRIMMED3L', 'TRIMMED')
    sampleHead = sampleHead.replace('TRIMMED3RT', 'TRIMMED')
    sampleHead = sampleHead.replace('TRIMMED3R', 'TRIMMED')
    sampleHead = sampleHead.replace('TRIMMED7', 'TRIMMED')
    sampleHead = sampleHead.replace('TRIMMED8', 'TRIMMED')
    sampleHead = sampleHead.replace('TRIMMED6T', 'TRIMMED')
    sampleHead = sampleHead.replace('TRIMMED6', 'TRIMMED')
    sampleHead = sampleHead.replace('TRIMMED3T', 'TRIMMED')
    sampleHead = sampleHead.replace('TRIMMED3', 'TRIMMED')
    seqDict, seqLine, seqHead = {}, '', ''
    for lines in open(sampleFileName, 'r').xreadlines():
        if lines[0] == '>':
            if len(seqLine):
                if seqDict.has_key(sampleHead):
                    if seqDict[sampleHead].has_key(seqHead): print 'dulicate', seqHead
                seqDict.setdefault(sampleHead, {})[seqHead] = seqLine
                seqLine = ''
            seqHead = lines[1:].strip().upper()
        else:
            seqLine += lines.strip()
    seqDict.setdefault(sampleHead, {})[seqHead] = seqLine
    seqList = seqDict[sampleHead].keys()
    seqList.sort()
    for seqId in seqList:
        mySeq = seqDict[sampleHead][seqId]
        idStr, idStr2 = seqId[1:].split('|')
        idList = [int(ix) for ix in idStr2.split(',')]
        if sampleIdDict.has_key((idList[0], idList[1], idList[2], idList[6])):
            suffixStr = sampleIdDict[(idList[0], idList[1], idList[2], idList[6])]
        else:
            suffixStr = 'NONE'
        try:
            filedict[sampleHead + '_' + suffixStr].write('>' + seqId + '\n' + seqDict[sampleHead][seqId] + '\n')
        except:
            filedict[sampleHead + '_' + suffixStr] = \
                open(os.path.join(outDir, sampleHead + '_' + suffixStr + '.txt'), 'w')
            filedict[sampleHead + '_' + suffixStr].write('>' + seqId + '\n' + seqDict[sampleHead][seqId] + '\n')

for ix in filedict.keys():
    filedict[ix].close()

# ADD SEQUENCE NUMBER AT THE END OF FILE NAME
fileList = glob.glob(os.path.join(outDir, '*.txt'))
for filename in fileList:
    nSeq = 0
    for lines in open(filename, 'r').xreadlines():
        if lines[0] == '>': nSeq += 1
    q = os.rename(filename, filename[:-4] + '_' + str(nSeq) + '.txt')

