# -*- coding: utf-8 -*-
"""
Created on Thu Dec 17 15:19:50 2015

@author: gajendra
"""
import re, os
import operator, csv, gzip
from itertools import repeat
import sys
from Bio import SeqIO
import pandas as pd
import numpy as np
import string, itertools
from joblib import Parallel, delayed
from joblib.pool import has_shareable_memory
import multiprocessing
import sqlite3
import datetime as dt
from operator import is_not
from functools import partial
import subprocess 
from pygr import seqdb
import timeit
import h5py

args = sys.argv
if len(args) != 6:
    print 'Usage:', args[0], 'Demultiplexing_Trimming_file Enzyme_file ref_genome_fasta blat_output_file demultiplexed_fasta_file'
    sys.exit()

indicator_file=args[1]#'Demultiplexing_Trimming_blunt_GTAC.tsv'
restriction_cite_file=args[2]#'Enzyme.tsv'
genome_fasta_file=args[3]#'/workdata/ref_genomes/human_gnome/select_hg19/bwa_index/hg19_chr1to22XY.fa'
fasta_file=args[5]#'all_reads/mouse_data/mouse_data.fa'
blat_out_psl=args[4]#"all_reads/mouse_data/mouse_data.psl"
data_base_location=fasta_file.split('/')[0]+'/'+fasta_file.split('/')[1]+'/'+'proceesedpsl_db'
inter_files=fasta_file.split('/')[0]+'/'+fasta_file.split('/')[1]+'/'
tim_start = dt.datetime.now()
blatpsl_db = sqlite3.connect(':memory:',check_same_thread=False)
column_name=["Matches","misMatches","repMatches","nCount","qNumInsert","qBaseInsert",
             "tNumInsert","tBaseInsert","strand","qName","qSize","qStart","qEnd","tName",
             "tSize","tStart","tEnd","blockCount","blockSizes","qStarts","tStarts"]
chunksize = 20000
j = 0
index_start = 1
select_col1=["qName","strand","qSize","qStart","tName","tStart","tEnd","myScore","myPI","myCV","mySPAN"]
print "Reading "+blat_out_psl
for blat_all_data in pd.io.parsers.read_table(blat_out_psl,sep='\t',header=None,names=column_name,engine='c',chunksize=chunksize, iterator=True,index_col=False, encoding = 'utf-8'):
    #print "I am here 1"
    blat_all_data.index += index_start
    blat_all_data["Matches"]=blat_all_data["Matches"].astype('float64')
    blat_all_data["qSize"]=blat_all_data["qSize"].astype('float64')
    
    blat_all_data["myScore"] = blat_all_data["Matches"]-blat_all_data["misMatches"]
    blat_all_data["myPI"]=100.0*(blat_all_data["Matches"]+blat_all_data["repMatches"])/(blat_all_data["Matches"]+blat_all_data["misMatches"]+
    blat_all_data["repMatches"]+blat_all_data["qNumInsert"]+blat_all_data["tNumInsert"])
    blat_all_data["myCV"]=100.0*(blat_all_data["qEnd"]-blat_all_data["qStart"])/blat_all_data["qSize"]
    blat_all_data["mySPAN"]=1.0*(blat_all_data["tEnd"]-blat_all_data["tStart"])/(blat_all_data["qEnd"]-blat_all_data["qStart"])
    
    j+=1
    print '%d seconds: completed %d rows' %((dt.datetime.now() - tim_start).seconds, j*chunksize)
    	
    blat_all_data.loc[:,select_col1].to_sql('original_psl', blatpsl_db, index=True, if_exists='append')
    index_start = blat_all_data.index[-1] + 1    

seqSampleDict = {}
sampleLeftRightDict = {}
sampleList = []
sampleDict = {}
csvReader = csv.reader(open(indicator_file, 'r'), delimiter = '\t')
csvFields = csvReader.next()
csvValues = [row for row in csvReader]
# GATHER ALL
iJunction = csvFields.index('JUNCTION')
iAnimal = csvFields.index('VECTOR')
iSample =csvFields.index('SAMPLE')
itag = csvFields.index('MID')
iA = csvFields.index('A')
iB = csvFields.index('B')
iC = csvFields.index('C')
iD = csvFields.index('D')
iDD = csvFields.index('DD')
iE = csvFields.index('E')
iF = csvFields.index('F')
iFF = csvFields.index('FF')
iG = csvFields.index('G')
iJ = csvFields.index('J')
iH = csvFields.index('H')
iI = csvFields.index('I')
iK = csvFields.index('K')
sampleList = []
tagList, aList, bList, cList, dList, ddList, eList, fList, ffList, gList, jhList, iList, kList = \
    [], [], [], [], [], [], [], [], [], [], [], [], []
jnhList = []
for row in csvValues:
    if len(row) == 0: continue
    if len(row) > iSample:
        if row[iSample] not in sampleList and len(row[iSample]):
            if len(row) > iB and len(row[itag]) and len(row[iA]) and len(row[iB]):
                sampleList.append((row[iSample], row[itag], row[iA], row[iB], row[iE]))
    if len(row) < max(iJunction, iAnimal, iSample): continue
    if len(row[iSample]) and len(row[iJunction]) and len(row[iAnimal]):
        sampleId = row[iSample].replace('_', '').replace('-', '').replace('\'', '').replace('`', '')
        animalId = row[iAnimal].replace('_', '').replace('-', '').replace('\'', '').replace('`', '')
        sampleDict[sampleId] = (row[iJunction], animalId)
        sampleLeftRightDict[sampleId] = (row[iJunction], animalId)

    if row[itag] not in tagList and len(row[itag]):
        tagList.append(row[itag])
    if len(row) > iA:
        if row[iA] not in aList and len(row[iA]):
            aList.append(row[iA])
    if len(row) > iB:
        if row[iB] not in bList and len(row[iB]):
            bList.append(row[iB])
    if len(row) > iC:
        if row[iC] not in cList and len(row[iC]):
            cList.append(row[iC])
    if len(row) > iD:
        if row[iD] not in dList and len(row[iD]):
            dList.append(row[iD])
    if len(row) > iDD:
        if row[iDD] not in ddList and len(row[iDD]):
            ddList.append(row[iDD])
    if len(row) > iE:
        if row[iE] not in eList and len(row[iE]):
            eList.append(row[iE])
    if len(row) > iF:
        if row[iF] not in fList and len(row[iF]):
            fList.append(row[iF])
    if len(row) > iFF:
        if row[iFF] not in ffList and len(row[iFF]):
            ffList.append(row[iFF])
    if len(row) > iG:
        if row[iG] not in gList and len(row[iG]):
            gList.append(row[iG])
    if len(row) > iJ and len(row) > iH:
        if row[iJ] + row[iH] not in jhList and len(row[iJ] + row[iH]):
            jhList.append(row[iJ] + row[iH])
            jnhList.append((row[iJ], row[iH]))
    if len(row) > iI:
        if row[iI] not in iList and len(row[iI]):
            iList.append(row[iI])
    if len(row) > iK:
        if row[iK] not in kList and len(row[iK]):
            kList.append(row[iK])
tagList.sort()
aList.sort()
bList.sort()
cList.sort()
dList.sort()
ddList.sort()
eList.sort()
fList.sort()
ffList.sort()
gList.sort()
jhList.sort()
iList.sort()
jnhList.sort()
kList.sort()


sampleIdDict = {}
for sampleId, tagId, aId, bId, eId in sampleList:
    newId = sampleId.replace('_', '').replace('-', '')
    sampleIdDict[(tagList.index(tagId), aList.index(aId), bList.index(bId), eList.index(eId))] = newId
    sampleIdDict[(tagList.index(tagId), -1, bList.index(bId), eList.index(eId))] = newId
    sampleIdDict[(tagList.index(tagId), aList.index(aId), -1, eList.index(eId))] = newId

####
sampleList = sampleDict.keys()
sampleList.sort()
sampleTotalList=sampleList

tem_col_list=list(blat_all_data.columns.values)
del(blat_all_data)
select_col2=["AnimalID","Junction","SampleID","qName","strand","qSize","qStart","qSeq","tName","tStart","tEnd","blatDecide","myScore","myPI","myCV","mySPAN"]
def blatdecide(record,temp_blat):
    seqid=record.id
    seqid_seq=str(record.seq)
    seqID, idStr =seqid.split('|')
    idList = [int(iy) for iy in idStr.split(',')]
    temp_blat["myScore"]=temp_blat["myScore"].astype(float)
    temp_blat["myPI"]=temp_blat["myPI"].astype(float)
    temp_blat["myCV"]=temp_blat["myCV"].astype(float)
    temp_blat["mySPAN"]=temp_blat["mySPAN"].astype(float)
    temp_blat["qStart"]=temp_blat["qStart"].astype(float)
    if len(temp_blat.index)==0:
        if sampleIdDict.has_key((idList[0], idList[1], idList[2], idList[6])):
                suffixStr = sampleIdDict[(idList[0], idList[1], idList[2], idList[6])]
                
                jun,anim=sampleDict[suffixStr]
                temp_blat["blatDecide"]=list(itertools.repeat("NoHits",len(temp_blat.index)))
                temp_blat["SampleID"]=list(itertools.repeat(suffixStr,len(temp_blat.index)))
                temp_blat["Junction"]=list(itertools.repeat(jun,len(temp_blat.index)))  
                temp_blat["AnimalID"]=list(itertools.repeat(anim,len(temp_blat.index)))
                temp_blat["qSeq"]=list(itertools.repeat(seqid_seq,len(temp_blat.index)))
        else:
                suffixStr = 'NONE'
                temp_blat["blatDecide"]=list(itertools.repeat("NoHits",len(temp_blat.index)))
                temp_blat["SampleID"]=list(itertools.repeat(suffixStr,len(temp_blat.index)))
                temp_blat["Junction"]=list(itertools.repeat('UNKNOWN',len(temp_blat.index)))  
                temp_blat["AnimalID"]=list(itertools.repeat('UNKNOWN',len(temp_blat.index)))
                temp_blat["qSeq"]=list(itertools.repeat(seqid_seq,len(temp_blat.index)))
        return temp_blat.loc[:,select_col2]
    dels_seqs.execute("DELETE FROM original_psl WHERE qName = ?",(seqid,))
    blatpsl_db.commit()
    
    if sampleIdDict.has_key((idList[0], idList[1], idList[2], idList[6])):
                suffixStr = sampleIdDict[(idList[0], idList[1], idList[2], idList[6])]
                jun,anim=sampleDict[suffixStr]
                temp_blat["blatDecide"]=list(itertools.repeat('Un',len(temp_blat.index)))
                temp_blat["SampleID"]=list(itertools.repeat(suffixStr,len(temp_blat.index)))
                temp_blat["Junction"]=list(itertools.repeat(jun,len(temp_blat.index)))  
                temp_blat["AnimalID"]=list(itertools.repeat(anim,len(temp_blat.index)))
                temp_blat["qSeq"]=list(itertools.repeat(seqid_seq,len(temp_blat.index)))
                temp_blat["fDiff"]=list(itertools.repeat(100.0,len(temp_blat.index)))
    else:
                suffixStr = 'NONE'
                temp_blat["blatDecide"]=list(itertools.repeat("Un",len(temp_blat.index)))
                temp_blat["SampleID"]=list(itertools.repeat(suffixStr,len(temp_blat.index)))
                temp_blat["Junction"]=list(itertools.repeat('UNKNOWN',len(temp_blat.index)))  
                temp_blat["AnimalID"]=list(itertools.repeat('UNKNOWN',len(temp_blat.index)))
                temp_blat["qSeq"]=list(itertools.repeat(seqid_seq,len(temp_blat.index)))
                temp_blat["qSeq"]=list(itertools.repeat(seqid_seq,len(temp_blat.index)))
                temp_blat["fDiff"]=list(itertools.repeat(100.0,len(temp_blat.index)))
    maxscore=float(max(temp_blat["myScore"]))
    tmpList=[float(iscore) for iscore in temp_blat["myScore"] if float(iscore)!=float(maxscore)]
    secondScoreList = []
    temp_blat["fDiff"]=list(100.0*(float(maxscore)-temp_blat["myScore"].astype(float))/float(maxscore))
    temp_blat["fDiff"]=temp_blat["fDiff"].astype(float)
    if len(tmpList):
        
        for iScore in tmpList:
              fDiff = 100.0*(float(maxscore) - iScore)/float(maxscore)
              if fDiff <= 3.0:
                  secondScoreList.append(iScore)
        secondScoreList.sort()
        secondScoreList.reverse()
    if maxscore<15:
        temp_blat.loc[:,"blatDecide"]="NoHits"
        return temp_blat.loc[:,select_col2]
        #continue
    else:
        df1=temp_blat[(temp_blat["myScore"]==maxscore)&(temp_blat["myPI"]>=95.0)&(temp_blat["myCV"]>=95.0)]
        if len(df1.index)==0:
            temp_blat.loc[:,"blatDecide"]="NoHits"
            return temp_blat.loc[:,select_col2]
            #continue
        else:
           df=temp_blat[(temp_blat["myScore"]==maxscore)&(temp_blat["myPI"]>=95.0)&(temp_blat["myCV"]>=95.0)&((temp_blat["mySPAN"]>0.7)&(temp_blat["mySPAN"]<1.3))]
           if len(df.index)==0:
              temp_blat.loc[:,"blatDecide"]="NoGoodSpn"
              return temp_blat.loc[:,select_col2]
              #continue
           else:
               df=temp_blat[(temp_blat["myScore"]>=15)&(temp_blat["myPI"]>=95.0)&(temp_blat["myCV"]>=95.0)&((temp_blat["mySPAN"]>0.7)&(temp_blat["mySPAN"]<1.3))&(temp_blat["fDiff"]<=3.0)]
               if len(df.index)==1:
                   temp_blat.loc[(temp_blat["myScore"]>=15)&(temp_blat["myPI"]>=95.0)&(temp_blat["myCV"]>=95.0)&((temp_blat["mySPAN"]>0.7)&(temp_blat["mySPAN"]<1.3)&(temp_blat["fDiff"]<=3.0)),"blatDecide"]="Single"
                   return temp_blat.loc[:,select_col2]
                   #continue
               else:
                   temp_blat.loc[:,"blatDecide"]="Multi"
                   return temp_blat.loc[:,select_col2]
                   #continue
if multiprocessing.cpu_count()>8:
    num_cores = multiprocessing.cpu_count()/2
    
else:
    num_cores = 2    
proceesedpsl_db = sqlite3.connect(data_base_location,check_same_thread=False)
#proceesedpsl.db
proc_list=[]
dels_seqs=blatpsl_db.cursor()
in_cr=dels_seqs.execute("CREATE INDEX qIndex ON original_psl (qName)")
in_cr.fetchone()

def batch_iterator(iterator, batch_size) :
    entry = True
    while entry :
        batch = []
        while len(batch) < batch_size :
            try :
                entry = iterator.next()
            except StopIteration :
                entry = None
            if entry is None :
                #End of file
                break
            batch.append(entry)
        if batch :
            yield batch

final_data=pd.DataFrame(columns=select_col2)
final_data.to_sql('processed_psl', proceesedpsl_db, if_exists='replace')


record_iter = SeqIO.parse(fasta_file,"fasta")
start=timeit.default_timer()

for i, batch in enumerate(batch_iterator(record_iter, 1000)) :
    final_data=pd.DataFrame(columns=select_col2)
    final_data=final_data.append(Parallel(n_jobs=num_cores, max_nbytes=None)(delayed(blatdecide)(record,pd.io.sql.read_sql('SELECT * FROM original_psl WHERE qName = ?', blatpsl_db,params=[str(record.id)]).copy()) for record in batch))
    final_data.to_sql('processed_psl', proceesedpsl_db, if_exists='append')
    
stop =timeit.default_timer()
#print 'Time required for hit separation %f' %(stop -start) 
inde_curs=proceesedpsl_db.cursor()

blatpsl_db.close()


select_col=["AnimalID","Junction","SampleID","qName","strand","qSize","qSeq","tName","tStart","tEnd"]

temp_single_hits=pd.read_sql_query('SELECT * FROM processed_psl WHERE blatDecide == "Single" ', proceesedpsl_db)
temp_single_hits["qStart"]=temp_single_hits["qStart"].astype(float)
temp_single_hits["qName"]=temp_single_hits["qName"].astype(str)
temp_single_hits["SampleID"]=temp_single_hits["SampleID"].astype(str)
Single_hits=temp_single_hits.loc[((temp_single_hits["qStart"]<=5.0) & (temp_single_hits["SampleID"]!= 'NONE' ))].copy()
del(temp_single_hits)

Single_hits["VIS_site"]=list(itertools.repeat(0,len(Single_hits.index)))
Single_hits["tSeq"]=list(itertools.repeat('AAAAAA',len(Single_hits.index)))
Single_hits["tLen"]=list(itertools.repeat(0,len(Single_hits.index)))
print_col=["qName","tName","strand","VIS_site"]

count_df=pd.DataFrame(0,index=Single_hits.index,columns=sampleList)
Single_hits=pd.concat([Single_hits,count_df],axis=1,join='outer')
restrictionDict={}
for lines in open(restriction_cite_file, 'r').xreadlines():
    if lines[:6] == 'ANIMAL': continue
    if len(lines.strip()) == 0: continue
    myAnimal, mySample, myToCheck = lines.split('\t')[:3]
    myToCheck = myToCheck.strip()
    
    restrictionDict[mySample.replace('_', '').replace('-', '').replace('\'', '').replace('`', '')] = [myToCheck, myAnimal]

print "Single hits processing"

rheMac2 = seqdb.SequenceFileDB(genome_fasta_file)
nStep = 2000
resultDict = {}
strandDict = {}
for line1 in Single_hits[['AnimalID','SampleID','qName','VIS_site','tName','tStart','tEnd','strand']].itertuples():
    lines=[str(xx) for xx in line1]
    lines=lines[1:]
    virustag, sampleTag, siteIdStr, mysite, chrid, tstart,tend, strand = lines
    if strand == '+':
        chrsite =int(float(tstart))
    else:
        chrsite =int(float(tend))
    seqId, idStr = siteIdStr.split('|')
    strandDict[seqId] = strand
    resultDict.setdefault(chrid, {}).setdefault((chrsite, strandDict[seqId]), {}).setdefault(sampleTag, {})
    resultDict[chrid][(chrsite, strandDict[seqId])][sampleTag][seqId] = virustag, idStr, mysite
    

chrList = resultDict.keys()
chrList.sort()
for chrid in chrList:
    siteList = resultDict[chrid].keys()
    siteList.sort()
    for chrsite, strand in siteList:

        # FOR STATISTICS FILE
        countList = [0]*len(sampleList)
        for sampleTag in resultDict[chrid][(chrsite, strand)].keys():
            nSeqId = len(resultDict[chrid][(chrsite, strand)][sampleTag])
            if sampleTag not in sampleList: print sampleTag, sampleList
            iCountIndex = sampleList.index(sampleTag)
            countList[iCountIndex] = nSeqId
        #print countList
        rsDict = {}
        for sampleTag in resultDict[chrid][(chrsite, strand)].keys():
            enzyme = restrictionDict[sampleTag]
            rsDict.setdefault(enzyme[0], 0)
            rsDict[enzyme[0]] += 1
        revrsdict = {}
        for enzyme, sitecount in rsDict.items():
            revrsdict.setdefault(sitecount, []).append(enzyme)
        maxrevrs = max(revrsdict.keys())

        cutSite = revrsdict[maxrevrs][0]

        if strand == '+': # SEARCH FORWARD DIRECTION FOR cutSite
            myStr1, myStr2 = str(rheMac2[chrid][chrsite:chrsite + 5]).upper(), ''
            for iCurrent in range(chrsite + 5, rheMac2[chrid].stop, nStep):
                myStr2 += str(rheMac2[chrid][iCurrent:iCurrent + nStep]).upper()
                if cutSite in myStr2: break
            iTCGA = myStr2.index(cutSite)
            myStr = myStr1 + myStr2[:iTCGA+4]
            wlist1 = myStr, len(myStr)
            chrsite1 = chrsite + 5
            myStr1, myStr2 = str(-rheMac2[chrid][chrsite1 - 5:chrsite1]).upper(), ''
            for iCurrent in range(chrsite1 - 5, 0, -nStep):
                myStr2 += str(-rheMac2[chrid][iCurrent - nStep:iCurrent]).upper()
                if cutSite in myStr2: break
            iTCGA = myStr2.index(cutSite)
            myStr = myStr1 + myStr2[:iTCGA+4]
            wlist2 = myStr, len(myStr)
        if strand == '-': # SEARCH REVERSE DIRECTION FOR TCGA, REVERSE COMPLEMENT
            myStr1, myStr2 = str(-rheMac2[chrid][chrsite - 5:chrsite]).upper(), ''
            for iCurrent in range(chrsite - 5, 0, -nStep):
                myStr2 += str(-rheMac2[chrid][iCurrent - nStep:iCurrent]).upper()
                if cutSite in myStr2: break
            iTCGA = myStr2.index(cutSite)
            myStr = myStr1 + myStr2[:iTCGA+4]
            wlist1 = myStr, len(myStr)
            chrsite1 = chrsite - 5
            myStr1, myStr2 = str(rheMac2[chrid][chrsite1:chrsite1 + 5]).upper(), ''
            for iCurrent in range(chrsite1 + 5, rheMac2[chrid].stop, nStep):
                myStr2 += str(rheMac2[chrid][iCurrent:iCurrent + nStep]).upper()
                if cutSite in myStr2: break
            iTCGA = myStr2.index(cutSite)
            myStr = myStr1 + myStr2[:iTCGA+4]
            wlist2 = myStr, len(myStr)

        for sampleTag in resultDict[chrid][(chrsite, strand)].keys():
            if sampleTag == 'NONE': continue
            for seqi in resultDict[chrid][(chrsite, strand)][sampleTag].keys():
                zz=Single_hits.loc[Single_hits["qName"].str.contains(seqi)].index
                seqloc=int(zz[0])
                Single_hits.ix[seqloc,"tSeq"]=wlist1[0]
                Single_hits.ix[seqloc,"tLen"]=wlist1[1]
                Single_hits.ix[seqloc,sampleList]=countList
                Single_hits.ix[seqloc,"VIS_site"]=chrsite
                

pipe11_table=proceesedpsl_db.cursor()
pipe11_table.execute('''DROP TABLE IF EXISTS ALL_merge_site''')
pipe11_table.execute('''CREATE TABLE ALL_merge_site (col1, col2, col3, col4, col5, col6, col7, col8)''')
resultDict = {}
strandDict = {}

#Single_hits.to_csv("Single_hits2.tsv",sep='\t',header=True,index=False,index_label=None)
Single_hits.to_sql('single_hit', proceesedpsl_db, chunksize=20000,if_exists='replace')
#print "Writing Single hit file"

infoDict = {}
siteDict = {}
for row in Single_hits[['AnimalID','Junction','qName','tName','strand','VIS_site']].itertuples():
        row1=[str(jk) for jk in row]
        junk, myClass, myDirect, seqIdStr, chrid, strand, chrsite = row1
        seqId, idStr = seqIdStr.split('|')
        infoDict[str(seqId)] = (str(myDirect), str(myClass))
        chrsite = int(float(chrsite))
        siteDict.setdefault(chrid, {}).setdefault((chrsite, strand), []).append((seqId, infoDict[seqId][0]))


chrList = siteDict.keys()
chrList.sort()
for chrid in chrList:
    siteList = siteDict[chrid].keys()
    siteList.sort()
    for x in range(len(siteList)):
        if x == 0: # initialize array
            n = 0
            clusterList = [[]]
            tendmax = siteList[x][0] # tEnd max
            clusterList[n].append(siteList[x])
        elif x > 0:
            if abs(siteList[x][0] - tendmax) <= 15: # IF ADJACENT, THEN MERGE
                tendmax = max(tendmax, siteList[x][0])
                clusterList[n].append(siteList[x])
            else:
                n += 1
                clusterList.append([])
                tendmax = siteList[x][0]
                clusterList[n].append(siteList[x])
    for subCluster in clusterList:
        replaceDict = {}
        countDict = {}
        tmpList1, tmpList2 = [], []
        for chrsite, strand in subCluster:
            seqCount = len(siteDict[chrid][(chrsite, strand)])
            countLeft, countRight = 0, 0
            for seqId, myDirect in siteDict[chrid][(chrsite, strand)]:
                if myDirect == 'LEFT': countLeft += 1
                if myDirect == 'RIGHT': countRight += 1
            countTag = 0
            if countLeft and not countRight: countTag = -1
            if not countLeft and countRight: countTag = 1
            countDict[(chrsite, strand)] = countTag
            if strand == '+': tmpList1.append((seqCount, chrsite, strand))
            if strand == '-': tmpList2.append((seqCount, chrsite, strand))
        tmpList1.sort()
        tmpList2.sort()
        tmpList1.reverse()
        tmpList2.reverse()
        fSite, rSite, fStrand, rStrand, fDirect, rDirect = -1, -1, '', '', '', ''
        if len(tmpList1) == 1:
            fSite, fStrand = tmpList1[0][1:]
            if countDict[(fSite, fStrand)] == -1: fDirect = 'LEFT'
            if countDict[(fSite, fStrand)] == 1: fDirect = 'RIGHT'
        elif len(tmpList1) > 1:
            if tmpList1[0][0] > tmpList1[1][0]:
                fSite, fStrand = tmpList1[0][1:]
                if countDict[(fSite, fStrand)] == -1: fDirect = 'LEFT'
                if countDict[(fSite, fStrand)] == 1: fDirect = 'RIGHT'
                seqCount1, chrsite1, strand1 = tmpList1[0]
                for seqCount2, chrsite2, strand2 in tmpList1[1:]:
                    if abs(chrsite2 - chrsite1) <= 5 and strand1 == strand2:
                        if countDict[(chrsite1, strand1)] == countDict[(chrsite2, strand2)]:
                            replaceDict[(chrsite2, strand2)] = chrsite1, strand1
        if len(tmpList2) == 1:
            rSite, rStrand = tmpList2[0][1:]
            if countDict[(rSite, rStrand)] == -1: rDirect = 'LEFT'
            if countDict[(rSite, rStrand)] == 1: rDirect = 'RIGHT'
        elif len(tmpList2) > 1:
            if tmpList2[0][0] > tmpList2[1][0]:
                rSite, rStrand = tmpList2[0][1:]
                if countDict[(rSite, rStrand)] == -1: rDirect = 'LEFT'
                if countDict[(rSite, rStrand)] == 1: rDirect = 'RIGHT'
                seqCount1, chrsite1, strand1 = tmpList2[0]
                for seqCount2, chrsite2, strand2 in tmpList2[1:]:
                    if abs(chrsite2 - chrsite1) <= 5 and strand1 == strand2:
                        if countDict[(chrsite1, strand1)] == countDict[(chrsite2, strand2)]:
                            replaceDict[(chrsite2, strand2)] = chrsite1, strand1
        pairDict = {} # FOR PAIR INFORMATION
        if fSite >= 0 and rSite >= 0 and abs(fSite - rSite) == 5:
            pairDict[(fSite, fStrand)] = rSite, rStrand
            pairDict[(rSite, rStrand)] = fSite, fStrand
        # + LEFT - RIGHT: NO, CONSIDER TAG INFORMATION AND USE IT FOR LEFT/RIGHT ID
        for chrsite, strand in subCluster:
            if replaceDict.has_key((chrsite, strand)): continue
            if pairDict.has_key((chrsite, strand)): continue
            if countDict[(chrsite, strand)] == -1: directTag = 'LEFT'
            elif countDict[(chrsite, strand)] == 1: directTag = 'RIGHT'
            else: directTag = 'NONE'
            wlist = 'SINGLE', chrid, chrsite, strand, directTag  
            #outfile.write('\t'.join(map(str, wlist)) + '\n')
            pipe11_table.execute("INSERT INTO ALL_merge_site VALUES (?,?,?,?,?,NULL,NULL,NULL)",tuple( wlist))
        for chrsite, strand in replaceDict.keys():
            chrsite1, strand1 = replaceDict[(chrsite, strand)]
            wlist = 'REPLACE', chrid, chrsite, strand, chrsite1, strand1
            #outfile.write('\t'.join(map(str, wlist)) + '\n')
            pipe11_table.execute("INSERT INTO ALL_merge_site VALUES (?,?,?,?,?,?,NULL,NULL)",tuple( wlist))
        if len(pairDict):
            if fDirect == 'LEFT':
                wlist = 'PAIR', chrid, fSite, fStrand, 'LEFT', rSite, rStrand, 'RIGHT'
            if fDirect == 'RIGHT':
                wlist = 'PAIR', chrid, rSite, rStrand, 'LEFT', fSite, fStrand, 'RIGHT'
            #outfile.write('\t'.join(map(str, wlist)) + '\n')
            pipe11_table.execute("INSERT INTO ALL_merge_site VALUES (?,?,?,?,?,?,?,?)",tuple( wlist))
proceesedpsl_db.commit()
pipe11_out=pd.read_sql_query('SELECT * FROM ALL_merge_site', proceesedpsl_db)


statusDict = {}
for row in pipe11_out.itertuples():
        reline=[]
        for kk in row:
            if kk == None: continue
            else: reline.append(str(kk))
                
        linelist=reline[1:] #filter(partial(is_not, 'None'), reline[1:])
        #linelist = lines.strip().split('\t')
        if linelist[0] == 'SINGLE':
            chrid, chrsite, strand, myDirect = linelist[1:]
            statusDict.setdefault('SINGLE', {})[(chrid, int(chrsite), strand)] = myDirect
        if linelist[0] == 'REPLACE': # FRONT ONE INTO LAST ONE
            chrid, delSite, delStrand, repSite, repStrand = linelist[1:]
            statusDict.setdefault('REPLACE', {}).setdefault((chrid, int(repSite), repStrand), \
                []).append((int(delSite), delStrand))
        if linelist[0] == 'PAIR':
            if linelist[4] == 'LEFT':
                chrid, leftSite, leftStrand, j1, rightSite, rightStrand, j2 = linelist[1:]
            if linelist[4] == 'RIGHT':
                chrid, rightSite, rightStrand, j1, leftSite, leftStrand, j2 = linelist[1:]
            statusDict.setdefault('LEFT', {})[(chrid, int(leftSite), leftStrand)] = int(rightSite), rightStrand

# MAPPING STATUS
mapDict = {}
seqDict, seqHead = {}, ''
sampleDict = {}

pipe12_temp=pd.read_sql_query('SELECT * FROM single_hit', proceesedpsl_db)
pipe12_names=['AnimalID', 'Junction', 'qName', 'strand', 'qSize', 'qSeq', 'tLen', 'tSeq', 'tName', 'VIS_site']
pipe12_in=pd.concat([pipe12_temp.loc[:,pipe12_names],pipe12_temp.iloc[:, (len(pipe12_temp.columns)-len(sampleList)):len(pipe12_temp.columns)]], axis=1)



resultDict = {}
countDict = {}
directDict = {}

allDir='pub_script'
sampleDir='unknown'
for lines1 in pipe12_in.itertuples():
    linelist=[str(jk) for jk in lines1]
    #linelist = lines.splitlines()[0].split('\t')
    junk, myClass, myDirect, seqId, strand, seqLength, seqStr, gLength, gStr, chrid, chrsite = linelist[:11]
    #j1, j2, j3, tssDistance, tssName, geneSymbol, tssCancer, cpgDistance, \
    #    cpgName, gcstr, repName, repClass, repFamily, mirDistance, mirName, tupos, tusize, tugenestart, \
    #    tugeneend, tustrand, tuacc, tugenesymbol = linelist[-22:]
    # REMOVE ALL CLASS
    #myClass = 'NA'
    countList = map(int, linelist[11:])
    chrsite = int(float(chrsite))
    countDict.setdefault((chrid, chrsite, strand), {})[allDir] = countList
    saveList = myClass, myDirect, seqLength, seqStr, gLength, gStr
        
    resultDict.setdefault(chrid, {}).setdefault((chrsite, strand), {})[seqId] = saveList
    directDict[seqId] = myDirect

outDict = {}
# MERGE REPLACE RESULTS FOR SEQUENCE READ COUNTS

for chrid, repSite, repStrand in statusDict['REPLACE'].keys():
    for delSite, delStrand in statusDict['REPLACE'][(chrid, repSite, repStrand)]:
        #for allDir, sampleDir in seqMapDirList:
        if countDict[(chrid, repSite, repStrand)].has_key(allDir) and \
            countDict[(chrid, delSite, delStrand)].has_key(allDir):
            countDict[(chrid, repSite, repStrand)][allDir] = [ix+iy for ix, iy in \
                zip(countDict[(chrid, delSite, delStrand)][allDir], countDict[(chrid, repSite, repStrand)][allDir])]
        if not countDict[(chrid, repSite, repStrand)].has_key(allDir) and \
            countDict[(chrid, delSite, delStrand)].has_key(allDir):
            countDict.setdefault((chrid, repSite, repStrand), {})[allDir] = \
                countDict[(chrid, delSite, delStrand)][allDir]
        for seqId in resultDict[chrid][(delSite, delStrand)].keys():
            resultDict[chrid][(repSite, repStrand)][seqId] = resultDict[chrid][(delSite, delStrand)][seqId]
            outDict[seqId] = 0

countDict2 = {}
for chrid, chrsite, strand in countDict.keys():
    totalCountList = []
    
    if countDict[(chrid, chrsite, strand)].has_key(allDir): 
        for iCount in countDict[(chrid, chrsite, strand)][allDir]:
            totalCountList.append(iCount)
    else:
        for ix in sampleDict[sampleDir]:
            totalCountList.append(0)
    countDict2[(chrid, chrsite, strand)] = tuple(totalCountList)
    
#print "Check Content"
#print countDict2
saveDict = {}

# PRINT OUT RESULTS
for chrid, chrsite, strand in statusDict['SINGLE'].keys():
    myDirect = statusDict['SINGLE'][(chrid, chrsite, strand)]
    countList = countDict2[(chrid, chrsite, strand)]
    totalCount = reduce(operator.add, countList)
    sortDict = {}
    for seqId in resultDict[chrid][(chrsite, strand)].keys():
        myClass, myDirect, seqLength, seqStr, gLength, gStr = resultDict[chrid][(chrsite, strand)][seqId]
        if myDirect == 'LEFT':
            wlist = myClass, '', '', '', 0, '', 0, '', seqId, 'Single', \
                strand, seqLength, seqStr, gLength, gStr, totalCount, chrid, -1, chrsite
            wlist = wlist + tuple(countList) 
                        
            sortDict.setdefault('LEFT', []).append((seqLength, seqId, wlist))
            
        if myDirect == 'RIGHT':
            wlist = myClass, seqId, 'Single', strand, seqLength, seqStr, gLength, gStr, '', '', \
                '', 0, '', 0, '', totalCount, chrid, chrsite, -1
            wlist = wlist + tuple(countList) 
            
            sortDict.setdefault('RIGHT', []).append((seqLength, seqId, wlist))
        outDict[seqId] = 0
    if sortDict.has_key('LEFT'):
        sortDict['LEFT'].sort()
        sortDict['LEFT'].reverse()
        classDict = {}
        for seqLength, seqId, wlist in sortDict['LEFT']:
            myClass = wlist[0]
            classDict.setdefault(myClass, 0)
            classDict[myClass] += 1
            outDict[seqId] = 0
        if len(classDict) > 1: print classDict
        for myClass in classDict.keys():
            j1, j2, j3, j4, j5, j6, j7, j8, seqId, j9, \
                strand, seqLength, seqStr, gLength, gStr, totalCount, chrid, j10, chrsite = sortDict['LEFT'][0][2][:19]
            
            countList = sortDict['LEFT'][0][2][19:]
            wlist2 = (j1, j3, j4, j5, j6, j7, j8, 'Single', \
                strand, seqLength, seqStr, gLength, gStr, totalCount, chrid, j10, chrsite) + countList #+ \
                
            saveDict['\t'.join(map(str, wlist2))] = 0
    if sortDict.has_key('RIGHT'):
        sortDict['RIGHT'].sort()
        sortDict['RIGHT'].reverse()
        classDict = {}
        for seqLength, seqId, wlist in sortDict['RIGHT']:
            myClass = wlist[0]
            classDict.setdefault(myClass, 0)
            classDict[myClass] += 1
            outDict[seqId] = 0
        if len(classDict) > 1: print classDict
        for myClass in classDict.keys():
            j1, seqId, j9, strand, seqLength, seqStr, gLength, gStr, j2, j3, \
                j4, j5, j6, j7, j8, totalCount, chrid, chrsite, j10 = sortDict['RIGHT'][0][2][:19]
          
            countList = sortDict['RIGHT'][0][2][19:]
            wlist2 = (j1, 'Single', strand, seqLength, seqStr, gLength, gStr, j3, \
                j4, j5, j6, j7, j8, totalCount, chrid, chrsite, j10) + countList #+ \
                
            saveDict['\t'.join(map(str, wlist2))] = 0


for chrid, leftSite, leftStrand in statusDict['LEFT'].keys():
    rightSite, rightStrand = statusDict['LEFT'][(chrid, leftSite, leftStrand)]
    countList = [ix+iy for ix, iy in zip(countDict2[(chrid, leftSite, leftStrand)], \
        countDict2[(chrid, rightSite, rightStrand)])]
    totalCount = reduce(operator.add, countList)
    leftList = resultDict[chrid][(leftSite, leftStrand)].keys()
    rightList = resultDict[chrid][(rightSite, rightStrand)].keys()
    sortDict = {}
    for seqId in rightList:
        myClass, myDirect, seqLength, seqStr, gLength, gStr = resultDict[chrid][(rightSite, rightStrand)][seqId]#, \
           
        wlist = myClass, seqId, 'Single', rightStrand, seqLength, seqStr, gLength, gStr, '', 'Single', \
            leftStrand, 0, '', 0, '', totalCount, chrid, rightSite, leftSite
        wlist = wlist + tuple(countList) 
        sortDict.setdefault('RIGHT', []).append((seqLength, seqId, wlist))
        outDict[seqId] = 0
    for seqId in leftList:
        myClass, myDirect, seqLength, seqStr, gLength, gStr= resultDict[chrid][(leftSite, leftStrand)][seqId]#, \
            
        wlist = myClass, '', 'Single', rightStrand, 0, '', 0, '', seqId, 'Single', \
            leftStrand, seqLength, seqStr, gLength, gStr, totalCount, chrid, rightSite, leftSite
        wlist = wlist + tuple(countList) 
           
        sortDict.setdefault('LEFT', []).append((seqLength, seqId, wlist))
        outDict[seqId] = 0

    sortDict['LEFT'].sort()
    sortDict['LEFT'].reverse()
    sortDict['RIGHT'].sort()
    sortDict['RIGHT'].reverse()
    classDict = {}
    for seqLength, seqId, wlist in sortDict['LEFT']:
        myClass = wlist[0]
        classDict.setdefault(myClass, 0)
        classDict[myClass] += 1
        outDict[seqId] = 0
    for seqLength, seqId, wlist in sortDict['RIGHT']:
        myClass = wlist[0]
        classDict.setdefault(myClass, 0)
        classDict[myClass] += 1
        outDict[seqId] = 0
    if len(classDict) > 1: print classDict
    for myClass in classDict.keys():
        j1, j2, j3, j4, j5, j6, j7, j8, seqId, j9, \
            strand1, seqLength1, seqStr1, gLength1, gStr1, totalCount1, chrid1, j10, chrsite1 \
            = sortDict['LEFT'][0][2][:19]
        
        countList = sortDict['LEFT'][0][2][19:]
        wlist2a = (j1, j3, j4, j5, j6, j7, j8, j9, \
            strand1, seqLength1, seqStr1, gLength1, gStr1, totalCount1, chrid1, j10, chrsite1) + countList 
            
        k1, seqId, k9, strand2, seqLength2, seqStr2, gLength2, gStr2, k2, k3, \
            k4, k5, k6, k7, k8, totalCount2, chrid2, chrsite2, k10 = sortDict['RIGHT'][0][2][:19]
       
        countList = sortDict['RIGHT'][0][2][19:]
        wlist2b = (k1, k9, strand2, seqLength2, seqStr2, gLength2, gStr2, k3, \
            k4, k5, k6, k7, k8, totalCount2, chrid2, chrsite2, k10) + countList 
        wlist2 = j1, k9, strand2, seqLength2, seqStr2, gLength2, gStr2, k3, \
            strand1, seqLength1, seqStr1, gLength1, gStr1, totalCount1, chrid1, chrsite2, chrsite1
        wlist2 = wlist2 + countList 
        saveDict['\t'.join(map(str, wlist2))] = 0

tmpDir = 'tmp29'

totalCountList = []
totalPreList = []
totalAftList = []
seqDict = {}
rm_temcmd='rm -rf %s'%tmpDir
rm_tem=subprocess.Popen(rm_temcmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell =True)
rmstdout,rmstderr=rm_tem.communicate()
make_cmd='mkdir -p %s'%tmpDir
make_tmp=subprocess.Popen(make_cmd, stdout=subprocess.PIPE, shell =True)
make_tmp.communicate()
merge_file='%s/merge.fa'%tmpDir
outfile = open(merge_file, 'w')
iLine = 0

    
for line in saveDict.keys():

    linelist=line.split('\t')
    linelist[-1] = linelist[-1].strip()
    myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2, \
        gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2 = linelist[:17]
        
    countList = tuple(map(int, linelist[17:]))
    preList = myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2, \
        gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2
        
    totalCountList.append(countList)
    totalPreList.append(preList)
    
    # L LEFT R RIGHT S SINGLE P PAIR
    if len(myOcc1) and len(myOcc2) == 0:
        outfile.write('>%d|RS|%s|%d\n%s\n' % (iLine, totalCount, len(seqStr1), seqStr1))
        seqDict['%d|RS|%s|%d' % (iLine, totalCount, len(seqStr1))] = seqStr1
    if len(myOcc1) == 0 and len(myOcc2):
        outfile.write('>%d|LS|%s|%d\n%s\n' % (iLine, totalCount, len(seqStr2), seqStr2))
        seqDict['%d|LS|%s|%d' % (iLine, totalCount, len(seqStr2))] = seqStr2
    if len(myOcc1) and len(myOcc2):
        outfile.write('>%d|RP|%s|%d\n%s\n' % (iLine, totalCount, len(seqStr1), seqStr1))
        seqDict['%d|RP|%s|%d' % (iLine, totalCount, len(seqStr1))] = seqStr1
        outfile.write('>%d|LP|%s|%d\n%s\n' % (iLine, totalCount, len(seqStr2), seqStr2))
        seqDict['%d|LP|%s|%d' % (iLine, totalCount, len(seqStr2))] = seqStr2

    iLine += 1
pipe14_seqDict=seqDict
outfile.close()

blast_dbcmd='formatdb -t %s/merge -i %s/merge.fa -p F -o F -n %s/merge'%(tmpDir,tmpDir,tmpDir)

subprocess.call(blast_dbcmd, shell =True)
stderr=1
cnt=0


while stderr:
    if cnt>2:
        break
    run14_blastcmd='blastall -p blastn -d %s/merge -i %s/merge.fa -m 9 -a %d -F F | gzip > %s/merge.blastn.gz'%(tmpDir,tmpDir,num_cores,tmpDir)
    
    run_blast14=subprocess.Popen(run14_blastcmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell =True)
    cnt=cnt+1
    stdout,stderr=run_blast14.communicate()
    


if cnt>2:
    run14_blastcmd='blastall -p blastn -d %s/merge -i %s/merge.fa -m 9 -a %d -F F | gzip > %s/merge.blastn.gz'%(tmpDir,tmpDir,1,tmpDir)
    run_blast14=subprocess.Popen(run14_blastcmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell =True)
    stdout,stderr=run_blast14.communicate()


mergeDict = {}
zip_file='%s/merge.blastn.gz'%tmpDir

infile = gzip.GzipFile(zip_file, 'r')

while 1:
    lines = infile.readline()
    #print lines
    if lines == '': break
    if lines[0] == '#': continue
    query_id, subject_id, p_identity, alignment_length, mismatches, gap_openings, q_start, q_end, \
        s_start, s_end, e_value, bit_score = lines.splitlines()[0].split('\t')
    #print lines.splitlines()[0].split('\t')
    if query_id == subject_id: continue
    if 'RP' in query_id or 'LP' in query_id: continue
    if 'R' in query_id and 'L' in subject_id: continue
    if 'L' in query_id and 'R' in subject_id: continue
    query_count, query_length = map(int, query_id.rsplit('|', 2)[-2:])
    subject_count, subject_length = map(int, subject_id.rsplit('|', 2)[-2:])
    query_seq = seqDict[query_id]
    if query_count > 3: continue
    subject_seq = seqDict[subject_id]
    if subject_count < query_count: continue
    identity1 = 100.*(float(alignment_length) - float(mismatches) - float(gap_openings))
    identity1 = identity1/float(min(query_length, subject_length))
    identity2 = 100.*(float(alignment_length) - float(mismatches) - float(gap_openings))
    identity2 = identity2/float(max(query_length, subject_length))
    #if int(mismatches) + int(gap_openings) > 5: continue
    if identity1 < 90.: continue
    if query_count > subject_count:
        query_id, subject_id, identity1, identity2, alignment_length, q_start, q_end, s_start, s_end = \
            subject_id, query_id, identity1, identity2, alignment_length, s_start, s_end, q_start, q_end
    elif query_count == subject_count: # IF SAME COUNTS, QUERY IS SHORTER/SUBJECT IS LONGER ONE
        if query_length > subject_length:
            query_id, subject_id, identity1, identity2, alignment_length, q_start, q_end, s_start, s_end = \
                subject_id, query_id, identity1, identity2, alignment_length, s_start, s_end, q_start, q_end
    mergeDict.setdefault(query_id, {}).setdefault((identity1, identity2), \
        {}).setdefault((subject_count, subject_length),[])
    mergeDict[query_id][(identity1, identity2)][(subject_count, subject_length)].append((subject_id, \
        identity1, identity2, alignment_length, q_start, q_end, s_start, s_end, \
        query_seq, subject_seq))
infile.close()
pipe14_dict=mergeDict
#print mergeDict
replaceDict = {}
reverseDict = {}
for query_id in mergeDict.keys():
    
    identityList = mergeDict[query_id].keys()
    identityList.sort()
    identityList.reverse()
    maxIdentity = identityList[0]
    subjectCountList = mergeDict[query_id][maxIdentity].keys()
    subjectCountList.sort()
    subjectCountList.reverse()

    maxSubjectCount, maxSubjectLength = subjectCountList[0]
    mergeDict[query_id][maxIdentity][(maxSubjectCount, maxSubjectLength)].sort()
    subject_id, identity1, identity2, alignment_length, q_start, q_end, s_start, s_end, query_seq, subject_seq \
        = mergeDict[query_id][maxIdentity][(maxSubjectCount, maxSubjectLength)][0]
    qLine = int(query_id.split('|', 1)[0])
    sLine = int(subject_id.split('|', 1)[0])
    
    replaceDict[qLine] = sLine
    reverseDict.setdefault(sLine, []).append(qLine)

deleteDict = {}
for qLine, sLine in replaceDict.items():
    if reverseDict.has_key(qLine) and reverseDict.has_key(sLine):
        if reverseDict[qLine] == [sLine] and reverseDict[sLine] == [qLine]:
            #print 'duplicate', qLine, sLine
            if qLine < sLine:
                deleteDict[sLine] = qLine
            else:
                deleteDict[qLine] = sLine
for qLine, sLine in deleteDict.items():
    del replaceDict[qLine]
    del reverseDict[sLine]
#print '#1', len(replaceDict)
while 1: # DELETE HEAD
    nDelete = 0
    for qLine, sLine in replaceDict.items(): # DELETE Q AND ADD INTO S
        if reverseDict.has_key(qLine): continue
        #print '%d TO DELETE1', qLine, sLine, reverseDict[sLine]
        preList1 = totalPreList[qLine]
        preList2 = totalPreList[sLine]
       # aftList1 = totalAftList[qLine]
       # aftList2 = totalAftList[sLine]
        countList1 = totalCountList[qLine]
        countList2 = totalCountList[sLine]
        totalCount = int(preList1[-4]) + int(preList2[-4])
        countList = tuple([ix+iy for ix, iy in zip(countList1, countList2)])
        preList = preList2[:-4] + (totalCount,) + preList2[-3:]
        totalCountList[qLine] = None
        totalCountList[sLine] = countList
        totalPreList[qLine] = None
        totalPreList[sLine] = preList
        #totalAftList[qLine] = None
        #print '%d DELETED1', qLine, sLine, reverseDict[sLine]
        del replaceDict[qLine]
        reverseDict[sLine].remove(qLine)
        if len(reverseDict[sLine]) == 0: del reverseDict[sLine]
        nDelete += 1
    if nDelete == 0: break
deleteDict = {}

for qLine, sLine in replaceDict.items():
    if reverseDict.has_key(qLine) and reverseDict.has_key(sLine):
        if reverseDict[qLine] == [sLine] and reverseDict[sLine] == [qLine]:
            #print 'duplicate2', qLine, sLine
            if qLine < sLine:
                deleteDict[sLine] = qLine
            else:
                deleteDict[qLine] = sLine
for qLine, sLine in deleteDict.items():
    del replaceDict[qLine]
    del reverseDict[sLine]

#print '#2', len(replaceDict)
while 1: # DELETE A<->B
    nDelete = 0
    for qLine, sLine in replaceDict.items(): # DELETE Q AND ADD INTO S
        if not reverseDict.has_key(sLine): continue
        if qLine not in reverseDict[sLine]: continue
        if len(reverseDict[sLine]) != 1: continue
        #print '%d TO DELETE2', qLine, sLine, reverseDict[sLine]
        preList1 = totalPreList[qLine]
        preList2 = totalPreList[sLine]
        
        countList1 = totalCountList[qLine]
        countList2 = totalCountList[sLine]
        totalCount = int(preList1[-4]) + int(preList2[-4])
        countList = tuple([ix+iy for ix, iy in zip(countList1, countList2)])
        preList = preList2[:-4] + (totalCount,) + preList2[-3:]
        totalCountList[qLine] = None
        totalCountList[sLine] = countList
        totalPreList[qLine] = None
        totalPreList[sLine] = preList
        #totalAftList[qLine] = None
        #print '%d DELETED2', qLine, sLine, reverseDict[sLine]
        del replaceDict[qLine]
        reverseDict[sLine].remove(qLine)
        if len(reverseDict[sLine]) == 0: del reverseDict[sLine]
        nDelete += 1
    if nDelete == 0: break


curDir = os.path.realpath('.')
os.chdir(curDir)

if len(replaceDict):
    for ix, iy in replaceDict.items(): print ix, iy
    sys.exit('REPLACEDICT REMAINED')


rheMac2 = seqdb.SequenceFileDB(genome_fasta_file)
nStep = 2000

# load restriction site information
restrictionDict = {}
#for lines in open('Plan_Documents16/CHECK_TCGA_GTAC_CATG_073111B_UPPER.txt', 'r').xreadlines():
#for lines in open('Plan_Documents24/CHECK_TCGA_GTAC_CATG_DEC20012a_UPPER.txt', 'r').xreadlines():
for lines in open(restriction_cite_file, 'r').xreadlines():
    if lines[:6] == 'ANIMAL': continue
    if len(lines.strip()) == 0: continue
    myAnimal, mySample, myToCheck = lines.split('\t')[:3]
    myToCheck = myToCheck.strip()
    #Gaju made changes to mySample
    restrictionDict[mySample.replace('_', '').replace('-', '').replace('\'', '').replace('`', '')] = myToCheck

saveDict = {}

for ix in range(len(totalCountList)):
    countList = totalCountList[ix]
    preList = totalPreList[ix]
    #aftList = totalAftList[ix]
    if countList is None or preList is None : continue #or aftList is None
    myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2, \
        gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2 = preList
    chrsite1, chrsite2 = int(chrsite1), int(chrsite2)

    rsDict = {}
    for ix in range(len(countList)):
        sampleCount, sampleID = countList[ix], sampleList[ix]
        sampleCount = int(sampleCount)
        enzyme = restrictionDict[sampleID]
        rsDict.setdefault(enzyme, 0)
        rsDict[enzyme] += sampleCount

    revrsdict = {}
    for enzyme, sitecount in rsDict.items():
        revrsdict.setdefault(sitecount, []).append(enzyme)
    maxrevrs = max(revrsdict.keys())
    cutSite = revrsdict[maxrevrs][0]

    if chrsite1 == -1:
        if strand2 == '+':
            chrsite1 = chrsite2 + 5
            myStr1, myStr2 = str(-rheMac2[chrid][chrsite1 - 5:chrsite1]).upper(), ''
            for iCurrent in range(chrsite1 - 5, 0, -nStep):
                myStr2 += str(-rheMac2[chrid][iCurrent - nStep:iCurrent]).upper()
                if cutSite in myStr2: break
            iTCGA = myStr2.index(cutSite)
            myStr = myStr1 + myStr2[:iTCGA+4]
            gStr1, gLength1, strand1 = myStr, len(myStr), '-'
        if strand2 == '-':
            chrsite1 = chrsite2 - 5
            myStr1, myStr2 = str(rheMac2[chrid][chrsite1:chrsite1 + 5]).upper(), ''
            for iCurrent in range(chrsite1 + 5, rheMac2[chrid].stop, nStep):
                myStr2 += str(rheMac2[chrid][iCurrent:iCurrent + nStep]).upper()
                if cutSite in myStr2: break
            iTCGA = myStr2.index(cutSite)
            myStr = myStr1 + myStr2[:iTCGA+4]
            gStr1, gLength1, strand1 = myStr, len(myStr), '+'
    if chrsite2 == -1:
        if strand1 == '+':
            chrsite2 = chrsite1 + 5
            myStr1, myStr2 = str(-rheMac2[chrid][chrsite2 - 5:chrsite2]).upper(), ''
            for iCurrent in range(chrsite2 - 5, 0, -nStep):
                myStr2 += str(-rheMac2[chrid][iCurrent - nStep:iCurrent]).upper()
                if cutSite in myStr2: break
            iTCGA = myStr2.index(cutSite)
            myStr = myStr1 + myStr2[:iTCGA+4]
            gStr2, gLength2, strand2 = myStr, len(myStr), '-'
        if strand1 == '-':
            chrsite2 = chrsite1 - 5
            myStr1, myStr2 = str(rheMac2[chrid][chrsite2:chrsite2 + 5]).upper(), ''
            for iCurrent in range(chrsite2 + 5, rheMac2[chrid].stop, nStep):
                myStr2 += str(rheMac2[chrid][iCurrent:iCurrent + nStep]).upper()
                if cutSite in myStr2: break
            iTCGA = myStr2.index(cutSite)
            myStr = myStr1 + myStr2[:iTCGA+4]
            gStr2, gLength2, strand2 = myStr, len(myStr), '+'

    preList = myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2, \
        gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2
    fullList = preList + countList #+ aftList
    saveDict[fullList] = 0

#just_df=pd.DataFrame.from_dict(saveDict,orient='index')
#print "Converted to DataFrame"
#just_df.head()

#pipe14_table=proceesedpsl_db.cursor()
#pipe14_table.execute('''DROP TABLE IF EXISTS pipe14_results''')
#pipe14_table.execute('''CREATE TABLE  pipe14_results (col1, col2, col3, col4, col5, col6, col7, col8)''')

totalCountList = []
totalPreList = []
totalAftList = []
seqDict = {}
iLine=0
# SAVE SEQUENCE MEMBER INFORMATION
memberDict = {}
memberSeqDict = {}
merge2_file='%s/merge2.fa'%tmpDir
outfile = open(merge2_file, 'w')

for fullList in saveDict.keys():
    linelist=list(fullList)
    myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2, \
        gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2 = linelist[:17]
    countList = tuple(map(int, linelist[17:]))
   
    preList = myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2, \
        gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2
    totalCountList.append(countList)
    totalPreList.append(preList)
    if len(myOcc1) and len(myOcc2) == 0:
        outfile.write('>%d|RS|%s|%d\n%s\n' % (iLine, totalCount, len(seqStr1), seqStr1))
        seqDict['%d|RS|%s|%d' % (iLine, totalCount, len(seqStr1))] = seqStr1
        memberSeqDict['%d|RS|%s|%d' % (iLine, totalCount, len(seqStr1))] = seqStr1
        memberDict['%d|RS|%s|%d' % (iLine, totalCount, len(seqStr1))] \
            = {'%d|RS|%s|%d' % (iLine, totalCount, len(seqStr1)):0}
        outfile.write('>%d|LG|%s|%d\n%s\n' % (iLine, totalCount, len(gStr2), gStr2))
        seqDict['%d|LG|%s|%d' % (iLine, totalCount, len(gStr2))] = gStr2
        memberSeqDict['%d|LG|%s|%d' % (iLine, totalCount, len(gStr2))] = gStr2
        memberDict['%d|LG|%s|%d' % (iLine, totalCount, len(gStr2))] \
            = {'%d|LG|%s|%d' % (iLine, totalCount, len(gStr2)):0}
    if len(myOcc1) == 0 and len(myOcc2):
        outfile.write('>%d|LS|%s|%d\n%s\n' % (iLine, totalCount, len(seqStr2), seqStr2))
        seqDict['%d|LS|%s|%d' % (iLine, totalCount, len(seqStr2))] = seqStr2
        memberSeqDict['%d|LS|%s|%d' % (iLine, totalCount, len(seqStr2))] = seqStr2
        memberDict['%d|LS|%s|%d' % (iLine, totalCount, len(seqStr2))] \
            = {'%d|LS|%s|%d' % (iLine, totalCount, len(seqStr2)):0}
        outfile.write('>%d|RG|%s|%d\n%s\n' % (iLine, totalCount, len(gStr1), gStr1))
        seqDict['%d|RG|%s|%d' % (iLine, totalCount, len(gStr1))] = gStr1
        memberSeqDict['%d|RG|%s|%d' % (iLine, totalCount, len(gStr1))] = gStr1
        memberDict['%d|RG|%s|%d' % (iLine, totalCount, len(gStr1))] \
            = {'%d|RG|%s|%d' % (iLine, totalCount, len(gStr1)):0}
    if len(myOcc1) and len(myOcc2):
        outfile.write('>%d|RP|%s|%d\n%s\n' % (iLine, totalCount, len(seqStr1), seqStr1))
        seqDict['%d|RP|%s|%d' % (iLine, totalCount, len(seqStr1))] = seqStr1
        memberSeqDict['%d|RP|%s|%d' % (iLine, totalCount, len(seqStr1))] = seqStr1
        memberDict['%d|RP|%s|%d' % (iLine, totalCount, len(seqStr1))] \
            = {'%d|RP|%s|%d' % (iLine, totalCount, len(seqStr1)):0}
        outfile.write('>%d|LP|%s|%d\n%s\n' % (iLine, totalCount, len(seqStr2), seqStr2))
        seqDict['%d|LP|%s|%d' % (iLine, totalCount, len(seqStr2))] = seqStr2
        memberSeqDict['%d|LP|%s|%d' % (iLine, totalCount, len(seqStr2))] = seqStr2
        memberDict['%d|LP|%s|%d' % (iLine, totalCount, len(seqStr2))] \
            = {'%d|LP|%s|%d' % (iLine, totalCount, len(seqStr2)):0}

    iLine += 1

outfile.close()

del(saveDict)
sample_seq=pd.read_sql_query('SELECT SampleID, blatDecide, qName FROM processed_psl WHERE blatDecide != "Un" ', proceesedpsl_db).drop_duplicates('qName')
   
seqDict = {}
tagDict = {}
sampleDict = {}
sampleSeqDict = {}
for tuprow in sample_seq.itertuples():
    #print tuprow
    junk,samp,blattag,seq1=[str(ij) for ij in tuprow]
    sampleDict[seq1.split('|')[0]] = samp
    tagDict[seq1.split('|')[0]] = blattag
for record in SeqIO.parse(fasta_file, "fasta"):
    seq1=record.id
    seqDict[seq1.split('|')[0]]=str(record.seq)
    if sampleDict.has_key(seq1.split('|')[0]): 
        continue
    else:
        idStr=seq1.split('|')[1]
        idList = [int(iy) for iy in idStr.split(',')]
        if sampleIdDict.has_key((idList[0], idList[1], idList[2], idList[6])):
                suffixStr = sampleIdDict[(idList[0], idList[1], idList[2], idList[6])] 
                sampleDict[seq1.split('|')[0]] = suffixStr
        else:
            sampleDict[seq1.split('|')[0]] = 'NONE'
for seqId in seqDict.keys():
    if tagDict.has_key(seqId): continue
    tagDict[seqId] = 'NoHits'
    
        
for tuprow in Single_hits.loc[:,['SampleID','qName']].itertuples():
    junk,mySample,seq1=[str(ij) for ij in tuprow]
    if mySample != 'NONE': continue # DELETE NONE
    if seqDict.has_key(seq1.split('|')[0]): del seqDict[seqId]
    if tagDict.has_key(seq1.split('|')[0]): del tagDict[seqId]

iTotalMulti, iTotalNoGoodSpn, iTotalNoHits = 0, 0, 0
for seqId in tagDict.keys():
    if tagDict[seqId] == 'Multi': iTotalMulti += 1
    if tagDict[seqId] == 'NoGoodSpn': iTotalNoGoodSpn += 1
    if tagDict[seqId] == 'NoHits': iTotalNoHits += 1

print 'Multi', iTotalMulti
print 'NoGoodSpn', iTotalNoGoodSpn
print 'NoHits', iTotalNoHits    

revSeqDict = {}
for seqId, mySeq in seqDict.iteritems():
    if tagDict[seqId] == 'Single': continue
    myTag = tagDict[seqId]
    if len(mySeq) == 0: mySeq = 'S'
    revSeqDict.setdefault(myTag, {}).setdefault(mySeq, {})[seqId] = 0
merge3_Others='%s/merge3_Others.fa'%tmpDir
merge2_NoHits='%s/merge2_NoHits.fa'%tmpDir
merge2_NoGoodSpn='%s/merge2_NoGoodSpn.fa'%tmpDir
merge2_Multi='%s/merge2_Multi.fa'%tmpDir
outfile1 = open(merge3_Others, 'w')#open(os.path.join(tmpDir, 'merge3_Others.fa'), 'w')
outfile2 = open(merge2_NoHits, 'w')
outfile3 = open(merge2_NoGoodSpn, 'w')
outfile4 = open(merge2_Multi, 'w')

seqList = revSeqDict['NoHits'].keys()
seqList.sort()
for iSeq in range(len(seqList)):
    mySeq = seqList[iSeq]
    if len(mySeq) == 0: mySeq = 'S'
    idList = revSeqDict['NoHits'][mySeq].keys()
    idList.sort()
    lIdList, rIdList = [], []
    for seqId in idList:
        mySample = sampleDict[seqId]
        if mySample == 'NONE': continue
        if sampleLeftRightDict[mySample][0] == 'LEFT':
            lIdList.append(seqId)
        if sampleLeftRightDict[mySample][0] == 'RIGHT':
            rIdList.append(seqId)
    if len(lIdList):
        myId = 'L|NoHits%d_%d|%d' % (iSeq, len(lIdList), len(mySeq))
        outfile2.write('>%s\n%s\n' % (myId, mySeq))
        for seqId in lIdList: memberDict.setdefault(myId, {})[seqId] = 0
        memberSeqDict[myId] = mySeq
    if len(rIdList):
        myId = 'R|NoHits%d_%d|%d' % (iSeq, len(rIdList), len(mySeq))
        outfile2.write('>%s\n%s\n' % (myId, mySeq))
        for seqId in rIdList: memberDict.setdefault(myId, {})[seqId] = 0
        memberSeqDict[myId] = mySeq
seqList = revSeqDict['NoGoodSpn'].keys()
seqList.sort()
for iSeq in range(len(seqList)):
    mySeq = seqList[iSeq]
    if len(mySeq) == 0: mySeq = 'S'
    idList = revSeqDict['NoGoodSpn'][mySeq].keys()
    idList.sort()
    lIdList, rIdList = [], []
    for seqId in idList:
        mySample = sampleDict[seqId]
        if mySample == 'NONE': continue
        if sampleLeftRightDict[mySample][0] == 'LEFT':
            lIdList.append(seqId)
        if sampleLeftRightDict[mySample][0] == 'RIGHT':
            rIdList.append(seqId)
    if len(lIdList):
        myId = 'L|NoGoodSpn%d_%d|%d' % (iSeq, len(lIdList), len(mySeq))
        outfile3.write('>%s\n%s\n' % (myId, mySeq))
        outfile1.write('>%s\n%s\n' % (myId, mySeq))
        for seqId in lIdList: memberDict.setdefault(myId, {})[seqId] = 0
        memberSeqDict[myId] = mySeq
    if len(rIdList):
        myId = 'R|NoGoodSpn%d_%d|%d' % (iSeq, len(rIdList), len(mySeq))
        outfile3.write('>%s\n%s\n' % (myId, mySeq))
        outfile1.write('>%s\n%s\n' % (myId, mySeq))
        for seqId in rIdList: memberDict.setdefault(myId, {})[seqId] = 0
        memberSeqDict[myId] = mySeq
seqList = revSeqDict['Multi'].keys()
seqList.sort()
for iSeq in range(len(seqList)):
    mySeq = seqList[iSeq]
    if len(mySeq) == 0: mySeq = 'S'
    idList = revSeqDict['Multi'][mySeq].keys()
    idList.sort()
    lIdList, rIdList = [], []
    for seqId in idList:
        mySample = sampleDict[seqId]
        if mySample == 'NONE': continue
        if sampleLeftRightDict[mySample][0] == 'LEFT':
            lIdList.append(seqId)
        if sampleLeftRightDict[mySample][0] == 'RIGHT':
            rIdList.append(seqId)
    if len(lIdList):
        myId = 'L|Multi%d_%d|%d' % (iSeq, len(lIdList), len(mySeq))
        outfile4.write('>%s\n%s\n' % (myId, mySeq))
        outfile1.write('>%s\n%s\n' % (myId, mySeq))
        for seqId in lIdList: memberDict.setdefault(myId, {})[seqId] = 0
        memberSeqDict[myId] = mySeq
    if len(rIdList):
        myId = 'R|Multi%d_%d|%d' % (iSeq, len(rIdList), len(mySeq))
        outfile4.write('>%s\n%s\n' % (myId, mySeq))
        outfile1.write('>%s\n%s\n' % (myId, mySeq))
        for seqId in rIdList: memberDict.setdefault(myId, {})[seqId] = 0
        memberSeqDict[myId] = mySeq
outfile2.close()
outfile3.close()
outfile4.close()

blast_dbcmd='formatdb -t %s/merge2_NoHits -i %s/merge2_NoHits.fa -p F -o F -n %s/merge2_NoHits'%(tmpDir,tmpDir,tmpDir)
#make_blast_db=subprocess.Popen(blast_dbcmd, stdout=subprocess.PIPE, shell =True)
subprocess.call(blast_dbcmd, shell =True)
stderr=1
cnt=0
#os.system('formatdb -t merge -i merge.fa -p F -o F -n merge')
#os.system('blastall -p blastn -d merge -i merge.fa -m 9 -a 8 -o merge.blastn')
while stderr:
    if cnt>2:
        break
    #os.system('blastall -p blastn -d merge2_NoHits -i merge2_NoHits.fa -m 9 -a 8 -F F | gzip > self_NoHits.blastn.gz')
    run14_blastcmd='blastall -p blastn -d %s/merge2_NoHits -i %s/merge2_NoHits.fa -m 9 -a %d -F F | gzip > %s/self_NoHits.blastn.gz'%(tmpDir,tmpDir,num_cores,tmpDir)
    #print cnt
    run_blast14=subprocess.Popen(run14_blastcmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell =True)
    cnt=cnt+1
    stdout,stderr=run_blast14.communicate()
    run_blast14.wait()
    #print stderr
    
#trying_hard=subprocess.call(run14_blastcmd, shell =True)
#print >>sys.stderr, "Child was terminated by signal", trying_hard
#run_blast14.wait()
#    print "trouble is here"

if cnt>2:
    #os.system('blastall -p blastn -d merge2_NoHits -i merge2_NoHits.fa -m 9 -a 8 -F F | gzip > self_NoHits.blastn.gz')
    run14_blastcmd='blastall -p blastn -d %s/merge2_NoHits -i %s/merge2_NoHits.fa -m 9 -a %d -F F | gzip > %s/self_NoHits.blastn.gz'%(tmpDir,tmpDir,1,tmpDir)
    run_blast14=subprocess.Popen(run14_blastcmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell =True)
    stdout,stderr=run_blast14.communicate()
    run_blast14.wait()
#os.system('blastall -p blastn -d merge -i merge.fa -m 9 -a 8 -F F | gzip > merge.blastn.gz')

# MERGE NOHITS, ALWAYS QUERY INTO SUBJECT SEQUENCES
mergeDict = {}
self_NoHits_zip='%s/self_NoHits.blastn.gz'%tmpDir
infile = gzip.GzipFile(self_NoHits_zip, 'r')
#for lines in open('self_NoHits.blastn', 'r').xreadlines():
while 1:
    lines = infile.readline()
    if lines == '': break
    if lines[0] == '#': continue
    query_id, subject_id, p_identity, alignment_length, mismatches, gap_openings, q_start, q_end, \
        s_start, s_end, e_value, bit_score = lines.splitlines()[0].split('\t')
    if query_id == subject_id: continue
    if 'L' in query_id and 'R' in subject_id: continue
    if 'R' in query_id and 'L' in subject_id: continue
    alignment_length, mismatches, gap_openings, q_start, q_end, s_start, s_end = \
        int(alignment_length), int(mismatches), int(gap_openings), int(q_start), int(q_end), int(s_start), int(s_end)
    if query_id[:2] not in ('L|', 'R|'):
        query_count = int(query_id.rsplit('|', 2)[-2])
    else:
        query_count = int(query_id.rsplit('|', 2)[-2].rsplit('_')[-1])
    if subject_id[:2] not in ('L|', 'R|'):
        subject_count = int(subject_id.rsplit('|', 2)[-2])
    else:
        subject_count = int(subject_id.rsplit('|', 2)[-2].rsplit('_')[-1])
    query_length = int(query_id.rsplit('|', 1)[-1])
    subject_length = int(subject_id.rsplit('|', 1)[-1])
    query_seq = memberSeqDict[query_id]
    subject_seq = memberSeqDict[subject_id]
    if query_seq[-3:] == 'TCG' or query_seq[-4:] in ('TCGA', 'GTAC', 'CATG'): query_TCGA = 1
    else: query_TCGA = 0
    if subject_seq[-3:] == 'TCG' or subject_seq[-4:] in ('TCGA', 'GTAC', 'CATG'): subject_TCGA = 1
    else: subject_TCGA = 0
    identity1 = 100.*(float(alignment_length) - float(mismatches) - float(gap_openings))
    identity1 = identity1/float(min(query_length, subject_length))
    identity2 = 100.*(float(alignment_length) - float(mismatches) - float(gap_openings))
    identity2 = identity2/float(max(query_length, subject_length))
    #if int(mismatches) + int(gap_openings) > 5: continue
    if identity1 < 90.: continue
    iSwap = 0
    if query_length > subject_length: # QUERY LENGTH SHOULD BE LONGER THAN SUBJECT LENGTH
        iSwap = 1
    elif query_length == subject_length: # DEBUGGING. QUERY_ID SHOULD BE COME FIRST
        if query_id < subject_id:
            iSwap = 1
    if iSwap:
        query_id, subject_id, identity1, identity2, alignment_length, q_start, q_end, s_start, s_end, \
            query_TCGA, subject_TCGA = \
            subject_id, query_id, identity1, identity2, alignment_length, s_start, s_end, q_start, q_end, \
                subject_TCGA, query_TCGA
        query_seq, query_length, query_count, subject_seq, subject_length, subject_count = \
            subject_seq, subject_length, subject_count, query_seq, query_length, query_count
    if query_TCGA:
        if not subject_TCGA: continue
        if abs(q_end - query_length) > 1 or abs(s_end - subject_length) > 1: continue
    mergeDict.setdefault(query_id, {}).setdefault((identity1, identity2), \
        {}).setdefault((subject_count, subject_length), [])
    mergeDict[query_id][(identity1, identity2)][(subject_count, subject_length)].append((subject_id, \
        identity1, identity2, alignment_length, q_start, q_end, s_start, s_end, query_seq, subject_seq))
infile.close()

replaceDict = {}
reverseDict = {}
for query_id in mergeDict.keys():
    #maxIdentity = max(mergeDict[query_id])
    #subjectCountList = mergeDict[query_id][maxIdentity].keys()
    #subjectCountList.sort()
    #subjectCountList.reverse()

    identityList = mergeDict[query_id].keys()
    identityList.sort()
    identityList.reverse()
    maxIdentity = identityList[0]
    subjectCountList = mergeDict[query_id][maxIdentity].keys()
    subjectCountList.sort()
    subjectCountList.reverse()


    maxSubjectCount, maxSubjectLength = subjectCountList[0]
    mergeDict[query_id][maxIdentity][(maxSubjectCount, maxSubjectLength)].sort()
    subject_id, identity1, identity2, alignment_length, q_start, q_end, s_start, s_end, query_seq, subject_seq \
        = mergeDict[query_id][maxIdentity][(maxSubjectCount, maxSubjectLength)][0]
    qLine, sLine = query_id, subject_id
    replaceDict[qLine] = sLine
    reverseDict.setdefault(sLine, []).append(qLine)
deleteDict = {}
for qLine, sLine in replaceDict.items():
    if reverseDict.has_key(qLine) and reverseDict.has_key(sLine):
        if reverseDict[qLine] == [sLine] and reverseDict[sLine] == [qLine]:
            if qLine < sLine:
                deleteDict[sLine] = qLine
            else:
                deleteDict[qLine] = sLine
for qLine, sLine in deleteDict.items():
    del replaceDict[qLine]
    del reverseDict[sLine]
while 1: # DELETE HEAD
    nDelete = 0
    for qLine, sLine in replaceDict.items(): # DELETE Q AND ADD INTO S
        if reverseDict.has_key(qLine): continue
        for seqId in memberDict[qLine]:
            memberDict[sLine][seqId] = 0
        del memberDict[qLine]
        del replaceDict[qLine]
        reverseDict[sLine].remove(qLine)
        if len(reverseDict[sLine]) == 0: del reverseDict[sLine]
        nDelete += 1
    if nDelete == 0: break
deleteDict = {}
for qLine, sLine in replaceDict.items():
    if reverseDict.has_key(qLine) and reverseDict.has_key(sLine):
        if reverseDict[qLine] == [sLine] and reverseDict[sLine] == [qLine]:
            if qLine < sLine:
                deleteDict[sLine] = qLine
            else:
                deleteDict[qLine] = sLine
for qLine, sLine in deleteDict.items():
    del replaceDict[qLine]
    del reverseDict[sLine]
while 1: # DELETE A<->B
    nDelete = 0
    for qLine, sLine in replaceDict.items(): # DELETE Q AND ADD INTO S
        if not reverseDict.has_key(sLine): continue
        if qLine not in reverseDict[sLine]: continue
        if len(reverseDict[sLine]) != 1: continue
        for seqId in memberDict[qLine]:
            memberDict[sLine][seqId] = 0 # no key error
        del memberDict[qLine]
        reverseDict[sLine].remove(qLine)
        del replaceDict[qLine]
        if len(reverseDict[sLine]) == 0: del reverseDict[sLine]
        nDelete += 1
    if nDelete == 0: break

print len(mergeDict)
print len(memberDict)
print len(memberSeqDict)

iMemberTotal = 0
for myId in memberDict.keys():
    if 'NoHits' not in myId: continue
    iMemberTotal += len(memberDict[myId])
    #print memberDict[myId].keys()
    #raw_input('Press Enter Key')
    outfile1.write('>%s\n%s\n' % (myId, memberSeqDict[myId]))
print 'iMemberTotal', iMemberTotal

outfile1.close()
revMemberDict = {}
for myId in memberDict.keys():
    for seqId in memberDict[myId].keys():
        revMemberDict.setdefault(seqId, []).append(myId)
for seqId in revMemberDict.keys():
    if len(revMemberDict[seqId]) == 1: continue
    for myId in revMemberDict[seqId]:
        print seqId, myId, memberDict[myId]

# SAVE NOHIT MERGE AND DO THE PROCESS FOR MULTI AND NOGOODSPN

if len(replaceDict):
    for ix, iy in replaceDict.items(): print ix, iy
    sys.exit('REPLACEDICT REMAINED')
merge3_file='%s/merge3.fa'%tmpDir
outfile = open(merge3_file, 'w')
outfile.write(open(merge2_file, 'r').read())
outfile.write(open(merge3_Others, 'r').read())
outfile.close()

#os.system('formatdb -t merge3 -i merge3.fa -p F -o F -n merge3')
#os.system('blastall -p blastn -d merge3 -i merge3_Others.fa -m 9 -a 8 -F F | gzip > merge3_Others.blastn.gz')
blast_dbcmd='formatdb -t %s/merge3 -i %s/merge3.fa -p F -o F -n %s/merge3'%(tmpDir,tmpDir,tmpDir)
#make_blast_db=subprocess.Popen(blast_dbcmd, stdout=subprocess.PIPE, shell =True)
subprocess.call(blast_dbcmd, shell =True)
stderr=1
cnt=0
#os.system('formatdb -t merge -i merge.fa -p F -o F -n merge')
#os.system('blastall -p blastn -d merge -i merge.fa -m 9 -a 8 -o merge.blastn')
while stderr:
    if cnt>2:
        break
    #os.system('blastall -p blastn -d merge2_NoHits -i merge2_NoHits.fa -m 9 -a 8 -F F | gzip > self_NoHits.blastn.gz')
    run14_blastcmd='blastall -p blastn -d %s/merge3 -i %s/merge3_Others.fa -m 9 -a %d -F F | gzip > %s/merge3_Others.blastn.gz'%(tmpDir,tmpDir,num_cores,tmpDir)
    #print cnt
    run_blast14=subprocess.Popen(run14_blastcmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell =True)
    cnt=cnt+1
    stdout,stderr=run_blast14.communicate()
    run_blast14.wait()
    #print stderr
    
#trying_hard=subprocess.call(run14_blastcmd, shell =True)
#print >>sys.stderr, "Child was terminated by signal", trying_hard

#    print "trouble is here"

if cnt>2:
    run14_blastcmd='blastall -p blastn -d %s/merge3 -i %s/merge3_Others.fa -m 9 -a %d -F F | gzip > %s/merge3_Others.blastn.gz'%(tmpDir,tmpDir,1,tmpDir)
    run_blast14=subprocess.Popen(run14_blastcmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell =True)
    stdout,stderr=run_blast14.communicate()
    run_blast14.wait()
#os.system('blastall -p blastn -d merge -i merge.fa -m 9 -a 8 -F F | gzip > merge.blastn.gz')
mergeDict = {}
merge3_Others_zip='%s/merge3_Others.blastn.gz'%tmpDir
infile = gzip.GzipFile(merge3_Others_zip, 'r')
#for lines in open('merge3_Others.blastn', 'r').xreadlines():
while 1:
    lines = infile.readline()
    if lines == '': break
    if lines[0] == '#': continue
    query_id, subject_id, p_identity, alignment_length, mismatches, gap_openings, q_start, q_end, \
        s_start, s_end, e_value, bit_score = lines.splitlines()[0].split('\t')
    if query_id == subject_id: continue
    if 'L' in query_id and 'R' in subject_id: continue
    if 'R' in query_id and 'L' in subject_id: continue
    alignment_length, mismatches, gap_openings, q_start, q_end, s_start, s_end = \
        int(alignment_length), int(mismatches), int(gap_openings), int(q_start), int(q_end), int(s_start), int(s_end)
    query_seq = memberSeqDict[query_id]
    subject_seq = memberSeqDict[subject_id]
    if query_id[:2] not in ('L|', 'R|'):
        query_count = int(query_id.rsplit('|', 2)[-2])
    else:
        query_count = int(query_id.rsplit('|', 2)[-2].rsplit('_')[-1])
    if subject_id[:2] not in ('L|', 'R|'):
        subject_count = int(subject_id.rsplit('|', 2)[-2])
    else:
        subject_count = int(subject_id.rsplit('|', 2)[-2].rsplit('_')[-1])
    query_length = int(query_id.rsplit('|', 1)[-1])
    subject_length = int(subject_id.rsplit('|', 1)[-1])
    if s_start > 5 or q_start > 5: continue
    if query_seq[-3:] == 'TCG' or query_seq[-4:] in ('TCGA', 'GTAC', 'CATG'): query_TCGA = 1
    else: query_TCGA = 0
    if subject_seq[-3:] == 'TCG' or subject_seq[-4:] in ('TCGA', 'GTAC', 'CATG'): subject_TCGA = 1
    else: subject_TCGA = 0
    identity1 = 100.*(float(alignment_length) - float(mismatches) - float(gap_openings))
    identity1 = identity1/float(min(query_length, subject_length))
    identity2 = 100.*(float(alignment_length) - float(mismatches) - float(gap_openings))
    identity2 = identity2/float(max(query_length, subject_length))
    #if int(mismatches) + int(gap_openings) > 5: continue
    if identity1 < 90.: continue
    if 'G|' in subject_id:
        if s_start > 2 or q_start > 2: continue
        if identity1 < 95.: continue
    iSwap = 0
    if query_length > subject_length:# and '|' not in subject_id: # QUERY LENGTH SHOULD BE LONGER THAN SUBJECT LENGTH
        iSwap = 1
    elif query_length == subject_length: # DEBUGGING. QUERY_ID SHOULD BE COME FIRST
        if query_id < subject_id:
            iSwap = 1
    if iSwap:
        query_id, subject_id, identity1, identity2, alignment_length, q_start, q_end, s_start, s_end, \
            query_TCGA, subject_TCGA = \
            subject_id, query_id, identity1, identity2, alignment_length, s_start, s_end, q_start, q_end, \
                subject_TCGA, query_TCGA
        query_seq, query_length, query_count, subject_seq, subject_length, subject_count = \
            subject_seq, subject_length, subject_count, query_seq, query_length, query_count
    if query_TCGA:
        if not subject_TCGA: continue
        if abs(q_end - query_length) > 1 or abs(s_end - subject_length) > 1: continue
    mergeDict.setdefault(query_id, {}).setdefault((identity1, identity2), \
        {}).setdefault((subject_count, subject_length), [])
    if 'P|' in subject_id: iOrder = 0
    elif '|LS|' in subject_id or '|RS|' in subject_id: iOrder = 1
    elif 'G|' in subject_id: iOrder = 2
    elif 'Multi' in subject_id: iOrder = 3
    elif 'NoGoodSpn' in subject_id: iOrder = 4
    elif 'NoHits' in subject_id: iOrder = 5
    else:
        print 'NO ORDER', lines[:-1]
        iOrder = 6
    mergeDict[query_id][(identity1, identity2)][(subject_count, subject_length)].append((iOrder, subject_id, \
        identity1, identity2, alignment_length, q_start, q_end, s_start, s_end, query_seq, subject_seq))
infile.close()

replaceDict = {}
reverseDict = {}
for query_id in mergeDict.keys():
    #maxIdentity = max(mergeDict[query_id])
    #subjectCountList = mergeDict[query_id][maxIdentity].keys()
    #subjectCountList.sort()
    #subjectCountList.reverse()

    identityList = mergeDict[query_id].keys()
    identityList.sort()
    identityList.reverse()
    maxIdentity = identityList[0]
    subjectCountList = mergeDict[query_id][maxIdentity].keys()
    subjectCountList.sort()
    subjectCountList.reverse()

    maxSubjectCount, maxSubjectLength = subjectCountList[0]
    mergeDict[query_id][maxIdentity][(maxSubjectCount, maxSubjectLength)].sort()
    iOrder, subject_id, identity1, identity2, alignment_length, q_start, q_end, s_start, s_end, query_seq, subject_seq \
        = mergeDict[query_id][maxIdentity][(maxSubjectCount, maxSubjectLength)][0]
    qLine, sLine = query_id, subject_id
    # MERGING ORDERS: SINGLES WITH BOTH JUNCTION, HIGHEST FREQ, SINGLES, MULTI, NOGOODSPN, LONGEST, FIRST
    replaceDict[qLine] = sLine
    reverseDict.setdefault(sLine, []).append(qLine)
deleteDict = {}
for qLine, sLine in replaceDict.items():
    if reverseDict.has_key(qLine) and reverseDict.has_key(sLine):
        if reverseDict[qLine] == [sLine] and reverseDict[sLine] == [qLine]:
            if qLine < sLine:
                deleteDict[sLine] = qLine
            else:
                deleteDict[qLine] = sLine
for qLine, sLine in deleteDict.items():
    del replaceDict[qLine]
    del reverseDict[sLine]
while 1: # DELETE HEAD
    nDelete = 0
    for qLine, sLine in replaceDict.items(): # DELETE Q AND ADD INTO S
        if reverseDict.has_key(qLine): continue
        for seqId in memberDict[qLine]:
            #if not memberDict[sLine].has_key(seqId): continue # DEBUG, KEY ERROR BELOW. AUGUST 4, 2010.
            memberDict[sLine][seqId] = 0 # KEY ERROR. AUGUST 3, 2010.
        del memberDict[qLine]
        del replaceDict[qLine]
        reverseDict[sLine].remove(qLine)
        if len(reverseDict[sLine]) == 0: del reverseDict[sLine]
        nDelete += 1
    if nDelete == 0: break
deleteDict = {}
for qLine, sLine in replaceDict.items():
    if reverseDict.has_key(qLine) and reverseDict.has_key(sLine):
        if reverseDict[qLine] == [sLine] and reverseDict[sLine] == [qLine]:
            if qLine < sLine:
                deleteDict[sLine] = qLine
            else:
                deleteDict[qLine] = sLine
for qLine, sLine in deleteDict.items():
    del replaceDict[qLine]
    del reverseDict[sLine]
while 1: # DELETE A<->B
    nDelete = 0
    for qLine, sLine in replaceDict.items(): # DELETE Q AND ADD INTO S
        if not reverseDict.has_key(sLine): continue
        if qLine not in reverseDict[sLine]: continue
        if len(reverseDict[sLine]) != 1: continue
        for seqId in memberDict[qLine]:
            #if not memberDict[sLine].has_key(seqId): continue # DEBUG, KEY ERROR BELOW. AUGUST 4, 2010.
            memberDict[sLine][seqId] = 0 # KEY ERROR. AUGUST 3, 2010. 
        del memberDict[qLine]
        reverseDict[sLine].remove(qLine)
        del replaceDict[qLine]
        if len(reverseDict[sLine]) == 0: del reverseDict[sLine]
        nDelete += 1
    if nDelete == 0: break

print len(mergeDict)
print len(memberDict)
print len(memberSeqDict)

revMemberDict = {}
for myId in memberDict.keys():
    for seqId in memberDict[myId].keys():
        revMemberDict.setdefault(seqId, []).append(myId)
for seqId in revMemberDict.keys():
    if len(revMemberDict[seqId]) == 1: continue
    for myId in revMemberDict[seqId]:
        print seqId, myId, memberDict[myId]

# SAVE NOHIT MERGE AND DO THE PROCESS FOR MULTI AND NOGOODSPN, sampleTotalList

if len(replaceDict):
    for ix, iy in replaceDict.items(): print ix, iy
    sys.exit('REPLACEDICT REMAINED')

for myId in memberDict.keys():
    if myId[:2] in ('R|', 'L|'): continue
    iLine, iTag, iTotalCount = myId.split('|')[:3]
    iLine, iTotalCount = int(iLine), int(iTotalCount)
    myCountList = [0]*len(sampleTotalList)
    seqIdList = memberDict[myId].keys()
    seqIdList.sort()
    if len(seqIdList) == 1: continue
    iBarCount, barId = 0, ''
    for seqId in seqIdList:
        if '|' in seqId:
            iBarCount += 1
            barId = seqId
    if iBarCount != 1:
        deletedIDList = []
        for seqId in seqIdList:
            if '|' in seqId and barId != seqId: # merge it
                dLine, dTag, dTotalCount = seqId.split('|')[:3]
                dLine, dTotalCount = int(dLine), int(dTotalCount)

                preList1 = totalPreList[dLine]
                preList2 = totalPreList[iLine]
               # aftList1 = totalAftList[dLine]
               # aftList2 = totalAftList[iLine]
                countList1 = totalCountList[dLine]
                countList2 = totalCountList[iLine]
                totalCount = int(preList1[-4]) + int(preList2[-4])
                countList = tuple([ix+iy for ix, iy in zip(countList1, countList2)])
                preList = preList2[:-4] + (totalCount,) + preList2[-3:]
                totalCountList[dLine] = None
                totalCountList[iLine] = countList
                totalPreList[dLine] = None
                totalPreList[iLine] = preList
                #totalAftList[dLine] = None
                if memberDict.has_key(seqId): del memberDict[seqId]
                #print seqId, 'merged into', barId, `memberDict[myId].keys()` # debugging, no key error
                deletedIDList.append(seqId)
        for seqId in deletedIDList:
            seqIdList.remove(seqId)
    
        #sys.exit('Wrong ID: ' + myId + ' ' + `memberDict[myId].keys()`)
    
    
    countList = totalCountList[iLine]
    
    preList = totalPreList[iLine]
    #aftList = totalAftList[iLine]
    myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2, \
        gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2 = preList
    for seqId in seqIdList:
        if seqId == barId: continue
        mySample = sampleDict[seqId]
        myCountList[sampleTotalList.index(mySample)] += 1
    if reduce(operator.add, myCountList) == 0: continue
    countList = map(int, countList)
    countList = tuple([ix + iy for ix, iy in zip(countList, myCountList)])
    totalCount = int(totalCount) + reduce(operator.add, myCountList)
    preList = myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2, \
        gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2
    totalCountList[iLine] = countList
    totalPreList[iLine] = preList
    
#outfile = open(inter_files+'pipe15_merge_others_all_in_one.txt', 'w')
pipe15_table=proceesedpsl_db.cursor()
pipe15_table.execute('''DROP TABLE IF EXISTS Table_pip15''')
pipe15_header_list=['CLONE_ID','VECTOR_ID','R_TYPE','R_STRAND','R_QLEN','R_QSEQ','R_GLEN','R_GSEQ','L_TYPE','L_STRAND','L_QLEN','L_QSEQ','L_GLEN','L_GSEQ','TOTAL_COUNT','CHR_NO','CHR_SITE1','CHR_SITE2']
#pipe15_header_list=pipe15_header_list+sampleTotalList
var_string = ', '.join('?' * len(pipe15_header_list+sampleTotalList))
pipe15_table.execute('''CREATE TABLE Table_pip15 {tn}'''.format(tn=tuple(pipe15_header_list+sampleTotalList)))
query_string = 'INSERT INTO Table_pip15 VALUES (%s);' % var_string
clone_num=1
for iLine in range(len(totalCountList)):
    countList = totalCountList[iLine]
    preList = totalPreList[iLine]
    Clone_id='CLONE_'+str(clone_num)
    #aftList = totalAftList[iLine]
    if preList is None or countList is None : continue
    fullList = preList + countList #+ aftList
    table_list=[x for x in fullList]
    table_list.insert(0, Clone_id)
    pipe15_table.execute(query_string, table_list)
    #outfile.write('\t'.join(map(str, fullList)) + '\n')
    clone_num=clone_num+1
for myId in memberDict.keys():
    if myId[:2] not in ('R|', 'L|'): continue
    myLCountList = [0]*len(sampleTotalList)
    myRCountList = [0]*len(sampleTotalList)
    seqIdList = memberDict[myId].keys()
    seqIdList.sort()
    Clone_id='CLONE_'+str(clone_num)
    for seqId in seqIdList:
        if not sampleDict.has_key(seqId): continue # debug. August 25, 2010. NOT SURE WHY IT HAPPENS
        mySample = sampleDict[seqId] # KeyError: '186|RS|23|80'
        if mySample == 'NONE': continue
        if sampleLeftRightDict[mySample][0] == 'LEFT': myLCountList[sampleTotalList.index(mySample)] += 1
        elif sampleLeftRightDict[mySample][0] == 'RIGHT': myRCountList[sampleTotalList.index(mySample)] += 1
        else: sys.exit(`sampleLeftRightDict`)
    if memberSeqDict[myId][-3:] == 'TCG' or memberSeqDict[myId][-4:] in ('TCGA', 'GTAC', 'CATG'): isTCGA = 1
    else: isTCGA = 0
    if len(memberSeqDict[myId]) <= 15 and not isTCGA: continue
    if reduce(operator.add, myLCountList) + reduce(operator.add, myRCountList) == 0: continue
    if reduce(operator.add, myLCountList) and reduce(operator.add, myRCountList):
        print 'STILL BOTH HAS IT', myId

    # sampleLeftRightDict[sampleId] = (row[iJunction], animalId)
    if reduce(operator.add, myRCountList):
        myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2, \
            gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2 = \
            'NA', myId.split('_')[0], '', len(memberSeqDict[myId]), memberSeqDict[myId], 0, '', '', '', 0, '', \
                0, '', reduce(operator.add, myRCountList), '', -1, -1
        tssDistance, tssName, geneSymbol, tssCancer, cpgDistance, \
            cpgName, gcstr, repName, repClass, repFamily, mirDistance, mirName, tupos, tusize, tugenestart, \
            tugeneend, tustrand, tuacc, tugenesymbol = ['']*19
        preList = myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2,\
            gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2
       # aftList = tssDistance, tssName, geneSymbol, tssCancer, cpgDistance, \
       #     cpgName, gcstr, repName, repClass, repFamily, mirDistance, mirName, tupos, tusize, tugenestart, \
       #     tugeneend, tustrand, tuacc, tugenesymbol
        countList = tuple(myRCountList)
        fullList = preList + countList #+ aftList
        table_list=[x for x in fullList]
        table_list.insert(0, Clone_id)
        pipe15_table.execute(query_string, table_list)
        clone_num=clone_num+1
        #outfile.write('\t'.join(map(str, fullList)) + '\n')
    if reduce(operator.add, myLCountList):
        myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2, \
            gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2 = \
            'NA', '', '', 0, '', 0, '', myId.split('_')[0], '', len(memberSeqDict[myId]), memberSeqDict[myId], 0, '',\
                reduce(operator.add, myLCountList), '', -1, -1
        tssDistance, tssName, geneSymbol, tssCancer, cpgDistance, \
            cpgName, gcstr, repName, repClass, repFamily, mirDistance, mirName, tupos, tusize, tugenestart, \
            tugeneend, tustrand, tuacc, tugenesymbol = ['']*19
        preList = myClass, myOcc1, strand1, seqLength1, seqStr1, gLength1, gStr1, myOcc2, strand2, seqLength2, seqStr2,\
            gLength2, gStr2, totalCount, chrid, chrsite1, chrsite2
       # aftList = tssDistance, tssName, geneSymbol, tssCancer, cpgDistance, \
       #     cpgName, gcstr, repName, repClass, repFamily, mirDistance, mirName, tupos, tusize, tugenestart, \
       #     tugeneend, tustrand, tuacc, tugenesymbol
        countList = tuple(myLCountList)
        fullList = preList + countList #+ aftList
        #pipe15_table.execute("INSERT INTO Table_pip15 VALUES {rn}".format(rn=tuple(fullList)))
        table_list=[x for x in fullList]
        table_list.insert(0, Clone_id)
        pipe15_table.execute(query_string, table_list)
        clone_num=clone_num+1
        #outfile.write('\t'.join(map(str, fullList)) + '\n')
#outfile.close()
stop =timeit.default_timer()
pipe15_data=pd.read_sql_query('SELECT * FROM Table_pip15 ', proceesedpsl_db)
pipe15_data[pipe15_header_list+sampleTotalList].to_csv(inter_files+'initial_count_without_homopolymer_correction.txt',sep='\t')
#print 'Time required to complete the analysis %f' %((stop -start)/60.0)
proceesedpsl_db.commit()

############################################# Homoplymer error correction script ####################################
#####################################################################################################################
#all_colnames = input_data.columns.tolist()
#col_names_ts = sampleTotalList
## Two-sided
## Version: 20160912

NUM_TS = 52
HOMO_PENALTY = 0.33
SIM_THRESHOLD = 0.95
MIN_SEQLEN = 25
LINKER_SEQ = 'CGGATC'

CNT_TAG = 'TOTAL_COUNT'
LSEQ_TAG = 'L_QSEQ'
LLEN_TAG = 'L_QLEN'
LTYP_TAG = 'L_TYPE'
RSEQ_TAG = 'R_QSEQ'
RLEN_TAG = 'R_QLEN'
RTYP_TAG = 'R_TYPE'

tot_prob_cnt = 0

## Clean data
def clean_seq(seq):
    seq = seq.replace('N','')
    loc = seq.find(LINKER_SEQ)
    if loc != -1 and loc >= len(seq) - 40:
        seq = seq[:loc]
    return seq

def calc_dist(seq1, seq2, SIM_THRESHOLD):
    seq1 = seq1[:len(seq2)]
    seq2 = seq2[:len(seq1)]

    len1 = len(seq1)
    len2 = len(seq2)
    last_col = []

    v0 = [0]*(len2+1)  # the previous row
    v1 = [0]*(len2+1)  # the current row
    for k in range(len2+1):
        v0[k] = k

    curr_dist = 0
    for i in range(len1): # i is the true ind for seq1
        v1[0] = i+1
        for j in range(len2): # j is the true ind for seq2
            if i > 0 and seq1[i]==seq1[i-1] and seq1[i]==seq2[j]: # homopolymer ins in seq1
                delMin = v0[j+1] + HOMO_PENALTY
            else:
                delMin = v0[j+1] + 1 # insertion err in seq1
            if j > 0 and seq2[j]==seq2[j-1] and seq1[i]==seq2[j]:
                insMin = v1[j] + HOMO_PENALTY
            else:
                insMin = v1[j] + 1 # insertion err in seq2

            if seq1[i] == seq2[j]: # or seq1[i]=='N' or seq2[j]=='N':
                sameMin = v0[j]
                v1[j+1] = min([sameMin,delMin,insMin])
            else:
                subMin = v0[j] + 1
                v1[j+1] = min([subMin,delMin,insMin])
        last_col.append(v0[-1])
        v0 = list(v1)

        curr_dist = min(min(v0), min(last_col))/float(min(len(seq1), len(seq2)))
        if curr_dist > 1-SIM_THRESHOLD:
            return 1 # Not similar
    return curr_dist # similar

def handle_onesided(filename, side):
    if side == 'R':
        seq_lab = RSEQ_TAG
        seqlen_lab = RLEN_TAG
        match_lab = RTYP_TAG
    else:
        seq_lab = LSEQ_TAG
        seqlen_lab = LLEN_TAG
        match_lab = LTYP_TAG

    ## Read input
    input_data = pd.read_csv(filename, sep='\t')
    sLength = input_data.shape[0]
    input_data['Indl'] = pd.Series(np.random.randn(sLength), index=input_data.index)

    ## Clean data
    input_data.ix[:,seq_lab] = [clean_seq(seq) for seq in input_data[seq_lab]]
    input_data_invalid = input_data[input_data[seqlen_lab] < MIN_SEQLEN]
    input_data_invalid.to_csv(inter_files+'intermediate_input_shorterseqs_'+side+'.txt', index=False, sep='\t')
    input_data = input_data[input_data[seqlen_lab] >= MIN_SEQLEN]
    input_data.to_csv(inter_files+'intermediate_input_valid_'+side+'.txt', index=False, sep='\t')

    USE_UNSUPERVISE = True
    if USE_UNSUPERVISE:
        # So global ind is the real index in the pandas dataframe
        indgs_masts = input_data[(input_data[match_lab].str.contains('Multi'))
                | (input_data[match_lab] == 'Single')].index.tolist()
        num_masts = len(indgs_masts)

        ## Mapping Original inds to Special inds
        indls_masts = range(num_masts)

        input_data.ix[indgs_masts, 'Indl'] = indls_masts

        dict_masts = dict(zip(indls_masts, indgs_masts))

        ##
        # 20160825
        if os.path.isfile(inter_files+'intermediate_dist_matrix_unsup_'+side+'.hdf5'):
            f = h5py.File(inter_files+'intermediate_dist_matrix_unsup_'+side+'.hdf5', 'r')
            dist_matrix_unsup = f['dist_matrix_unsup'][:]
            dist_matrix_unsup.resize(num_masts, num_masts)
            f.close()
        else:
            f = h5py.File(inter_files+'intermediate_dist_matrix_unsup_'+side+'.hdf5', 'w')
            dist_matrix_unsup = np.asarray(
            [calc_dist(seq1, seq2, SIM_THRESHOLD)
            for seq1 in input_data.ix[indgs_masts][seq_lab]
                for seq2 in input_data.ix[indgs_masts][seq_lab]]
            )
            dist_matrix_unsup = dist_matrix_unsup.reshape((num_masts, num_masts))
            f.create_dataset(inter_files+'dist_matrix_unsup', data=dist_matrix_unsup)
            f.close()

        # ranking
        def rank_seqs(seq_rows):
            scores = [0]*len(seq_rows)
            dict_hitTypes = {'Multi':0.67, 'Single':0.33}
            for i in range(len(seq_rows)):
                scores[i] += ('Multi' in seq_rows[i].iloc[0][match_lab]) * 100
                scores[i] += seq_rows[i].iloc[0][CNT_TAG] * 10
                scores[i] += seq_rows[i].iloc[0][seqlen_lab] * 1
            return scores.index(max(scores))

        # grouping, not parallelable
        visited = np.zeros((num_masts,num_masts)) #
        queue = []
        groups = [] # will contain global ind
        for i in indls_masts:
            for j in indls_masts: # j > i!!!
                if j <= i:
                    continue
                if not visited[i][j] and dist_matrix_unsup[i][j] < 1 - SIM_THRESHOLD:
                    group = [] # the ind in the group is local
                    queue.insert(0, [i,j])
                    visited[i][j] = True
                    while len(queue) > 0:
                        curr_node = queue.pop()
                        group.append(curr_node[0])
                        group.append(curr_node[1])
                        # first, search through (i1,j) where i1 < i
                        for i1 in range(curr_node[1]):
                            if (not visited[i1][curr_node[1]]) and \
                                            dist_matrix_unsup[i1][curr_node[1]] < 1 - SIM_THRESHOLD:
                                queue.insert(0, [i1, curr_node[1]])
                                visited[i1][curr_node[1]] = True
                        for j2 in range(curr_node[1]+1, num_masts):
                            if (not visited[curr_node[1]][j2]) and\
                                            dist_matrix_unsup[curr_node[1]][j2] < 1 - SIM_THRESHOLD:
                                queue.insert(0, [curr_node[1], j2])
                                visited[curr_node[1]][j2] = True

                        for j1 in range(curr_node[0]+1, num_masts):
                            if (not visited[curr_node[0]][j1]) and \
                                            dist_matrix_unsup[curr_node[0]][j1] < 1 - SIM_THRESHOLD:
                                queue.insert(0, [curr_node[0], j1])
                                visited[curr_node[0]][j1] = True
                        for i2 in range(curr_node[0]):
                            if (not visited[i2][curr_node[0]]) and\
                                            dist_matrix_unsup[i2][curr_node[0]] < 1 - SIM_THRESHOLD:
                                queue.insert(0, [i2, curr_node[0]])
                                visited[i2][curr_node[0]] = True
                    group = list(set(group))
                    # Above: finished constructing a group of local inds

                    # Next: Ranking
                    curr_best = rank_seqs([input_data.loc[[dict_masts[k]]] for k in group])

                    # Put the best one at the loc 0
                    best_mast_indl = group.pop(curr_best)
                    group.insert(0, best_mast_indl)

                    # Method 1: Add all cnts to the best one
                    best_mast_indg = dict_masts[best_mast_indl]
                    for k in range(1, len(group)):  # local ind
                        curr_mast_indg = dict_masts[group[k]]
                        input_data.ix[best_mast_indg, CNT_TAG] +=\
                            input_data.ix[curr_mast_indg, CNT_TAG]
                        input_data.ix[best_mast_indg, col_names_ts] +=\
                            input_data.ix[curr_mast_indg, col_names_ts]
                        input_data.ix[curr_mast_indg, 'Indl'] = -1

                    groups += [dict_masts[one_local_ind] for one_local_ind in group]
                    groups += ['']

        #input_data.loc[groups].to_csv('intermediate_output_454_masts_grouped_'+side+'.txt', index=False, sep='\t')
        input_data = input_data[(input_data[match_lab].str.contains('NoGoodSpn')
                | input_data[match_lab].str.contains('NoHits')) | (input_data['Indl'] >= 0)] # Get rid of merged stuff
        #input_data.drop('Indl',axis=1,inplace=True)
        input_data.to_csv(inter_files+'intermediate_output_454_masts_merged_'+side+'.txt', index=False,  sep='\t')
        ##
        dict_masts.clear()


    ###### Supervised now ######

    ## Original indices
    indgs_masts = input_data[(input_data[match_lab].str.contains('Multi'))
            | (input_data[match_lab] == 'Single')].index.tolist()
    indgs_slavs = input_data[(input_data[match_lab].str.contains('NoGoodSpn'))
            | (input_data[match_lab].str.contains('NoHits'))].index.tolist()

    num_masts = len(indgs_masts)
    num_slavs = len(indgs_slavs)

    ## Mapping local inds to global inds

    indls_masts = range(num_masts)
    dict_masts = dict(zip(indls_masts, indgs_masts))
    indls_slavs = range(num_slavs)
    dict_slavs = dict(zip(indls_slavs, indgs_slavs))

    ## Mapping global inds to local inds
    input_data.ix[indgs_masts,'Indl'] = indls_masts
    input_data.ix[indgs_slavs,'Indl'] = indls_slavs

    ## Distance matrix
    if os.path.isfile(inter_files+'intermediate_dist_matrix_super_'+side+'.hdf5'):
        f = h5py.File(inter_files+'intermediate_dist_matrix_super_'+side+'.hdf5', 'r')
        dist_matrix_super = f['dist_matrix_super'][:]
        dist_matrix_super.resize(num_masts, num_slavs)
        f.close()
    else:
        f = h5py.File(inter_files+'intermediate_dist_matrix_super_'+side+'.hdf5', 'w')
        dist_matrix_super = np.asarray(
        [calc_dist(seq1, seq2, SIM_THRESHOLD)
        for seq1 in input_data.ix[indgs_masts,seq_lab]
            for seq2 in input_data.ix[indgs_slavs,seq_lab]]
                  )
        dist_matrix_super = dist_matrix_super.reshape((num_masts, num_slavs))
        f.create_dataset(inter_files+'dist_matrix_super', data=dist_matrix_super)
        f.close()

    # The best way to remember this is that the order of for loop inside the list comprehension
    # is based on the order in which they appear in traditional loop approach. Outer most loop comes
    # first, and then the inner loops subsequently.

    ## Merge slavs to masts
    for i in indls_masts:
        Indls_int = [int(one_Indl) for one_Indl in input_data['Indl']]
        slavs_indl = [one_ind for one_ind in
            input_data[input_data.index.isin(indgs_slavs) &
                (dist_matrix_super[i][Indls_int] < 1 - SIM_THRESHOLD)]['Indl']]

        if len(slavs_indl) > 0:
            for j in slavs_indl:
                input_data.ix[dict_masts[i], CNT_TAG] += input_data.ix[dict_slavs[j], CNT_TAG]
                input_data.ix[dict_masts[i], col_names_ts] += input_data.ix[dict_slavs[j], col_names_ts]

    err_slavs = []
    err_slav_masts = []
    ## Get rid of error slavs
    for j in indls_slavs:
        masts_indgs = [dict_masts[i] for i in indls_masts
                       if dist_matrix_super[i][j] < 1 - SIM_THRESHOLD]
        slav_indg = dict_slavs[j]

        if len(masts_indgs) > 1:
            err_slavs.append(slav_indg)
            err_slav_masts.append(masts_indgs)

            for mast_indg in masts_indgs:
                input_data.ix[mast_indg, CNT_TAG] -= input_data.ix[slav_indg, CNT_TAG]
                input_data.ix[mast_indg, col_names_ts] -= input_data.ix[slav_indg, col_names_ts]
            input_data.ix[slav_indg, 'Indl'] = -2
        elif len(masts_indgs) == 1:
            input_data.ix[slav_indg, 'Indl'] = -3 # -input_data.ix[slav_indg, 'Indl']

    ## Write to file
    curr_merged = input_data[input_data.index.isin(indgs_masts)
        | (input_data.index.isin(indgs_slavs) & (input_data['Indl'] >= 0))]
    curr_merged.to_csv(inter_files+'intermediate_output_454_merged_'+side+'.txt', index=False, sep='\t')

    err_slavs_df = pd.DataFrame(columns=input_data.columns.tolist())
    for i in range(len(err_slavs)):
        err_slavs_df = err_slavs_df.append(pd.DataFrame(input_data.loc[[err_slavs[i]]]))
        for one_mast in err_slav_masts[i]:
            err_slavs_df = err_slavs_df.append(pd.DataFrame(input_data.loc[[one_mast]]))

        err_slavs_df = err_slavs_df.append({'CLONE_ID':'-'}, ignore_index=True)
        #print input_data.ix[err_slavs[i], CNT_TAG]
    err_slavs_df.to_csv(inter_files+'intermediate_output_454_problemones_'+side+'.txt', index=False, sep='\t')


DISTTHRESHOLD = 20
def linkable_RL(seq1, seq2):
    dict_turn = {'G':'C', 'C':'G', 'A':'T', 'T':'A'}
    LINK_THRESHOLD = 0.6
    seq1 = ''.join([dict_turn[n] for n in seq1[:5]]) # code
    seq2 = seq2[:5][::-1] # take the first 5, and reverse
    return calc_dist(seq1, seq2, LINK_THRESHOLD) < 1 - LINK_THRESHOLD

SITE_TAG = 'CHR_SITE1'
ORIE_TAG = 'R_STRAND'
def link_RLseqs(input_data):
    input_data = input_data.sort_values([SITE_TAG], ascending=False)

    for i in range(input_data.shape[0] - 1):
        curr_Rloc = input_data.iloc[i][SITE_TAG]
        next_Rloc = input_data.iloc[i+1][SITE_TAG]

        curr_ORI = input_data.iloc[i][ORIE_TAG]
        next_ORI = input_data.iloc[i+1][ORIE_TAG]

        curr_Rseq = input_data.iloc[i][RSEQ_TAG]
        next_Rseq = input_data.iloc[i+1][RSEQ_TAG]

        curr_Lseq = input_data.iloc[i][LSEQ_TAG]
        next_Lseq = input_data.iloc[i+1][LSEQ_TAG]

        if next_Rloc == -1:
            break
        if (curr_Rloc - next_Rloc > DISTTHRESHOLD)\
                or (curr_ORI != next_ORI):
            continue

        # NOTE: isinstance(curr_Rseq, float) means MISSING data
        if curr_Lseq > 0 and isinstance(curr_Rseq, float)\
                and next_Rseq > 0 and isinstance(next_Lseq, float):
            if linkable_RL(curr_Lseq, next_Rseq):
                curr_index = input_data.index[i]
                input_data.loc[curr_index,RSEQ_TAG] = next_Rseq
                input_data.loc[curr_index,RLEN_TAG] = len(next_Rseq)
                input_data.loc[curr_index,CNT_TAG] += input_data.iloc[i+1][CNT_TAG]
                input_data.loc[curr_index,col_names_ts] += input_data.iloc[i+1][col_names_ts]

                input_data.drop(input_data.index[i+1], inplace=True)

        if curr_Rseq > 0 and isinstance(curr_Lseq, float)\
                and next_Lseq > 0 and isinstance(next_Rseq, float):
            if linkable_RL(curr_Rseq, next_Lseq):
                curr_index = input_data.index[i]
                input_data.loc[curr_index,LSEQ_TAG] = next_Lseq
                input_data.loc[curr_index,LLEN_TAG] = len(next_Lseq)
                input_data.loc[curr_index,CNT_TAG] += input_data.iloc[i+1][CNT_TAG]
                input_data.loc[curr_index,col_names_ts] += input_data.iloc[i+1][col_names_ts]

                input_data.drop(input_data.index[i+1], inplace=True)
    return input_data


input_data = pd.read_csv(inter_files+'initial_count_without_homopolymer_correction.txt', sep='\t')
#input_data = input_data.rename(columns={'Unnamed: 0':'Indl'})

all_colnames = input_data.columns.tolist()
col_names_ts = sampleTotalList


#col_names_ts = ['LEFT TOTAL','RIGHT TOTAL',' LRQ55',' LRQ56',' LRQ57',' LRQ58',' RRQ55',' RRQ56',' RRQ57',' RRQ58']

input_data.ix[input_data[RLEN_TAG]>0,RSEQ_TAG] =\
    [clean_seq(seq) for seq in input_data[input_data[RLEN_TAG]>0][RSEQ_TAG]]
input_data.ix[input_data[RLEN_TAG]>0,RLEN_TAG] =\
    [len(seq) for seq in input_data[input_data[RLEN_TAG]>0][RSEQ_TAG]]
input_data.ix[input_data[LLEN_TAG]>0,LSEQ_TAG] =\
    [clean_seq(seq) for seq in input_data[input_data[LLEN_TAG]>0][LSEQ_TAG]]
input_data.ix[input_data[LLEN_TAG]>0,LLEN_TAG] =\
    [len(seq) for seq in input_data[input_data[LLEN_TAG]>0][LSEQ_TAG]]

# TODO: what if both sides < 25?
input_data_invalid = input_data[(input_data[RLEN_TAG]+input_data[LLEN_TAG]) < MIN_SEQLEN]
input_data_invalid.to_csv(inter_files+'intermediate_input_shorterseqs.txt', index=False, sep='\t')
input_data = input_data[(input_data[RLEN_TAG]+input_data[LLEN_TAG]) >= MIN_SEQLEN]
input_data.to_csv(inter_files+'intermediate_input_valid.txt', index=False, sep='\t')
# print 'total valid cnt: ' + str(sum(input_data[CNT_TAG]))

sLength = input_data.shape[0]
input_data['Indl'] = pd.Series(np.random.randn(sLength), index=input_data.index)

DO_GENERATE_FILES = True
if DO_GENERATE_FILES:
    indgs_Tsided = input_data[(input_data[RLEN_TAG] > 0)
            & (input_data[LLEN_TAG] > 0)].index.tolist()
    num_Tsided = len(indgs_Tsided)
    indls_Tsided = range(num_Tsided)
    input_data.ix[indgs_Tsided,'Indl'] = indls_Tsided
    dict_Tsided = dict(zip(indls_Tsided, indgs_Tsided))

    indgs_Rsided = input_data[(input_data[RLEN_TAG] > 0)
            & (input_data[LLEN_TAG] == 0)].index.tolist()
    num_Rsided = len(indgs_Rsided)
    indls_Rsided = range(num_Rsided)
    input_data.ix[indgs_Rsided,'Indl'] = indls_Rsided
    dict_Rsided = dict(zip(indls_Rsided, indgs_Rsided))

    ################
    ## Merge R- to T-
    if os.path.isfile(inter_files+'intermediate_dist_matrix_TRsided.hdf5'):
        f = h5py.File(inter_files+'intermediate_dist_matrix_TRsided.hdf5', 'r')
        dist_matrix_TRsided = f['dist_matrix_TRsided'][:]
        dist_matrix_TRsided.resize(num_Tsided, num_Rsided)
        f.close()
    else:
        f = h5py.File(inter_files+'intermediate_dist_matrix_TRsided.hdf5', 'w')
        dist_matrix_TRsided = np.asarray(
        [calc_dist(seq1, seq2, SIM_THRESHOLD)
        for seq1 in input_data.ix[indgs_Tsided,RSEQ_TAG]
            for seq2 in input_data.ix[indgs_Rsided,RSEQ_TAG]]
        )
        dist_matrix_TRsided = dist_matrix_TRsided.reshape((num_Tsided, num_Rsided))
        f.create_dataset(inter_files+'dist_matrix_TRsided', data=dist_matrix_TRsided)
        f.close()

    ## Merge slavs to masts
    input_data_Rsided = input_data.loc[indgs_Rsided]
    for i in indls_Tsided:
        Indls_R_int = [int(one_Indl) for one_Indl in input_data_Rsided['Indl']]
        slavs_indl = [one_ind for one_ind in
            input_data_Rsided[dist_matrix_TRsided[i][Indls_R_int] < 1 - SIM_THRESHOLD]['Indl']]

        if len(slavs_indl) > 0:
            for j in slavs_indl:
                input_data.ix[dict_Tsided[i], CNT_TAG] += input_data.ix[dict_Rsided[j], CNT_TAG]
                input_data.ix[dict_Tsided[i], col_names_ts] += input_data.ix[dict_Rsided[j], col_names_ts]

    err_Rsided = []
    err_Rsided_Ts = []
    ## Get rid of error slavs
    for j in indls_Rsided:
        masts_indgs = [dict_Tsided[i] for i in indls_Tsided
                       if dist_matrix_TRsided[i][j] < 1 - SIM_THRESHOLD]
        slav_indg = dict_Rsided[j]

        if len(masts_indgs) > 1:
            err_Rsided.append(slav_indg)
            err_Rsided_Ts.append(masts_indgs)

            for mast_indg in masts_indgs:
                input_data.ix[mast_indg, CNT_TAG] -= input_data.ix[slav_indg, CNT_TAG]
                input_data.ix[mast_indg, col_names_ts] -= input_data.ix[slav_indg, col_names_ts]
            input_data.ix[slav_indg, 'Indl'] = -2

        elif len(masts_indgs) == 1:
            input_data.ix[slav_indg, 'Indl'] = -3 #

    ################
    ## Merge L- to T-
    indgs_Lsided = input_data[(input_data[RLEN_TAG] == 0)
            & (input_data[LLEN_TAG] > 0)].index.tolist()
    num_Lsided = len(indgs_Lsided)
    indls_Lsided = range(num_Lsided)
    input_data.ix[indgs_Lsided, 'Indl'] = indls_Lsided
    dict_Lsided = dict(zip(indls_Lsided, indgs_Lsided))

    input_data_Lsided = input_data.loc[indgs_Lsided]

    if os.path.isfile(inter_files+'intermediate_dist_matrix_TLsided.hdf5'):
        f = h5py.File(inter_files+'intermediate_dist_matrix_TLsided.hdf5', 'r')
        dist_matrix_TLsided = f['dist_matrix_TLsided'][:]
        dist_matrix_TLsided.resize(num_Tsided, num_Lsided)
        f.close()
    else:
        f = h5py.File(inter_files+'intermediate_dist_matrix_TLsided.hdf5', 'w')
        dist_matrix_TLsided = np.asarray(
        [calc_dist(seq1, seq2, SIM_THRESHOLD)
        for seq1 in input_data.ix[indgs_Tsided,LSEQ_TAG]
            for seq2 in input_data.ix[indgs_Lsided,LSEQ_TAG]]
        )
        dist_matrix_TLsided = dist_matrix_TLsided.reshape((num_Tsided, num_Lsided))
        f.create_dataset('dist_matrix_TLsided', data=dist_matrix_TLsided)
        f.close()

    ## Merge slavs to masts
    for i in indls_Tsided:
        Indls_L_int = [int(one_Indl) for one_Indl in input_data_Lsided['Indl']]
        slavs_indl = [one_ind for one_ind in
            input_data_Lsided[dist_matrix_TLsided[i][Indls_L_int] < 1 - SIM_THRESHOLD]['Indl']]

        if len(slavs_indl) > 0:
            for j in slavs_indl:
                input_data.ix[dict_Tsided[i], CNT_TAG] += input_data.ix[dict_Lsided[j], CNT_TAG]
                input_data.ix[dict_Tsided[i], col_names_ts] += input_data.ix[dict_Lsided[j], col_names_ts]

    err_Lsided = []
    err_Lsided_Ts = []
    ## Get rid of error slavs
    for j in indls_Lsided:
        masts_indgs = [dict_Tsided[i] for i in indls_Tsided
                       if dist_matrix_TLsided[i][j] < 1 - SIM_THRESHOLD]
        slav_indg = dict_Lsided[j]

        if len(masts_indgs) > 1:
            err_Lsided.append(slav_indg)
            err_Lsided_Ts.append(masts_indgs)

            for mast_indg in masts_indgs:

                input_data.ix[mast_indg, CNT_TAG] -= input_data.ix[slav_indg, CNT_TAG]
                input_data.ix[mast_indg, col_names_ts] -= input_data.ix[slav_indg, col_names_ts]
            input_data.ix[slav_indg, 'Indl'] = -2
        elif len(masts_indgs) == 1:
            input_data.ix[slav_indg, 'Indl'] = -3 # -input_data.ix[slav_indg, 'Indl']

    ################

    ################

    DO_LINK_RLSEQS = True
    if DO_LINK_RLSEQS:
        input_data = link_RLseqs(input_data)
    else:
        input_data = input_data.sort_values([SITE_TAG], ascending=False)
    ################

    ## Write to file, T
    pd_Tsided = input_data[(input_data[RLEN_TAG] > 0) & (input_data[LLEN_TAG] > 0)]
    pd_Tsided.to_csv(inter_files+'intermediate_merged_Tsided.txt', index=False, sep='\t')

    # Handle T-side
    err_slavs_df = pd.DataFrame(columns=input_data.columns.tolist())
    err_slavs = err_Rsided + err_Lsided
    err_slav_masts = err_Rsided_Ts + err_Lsided_Ts
    for i in range(len(err_slavs)):
        err_slavs_df = err_slavs_df.append(pd.DataFrame(input_data.loc[[err_slavs[i]]]))
        for one_mast in err_slav_masts[i]:
            err_slavs_df = err_slavs_df.append(pd.DataFrame(input_data.loc[[one_mast]]))
        err_slavs_df = err_slavs_df.append({'CLONE_ID':'separator'}, ignore_index=True)

        #print input_data.ix[err_slavs[i], CNT_TAG]
    err_slavs_df.to_csv(inter_files+'intermediate_problemones_Tsided.txt', index=False, sep='\t')


    ## Write to file, R
    pd_Rsided = input_data[(input_data[RLEN_TAG] > 0) & (input_data[LLEN_TAG] == 0)\
                            & (input_data['Indl'] >= 0)]
    pd_Rsided.to_csv(inter_files+'intermediate_valid_Rsided.txt', index=False, sep='\t')

    ## Call function for R-side
    if not os.path.isfile(inter_files+'intermediate_output_454_merged_R.txt'):
        handle_onesided(inter_files+'intermediate_valid_Rsided.txt', 'R')

    ## Write to file, L
    pd_Lsided = input_data[(input_data[RLEN_TAG] == 0) & (input_data[LLEN_TAG] > 0)\
                            & (input_data['Indl'] >= 0)]
    pd_Lsided.to_csv(inter_files+'intermediate_valid_Lsided.txt', index=False, sep='\t')

    ## Call function for L-side
    if not os.path.isfile(inter_files+'intermediate_output_454_merged_L.txt'):
        handle_onesided(inter_files+'intermediate_valid_Lsided.txt', 'L')

## Output
problemones = pd.DataFrame(columns=input_data.columns.tolist())
problemones = problemones.append(pd.read_csv(inter_files+'intermediate_problemones_Tsided.txt', sep='\t'))
problemones = problemones.append(pd.read_csv(inter_files+'intermediate_output_454_problemones_R.txt',  sep='\t'))
problemones = problemones.append(pd.read_csv(inter_files+'intermediate_output_454_problemones_L.txt',  sep='\t'))
problemones.drop('Indl',axis=1,inplace=True)
#problemones.to_csv('problemones.txt', index=False, sep='\t')

##TODO: grouped Tmasts? and naming

merged_all = pd.DataFrame(columns=input_data.columns.tolist())
merged_all = merged_all.append(pd.read_csv(inter_files+'intermediate_merged_Tsided.txt', sep='\t'))
merged_all = merged_all.append(pd.read_csv(inter_files+'intermediate_output_454_merged_R.txt', sep='\t'))
merged_all = merged_all.append(pd.read_csv(inter_files+'intermediate_output_454_merged_L.txt', sep='\t'))
merged_all.drop('Indl',axis=1,inplace=True)
merged_all.to_csv(inter_files+'Final_count.txt', index=False, sep='\t')
#print 'total merged cnt: ' + str(sum(merged_all[CNT_TAG]))

# clean up the intermediate files
DO_DELETE_INTERMEDIATE = True
if DO_DELETE_INTERMEDIATE:
	xxz=inter_files+'intermediate_*'
	rm_intcmd='rm -rf %s'%xxz
	rm_int=subprocess.Popen(rm_intcmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell =True)
	rmstdout,rmstderr=rm_int.communicate()


