#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
#

import sys
# import re
import argparse
from itertools import groupby


TRANSLATE = {
    "AAA": "K", "AAC": "N", "AAG": "K", "AAT": "N",
    "ACA": "T", "ACC": "T", "ACG": "T", "ACT": "T",
    "AGA": "R", "AGC": "S", "AGG": "R", "AGT": "S",
    "ATA": "I", "ATC": "I", "ATG": "M", "ATT": "I",
    "CAA": "Q", "CAC": "H", "CAG": "Q", "CAT": "H",
    "CCA": "P", "CCC": "P", "CCG": "P", "CCT": "P",
    "CGA": "R", "CGC": "R", "CGG": "R", "CGT": "R",
    "CTA": "L", "CTC": "L", "CTG": "L", "CTT": "L",
    "GAA": "E", "GAC": "D", "GAG": "E", "GAT": "D",
    "GCA": "A", "GCC": "A", "GCG": "A", "GCT": "A",
    "GGA": "G", "GGC": "G", "GGG": "G", "GGT": "G",
    "GTA": "V", "GTC": "V", "GTG": "V", "GTT": "V",
    "TAA": "*", "TAC": "Y", "TAG": "*", "TAT": "Y",
    "TCA": "S", "TCC": "S", "TCG": "S", "TCT": "S",
    "TGA": "*", "TGC": "C", "TGG": "W", "TGT": "C",
    "TTA": "L", "TTC": "F", "TTG": "L", "TTT": "F"
    }


COMPBASES = [('A', 'T'), ('C', 'G'), ('K', 'M'), ('R', 'Y'), ('S', 'W'),
             ('B', 'V'), ('D', 'H'), ('N', 'X'), ('-', '-'), ('X', 'X')]

COMPCODE = dict(COMPBASES +
                [(v, k) for (k, v) in COMPBASES] +
                [(k.lower(), v.lower()) for (k, v) in COMPBASES] +
                [(v.lower(), k.lower()) for (k, v) in COMPBASES]
                )


def complement(sequence):
    """Returns sequence complement"""
    return ''.join([COMPCODE[x] for x in sequence[::-1]])


def fasta_iter(fasta_name):
    """
        given a fasta file. yield tuples of header, sequence
        Adapted from https://github.com/brentp
    """
    filehandler = open(fasta_name, 'r')
    faiter = (x[1] for x in groupby(filehandler, lambda line: line[0] == ">"))
    for header in faiter:
        header = next(header)[1:].strip()
        seq = "".join(s.strip() for s in next(faiter))
        yield header, seq


def parse_gff_exons(gff_file, window_size=10000):
    """Parses a GFF3 file for exon locations
        Arguments:
            gff_file: path to GFF3 file

        Output: triplets for codon locations

    """
    # re_exonid = re.compile("Name=(.*?);")
    def proc_note(notes):
        dictnote = {}
        dictsrc = {'id': 'ID=',
                   'name': 'Name=',
                   'gene': 'Parent=',
                   'extname': 'external_name='}
        notes = notes.split(';')
        for x in notes:
            for srch in dictsrc:
                if x.startswith(dictsrc[srch]):
                    dictnote[srch] = x.replace(dictsrc[srch], '')
        return dictnote
#        re_id = re.compile("ID=(.*?)(?:\s;|$)")
#        re_name = re.compile("Name=(.*?)(?:\s;|$)")
#        re_parent = re.compile("Parent=transcript:(.*?)(?:\s;|$)")
#        re_extname = re.compile("external_name=(.*?)(?:\s;|$)")
        # re_phase = re.compile("ensembl_phase=(.*?);")
    features = {}
    # geneid = 0
    with open(gff_file) as gff:
        for line in gff:
            if line[0] == '#':
                continue
            arr = line.rstrip().split()
            phase = '-'
            dnote = proc_note(arr[8])
            genename = dnote.get('gene', '-')
            featurename = dnote.get('id', '-')
            if arr[2] in ['CDS', 'biological_region', 'five_prime_UTR',
                          'three_prime_UTR', 'ncRNA_gene']:
                if arr[2] == 'CDS':
#                    featurename = re.findall(re_id, arr[8])[0]
#                    genename = re.findall(re_parent, arr[8])[0]
                    phase = int(arr[7])
                    featuretype = "exon"
                elif arr[2] == 'three_prime_UTR' or arr[2] == 'five_prime_UTR':
                    featuretype = arr[2][:]
            #        print(dnote)
            #        print(line)
                elif arr[2] == "ncRNA_gene":
                    featurename = dnote['id']
                    featuretype = "ncRNA"
                elif (arr[2] == "biological_region" and
                        "logic_name=intron" in arr[8]):
                    featuretype = "intron"
                else:
                    continue
                strand = arr[6]
                contig = arr[0]
                if contig not in features:
                    features[contig] = {}
                coords = [int(arr[3]), int(arr[4])]
                for win in range(coords[0] // window_size,
                                 coords[1] // window_size + 1):
                    if win not in features[contig]:
                        features[contig][win] = []
                    features[contig][win].append({'type': featuretype,
                                                  'feature': featurename,
                                                  'gene': genename,
                                                  'startcoord': coords[0],
                                                  'endcoord': coords[1],
                                                  'phase': phase,
                                                  'strand': strand
                                                  })
    return features


def main(arguments=None):
    arguments = arguments or sys.argv[1:]
    parser = argparse.ArgumentParser(
        description='')
    parser.add_argument('--list', nargs='*')
    parser.add_argument('--out')
    parser.add_argument('--gff')
    parser.add_argument('--fasta')
    parser.add_argument('--pattern', nargs='*')
    parser.add_argument('--window', type=int, default=10000)
    args = parser.parse_args(args=arguments)
    exons = parse_gff_exons(args.gff, window_size=args.window)
    seqs = {}
    for header, seq in fasta_iter(args.fasta):
        seqs[header] = seq
    entries = []
    for fname in args.list:
        with open(fname, 'rt') as infile:
            for line in infile:
                arr = line.rstrip().split()
                if arr[4] not in args.pattern:
                    continue
                rep = arr[0].split(':')[0]
                chrom = arr[0].split(':')[1]
                coord = int(arr[1])
                refallele = arr[2]
                altallele = arr[3]
                window = coord // args.window
                if window not in exons[chrom]:
                    continue
                for entry in exons[chrom][window]:
                    if coord >= entry['startcoord']:
                        if coord <= entry['endcoord']:
                            if len(refallele) > 1 or len(altallele) > 1:
                                entries.append([chrom, str(coord).zfill(7),
                                                arr[4], entry['gene'],
                                                entry['feature'],
                                                '-', refallele, altallele,
                                                '-', '-',
                                                'INDEL', rep])
                            elif entry['type'] == 'exon':
                                phase = ((coord - entry['startcoord']) % 3 +
                                         entry['phase'])
                                refseq = seqs[chrom][(coord - 1) -
                                                     phase:(coord - 1) -
                                                     phase + 3]
                                altseq = ''.join([refseq[:phase], altallele,
                                                  refseq[phase+1:]])
                                if entry['strand'] == '-':
                                    refseq = complement(refseq)
                                    altseq = complement(altseq)
                                refpep = TRANSLATE[refseq]
                                altpep = TRANSLATE[altseq]
                                entries.append([chrom, str(coord).zfill(7),
                                                arr[4],
                                                entry['gene'],
                                                entry['feature'],
                                                str(phase), refseq,
                                                altseq, refpep, altpep,
                                                refpep == altpep and 'SYN'
                                                or 'NONSYN', rep])
                            else:
                                entries.append([chrom, str(coord).zfill(7),
                                                arr[4],
                                                entry['gene'],
                                                entry['feature'],
                                                '-', refallele, altallele,
                                                "-", "-", entry['type'], rep
                                                ])

    with open(args.out, 'wt') as outfile:
        for entry in sorted(entries):
            outfile.write("{}\n".format("\t".join(entry)))
    return ''

if __name__ == '__main__':
    main()
