% NNS data analyzer
%
% INPUTS
% 1. The fastq sequence files that need to be analyzed (typically
%    paired-end reads joined by FLASh (T. Magoc and S. Salzburg.
%    Bioinformatics. 27:21 (2011), 2957-63)
% 2. the fwd and rev anchor pairs that flank the sequence mutation window
%    (NNS sub-library group)
% 3. the wildtype DNA sequence of the full length gene flanked with first 
%    fwd and last rev anchor
%
% OUTPUTS
% For each *.fastq file, a corresponding matlab structure variable ANCHORS
% is generated and saved as *.fastq.mat. The only fields in ANCHORS are
% 1) COUNTS that has 21 rows (one for each amino acid + stop codon) and a
% column for each position in the partition window that gets analyzed.
% 2) WT_COUNTS : number of times the WT protein fragment is seen.
% ANCHORS is a vector with as many elements as there are fwd - rev anchor
% pairs. Essentially, this script gives the counts of each of the NNS
% mutations
%
% Subramanian Subramanian  subramaniansk@gmail.com
% Michael A. Stiffler      mstiffler@post.harvard.edu
% 09Dec2015

%% initialization. Enter sequence information specific to experiment.
% Illustrative default values are for TEM-1 beta-lactamase;
wt_dna='GTTTTTGCTCACCCAGAAACGCTGGTGAAAGTAAAAGATGCTGAAGATCAGTTGGGTGCACGAGTGGGTTACATCGAACTGGATCTCAACAGCGGTAAGATCCTTGAGAGTTTTCGCCCCGAAGAACGTTTTCCAATGATGAGCACTTTTAAAGTTCTGCTATGTGGCGCGGTATTATCCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGCTTTTTTGCACAACATGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGAATGAAGCCATACCAAACGACGAGCGTGACACCACGATGCCTGCAGCAATGGCAACAACGTTGCGCAAACTATTAACTGGCGAACTACTTACTCTAGCTTCCCGGCAACAATTAATAGACTGGATGGAGGCGGATAAAGTTGCAGGACCACTTCTGCGCTCGGCCCTTCCGGCTGGCTGGTTTATTGCTGATAAATCTGGAGCCGGTGAGCGTGGGTCTCGCGGTATCATTGCAGCACTGGGGCCAGATGGTAAGCCCTCCCGTATCGTAGTTATCTACACGACGGGGAGTCAGGCAACTATGGATGAACGAAATAGACAGATCGCTGAGATAGGTGCCTCACTGATTAAGCATTGGTAACTGTCAGACCAAGTTTA';
anchors_fwd = {'GTTTTTGCT','TTTTAAAGTTCTGCTATGTGGC','ATAACCATGAGTGATAAC','CACGATGCCT','TGATAAATCTGGA'};
anchors_rev = {'GCGGTATTATCCCGT','ACTGCGGCCAACTTA','GCAATGGCAACAACGTT','GCCGGTGAGCGT','CTGTCAGACCAAGTTTA'};
aa='ACDEFGHIKLMNPQRSTVWY*'; % amino acid order

%% get number of anchor pairs, insert size and WT insert sequence
if numel(anchors_fwd) == numel(anchors_rev)
    n_anchors=numel(anchors_fwd);
else
    disp('Unequal number of fwd and rev anchors. Exiting...');
    exit();
end
for i=1:numel(anchors_fwd); sequencing_file_names{i}=[num2str(i) '.fastq'];end
for i=1:n_anchors
    f1=strfind(wt_dna,anchors_fwd{i});
    r1=strfind(wt_dna,anchors_rev{i});
    anchors_wt_nuc{i}=wt_dna(f1+numel(anchors_fwd{i}):r1-1); % get wt sequence between anchors
    anchors_len(i)=numel(anchors_wt_nuc{i});
    anchors_wt{i}=nt2aa(anchors_wt_nuc{i},'AlternativeStartCodons','False'); % get wt protein sequence between anchors
end

%% process each anchor pair
% Logic:in each sequencing file identify all reads flanked by each of the 
%       anchor pairs (fwd and rev anchor), and assign the region between 
%       anchors as WT, or any of the single mutants, or discard. finally,
%       get the counts of WT, and single mutants (amino acid level).
%
for h=1:numel(sequencing_file_names)
    [~,sequences]=fastqread(sequencing_file_names{h}); % read in sequencing files
    if ~iscell(sequences)
        sequences={sequences};
    end
    
    % process for each anchor sequence
    for i=1:n_anchors
        wt_len=numel(anchors_wt{i});
        seqs=repmat('-',numel(sequences), wt_len);
        wt_counts=0;
        fwd=anchors_fwd{i};
        nfwd=numel(fwd);
        rev=anchors_rev{i};
        len=anchors_len(i);
        wt_anch=anchors_wt{i};
        wt_anch_nuc=anchors_wt_nuc{i};
        
        % process for each sequencing read
        parfor j=1:numel(sequences) 
            f1=strfind(sequences{j},fwd); % find forward anchor
            if f1
                f1=f1(1);
                r1=strfind(sequences{j},rev); % find reverse anchor
                if r1
                    r1=r1(end);
                    if r1-f1==nfwd+len % check if sequence between anchors is of correct length
                        pro=nt2aa(sequences{j}(f1+nfwd:r1-1),'AlternativeStartCodons','False'); % translate region between anchors
                        tot=sum(eq(wt_anch,pro)); % find total number of non-mutated amino acids
                        if tot==wt_len&&sum(eq(sequences{j}(f1+nfwd:r1-1),wt_anch_nuc))~=len % check if wild-type nucleotide sequence
                            wt_counts=wt_counts+1; % add to wild-type counts
                        elseif wt_len-tot==1 % check if contains only one amino acid mutation
                            seqs(j,:)=pro; % replace amino acid sequence in seqs with translated sequence 
                        end
                    end
                end
            end
        end
        seqs(seqs(:,1)=='-',:)=[];

        % get the counts for each amino acid at each position
        aa_counts=zeros(numel(aa),wt_len);
        for j=1:numel(aa)
            aa_counts(j,:)=sum(seqs==aa(j));
        end
        anchors(i).counts=aa_counts;
        anchors(i).wt_counts=wt_counts;
    end
    save([sequencing_file_names{h} '.anchor_counts.' 'date_' datestr(now','yyyy.mm.dd') '_t_' datestr(now','HH.MM.SS') '.mat'],'anchors');
    disp(['Sequencing file ' sequencing_file_names{h} ' processed.']);
    disp('Anchor_Group#   #WT_sequences    #All single mutants');
    for i=1:numel(anchors_fwd);disp([num2str(i) '      ' num2str(anchors(i).wt_counts) '    ' num2str(sum(anchors(i).counts(:,1)))]);end
end