#!/usr/bin/perl -w                                                                                                          
use strict;

#This perl script serves as an example script to remove reads from fastq files that have matches in a contaminant database, based on filtered ssaha2 output.  
#USAGE: perl ssaha_match_removal_script.pl my_fastq.fq my_ssaha2_contaminant_screen_results.txt my_filtered_fastq_output.fq

my $fastq = shift @ARGV or die;
my $ssaha2_result = shift @ARGV or die;
my $out = shift @ARGV or die;

#create a hash with the sequence IDs to be removed (significant matches from ssaha2 search against contaminant database)
my %remove = ();
open ( my $RM, $ssaha2_result ) or die "can't open $ssaha2_result\n";
while ( my $rline = <$RM> ){
    chomp $rline;
    my @rarray = split /\s+/, $rline;
    $remove{$rarray[2]} = 1;
}

#remove sequences with contaminant matches from fastq file
open ( my $FQ, $fastq ) || die "can't open $fastq\n";
open ( my $OUT, ">$out" ) || die "can't open $out\n";
my $seq_counter = 0;
my $counter = 0;
my $read;
my $end;
my $nuc_seq;
my $qual_seq;
while ( my $line = <$FQ> ){
    chomp $line;
    $counter++;
#this regular expression is too specific to our libraries to be made generally available
    if ( $line =~ m/^@(SN\S+)\s(\d\S+)/ ){
        $read = $1;
        $end = " ".$2;
#example sequence ID from read pair #1: @SN1163:57:D0RB4ACXX:5:1101:1320:2072 1:N:0:ATGTCAG
#example sequence ID from read pair #2: @SN1163:57:D0RB4ACXX:5:1101:1320:2072 2:N:0:ATGTCAG
        $seq_counter++;
    }
    elsif ( ($counter - 2)/4 == $seq_counter -1 ){
        $nuc_seq = $line;
    }
    elsif ( ($counter - 3)/4 == $seq_counter -1 ){
        next;
    }
    elsif ( ($counter - 4)/4 == $seq_counter -1) {
        $qual_seq = $line;
    }
    else {
        warn "line $line doesn't conform to hash specifications (counter is at $counter, seq counter at $seq_counter)\n";
    }
#start processing reads individually
    my $def = "@";
    if ( ($counter - 4)/4 == $seq_counter -1){
        unless ( defined $remove{$read} ){
            print $OUT "$def$read$end\n$nuc_seq\n+\n$qual_seq\n";
        }
    }
}
   
my $check = $counter/$seq_counter;
print "number of sequences read: $seq_counter; if this number isn't 4, something weird is going on with fastq file: $check\n";
