#!/usr/bin/perl
# Rene Warren 04/2009
# rwarren@bcgsc.ca

use strict;

if($#ARGV<2){
   die "Usage: $0 <name of run   e.g. Herpesvirus_3.60kb.reads.fa.ssake_m16_o2_r0.6_t0_pid21157> <insert size used during SSAKE run (-d option in SSAKE)> <minimum overlap between contigs (-x option in SSAKE e.g. 15 is probably a good start)>\n";
}

my $core = $ARGV[0];
my $chunk = $ARGV[1];
my $overlap = $ARGV[2];

my $verbose=0;

&mergeContigs($core,$chunk,$verbose,$overlap);

exit;




#-----------------------
sub reverseComplement{
   $_ = shift;
   $_ = uc();
   tr/ATGC/TACG/;
   return (reverse());
}

#------------------
sub mergeContigs{

   my ($core, $chunk, $verbose,$min_word_length) = @_;

   my $max_count_trim = 10;
   my $contigs = $core . ".contigs";
   my $scaffold = $ARGV[0] . ".scaffolds";

   my $scaffold_fasta = $core . ".mergedcontigs";

   open(IN,$scaffold) || die "can't open $scaffold";
   open(OUT,">$scaffold_fasta") || die "can't write to $scaffold_fasta -- fatal\n";

   my ($tot,$sct,$ct_merge) = (0,0,0);

   while(<IN>){
      chomp;
      my $sc="";;
      my @a = split(/\,/);
      my @tig;

      if($a[2]=~/\_/){
         @tig = split(/\_/,$a[2]);
      }else{
         push @tig, $a[2];
      }

      $sct++;
      my ($ct,$tigsum,$mct) = (0,0,0);
      my ($prev,$word,$template) = ("NA","NA","NA");
      my ($seq,$prevseq,$headconcat) = ("","","");


      print "$_\n" if($verbose);

      foreach my $t (@tig){
         $ct++;

         if($t=~/([fr])(\d+)z(\d+)(\S+)?/i){

            my $orient = $1;
            my $tnum=$2;
            my $head = $orient . $tnum;
            my $search = "tig" . $tnum;
            my $other = $4;
            $tot+= $3;
            $tigsum +=$3;

            my $gap = $1 if($other=~/m(\-?\d+)/);

            print "\tSC $a[0] - TIG $ct.  pattern: $t search: $search totalTigSize: $tot Orientation: $orient Gap/Overlap: $gap\n" if($verbose);

            my $count_trim = 0;

            open(FA,$contigs);
            READ:
            while(<FA>){
               chomp;
               if (/\>(\S+)/){
                  my $head=$1;
                  $seq =~ s/[BDEFHIJKLMOPQRSUVWXYZ]/N/g;
                  if ($prev=~/$search\|/i && $prev ne $head && $prev ne "NA"){
                     last READ;
                  }
                  $prev = $head;
                  $seq='';
               }elsif(/^(\S+)$/){
                  $seq.=uc($1);
               }
            }
            close FA;
            $seq = reverseComplement($seq) if($orient eq "r");

            print "\t$prev\n" if($verbose);

            #### CONTIG MERGE CODE ####
            if($word ne "NA"){
               #####
               if(length($seq)<=$chunk){
                  $template = $seq;
               }else{
                  $template = substr($seq,0,$chunk);
               }

               ##### word search
               my $dynamic_word = $word;

               SCAN:
               until($template =~ /$dynamic_word/){
                  $dynamic_word = substr($dynamic_word,1,length($dynamic_word));
                  if(length($dynamic_word) < $min_word_length){
                     $count_trim++;
                     last SCAN if($count_trim >= $max_count_trim);
                     $dynamic_word = substr($word,0,length($word)-$count_trim);
                  }
               }
           
               if($seq =~ /^\S{0,$max_count_trim}$dynamic_word(.*)/){### will grab the left-most match which is ok
                  my $tail = $1;
                  my $all = "ERROR_";
                  #my ($all) = ($1) if($prevseq =~ /^(.*)$dynamic_word/);  #($prevseq =~ /(.*)$dynamic_word[^$dynamic_word]*$/);
                  while($prevseq =~ /^(.*)$dynamic_word/ig){
                     $all = $1;
                  }
                  print "$prevseq **** $all **** WORD:$word *** DWord:$dynamic_word *** COUNTTRIM:$count_trim\n" if ($verbose && $all=~/ERROR/);

                  $prevseq = $all . lc($dynamic_word) . $tail;
                  my $overlap = length($dynamic_word);
                  $ct_merge++;
                  print "$ct_merge. GROUNDS FOR MERGING ($overlap nt overlap) !!!\n" if($verbose);
                  $headconcat .= "+" . $prev;
               }else{
                  print "No MERGE, will print previous sequence and memorize current.\n" if($verbose);
                  my $scsz = length($prevseq);
                  print OUT ">$a[0].$mct|size$scsz $headconcat\n$prevseq\n";
                  $prevseq = $seq;
                  $headconcat = $prev;
                  $mct++;
               }
            }else{
               $prevseq = $seq;
               $headconcat = $prev;
               $mct++;
            }

            ##### For the next search
            if(length($seq)<=$chunk){
               $word = $seq;
            }else{
               $word = substr($seq,length($seq)-$chunk,$chunk); ### this will be the next word to search with
            }
            ###########################
            #print OUT ">$a[0]_$prev\n$seq\n";

         }#tig regex
         
      }#each tig
      my $scsz = length($prevseq);
      print OUT ">$a[0].$mct|size$scsz $headconcat\n$prevseq\n";
      $prevseq = '';
   }
   close IN;
   close OUT;
}

