#!/usr/bin/perl
#----------------------------------------------------------#
#        Author: Douglas Senalik dsenalik@wisc.edu         #
#    http://www.vcru.wisc.edu/simonlab/sdata/software/     #
#----------------------------------------------------------#
# "Black Box" program series
=bb
Rearrange the order of sequences in a FASTA file
=cut bb
use strict;
use warnings;
use Getopt::Long;      # for getting command line parameters
# version 1.0 Spetember 3, 2011



############################################################
# configuration variables
############################################################
my $version = "1.0";
my $defaultprefix = "concatenated";
my $spacerlen = 20;



############################################################
# global variables
############################################################
my $ansiup    = "\033[1A";  # terminal control
(my $prognopath = $0) =~ s/^.*[\/\\]//;
my @inhdr = ();   # FASTA headers without ">"
my @inseq = ();   # sequence
my @inqhdr = ();  # quality headers without ">"
my @inqseq = ();  # quality "sequence"
my $doqual = 0;   # 1 if found and read in quality file
my $outqualname = "";  # output quality file name ( if used )
my $currbp = 0;   # current number of base pairs written, for $startstop
my %excludehash = ();  # contigs to exclude, converted to hash for easy lookup



############################################################
# command line parameters
############################################################
my $infilename  = "";   # input file name
my $outfilename = "";   # output file name
my $append      = 0;    # append to existing output file
my $coordinates = "";   # output file for coordinates, where each contig starts and ends
my @seq         = ();   # sequences to keep
my @exclude     = ();   # sequences to exclude
my $help        = 0;    # print help and exit
my $oneseq      = 0;    # to concatenate all sequences
my $prefix      = $defaultprefix;   # if using --oneseq, prefix file with this header
my $startstop   = 0;    # put start and stop coordinates in file
my $noqual      = 0;    # do not attempt to include qual file
my $blankline   = 0;    # blank line between sequences in --onesequence mode
my $random      = 0;    # return this many random contigs
my $quiet       = 0;    # only show errors
my $debug       = 0;    # print extra debugging information
GetOptions (
            "infile=s"       => \$infilename,        # string
            "outfile=s"      => \$outfilename,       # string
            "coordinates=s"  => \$coordinates,       # string
            "sequence=s"     => \@seq,               # string array
            "exclude=s"      => \@exclude,           # string array
            "onesequence"    => \$oneseq,            # flag
            "prefix=s"       => \$prefix,            # string
            "noqual"         => \$noqual,            # flag
            "blankline"      => \$blankline,         # flag
            "append"         => \$append,            # flag
            "startstop"      => \$startstop,         # flag
            "random=i"       => \$random,            # integer
            "help"           => \$help,              # flag
            "quiet"          => \$quiet,             # flag
            "debug"          => \$debug);            # flag
unless ( $infilename ) { $help = 1; }
unless ( $outfilename ) { $help = 1; }
if ( ( ( scalar @seq ) + ( scalar @exclude ) + $random ) < 1 ) { $help = 1; }
# debug implies not quiet
if ( $debug ) { $quiet = 0; }



############################################################
# print help screen
############################################################
if ( $help )
  {
    print "$prognopath  Version $version

Rearrange the order of sequences in a FASTA file based on
your specified contigs and orientations

Required parameters:
  --infile=xxx      input FASTA file name with multiple sequences
  --outfile=xxx     output file name, or \"-\" for stdout
  --seq=xxx         sequences to keep, multiple allowed, a plus
                    \"+\" for forward orientation is optional, 
                    or use \"-\" anywhere to indicate reverse
                    complement. Use \"..\" to indicate a range.
                    Use \",\" to separate entries. Examples:
                    --seq=contig45 --seq=46,49,-21..23
                    --seq=+32 --seq=-65 --seq=76-
                    --seq=00021..45- -seq=45+..47
                    or use \"s\" for a spacer of $spacerlen Ns
                    e.g. --seq=00021+,S,45-
                    The --seq parameter may be omitted if --exclude
                    or --random is used instead
Optional parameters:
  --exclude=xxx     use this in place of the --seq parameter to
                    output all contigs except these. Order will be
                    unchanged from the original file in this case.
  --random=xxx      return this many sequences selected at random
                    and placed in random order
  --coordinates=xxx create this additional output file, which has
                    the starting and ending position of each contig
  --onesequence     concatenate all sequences into one
  --blankline       for --onesequence mode, put a blank line
                    between each sequence
  --prefix=xxx      if using --onesequence, use this prefix,
                    default = \"$prefix\"
  --append          append to existing --outfile
  --startstop       add starting and ending base position
                    to FASTA headers
  --noqual          if a .qual file is present, a corresponding
                    output .qual file is created. This flag
                    turns off this quality file processing
  --help            print this screen
  --quiet           only print error messages
  --debug           print extra debugging information
";
    exit 1;
  } # if ( $help )



############################################################
# expand ranges in specified sequences
############################################################
@seq = expandseq ( "--seq", @seq );
@exclude = expandseq ( "--exclude", @exclude );

# convert excludes to a hash for easy lookup
foreach my $anexclude ( @exclude )
  {
    $anexclude =~ s/\-//g;  # remove directional information for excludes
    $excludehash{$anexclude} = 1;
  } # foreach my $anexclude ( @exclude )



############################################################
# read input file into array
############################################################
{
debugmsg ( "Reading input file \"$infilename\"" );
my $currcontig = 0;
my $nlines = 0;
my $nheaders = 0;
my $contigindex = 0;
my $INF = stdopen ( "<", $infilename );
while ( my $aline = <$INF> )
  {
    $nlines++;
    $aline =~ s/[\r]//g;  # remove DOS returns
    if ( $aline =~ m/^>(.*)$/ )
      {
        my $hdr = $1;
        $contigindex++;
        $nheaders++;
        $hdr =~ s/[\s\n]+$//g;  # header remove newlines and trailing white space
        if ( $hdr =~ m/contig(\d+)/i )
          {
            $currcontig = $1;
            $inhdr[$currcontig] = $hdr;  # ">" has been removed
          }
        else
          { die "Could not find contig number in header \"$hdr\" line $nlines\n"; }
      }
    else
      {
        $inseq[$currcontig] .= $aline;  # retains newlines
      }
  } # while <$INF>
stdclose ( $INF );
unless ( $quiet )
  { print "Input file read, " . commify($nlines) . " lines, " . commify($nheaders) . " sequences\n"; }
}



############################################################
# read quality file into array
############################################################
unless ( $noqual )
  {
my $inqualname = qualname( $infilename );
if ( -e $inqualname )
  {
debugmsg ( "Reading input quality file \"$inqualname\", output name will be \"$outqualname\"" );
my $currcontig = 0;
my $nlines = 0;
my $nheaders = 0;
my $contigindex = 0;
my $INF = stdopen ( "<", $inqualname, "quality" );
while ( my $aline = <$INF> )
  {
    $nlines++;
    $aline =~ s/[\r]//g;  # remove DOS returns
    if ( $aline =~ m/^>(.*)$/ )
      {
        my $hdr = $1;
        $contigindex++;
        $nheaders++;
        $hdr =~ s/[\s\n]+$//g;  # header remove newlines and trailing white space
        if ( $hdr =~ m/contig(\d+)/i )
          {
            $currcontig = $1;
            $inqhdr[$currcontig] = $hdr;  # ">" has been removed
          }
        else
          { die "Could not find contig number in header \"$hdr\" line $nlines\n"; }
      }
    else
      {
        $inqseq[$currcontig] .= $aline;  # retains newlines
      }
  } # while <$INF>
stdclose ( $INF );
unless ( $quiet )
  { print "Quality file read, " . commify($nlines) . " lines, " . commify($nheaders) . " sequences\n"; }
$doqual = 1;
  } # if quality file found
else  # no quality file found
  {
    unless ( $quiet ) { print "No corresponding quality file found\n"; }
  } # else no quality file found
  } # unless ( $noqual )



############################################################
# random mode, create list of contigs
############################################################
if ( $random )
  {
    # create a new list of just defined contig indices ( some contig numbers may be blank )
    my @definedcontigs = ();
    for ( my $i=0; $i<=$#inhdr; $i++ )
      {
        if ( defined($inhdr[$i]) )
          { push ( @definedcontigs, $i ) }
      } # for $i

    # get list of random contigs
    for ( my $i=1; $i<=$random; $i++ )
      {
        # test to prevent getting more random contigs than are available
        if ( scalar @definedcontigs < 1 )
          { die "Error, requested more random contigs than are in the input file\n"; }

        # select a random contig and save it to the @seq list
        my $randomindex = int(rand(@definedcontigs));
        my $contigindex = $definedcontigs[$randomindex];
        debugmsg ( "Selecting random contig \"$contigindex\"" );
        push ( @seq, $contigindex );

        # remove selected contig from array to prevent re-use
        splice ( @definedcontigs, $randomindex, 1 );

      } # for $i

  } # if ( $random )



############################################################
# create output file
############################################################
{
debugmsg ( "Creating output FASTA file \"$outfilename\"" );
my @coordinatesdata = ();
my $currposition = 1;
my $nseq = 0;
my $nlines = 0;
my $OUTF = stdopen ( $append?">>":">", $outfilename );
my $OUTQ;
if ( $doqual )
  {
    $outqualname = qualname( $outfilename );
    debugmsg ( "Creating output quality file \"$outqualname\"" );
    $OUTQ = stdopen ( $append?">>":">", $outqualname, "quality" );
  }
if ( $oneseq ) 
  {
    print $OUTF ">".$prefix." ".scalar(@seq)." sequences: ", join (",", @seq), "\n";
    if ( $doqual ) { print $OUTQ ">".$prefix." ".scalar(@seq)." sequences: ", join (",", @seq), "\n"; }
  }



############################################################
# --seq mode
############################################################
foreach my $aseqnum ( @seq )
  {
    # strip off reverse complement flag if present
    my $rc = 0;
    if ( $aseqnum =~ m/-/ ) { $rc = 1; }
    $aseqnum =~ s/-//g;

    # check for special spacer
    unless ( $aseqnum =~ m/^S$/i )
      {
        # check validity of number
        unless ( defined $inseq[$aseqnum] )
          { die "Error, no such sequence number \"$aseqnum\"\n"; }

        # skip this one if it is on the exclude list
        if ( $excludehash{$aseqnum} )
          {
            debugmsg ( "Specified sequence \"$aseqnum\" is on the exclude list, skipping it" );
            next;
          }
      }

    # get relevant sequence
    my $aseq;
    my $aqual = "";
    if ( $aseqnum =~ m/^S$/i )
      {
        $aseq = "N"x$spacerlen; 
        if ( $doqual ) { $aqual = "00 "x$spacerlen; }  # not tested this may be wrong @@@
      }
    else
      {
        $aseq = $inseq[$aseqnum];
        if ( $doqual ) { $aqual = $inqseq[$aseqnum]; }
      }

    debugmsg ( "Adding sequence $aseqnum " . ($rc?"=ReverseComplement":"") . ", " . commify(length($aseq)) . " b.p." );

    # reverse complement it if appropriate
    if ( $rc )
      {
        $aseq = revcomp ( $aseq );
        if ( $doqual ) { $aqual = qualrev ( $aqual ); }
      }

    # strip leading and trailing white space from sequence
    $aseq =~ s/^\s+//;
    $aseq =~ s/\s+$//;
    if ( $doqual )
      {
        $aqual =~ s/^\s+//;
        $aqual =~ s/\s+$//;
      }

    # calculate sequence length for --startstop and for statistics line at end
    my $seqlen = 0;
    my $bseq = $aseq;
    $bseq =~ s/[^A-Za-z]//g;
    $seqlen = length ( $bseq );
    if ( $coordinates )
      {
        push ( @coordinatesdata, join ( "\t", $aseqnum, $rc?"-":"+", $currposition, ( $currposition+$seqlen-1 ) ) );
      }
    $currposition += $seqlen;

    # print header to ouput ( unless concatenating )
    $nseq++;
    unless ( $oneseq )
      {
        print $OUTF ">", $inhdr[$aseqnum];
        if ( $startstop )
          { print $OUTF " ",($currbp+1),"..",($currbp+$seqlen); }
        # append " RC" to end of FASTA header if we reverse complemented it
        if ( $rc )
          { print $OUTF " RC"; }
        print $OUTF "\n";
        $nlines++;
        if ( $doqual ) { print $OUTQ ">", $inqhdr[$aseqnum], "\n"; }
      }
    if ( $startstop ) { $currbp += $seqlen; }

    # print sequence to output, and follow with a return
    print $OUTF $aseq, "\n";
    if ( $blankline ) { print $OUTF "\n"; }
    $nlines += ( $aseq =~ tr/\n// + 1 );
    if ( $doqual ) { print $OUTQ $aqual, "\n"; }

  } # foreach my $aseq ( @seq )



############################################################
# --exclude mode
############################################################
if ( ( scalar @exclude ) and ( ! scalar @seq ) )  # if both --seq and --exclude were specified, we are done already
  {
    # loop through all input sequences in their original order
    for ( my $aseqnum=0; $aseqnum<=$#inhdr; $aseqnum++ )
      {
        # skip undefined sequences
        next unless ( defined $inhdr[$aseqnum] );

        # skip if on the exclude list
        if ( $excludehash{$aseqnum} )
          {
            debugmsg ( "Sequence \"$aseqnum\" is on the exclude list, skipping it" );
            next;
          }

        # get relevant sequence
        my $aseq = $inseq[$aseqnum];
        my $aqual = "";
        if ( $doqual ) { $aqual = $inqseq[$aseqnum]; }

        debugmsg ( "Adding sequence $aseqnum, " . commify(length($aseq)) . " b.p." );

        # strip leading and trailing white space from sequence
        $aseq =~ s/^\s+//;
        $aseq =~ s/\s+$//;
        if ( $doqual )
          {
            $aqual =~ s/^\s+//;
            $aqual =~ s/\s+$//;
          }

        # calculate sequence length for --startstop and for statistics line at end
        my $seqlen = 0;
        my $bseq = $aseq;
        $bseq =~ s/[^A-Za-z]//g;
        $seqlen = length ( $bseq );
        if ( $coordinates )
          {
            push ( @coordinatesdata, join ( "\t", $aseqnum, "+", $currposition, ( $currposition+$seqlen-1 ) ) );
          }
        $currposition += $seqlen;

        # print header to ouput ( unless concatenating )
        $nseq++;
        unless ( $oneseq )
          {
            print $OUTF ">", $inhdr[$aseqnum];
            if ( $startstop )
              { print $OUTF " ",($currbp+1),"..",($currbp+$seqlen); }
            print $OUTF "\n";
            $nlines++;
            if ( $doqual ) { print $OUTQ ">", $inqhdr[$aseqnum], "\n"; }
          }
        if ( $startstop ) { $currbp += $seqlen; }

        # print sequence to output, and follow with a return
        print $OUTF $aseq, "\n";
        if ( $blankline ) { print $OUTF "\n"; }
        $nlines += ( $aseq =~ tr/\n// + 1 );
        if ( $doqual )
          {
            print $OUTQ $aqual, "\n"; 
            if ( $blankline ) { print $OUTQ "\n"; }
          }
      } # for $aseqnum
  } # if ( ( scalar @exclude ) and ( ! scalar @seq ) )



stdclose ( $OUTF );
unless ( $quiet )
  { print "Output file created, " . commify($nlines) . " lines, " . commify($nseq) . " sequences, ".commify($currposition-1)." b.p.\n"; }
if ( $doqual )
  { stdclose ( $OUTQ ); }

if ( $coordinates )
  {
    debugmsg ( "Storing coordinates data to file \"$coordinates\"" );
    my $OUTC = stdopen ( ">", $coordinates );
    print $OUTC join ( "\n", @coordinatesdata ), "\n";
    stdclose ( $OUTC );
  } # if ( $coordinates )
}



exit 0;



############################################################
sub expandseq { my ( $debugtxt, @unexpanded ) = @_;
############################################################
# expand --seq or --exclude parameter ranges
# After parsing, all entries will be an optional "-" followed
# by contig number with leading zeroes stripped off
# e.g. 456 or -678
my @expanded = ();
foreach my $aseq ( @unexpanded )
  {
    # split at commas or white space
    foreach my $asubseq ( split ( /[,\s]+/, $aseq ) )
      {
        # skip null items
        next unless ( $asubseq );

        # save reverse complement flag
        my $revcomp = "";
        if ( $asubseq =~ m/-/ ) { $revcomp = "-"; }

        # remove all non [ digits or periods or $ or S ], removes "contig", "-", and "+"
        $asubseq =~ s/[^\d\.\$Ss]//g;

        # expand ranges
        if ( $asubseq =~ m/^(.*)\.\.(.*)$/ )
          {
            ( my $start = $1 ) =~ s/^0+//;
            ( my $end = $2 ) =~ s/^0+//;
            for ( my $i=$start; $i<=$end; $i++ )
              { push ( @expanded, $revcomp.$i ); }
          }
        else  # single contig
          {
            $asubseq =~ s/^0+//;  # trim leading zeroes
            push ( @expanded, $revcomp.$asubseq );
          }
      } # foreach my $asubseq
  } # foreach my $aseq ( @unexpanded )
debugmsg ( "Original $debugtxt array contained ".commify(scalar(@unexpanded))." items, after expansion it contains ".commify(scalar(@expanded))." items" );
return @expanded;
} # sub expandseq



############################################################
sub debugmsg { my ( $text, $noreturn, $nolinenum ) = @_;
############################################################
  if ( $debug )
    {
      my ($package, $filename, $line, $sub) = caller(0);
      unless ( $nolinenum ) { $text = "Line $line: " . $text; }
      if ( ! ( $noreturn ) ) { $text .= "\n"; }
      print $text;
    } # if ( $debug )
} # sub debugmsg



###############################################################
sub timestr {
###############################################################
  @_ = localtime(shift || time);
  return(sprintf("%04d/%02d/%02d %02d:%02d", $_[5]+1900, $_[4]+1, $_[3], @_[2,1]));
} # sub timestr



###############################################################
sub commify {
###############################################################
# http://perldoc.perl.org/perlfaq5.html#How-can-I-output-my-numbers-with-commas
  local $_ = shift;
  1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
  return $_;
} # commify



############################################################
sub qualname { my ( $fastaname ) = @_;
############################################################
# convert a FASTA file name to the corresponding quality file name
  my $qualname = $fastaname;
  $qualname =~ s/\.[^\.]+$//;  # remove existing extension whatever it is
  $qualname .= ".qual";        # replace with ".qual";
} # sub qualname



############################################################
sub revcomp { my ( $dna ) = @_;
############################################################
# standard DNA reverse complement, including degenerate bases
  my $revcomp = reverse ( $dna );
  $revcomp =~ tr/AaCcTtGgMmRrYyKkVvHhDdBb/TtGgAaCcKkYyRrMmBbDdHhVv/;
  return $revcomp;
} # sub revcomp



############################################################
sub qualrev { my ( $qual ) = @_;
############################################################
# reverse quality score order, retain returns in string
  $qual =~ s/\n/ \n /g;  # space on both sides of return so we can retain them
  # convert to array, some elements will be returns, and reverse the order
  my @qualarr = reverse( split ( /[ \t]+/, $qual ) );
  # convert back to string, space between values
  $qual = join ( " ", @qualarr );
  # remove extra spaces around returns that we introduced earlier
  $qual =~ s/ *\n *//g;
  return $qual
} # sub revcomp



###############################################################
sub stdopen { my ( $mode, $filename, $extratext ) = @_;
###############################################################
# a replacement for the three-parameter open which also allows
# the use of "-" as the file name to mean STDIN or STDOUT
  my $fh;  # the file handle
  if ( $filename eq "-" )  # only exact match to "-" has special meaning
    {
      if ( $mode =~ m/>/ )
        { $fh = *STDOUT }
      else
        { $fh = *STDIN }
    }
  else
    {
      # supplemental passed text for error messages, need one more space
      if ( defined $extratext )
        { $extratext .= " " }
      else
        { $extratext = "" }

      my $text;  # this is only used for error message
      if ( $mode =~ m/^\+?>>/ )  # ">>" or "+>>"
        { $text = "append" }
      elsif ( $mode =~ m/^\+?>/ )  # ">" or "+>"
        { $text = "output" }
      elsif ( $mode =~ m/^\+?</ )  # "<" or "+<"
        { $text = "input" }
      else
        { die "Error, unsupported file mode \"$mode\" specified to stdopen( $mode, $filename, $extratext )\n"; }
      open ( $fh, $mode, $filename ) or die ( "Error opening ${extratext}file \"$filename\" for $text: $!\n" );
    }
  # return the opened file handle to the caller
  return $fh;
} # stdopen



###############################################################
sub stdclose { my ( $fh ) = @_;
###############################################################
# same as built-in close, except in case of STDIN or STDOUT,
# and in this case the file handle is not closed

  unless ( fileno($fh) <= 2 )  # if file number is this low, is stdin or stdout or stderr
    { close ( $fh ) or die ( "Error closing file handle: $!\n" ); }

} # sub stdclose



# eof