#!/usr/bin/perl
#----------------------------------------------------------#
#        Author: Douglas Senalik dsenalik@wisc.edu         #
#----------------------------------------------------------#
# "Black Box" program series
=bb
Find open reading frames in a DNA or RNA sequence
=cut bb
use strict;
use warnings;
use Getopt::Long;      # for getting command line parameters
use Time::HiRes qw ( time ); #@@@
# 1.0.0 - Mar 9, 2011
# 1.1.0 - June 8, 2011 = Add --trimheader, fix major bug in sorting alpha instead of numeric
# 1.2.0 - Aug 28, 2012 = Bug fix in returned nucleotide sequence in table (length too long), added --origorder=s
# 1.3.0 - Apr 1, 2013 = Add gff output format, increase efficiency



############################################################
# configuration variables
############################################################
my @strictstartcodons = ( "ATG" );
my @fullstartcodons = ( "ATG", "GTG", "CTG", "TTG" );
my @stopcodons = ( "[TU]AA", "[TU]AG", "[TU]GA" );
# thresholds for guessing orientation
my $guessthreshold = 0.10;  # "ambiguous" if both directions
my $guessminbp = 100;  # "tooshort" if below this
my %guessstats = ();
my %proteins = qw/
    UUU F UUC F UUA L UUG L UCU S UCC S UCA S UCG S UAU Y UAC Y UAA * UAG * UGU C UGC C UGA * UGG W
    CUU L CUC L CUA L CUG L CCU P CCC P CCA P CCG P CAU H CAC H CAA Q CAG Q CGU R CGC R CGA R CGG R
    AUU I AUC I AUA I AUG M ACU T ACC T ACA T ACG T AAU N AAC N AAA K AAG K AGU S AGC S AGA R AGG R
    GUU V GUC V GUA V GUG V GCU A GCC A GCA A GCG A GAU D GAC D GAA E GAG E GGU G GGC G GGA G GGG G
    /;
my $defaultfeatureid = "CDS";



############################################################
# global variables
############################################################
my $ansiup    = "\033[1A";  # terminal control
(my $prognopath = $0) =~ s/^.*[\/\\]//;
my $currhdr = "";
my $currseq = "";
my $guessdata = "";
my $numseq = 0;
my %orfperid = ();
my $outfilename2 = "";  # --nonorffasta file name
my @startcodons;
my $past = time(); #@@@



############################################################
# command line parameters
############################################################
my $infilename  = "";   # input file name
my $outfilename = "";   # output file name
my $anystart    = 0;    # start also at beginning of sequence
my $fullstart   = 0;    # use all 4 start codons
my $minlen      = 100;  # minimum orf length in b.p.
my $guessorient = 0;    # different output format: guess orientation
my $fasta       = 0;    # different output format: FASTA
my $nonorffasta = 0;    # second file with all sequence not in the first one
my $fastacollapse = 0;  # no duplication in --fasta output
my $fastalargest = 0;   # only larger of any overlapping orfs is kept
my $trimheader  = 0;    # keep header only up to first white space
my $featureid   = $defaultfeatureid;
my $non;
my $origorder;          # return in sequence order instead of sorted by size
my $nsequence   = 0;    # include column with nucleotide sequence
my $psequence   = 0;    # include column with protein sequence
my $gffformat   = 0;    # generate output in gff3 format
my $help        = 0;    # print help and exit
my $quiet       = 0;    # only show errors
my $debug       = 0;    # print extra debugging information
GetOptions (
            "infile=s"         => \$infilename,        # string
            "outfile=s"        => \$outfilename,       # string
            "fullstart"        => \$fullstart,         # flag
            "anystart"         => \$anystart,          # flag
            "minlen=i"         => \$minlen,            # integer
            "guessorientation" => \$guessorient,       # flag
            "fasta"            => \$fasta,             # flag
            "nonorffasta"      => \$nonorffasta,       # flag
            "fastacollapse"    => \$fastacollapse,     # flag
            "fastalargest"     => \$fastalargest,      # flag
            "trimheader"       => \$trimheader,        # flag
            "non:i"            => \$non,               # flag/integer
            "origorder:s"      => \$origorder,         # flag
            "nsequence"        => \$nsequence,         # flag
            "psequence"        => \$psequence,         # flag
            "gffformat"        => \$gffformat,         # flag
            "featureid=s"      => \$featureid,         # string
            "help"             => \$help,              # flag
            "quiet"            => \$quiet,             # flag
            "debug"            => \$debug);            # flag
# required parameters
unless ( $infilename ) { $help = 1; }
unless ( $outfilename ) { $help = 1; }
if ( ( defined $non ) and ( ! $non ) ) { $non = 1; }
if ( $gffformat ) { $trimheader = 1; }
# debug implies not quiet
if ( $debug ) { $quiet = 0; }



############################################################
# print help screen
############################################################
if ( $help )
  {
    print "$prognopath

This program will detect open reading frames in FASTA
DNA or RNA sequences. This is similar to the NCBI program at 
http://www.ncbi.nlm.nih.gov/gorf/orfig.cgi

Required parameters:
  --infile=xxx       input file name
  --outfile=xxx      output file name, use \"-\" for stdout
Optional parameters:
  --fullstart        use full set of start codons: ATG GTG CTG TTG
                     the default is to only use ATG
  --anystart         start of sequence is also a valid orf start
  --minlen=xxx       minimum orf length in b.p., default=$minlen
  --guessorientation guess orientation based on strand with the
                     most total orfs, this data will be be saved
                     instead of the list of orfs
  --fasta            output file is in FASTA format, each orf is
                     a separate sequence, information is in header
  --nonorffasta      second FASTA file with all sequence not in
                     the first one. File name is --outfile name
                     with \"nonorf\" inserted
  --fastacollapse    combine overlapping sequence in the FASTA file
  --fastalargest     if two orfs overlap, keep only the larger one
  --trimheader       remove any text in the FASTA header after 
                     the first occurrence of white space
  --origorder        return list in sequence order instead of
                     the default which is sorted by size
  --origorder=s      same, but do + and - strands separately
  --nsequence        include a column with nucleotide sequence
  --psequence        include a column with protein sequence
  --gffformat        generate output in gff3 format. This also
                     enables --trimheader
  --featureid=xxx    column 3 of gff file, default is \"$defaultfeatureid\"
  --non              do not allow any \"N\"s in an orf
  --help             print this screen
  --quiet            only print error messages
  --debug            print extra debugging information
";
    exit 1;
  } # if ( $help )



############################################################
# perform orf detection
############################################################
if ( $fullstart )
  { @startcodons = @fullstartcodons }
else
  { @startcodons = @strictstartcodons }
my $OUTF = stdopen ( ">", $outfilename );
my $OUTF2;
if ( $nonorffasta )
  {
    ( $outfilename2 = $outfilename ) =~ s/(\.[^\.]*)$/.nonorf$1/;
    $OUTF2 = stdopen ( ">", $outfilename2 );
  }
unless ( ( $guessorient ) or ( $fasta ) or ( $gffformat ) )
  {
    print $OUTF join ( "\t", "#ID", "orf", "Frame", "Start", "Codon", "Stop", "Codon", "Length" );
    if ( $nsequence ) { print $OUTF "\tDNA Sequence"; }
    if ( $psequence ) { print $OUTF "\tProtein Sequence"; }
    print $OUTF "\n";
  }
if ( $gffformat )
  { print $OUTF "##gff-version 3\n##Index-subfeatures 1\n\n"; }

my $INF = stdopen ( "<", $infilename );
while ( my $aline = <$INF> )
  {
    $aline =~ s/[\r\n]//g;
    if ( $aline =~ m/^>(.*)$/ )
      {
        my $seqhdr = $1;
        if ( $trimheader ) { $seqhdr =~ s/\s.*$//; }  # Trim anything after first white space
        unless ( $seqhdr ) { die "Error, FASTA header is missing, or starts with white space: \"$aline\"\n"; }
        process();
        $currhdr = $seqhdr;
        $currseq = "";
      }
    else
      {
        $currseq .= $aline;
      }
  } # while <$INF>
stdclose ( $INF );
process();

if ( $guessorient )
  {
    print $OUTF "#ID\tSeq Len\tbp Fwd\tbp Rev\tOrientation\n", $guessdata;
    print "Guess Orientation Summary:\n";
    while ( my ( $key, $value ) = each ( %guessstats ) )
      { print commify($value), "\t", $key, "\n"; }
  }

unless ( $quiet )
  { print commify($numseq), " sequences processed\n"; }

stdclose ( $OUTF );
if ( $nonorffasta )
  { stdclose ( $OUTF2 ); }



############################################################
sub process {
############################################################
  if ( $currseq )  # first call will have no sequence
    {
      $numseq++;
      # display progress for large input files
      if ( ( ( $numseq % 10 ) == 0 ) and ( ! $quiet ) )
        { print commify($numseq), "\n", $ansiup; }

      my @outdata = ();
      my $currseqrc = revcomp($currseq);
      my @start = ();
      my @startrc = ();
      my @stop = ();
      my @stoprc = ();
      my $currlen = length($currseq);
      # variables used for guessing the orientation
      my $bpplus = 0;
      my $bpminus = 0;

      # only use stop codon once, later hits will be for shorter orf
      my %stopused = ();
      my %stopusedrc = ();

      # to avoid column alignment problems, make sure header has no tabs in it
      $currhdr =~ s/\t/ /g;
      debugmsg ( "Processing \"$currhdr\", " . commify($currlen) . " b.p." );

      # if --anystart, add start of sequence positions in all reading frames
      if ( $anystart )
        {
          push ( @start, 0, 1, 2 );
          push ( @startrc, 0, 1, 2 );
        }
      foreach my $acodon ( @startcodons )
        {
          while ( $currseq =~ m/($acodon)/ig )
            {
              debugmsg ( "Fwd: Start codon $1 at ".(pos($currseq)-2) );
              push ( @start, pos($currseq)-2 );
            }
          while ( $currseqrc =~ m/($acodon)/ig )
            {
              debugmsg ( "Rev: Start codon $1 at ".(pos($currseqrc)-2) );
              push ( @startrc, pos($currseqrc)-2 );
            }
        }
#print "Start codons ".et()." \"$currhdr\" \n"; #@@@
      foreach my $acodon ( @stopcodons )
        {
          while ( $currseq =~ m/($acodon)/ig )
            {
              debugmsg ( "Fwd: Stop codon $1 at ".(pos($currseq)-2) );
              push ( @stop, pos($currseq)-2 );
            }
          while ( $currseqrc =~ m/($acodon)/ig )
            {
              debugmsg ( "Rev: Stop codon $1 at ".(pos($currseqrc)-2) );
              push ( @stoprc, pos($currseqrc)-2 );
            }
        }

#print "Stop codons ".et()." \"$currhdr\" \n";
      # sort arrays to put positions in increasing order
      @start = sort ( { $a <=> $b } @start );
      @stop = sort ( { $a <=> $b } @stop );
      @startrc = sort ( { $a <=> $b } @startrc );
      @stoprc = sort ( { $a <=> $b } @stoprc );

#print "Sort ".et()." \"$currhdr\" \n";
      # find start-stop pairs
      my $laststop = 0;
      foreach my $astart ( @start )
        {
          my $frame = ( $astart % 3 );
          for my $s ( $laststop..$#stop )
            {
              my $astop = $stop[$s];

              # stop must be > $start
              if ( $astop <= $astart )
                {
                  $laststop = $s;
                  next;
                }

              # frame must match
              next if ( ( $astop % 3 ) != $frame );

              # ignore this start codon if we hit a used stop codons ( meaning a start codon existed earlier upstream )
              last if ( $stopused{$astop} );

              # prohibit N's in orf if requested
              if ( $non )
                {
                  my $orfseq = substr ( $currseq, $astart-1, ( $astop - $astart + 1 ) );
                  last if ( $orfseq =~ m/[Nn]{$non,}/ );
                }

              # length must be >= --minlen parameter
              my $orflen = ( $astop - $astart + 3 );
              if ( $orflen >= $minlen )
                {
                  # [0]ID  [1]orf  [2]Frame  [3]Start  [4]Codon  [5]Stop  [6]Codon  [7]Length
                  $orfperid{$currhdr}++;
                  my @outline = ( $currhdr,
                                  $orfperid{$currhdr},
                                  "+".($frame?$frame:3),
                                  $astart,
                                  substr($currseq,$astart-1,3),
                                  ($astop+2),
                                  substr($currseq,$astop-1,3),
                                  $orflen );
                  if ( $nsequence ) { push ( @outline, substr ($currseq,$astart-1,$orflen) ); }
                  if ( $psequence ) { push ( @outline, dnatoprotein(substr ($currseq,$astart-1,$orflen)) ); }
                  push ( @outdata, \@outline );
                  $bpplus += $orflen;
                  debugmsg ( "Fwd, Valid orf at $astart to $astop, length=$orflen" );
                }
              else
                { debugmsg ( "Fwd, Discarding too-short orf ( $orflen b.p. ) at $astart to $astop" ); }

              # if a stop codon is used, don't use it again as that will be for
              # a shorter orf inside a larger one. So, set flag to ignore it
              $stopused{$astop} = 1;

              # exit inner loop when first orf found, even if too short
              last;
            }
        } # foreach my $astart ( @start )

#print "Pairing Forward ".et()." \"$currhdr\" \n";
      $laststop = 0;
      foreach my $astart ( @startrc )
        {
          my $frame = ( $astart % 3 );
          for my $s ( $laststop..$#stoprc )
            {
              my $astop = $stoprc[$s];

              # stop must be > $start
              if ( $astop <= $astart )
                {
                  $laststop = $s;
                  next;
                }

              # frame must match
              next if ( ( $astop % 3 ) != $frame );

              # ignore this start codon if we hit a used stop codon ( meaning a start codon existed earlier upstream )
              last if ( $stopusedrc{$astop} );

              # prohibit N's in orf if requested
              if ( $non )
                {
                  my $orfseq = substr ( $currseqrc, $astart-1, ( $astop - $astart + 1 ) );
                  next if ( $orfseq =~ m/[Nn]/ );
                }

              # length must be >= --minlen parameter
              my $orflen = ( $astop - $astart + 3 );
              if ( $orflen >= $minlen )
                {
                  # [0]ID  [1]orf  [2]Frame  [3]Start  [4]Codon  [5]Stop  [6]Codon  [7]Length
                  $orfperid{$currhdr}++;
                  my @outline = ( $currhdr,
                                  $orfperid{$currhdr},
                                  "-".($frame?$frame:3),
                                  ($currlen-$astart+1),
                                  substr($currseqrc,$astart-1,3),
                                  ($currlen-($astop+2)+1),
                                  substr($currseqrc,$astop-1,3),
                                  $orflen );
                  if ( $nsequence ) { push ( @outline, substr ($currseqrc,$astart-1,$orflen) ); }
                  if ( $psequence ) { push ( @outline, dnatoprotein(substr ($currseqrc,$astart-1,$orflen)) ); }
                  push ( @outdata, \@outline );
                  $bpminus += $orflen;
                  debugmsg ( "Rev, Valid orf at $astart to $astop, length=$orflen" );
                }
              else
                { debugmsg ( "Rev, Discarding too-short orf ( $orflen b.p. ) at $astart to $astop" ); }

              # if a stop codon is used, don't use it again as that will be for
              # a shorter orf inside a larger one. So, set flag to ignore it
              $stopusedrc{$astop} = 1;

              # exit inner loop when first orf found, even if too short
              last;
            }
        } # foreach my $astart ( @startrc )

#print "Pairing Reverse ".et()." \"$currhdr\" \n";
      # save orf data for later output in appropriate format
      if ( defined $origorder )  # sort output data by original sequence order
        {
          @outdata = sort { $a->[3] <=> $b->[3] } @outdata;
          if ( $origorder =~ m/s/i )
            { @outdata = sort { substr($a->[2],0,1) cmp substr($b->[2],0,1) } @outdata; }
        }
      else  # sort output data by orf length, largest first
        { @outdata = sort { $b->[7] <=> $a->[7] } @outdata; }

      # save guess orientation data
      if ( $guessorient )
        {
          my $txt;
          if ( ( $bpplus + $bpminus ) < 1 )
            { $txt = "no orfs"; }
          elsif ( ( $bpplus + $bpminus ) < $guessminbp )
            { $txt = "orfs too short"; }
          elsif ( ( abs( $bpplus - $bpminus ) / ( $bpplus + $bpminus ) ) < $guessthreshold )
            { $txt = "ambiguous"; }
          elsif ( $bpplus > $bpminus )
            { $txt = "forward"; }
          else
            { $txt = "reverse"; }

          $guessdata .= join ( "\t", $currhdr, $currlen, $bpplus, $bpminus, $txt ) . "\n";
          $guessstats{$txt}++;
        } # if ( $guessorient )

      elsif ( $fasta )
        {
          my %used = ();  # for --fastacollapse or --fastalargest only, key is start, value is end
          foreach my $rowref ( @outdata )
            {
              # [0]ID  [1]orf  [2]Frame  [3]Start  [4]Codon  [5]Stop  [6]Codon  [7]Length
              my $header = ">" . $rowref->[0].".".$rowref->[1] . " " .  join ( "; ", @{$rowref}[2..7] );
              my $len = $rowref->[7];
              my $seq = "error";
              if ( ( $fastacollapse ) or ( $fastalargest ) )
                {
                  # get range, start always the lower value
                  my $start = $rowref->[3];
                  my $end = $rowref->[5];
                  if ( $end < $start )
                    {
                      $start = $rowref->[5];
                      $end = $rowref->[3];
                    }

                  # see if this range overlaps an existing one
                  my $inserted = 0;
                  foreach my $key ( keys %used )
                    {
                      if ( ( $start >= $key ) and ( $end <= $used{$key} ) )
                        {
                          # full subset, can ignore it completely for both --fastacollapse and --fastalargest
                          debugmsg ( "Sequence \"$currhdr\": orf $start..$end is subset of orf $key..$used{$key}" );
                          $inserted = 1;
                          last;
                        } # if ( ( $start >= $key ) and ( $end <= $used{$key} ) )
                      elsif ( ( $end > $used{$key} ) and ( $start <= $used{$key} ) )
                        {
                          if ( $fastacollapse )
                            {
                              debugmsg ( "Sequence \"$currhdr\": orf $start..$end extends end of orf $key..$used{$key}" );
                              # extend end side
                              $used{$key} = $end;
                              # modify start to detect join events
                              if ( $start > $key ) { $start = $key }
                            } # if ( $fastacollapse )
                          if ( $fastalargest )
                            {
                              if ( ( $end - $start ) > ( $used{$key} - $key ) )  # if new orf larger
                                {
                                  debugmsg ( "Sequence \"$currhdr\": orf $start..$end larger than orf $key..$used{$key}" );
                                  $used{$start} = $end;
                                  delete ( $used{$key} );
                                }
                            } # if ( $fastalargest )
                          $inserted = 1;
                        } # elsif ( ( $end > $used{$key} ) and ( $start <= $used{$key} ) )
                      elsif ( ( $start < $key ) and ( $end >= $key ) )
                        {
                          if ( $fastacollapse )
                            {
                              debugmsg ( "Sequence \"$currhdr\": orf $start..$end extends start of orf $key..$used{$key}" );
                              # extend start side
                              $used{$start} = $used{$key};
                              delete ( $used{$key} );
                              # modify end to detect join events
                              if ( $end < $used{$start} ) { $end = $used{$start} }
                            } # if ( $fastacollapse )
                          if ( $fastalargest )
                            {
                              if ( ( $end - $start ) > ( $used{$key} - $key ) )  # if new orf larger
                                {
                                  debugmsg ( "Sequence \"$currhdr\": orf $start..$end larger than orf $key..$used{$key}" );
                                  $used{$start} = $end;
                                  delete ( $used{$key} );
                                }
                            } # if ( $fastalargest )
                          $inserted = 1;
                        } # elsif ( ( $start < $key ) and ( $end >= $key ) )
                    } # foreach

                  # no action taken above, so new non-overlapping range found
                  unless ( $inserted )
                    {
                      $used{$start} = $end; 
                      debugmsg ( "Sequence \"$currhdr\": orf $start..$end is new" );
                    } # unless ( $inserted )

                } # if ( ( $fastacollapse ) or ( $fastalargest ) )
              else  # do not collapse overlaps
                {
                  if ( $rowref->[2] =~ m/\-/ ) # if reverse complement
                    {
                      $seq = substr( $currseq, $rowref->[5]-1, $len );
                      $seq = revcomp ( $seq );
                      $header .= " [RC]";
                    }
                  else  # forward orientation
                    {
                      $seq = substr( $currseq, $rowref->[3]-1, $len );
                    }
                  print $OUTF $header, "\n";
                  print $OUTF $seq, "\n";
                } # else do not collapse overlaps
            } # foreach my $rowref ( @outdata )

          if ( ( $fastacollapse ) or ( $fastalargest ) )
            {
              debugmsg ( "Sequence \"$currhdr\" has ".scalar ( keys %used )." orf sequences to save" );
              my $numseq = 0;
              my $prevend = 0;
              foreach my $key ( sort { $a <=> $b } keys %used )
                {
                  $numseq++;
                  my $len = ( $used{$key} - $key + 1 );
                  my $header = ">$currhdr.$numseq $key..$used{$key} = $len b.p.";
                  my $seq = substr( $currseq, $key-1, $len );
                  debugmsg ( "Save FASTA of orf $key..$used{$key} length $len" );
                  print $OUTF $header, "\n";
                  print $OUTF $seq, "\n";
                  if ( $nonorffasta )
                    {
                      my $nolen = ( $key - $prevend - 1 );
                      my $noseq = substr( $currseq, $prevend, $nolen );
                      my $noheader = ">$currhdr.$numseq ".($prevend+1)."..".($key-1)." = $nolen b.p.";
                      debugmsg ( "Save FASTA of non-orf ".($prevend+1)."..".($key-1)." length $nolen" );
                      $prevend = $used{$key};
                      print $OUTF2 $noheader, "\n";
                      print $OUTF2 $noseq, "\n";
                    } # if ( $nonorffasta )
                } # foreach
              if ( $nonorffasta )
                {
                  $numseq++;
                  my $nolen = ( length($currseq) - $prevend );
                  my $noseq = substr( $currseq, $prevend, $nolen );
                  my $noheader = ">$currhdr.$numseq ".($prevend+1)."..".length($currseq)." = $nolen b.p.";
                  debugmsg ( "Save FASTA of last non-orf ".($prevend+1)."..".length($currseq)." length $nolen" );
                  print $OUTF2 $noheader, "\n";
                  print $OUTF2 $noseq, "\n";
                } # if ( $nonorffasta )
            } # if ( ( $fastacollapse ) or ( $fastalargest ) )

        } # if ( $fasta )
      elsif ( $gffformat )
        {
#@@@
          # rowref columns: [0]ID [1]orf [2]Frame [3]Start [4]Codon [5]Stop [6]Codon [7]Length
          my $orfindex = 0;
          foreach my $rowref ( @outdata )
            {
              my $strand = substr ( $rowref->[2], 0, 1 );  # + or -
              my $start = ( $strand eq '+' )?$rowref->[3]:$rowref->[5];
              my $end   = ( $strand eq '+' )?$rowref->[5]:$rowref->[3];
              $orfindex++;
              print $OUTF join ( "\t", $currhdr, "bb.orffinder", $featureid,
                                       $start, $end, '.', $strand, 0,
                                       "ID=$currhdr.$orfindex"
                                       . ";Name=" . $rowref->[7] . $rowref->[2]
                               ), "\n";
            }
        } # if ( $gffformat )

      else  # normal output
        {
          foreach my $rowref ( @outdata )
            { print $OUTF join ( "\t", @{$rowref} ), "\n"; }
        }

      @outdata = ();
    }
#print "Saving ".et()." \"$currhdr\" \n";
} # sub process



############################################################
sub debugmsg { my ( $text, $noreturn, $nolinenum ) = @_;
############################################################
  if ( $debug )
    {
      my ($package, $filename, $line, $sub) = caller(0);
      unless ( $nolinenum ) { $text = "Line $line: " . $text; }
      if ( ! ( $noreturn ) ) { $text .= "\n"; }
      print $text;
    } # if ( $debug )
} # sub debugmsg



###############################################################
sub timestr {
###############################################################
  @_ = localtime(shift || time);
  return(sprintf("%04d/%02d/%02d %02d:%02d", $_[5]+1900, $_[4]+1, $_[3], @_[2,1]));
} # sub timestr



###############################################################
sub commify {
###############################################################
# http://perldoc.perl.org/perlfaq5.html#How-can-I-output-my-numbers-with-commas
  local $_ = shift;
  1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
  return $_;
} # commify



############################################################
sub revcomp { my ( $dna ) = @_;
############################################################
# standard DNA reverse complement, including degenerate bases
  my $revcomp = reverse ( $dna );
  $revcomp =~ tr/AaCcTtGgMmRrYyKkVvHhDdBb/TtGgAaCcKkYyRrMmBbDdHhVv/;
  return $revcomp;
} # sub revcomp



############################################################
sub dnatoprotein { my ( $dna ) = @_;
############################################################
# translate DNA sequence into protein sequence
  $dna =~ tr/acgtT/ACGUU/;
  if ( ( length($dna) % 3 ) != 0 )
    { die "Error, DNA sequence length not a multiple of 3: \"$dna\"\n"; }
  my $protseq = "";
  foreach my $triplet ( $dna =~ m/(...)/g )
    {
      if ( defined $proteins{$triplet} )
        { $protseq .= $proteins{$triplet}; }
      else
        { $protseq .= "?"; }  # should only happen if any letter is not [ACTGU]
    }
  return $protseq;
} # sub dnatoprotein



###############################################################
sub stdopen { my ( $mode, $filename, $extratext ) = @_;
###############################################################
# a replacement for the three-parameter open which also allows
# the use of "-" as the file name to mean STDIN or STDOUT
  my $fh;  # the file handle
  if ( $filename eq "-" )  # only exact match to "-" has special meaning
    {
      if ( $mode =~ m/>/ )
        { $fh = *STDOUT }
      else
	{ $fh = *STDIN }
    }
  else
    {
      # supplemental passed text for error messages, need one more space
      if ( defined $extratext )
        { $extratext .= " " }
      else
	{ $extratext = "" }

      my $text;  # this is only used for error message
      if ( $mode =~ m/^\+?>>/ )  # ">>" or "+>>"
        { $text = "append" }
      elsif ( $mode =~ m/^\+?>/ )  # ">" or "+>"
        { $text = "output" }
      elsif ( $mode =~ m/^\+?</ )  # "<" or "+<"
        { $text = "input" }
      else
	{ die "Error, unsupported file mode \"$mode\" specified to stdopen( $mode, $filename, $extratext )\n"; }
      open ( $fh, $mode, $filename ) or die ( "Error opening ${extratext}file \"$filename\" for $text: $!\n" );
    }
  # return the opened file handle to the caller
  return $fh;
} # stdopen



###############################################################
sub stdclose { my ( $fh ) = @_;
###############################################################
# same as built-in close, except in case of STDIN or STDOUT,
# and in this case the file handle is not closed

  unless ( fileno($fh) <= 2 )  # if file number is this low, is stdin or stdout or stderr
    { close ( $fh ) or die ( "Error closing file handle: $!\n" ); }

} # sub stdclose



# eof

=test_data
>A1Contig1
TCACACTTTTAGCTAACAGATTTACATATCTCCGTTGGAGATGCTCTAACATGACTAGAATGAGATGGCCCTCGTTAAAA
TTTTAAAATCTCGGCAGAGAAGTTGTTAAAAAGAGCACAATTCATGAGTTCGATAACTGATGAGAAACTGAGTGAACATA
AAGCAACACAGAGGGCAGTACAACTCTTATTTAAGAACGAGATTAGAAATGTCCAACATCTTCACAAATGAAGGACCGAA
GTTTGATATGGAATTCTGACTTGATAGAGACATTAGAGTTGGAAAACTTGTTGATCAATGCAGCTATTACCATGCATTCA
GCTGAAGCAAGAAAAGAGAGCAGAGGAGCTCATGCTCGTGAAGATTTTACGAAAAGAGATGATGAGAATTGGATGAAGCA
TTCATTGGGATACTGGGAGAATGAGAAGGTACGGCTGGACTACAGGCCTGTTCACATGAACACATTAGACGATGAGATTG
AAACGTTCCCACCTAAAGCACGTGTGTACTGAGGTATGGTATTGAAGGATACATGTGGTTGGGGAAAAATATAATATTTT
CTTCTAAGAAGTCCGCAATAAATTTACTGAGACCTAGAAAATTTCTAAATAATGACGTCATTTGTCAAACTGCAGTCGGG
GTGAGTAAGCTTGTCTCTGTCATGAGCAAGGGTGTGAGAATACCTCACTTGTATTGATCATGTTCCTGGCAACAAATATT
TTATACTACTCATAAAAATGACTCCAAACTTTCAATTACTAGTAACAACTGGATTGGAAGTTAAATCAAAGTGTTATCTT
ATTATTGGTAGCTCAGTTGAACTTTGTTTTAGAACAAAATGCTTACTATAATTTCGTGAGCATTATGACTTTGGATGACA
TTTTGGTCATTGTATTTGTGTATCAAATGTGAGATTAGTTGCTGTCTACT

NCBI results from http://www.ncbi.nlm.nih.gov/gorf/orfig.cgi
Frame		from		to	Length
 +3		228	..	512	285
 +2		533	..	697	165
 -1		493	..	621	129
 +3		738	..	851	114
 -3		203	..	313	111
 +1		124	..	231	108

Results from this program
#ID     orf     Frame   Start   Codon   Stop    Codon   Length
A1Contig1       2       +3      228     ATG     512     TGA     285
A1Contig1       3       +2      533     ATG     697     TGA     165
A1Contig1       5       -1      621     ATG     493     TAG     129
A1Contig1       4       +3      738     ATG     851     TAA     114
A1Contig1       6       -3      313     ATG     203     TAA     111
A1Contig1       1       +1      124     ATG     231     TGA     108

=cut test_data
sub et {
  # elapsed time
  my $now = time();
  my $et = $now - $past;
  $past = $now;
  return $et;
} # sub et