#!/usr/bin/perl #----------------------------------------------------------# # Author: Douglas Senalik dsenalik@wisc.edu # #----------------------------------------------------------# # "Black Box" program series =bb Find open reading frames in a DNA or RNA sequence =cut bb use strict; use warnings; use Getopt::Long; # for getting command line parameters use Time::HiRes qw ( time ); #@@@ # 1.0.0 - Mar 9, 2011 # 1.1.0 - June 8, 2011 = Add --trimheader, fix major bug in sorting alpha instead of numeric # 1.2.0 - Aug 28, 2012 = Bug fix in returned nucleotide sequence in table (length too long), added --origorder=s # 1.3.0 - Apr 1, 2013 = Add gff output format, increase efficiency ############################################################ # configuration variables ############################################################ my @strictstartcodons = ( "ATG" ); my @fullstartcodons = ( "ATG", "GTG", "CTG", "TTG" ); my @stopcodons = ( "[TU]AA", "[TU]AG", "[TU]GA" ); # thresholds for guessing orientation my $guessthreshold = 0.10; # "ambiguous" if both directions my $guessminbp = 100; # "tooshort" if below this my %guessstats = (); my %proteins = qw/ UUU F UUC F UUA L UUG L UCU S UCC S UCA S UCG S UAU Y UAC Y UAA * UAG * UGU C UGC C UGA * UGG W CUU L CUC L CUA L CUG L CCU P CCC P CCA P CCG P CAU H CAC H CAA Q CAG Q CGU R CGC R CGA R CGG R AUU I AUC I AUA I AUG M ACU T ACC T ACA T ACG T AAU N AAC N AAA K AAG K AGU S AGC S AGA R AGG R GUU V GUC V GUA V GUG V GCU A GCC A GCA A GCG A GAU D GAC D GAA E GAG E GGU G GGC G GGA G GGG G /; my $defaultfeatureid = "CDS"; ############################################################ # global variables ############################################################ my $ansiup = "\033[1A"; # terminal control (my $prognopath = $0) =~ s/^.*[\/\\]//; my $currhdr = ""; my $currseq = ""; my $guessdata = ""; my $numseq = 0; my %orfperid = (); my $outfilename2 = ""; # --nonorffasta file name my @startcodons; my $past = time(); #@@@ ############################################################ # command line parameters ############################################################ my $infilename = ""; # input file name my $outfilename = ""; # output file name my $anystart = 0; # start also at beginning of sequence my $fullstart = 0; # use all 4 start codons my $minlen = 100; # minimum orf length in b.p. my $guessorient = 0; # different output format: guess orientation my $fasta = 0; # different output format: FASTA my $nonorffasta = 0; # second file with all sequence not in the first one my $fastacollapse = 0; # no duplication in --fasta output my $fastalargest = 0; # only larger of any overlapping orfs is kept my $trimheader = 0; # keep header only up to first white space my $featureid = $defaultfeatureid; my $non; my $origorder; # return in sequence order instead of sorted by size my $nsequence = 0; # include column with nucleotide sequence my $psequence = 0; # include column with protein sequence my $gffformat = 0; # generate output in gff3 format my $help = 0; # print help and exit my $quiet = 0; # only show errors my $debug = 0; # print extra debugging information GetOptions ( "infile=s" => \$infilename, # string "outfile=s" => \$outfilename, # string "fullstart" => \$fullstart, # flag "anystart" => \$anystart, # flag "minlen=i" => \$minlen, # integer "guessorientation" => \$guessorient, # flag "fasta" => \$fasta, # flag "nonorffasta" => \$nonorffasta, # flag "fastacollapse" => \$fastacollapse, # flag "fastalargest" => \$fastalargest, # flag "trimheader" => \$trimheader, # flag "non:i" => \$non, # flag/integer "origorder:s" => \$origorder, # flag "nsequence" => \$nsequence, # flag "psequence" => \$psequence, # flag "gffformat" => \$gffformat, # flag "featureid=s" => \$featureid, # string "help" => \$help, # flag "quiet" => \$quiet, # flag "debug" => \$debug); # flag # required parameters unless ( $infilename ) { $help = 1; } unless ( $outfilename ) { $help = 1; } if ( ( defined $non ) and ( ! $non ) ) { $non = 1; } if ( $gffformat ) { $trimheader = 1; } # debug implies not quiet if ( $debug ) { $quiet = 0; } ############################################################ # print help screen ############################################################ if ( $help ) { print "$prognopath This program will detect open reading frames in FASTA DNA or RNA sequences. This is similar to the NCBI program at http://www.ncbi.nlm.nih.gov/gorf/orfig.cgi Required parameters: --infile=xxx input file name --outfile=xxx output file name, use \"-\" for stdout Optional parameters: --fullstart use full set of start codons: ATG GTG CTG TTG the default is to only use ATG --anystart start of sequence is also a valid orf start --minlen=xxx minimum orf length in b.p., default=$minlen --guessorientation guess orientation based on strand with the most total orfs, this data will be be saved instead of the list of orfs --fasta output file is in FASTA format, each orf is a separate sequence, information is in header --nonorffasta second FASTA file with all sequence not in the first one. File name is --outfile name with \"nonorf\" inserted --fastacollapse combine overlapping sequence in the FASTA file --fastalargest if two orfs overlap, keep only the larger one --trimheader remove any text in the FASTA header after the first occurrence of white space --origorder return list in sequence order instead of the default which is sorted by size --origorder=s same, but do + and - strands separately --nsequence include a column with nucleotide sequence --psequence include a column with protein sequence --gffformat generate output in gff3 format. This also enables --trimheader --featureid=xxx column 3 of gff file, default is \"$defaultfeatureid\" --non do not allow any \"N\"s in an orf --help print this screen --quiet only print error messages --debug print extra debugging information "; exit 1; } # if ( $help ) ############################################################ # perform orf detection ############################################################ if ( $fullstart ) { @startcodons = @fullstartcodons } else { @startcodons = @strictstartcodons } my $OUTF = stdopen ( ">", $outfilename ); my $OUTF2; if ( $nonorffasta ) { ( $outfilename2 = $outfilename ) =~ s/(\.[^\.]*)$/.nonorf$1/; $OUTF2 = stdopen ( ">", $outfilename2 ); } unless ( ( $guessorient ) or ( $fasta ) or ( $gffformat ) ) { print $OUTF join ( "\t", "#ID", "orf", "Frame", "Start", "Codon", "Stop", "Codon", "Length" ); if ( $nsequence ) { print $OUTF "\tDNA Sequence"; } if ( $psequence ) { print $OUTF "\tProtein Sequence"; } print $OUTF "\n"; } if ( $gffformat ) { print $OUTF "##gff-version 3\n##Index-subfeatures 1\n\n"; } my $INF = stdopen ( "<", $infilename ); while ( my $aline = <$INF> ) { $aline =~ s/[\r\n]//g; if ( $aline =~ m/^>(.*)$/ ) { my $seqhdr = $1; if ( $trimheader ) { $seqhdr =~ s/\s.*$//; } # Trim anything after first white space unless ( $seqhdr ) { die "Error, FASTA header is missing, or starts with white space: \"$aline\"\n"; } process(); $currhdr = $seqhdr; $currseq = ""; } else { $currseq .= $aline; } } # while <$INF> stdclose ( $INF ); process(); if ( $guessorient ) { print $OUTF "#ID\tSeq Len\tbp Fwd\tbp Rev\tOrientation\n", $guessdata; print "Guess Orientation Summary:\n"; while ( my ( $key, $value ) = each ( %guessstats ) ) { print commify($value), "\t", $key, "\n"; } } unless ( $quiet ) { print commify($numseq), " sequences processed\n"; } stdclose ( $OUTF ); if ( $nonorffasta ) { stdclose ( $OUTF2 ); } ############################################################ sub process { ############################################################ if ( $currseq ) # first call will have no sequence { $numseq++; # display progress for large input files if ( ( ( $numseq % 10 ) == 0 ) and ( ! $quiet ) ) { print commify($numseq), "\n", $ansiup; } my @outdata = (); my $currseqrc = revcomp($currseq); my @start = (); my @startrc = (); my @stop = (); my @stoprc = (); my $currlen = length($currseq); # variables used for guessing the orientation my $bpplus = 0; my $bpminus = 0; # only use stop codon once, later hits will be for shorter orf my %stopused = (); my %stopusedrc = (); # to avoid column alignment problems, make sure header has no tabs in it $currhdr =~ s/\t/ /g; debugmsg ( "Processing \"$currhdr\", " . commify($currlen) . " b.p." ); # if --anystart, add start of sequence positions in all reading frames if ( $anystart ) { push ( @start, 0, 1, 2 ); push ( @startrc, 0, 1, 2 ); } foreach my $acodon ( @startcodons ) { while ( $currseq =~ m/($acodon)/ig ) { debugmsg ( "Fwd: Start codon $1 at ".(pos($currseq)-2) ); push ( @start, pos($currseq)-2 ); } while ( $currseqrc =~ m/($acodon)/ig ) { debugmsg ( "Rev: Start codon $1 at ".(pos($currseqrc)-2) ); push ( @startrc, pos($currseqrc)-2 ); } } #print "Start codons ".et()." \"$currhdr\" \n"; #@@@ foreach my $acodon ( @stopcodons ) { while ( $currseq =~ m/($acodon)/ig ) { debugmsg ( "Fwd: Stop codon $1 at ".(pos($currseq)-2) ); push ( @stop, pos($currseq)-2 ); } while ( $currseqrc =~ m/($acodon)/ig ) { debugmsg ( "Rev: Stop codon $1 at ".(pos($currseqrc)-2) ); push ( @stoprc, pos($currseqrc)-2 ); } } #print "Stop codons ".et()." \"$currhdr\" \n"; # sort arrays to put positions in increasing order @start = sort ( { $a <=> $b } @start ); @stop = sort ( { $a <=> $b } @stop ); @startrc = sort ( { $a <=> $b } @startrc ); @stoprc = sort ( { $a <=> $b } @stoprc ); #print "Sort ".et()." \"$currhdr\" \n"; # find start-stop pairs my $laststop = 0; foreach my $astart ( @start ) { my $frame = ( $astart % 3 ); for my $s ( $laststop..$#stop ) { my $astop = $stop[$s]; # stop must be > $start if ( $astop <= $astart ) { $laststop = $s; next; } # frame must match next if ( ( $astop % 3 ) != $frame ); # ignore this start codon if we hit a used stop codons ( meaning a start codon existed earlier upstream ) last if ( $stopused{$astop} ); # prohibit N's in orf if requested if ( $non ) { my $orfseq = substr ( $currseq, $astart-1, ( $astop - $astart + 1 ) ); last if ( $orfseq =~ m/[Nn]{$non,}/ ); } # length must be >= --minlen parameter my $orflen = ( $astop - $astart + 3 ); if ( $orflen >= $minlen ) { # [0]ID [1]orf [2]Frame [3]Start [4]Codon [5]Stop [6]Codon [7]Length $orfperid{$currhdr}++; my @outline = ( $currhdr, $orfperid{$currhdr}, "+".($frame?$frame:3), $astart, substr($currseq,$astart-1,3), ($astop+2), substr($currseq,$astop-1,3), $orflen ); if ( $nsequence ) { push ( @outline, substr ($currseq,$astart-1,$orflen) ); } if ( $psequence ) { push ( @outline, dnatoprotein(substr ($currseq,$astart-1,$orflen)) ); } push ( @outdata, \@outline ); $bpplus += $orflen; debugmsg ( "Fwd, Valid orf at $astart to $astop, length=$orflen" ); } else { debugmsg ( "Fwd, Discarding too-short orf ( $orflen b.p. ) at $astart to $astop" ); } # if a stop codon is used, don't use it again as that will be for # a shorter orf inside a larger one. So, set flag to ignore it $stopused{$astop} = 1; # exit inner loop when first orf found, even if too short last; } } # foreach my $astart ( @start ) #print "Pairing Forward ".et()." \"$currhdr\" \n"; $laststop = 0; foreach my $astart ( @startrc ) { my $frame = ( $astart % 3 ); for my $s ( $laststop..$#stoprc ) { my $astop = $stoprc[$s]; # stop must be > $start if ( $astop <= $astart ) { $laststop = $s; next; } # frame must match next if ( ( $astop % 3 ) != $frame ); # ignore this start codon if we hit a used stop codon ( meaning a start codon existed earlier upstream ) last if ( $stopusedrc{$astop} ); # prohibit N's in orf if requested if ( $non ) { my $orfseq = substr ( $currseqrc, $astart-1, ( $astop - $astart + 1 ) ); next if ( $orfseq =~ m/[Nn]/ ); } # length must be >= --minlen parameter my $orflen = ( $astop - $astart + 3 ); if ( $orflen >= $minlen ) { # [0]ID [1]orf [2]Frame [3]Start [4]Codon [5]Stop [6]Codon [7]Length $orfperid{$currhdr}++; my @outline = ( $currhdr, $orfperid{$currhdr}, "-".($frame?$frame:3), ($currlen-$astart+1), substr($currseqrc,$astart-1,3), ($currlen-($astop+2)+1), substr($currseqrc,$astop-1,3), $orflen ); if ( $nsequence ) { push ( @outline, substr ($currseqrc,$astart-1,$orflen) ); } if ( $psequence ) { push ( @outline, dnatoprotein(substr ($currseqrc,$astart-1,$orflen)) ); } push ( @outdata, \@outline ); $bpminus += $orflen; debugmsg ( "Rev, Valid orf at $astart to $astop, length=$orflen" ); } else { debugmsg ( "Rev, Discarding too-short orf ( $orflen b.p. ) at $astart to $astop" ); } # if a stop codon is used, don't use it again as that will be for # a shorter orf inside a larger one. So, set flag to ignore it $stopusedrc{$astop} = 1; # exit inner loop when first orf found, even if too short last; } } # foreach my $astart ( @startrc ) #print "Pairing Reverse ".et()." \"$currhdr\" \n"; # save orf data for later output in appropriate format if ( defined $origorder ) # sort output data by original sequence order { @outdata = sort { $a->[3] <=> $b->[3] } @outdata; if ( $origorder =~ m/s/i ) { @outdata = sort { substr($a->[2],0,1) cmp substr($b->[2],0,1) } @outdata; } } else # sort output data by orf length, largest first { @outdata = sort { $b->[7] <=> $a->[7] } @outdata; } # save guess orientation data if ( $guessorient ) { my $txt; if ( ( $bpplus + $bpminus ) < 1 ) { $txt = "no orfs"; } elsif ( ( $bpplus + $bpminus ) < $guessminbp ) { $txt = "orfs too short"; } elsif ( ( abs( $bpplus - $bpminus ) / ( $bpplus + $bpminus ) ) < $guessthreshold ) { $txt = "ambiguous"; } elsif ( $bpplus > $bpminus ) { $txt = "forward"; } else { $txt = "reverse"; } $guessdata .= join ( "\t", $currhdr, $currlen, $bpplus, $bpminus, $txt ) . "\n"; $guessstats{$txt}++; } # if ( $guessorient ) elsif ( $fasta ) { my %used = (); # for --fastacollapse or --fastalargest only, key is start, value is end foreach my $rowref ( @outdata ) { # [0]ID [1]orf [2]Frame [3]Start [4]Codon [5]Stop [6]Codon [7]Length my $header = ">" . $rowref->[0].".".$rowref->[1] . " " . join ( "; ", @{$rowref}[2..7] ); my $len = $rowref->[7]; my $seq = "error"; if ( ( $fastacollapse ) or ( $fastalargest ) ) { # get range, start always the lower value my $start = $rowref->[3]; my $end = $rowref->[5]; if ( $end < $start ) { $start = $rowref->[5]; $end = $rowref->[3]; } # see if this range overlaps an existing one my $inserted = 0; foreach my $key ( keys %used ) { if ( ( $start >= $key ) and ( $end <= $used{$key} ) ) { # full subset, can ignore it completely for both --fastacollapse and --fastalargest debugmsg ( "Sequence \"$currhdr\": orf $start..$end is subset of orf $key..$used{$key}" ); $inserted = 1; last; } # if ( ( $start >= $key ) and ( $end <= $used{$key} ) ) elsif ( ( $end > $used{$key} ) and ( $start <= $used{$key} ) ) { if ( $fastacollapse ) { debugmsg ( "Sequence \"$currhdr\": orf $start..$end extends end of orf $key..$used{$key}" ); # extend end side $used{$key} = $end; # modify start to detect join events if ( $start > $key ) { $start = $key } } # if ( $fastacollapse ) if ( $fastalargest ) { if ( ( $end - $start ) > ( $used{$key} - $key ) ) # if new orf larger { debugmsg ( "Sequence \"$currhdr\": orf $start..$end larger than orf $key..$used{$key}" ); $used{$start} = $end; delete ( $used{$key} ); } } # if ( $fastalargest ) $inserted = 1; } # elsif ( ( $end > $used{$key} ) and ( $start <= $used{$key} ) ) elsif ( ( $start < $key ) and ( $end >= $key ) ) { if ( $fastacollapse ) { debugmsg ( "Sequence \"$currhdr\": orf $start..$end extends start of orf $key..$used{$key}" ); # extend start side $used{$start} = $used{$key}; delete ( $used{$key} ); # modify end to detect join events if ( $end < $used{$start} ) { $end = $used{$start} } } # if ( $fastacollapse ) if ( $fastalargest ) { if ( ( $end - $start ) > ( $used{$key} - $key ) ) # if new orf larger { debugmsg ( "Sequence \"$currhdr\": orf $start..$end larger than orf $key..$used{$key}" ); $used{$start} = $end; delete ( $used{$key} ); } } # if ( $fastalargest ) $inserted = 1; } # elsif ( ( $start < $key ) and ( $end >= $key ) ) } # foreach # no action taken above, so new non-overlapping range found unless ( $inserted ) { $used{$start} = $end; debugmsg ( "Sequence \"$currhdr\": orf $start..$end is new" ); } # unless ( $inserted ) } # if ( ( $fastacollapse ) or ( $fastalargest ) ) else # do not collapse overlaps { if ( $rowref->[2] =~ m/\-/ ) # if reverse complement { $seq = substr( $currseq, $rowref->[5]-1, $len ); $seq = revcomp ( $seq ); $header .= " [RC]"; } else # forward orientation { $seq = substr( $currseq, $rowref->[3]-1, $len ); } print $OUTF $header, "\n"; print $OUTF $seq, "\n"; } # else do not collapse overlaps } # foreach my $rowref ( @outdata ) if ( ( $fastacollapse ) or ( $fastalargest ) ) { debugmsg ( "Sequence \"$currhdr\" has ".scalar ( keys %used )." orf sequences to save" ); my $numseq = 0; my $prevend = 0; foreach my $key ( sort { $a <=> $b } keys %used ) { $numseq++; my $len = ( $used{$key} - $key + 1 ); my $header = ">$currhdr.$numseq $key..$used{$key} = $len b.p."; my $seq = substr( $currseq, $key-1, $len ); debugmsg ( "Save FASTA of orf $key..$used{$key} length $len" ); print $OUTF $header, "\n"; print $OUTF $seq, "\n"; if ( $nonorffasta ) { my $nolen = ( $key - $prevend - 1 ); my $noseq = substr( $currseq, $prevend, $nolen ); my $noheader = ">$currhdr.$numseq ".($prevend+1)."..".($key-1)." = $nolen b.p."; debugmsg ( "Save FASTA of non-orf ".($prevend+1)."..".($key-1)." length $nolen" ); $prevend = $used{$key}; print $OUTF2 $noheader, "\n"; print $OUTF2 $noseq, "\n"; } # if ( $nonorffasta ) } # foreach if ( $nonorffasta ) { $numseq++; my $nolen = ( length($currseq) - $prevend ); my $noseq = substr( $currseq, $prevend, $nolen ); my $noheader = ">$currhdr.$numseq ".($prevend+1)."..".length($currseq)." = $nolen b.p."; debugmsg ( "Save FASTA of last non-orf ".($prevend+1)."..".length($currseq)." length $nolen" ); print $OUTF2 $noheader, "\n"; print $OUTF2 $noseq, "\n"; } # if ( $nonorffasta ) } # if ( ( $fastacollapse ) or ( $fastalargest ) ) } # if ( $fasta ) elsif ( $gffformat ) { #@@@ # rowref columns: [0]ID [1]orf [2]Frame [3]Start [4]Codon [5]Stop [6]Codon [7]Length my $orfindex = 0; foreach my $rowref ( @outdata ) { my $strand = substr ( $rowref->[2], 0, 1 ); # + or - my $start = ( $strand eq '+' )?$rowref->[3]:$rowref->[5]; my $end = ( $strand eq '+' )?$rowref->[5]:$rowref->[3]; $orfindex++; print $OUTF join ( "\t", $currhdr, "bb.orffinder", $featureid, $start, $end, '.', $strand, 0, "ID=$currhdr.$orfindex" . ";Name=" . $rowref->[7] . $rowref->[2] ), "\n"; } } # if ( $gffformat ) else # normal output { foreach my $rowref ( @outdata ) { print $OUTF join ( "\t", @{$rowref} ), "\n"; } } @outdata = (); } #print "Saving ".et()." \"$currhdr\" \n"; } # sub process ############################################################ sub debugmsg { my ( $text, $noreturn, $nolinenum ) = @_; ############################################################ if ( $debug ) { my ($package, $filename, $line, $sub) = caller(0); unless ( $nolinenum ) { $text = "Line $line: " . $text; } if ( ! ( $noreturn ) ) { $text .= "\n"; } print $text; } # if ( $debug ) } # sub debugmsg ############################################################### sub timestr { ############################################################### @_ = localtime(shift || time); return(sprintf("%04d/%02d/%02d %02d:%02d", $_[5]+1900, $_[4]+1, $_[3], @_[2,1])); } # sub timestr ############################################################### sub commify { ############################################################### # http://perldoc.perl.org/perlfaq5.html#How-can-I-output-my-numbers-with-commas local $_ = shift; 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; return $_; } # commify ############################################################ sub revcomp { my ( $dna ) = @_; ############################################################ # standard DNA reverse complement, including degenerate bases my $revcomp = reverse ( $dna ); $revcomp =~ tr/AaCcTtGgMmRrYyKkVvHhDdBb/TtGgAaCcKkYyRrMmBbDdHhVv/; return $revcomp; } # sub revcomp ############################################################ sub dnatoprotein { my ( $dna ) = @_; ############################################################ # translate DNA sequence into protein sequence $dna =~ tr/acgtT/ACGUU/; if ( ( length($dna) % 3 ) != 0 ) { die "Error, DNA sequence length not a multiple of 3: \"$dna\"\n"; } my $protseq = ""; foreach my $triplet ( $dna =~ m/(...)/g ) { if ( defined $proteins{$triplet} ) { $protseq .= $proteins{$triplet}; } else { $protseq .= "?"; } # should only happen if any letter is not [ACTGU] } return $protseq; } # sub dnatoprotein ############################################################### sub stdopen { my ( $mode, $filename, $extratext ) = @_; ############################################################### # a replacement for the three-parameter open which also allows # the use of "-" as the file name to mean STDIN or STDOUT my $fh; # the file handle if ( $filename eq "-" ) # only exact match to "-" has special meaning { if ( $mode =~ m/>/ ) { $fh = *STDOUT } else { $fh = *STDIN } } else { # supplemental passed text for error messages, need one more space if ( defined $extratext ) { $extratext .= " " } else { $extratext = "" } my $text; # this is only used for error message if ( $mode =~ m/^\+?>>/ ) # ">>" or "+>>" { $text = "append" } elsif ( $mode =~ m/^\+?>/ ) # ">" or "+>" { $text = "output" } elsif ( $mode =~ m/^\+?</ ) # "<" or "+<" { $text = "input" } else { die "Error, unsupported file mode \"$mode\" specified to stdopen( $mode, $filename, $extratext )\n"; } open ( $fh, $mode, $filename ) or die ( "Error opening ${extratext}file \"$filename\" for $text: $!\n" ); } # return the opened file handle to the caller return $fh; } # stdopen ############################################################### sub stdclose { my ( $fh ) = @_; ############################################################### # same as built-in close, except in case of STDIN or STDOUT, # and in this case the file handle is not closed unless ( fileno($fh) <= 2 ) # if file number is this low, is stdin or stdout or stderr { close ( $fh ) or die ( "Error closing file handle: $!\n" ); } } # sub stdclose # eof =test_data >A1Contig1 TCACACTTTTAGCTAACAGATTTACATATCTCCGTTGGAGATGCTCTAACATGACTAGAATGAGATGGCCCTCGTTAAAA TTTTAAAATCTCGGCAGAGAAGTTGTTAAAAAGAGCACAATTCATGAGTTCGATAACTGATGAGAAACTGAGTGAACATA AAGCAACACAGAGGGCAGTACAACTCTTATTTAAGAACGAGATTAGAAATGTCCAACATCTTCACAAATGAAGGACCGAA GTTTGATATGGAATTCTGACTTGATAGAGACATTAGAGTTGGAAAACTTGTTGATCAATGCAGCTATTACCATGCATTCA GCTGAAGCAAGAAAAGAGAGCAGAGGAGCTCATGCTCGTGAAGATTTTACGAAAAGAGATGATGAGAATTGGATGAAGCA TTCATTGGGATACTGGGAGAATGAGAAGGTACGGCTGGACTACAGGCCTGTTCACATGAACACATTAGACGATGAGATTG AAACGTTCCCACCTAAAGCACGTGTGTACTGAGGTATGGTATTGAAGGATACATGTGGTTGGGGAAAAATATAATATTTT CTTCTAAGAAGTCCGCAATAAATTTACTGAGACCTAGAAAATTTCTAAATAATGACGTCATTTGTCAAACTGCAGTCGGG GTGAGTAAGCTTGTCTCTGTCATGAGCAAGGGTGTGAGAATACCTCACTTGTATTGATCATGTTCCTGGCAACAAATATT TTATACTACTCATAAAAATGACTCCAAACTTTCAATTACTAGTAACAACTGGATTGGAAGTTAAATCAAAGTGTTATCTT ATTATTGGTAGCTCAGTTGAACTTTGTTTTAGAACAAAATGCTTACTATAATTTCGTGAGCATTATGACTTTGGATGACA TTTTGGTCATTGTATTTGTGTATCAAATGTGAGATTAGTTGCTGTCTACT NCBI results from http://www.ncbi.nlm.nih.gov/gorf/orfig.cgi Frame from to Length +3 228 .. 512 285 +2 533 .. 697 165 -1 493 .. 621 129 +3 738 .. 851 114 -3 203 .. 313 111 +1 124 .. 231 108 Results from this program #ID orf Frame Start Codon Stop Codon Length A1Contig1 2 +3 228 ATG 512 TGA 285 A1Contig1 3 +2 533 ATG 697 TGA 165 A1Contig1 5 -1 621 ATG 493 TAG 129 A1Contig1 4 +3 738 ATG 851 TAA 114 A1Contig1 6 -3 313 ATG 203 TAA 111 A1Contig1 1 +1 124 ATG 231 TGA 108 =cut test_data sub et { # elapsed time my $now = time(); my $et = $now - $past; $past = $now; return $et; } # sub et