#!/usr/bin/perl #----------------------------------------------------------# # Author: Douglas Senalik dsenalik@wisc.edu # # http://www.vcru.wisc.edu/simonlab/sdata/software/ # #----------------------------------------------------------# # "Black Box" program series =bb Rearrange the order of sequences in a FASTA file =cut bb use strict; use warnings; use Getopt::Long; # for getting command line parameters # version 1.0 Spetember 3, 2011 ############################################################ # configuration variables ############################################################ my $version = "1.0"; my $defaultprefix = "concatenated"; my $spacerlen = 20; ############################################################ # global variables ############################################################ my $ansiup = "\033[1A"; # terminal control (my $prognopath = $0) =~ s/^.*[\/\\]//; my @inhdr = (); # FASTA headers without ">" my @inseq = (); # sequence my @inqhdr = (); # quality headers without ">" my @inqseq = (); # quality "sequence" my $doqual = 0; # 1 if found and read in quality file my $outqualname = ""; # output quality file name ( if used ) my $currbp = 0; # current number of base pairs written, for $startstop my %excludehash = (); # contigs to exclude, converted to hash for easy lookup ############################################################ # command line parameters ############################################################ my $infilename = ""; # input file name my $outfilename = ""; # output file name my $append = 0; # append to existing output file my $coordinates = ""; # output file for coordinates, where each contig starts and ends my @seq = (); # sequences to keep my @exclude = (); # sequences to exclude my $help = 0; # print help and exit my $oneseq = 0; # to concatenate all sequences my $prefix = $defaultprefix; # if using --oneseq, prefix file with this header my $startstop = 0; # put start and stop coordinates in file my $noqual = 0; # do not attempt to include qual file my $blankline = 0; # blank line between sequences in --onesequence mode my $random = 0; # return this many random contigs my $quiet = 0; # only show errors my $debug = 0; # print extra debugging information GetOptions ( "infile=s" => \$infilename, # string "outfile=s" => \$outfilename, # string "coordinates=s" => \$coordinates, # string "sequence=s" => \@seq, # string array "exclude=s" => \@exclude, # string array "onesequence" => \$oneseq, # flag "prefix=s" => \$prefix, # string "noqual" => \$noqual, # flag "blankline" => \$blankline, # flag "append" => \$append, # flag "startstop" => \$startstop, # flag "random=i" => \$random, # integer "help" => \$help, # flag "quiet" => \$quiet, # flag "debug" => \$debug); # flag unless ( $infilename ) { $help = 1; } unless ( $outfilename ) { $help = 1; } if ( ( ( scalar @seq ) + ( scalar @exclude ) + $random ) < 1 ) { $help = 1; } # debug implies not quiet if ( $debug ) { $quiet = 0; } ############################################################ # print help screen ############################################################ if ( $help ) { print "$prognopath Version $version Rearrange the order of sequences in a FASTA file based on your specified contigs and orientations Required parameters: --infile=xxx input FASTA file name with multiple sequences --outfile=xxx output file name, or \"-\" for stdout --seq=xxx sequences to keep, multiple allowed, a plus \"+\" for forward orientation is optional, or use \"-\" anywhere to indicate reverse complement. Use \"..\" to indicate a range. Use \",\" to separate entries. Examples: --seq=contig45 --seq=46,49,-21..23 --seq=+32 --seq=-65 --seq=76- --seq=00021..45- -seq=45+..47 or use \"s\" for a spacer of $spacerlen Ns e.g. --seq=00021+,S,45- The --seq parameter may be omitted if --exclude or --random is used instead Optional parameters: --exclude=xxx use this in place of the --seq parameter to output all contigs except these. Order will be unchanged from the original file in this case. --random=xxx return this many sequences selected at random and placed in random order --coordinates=xxx create this additional output file, which has the starting and ending position of each contig --onesequence concatenate all sequences into one --blankline for --onesequence mode, put a blank line between each sequence --prefix=xxx if using --onesequence, use this prefix, default = \"$prefix\" --append append to existing --outfile --startstop add starting and ending base position to FASTA headers --noqual if a .qual file is present, a corresponding output .qual file is created. This flag turns off this quality file processing --help print this screen --quiet only print error messages --debug print extra debugging information "; exit 1; } # if ( $help ) ############################################################ # expand ranges in specified sequences ############################################################ @seq = expandseq ( "--seq", @seq ); @exclude = expandseq ( "--exclude", @exclude ); # convert excludes to a hash for easy lookup foreach my $anexclude ( @exclude ) { $anexclude =~ s/\-//g; # remove directional information for excludes $excludehash{$anexclude} = 1; } # foreach my $anexclude ( @exclude ) ############################################################ # read input file into array ############################################################ { debugmsg ( "Reading input file \"$infilename\"" ); my $currcontig = 0; my $nlines = 0; my $nheaders = 0; my $contigindex = 0; my $INF = stdopen ( "<", $infilename ); while ( my $aline = <$INF> ) { $nlines++; $aline =~ s/[\r]//g; # remove DOS returns if ( $aline =~ m/^>(.*)$/ ) { my $hdr = $1; $contigindex++; $nheaders++; $hdr =~ s/[\s\n]+$//g; # header remove newlines and trailing white space if ( $hdr =~ m/contig(\d+)/i ) { $currcontig = $1; $inhdr[$currcontig] = $hdr; # ">" has been removed } else { die "Could not find contig number in header \"$hdr\" line $nlines\n"; } } else { $inseq[$currcontig] .= $aline; # retains newlines } } # while <$INF> stdclose ( $INF ); unless ( $quiet ) { print "Input file read, " . commify($nlines) . " lines, " . commify($nheaders) . " sequences\n"; } } ############################################################ # read quality file into array ############################################################ unless ( $noqual ) { my $inqualname = qualname( $infilename ); if ( -e $inqualname ) { debugmsg ( "Reading input quality file \"$inqualname\", output name will be \"$outqualname\"" ); my $currcontig = 0; my $nlines = 0; my $nheaders = 0; my $contigindex = 0; my $INF = stdopen ( "<", $inqualname, "quality" ); while ( my $aline = <$INF> ) { $nlines++; $aline =~ s/[\r]//g; # remove DOS returns if ( $aline =~ m/^>(.*)$/ ) { my $hdr = $1; $contigindex++; $nheaders++; $hdr =~ s/[\s\n]+$//g; # header remove newlines and trailing white space if ( $hdr =~ m/contig(\d+)/i ) { $currcontig = $1; $inqhdr[$currcontig] = $hdr; # ">" has been removed } else { die "Could not find contig number in header \"$hdr\" line $nlines\n"; } } else { $inqseq[$currcontig] .= $aline; # retains newlines } } # while <$INF> stdclose ( $INF ); unless ( $quiet ) { print "Quality file read, " . commify($nlines) . " lines, " . commify($nheaders) . " sequences\n"; } $doqual = 1; } # if quality file found else # no quality file found { unless ( $quiet ) { print "No corresponding quality file found\n"; } } # else no quality file found } # unless ( $noqual ) ############################################################ # random mode, create list of contigs ############################################################ if ( $random ) { # create a new list of just defined contig indices ( some contig numbers may be blank ) my @definedcontigs = (); for ( my $i=0; $i<=$#inhdr; $i++ ) { if ( defined($inhdr[$i]) ) { push ( @definedcontigs, $i ) } } # for $i # get list of random contigs for ( my $i=1; $i<=$random; $i++ ) { # test to prevent getting more random contigs than are available if ( scalar @definedcontigs < 1 ) { die "Error, requested more random contigs than are in the input file\n"; } # select a random contig and save it to the @seq list my $randomindex = int(rand(@definedcontigs)); my $contigindex = $definedcontigs[$randomindex]; debugmsg ( "Selecting random contig \"$contigindex\"" ); push ( @seq, $contigindex ); # remove selected contig from array to prevent re-use splice ( @definedcontigs, $randomindex, 1 ); } # for $i } # if ( $random ) ############################################################ # create output file ############################################################ { debugmsg ( "Creating output FASTA file \"$outfilename\"" ); my @coordinatesdata = (); my $currposition = 1; my $nseq = 0; my $nlines = 0; my $OUTF = stdopen ( $append?">>":">", $outfilename ); my $OUTQ; if ( $doqual ) { $outqualname = qualname( $outfilename ); debugmsg ( "Creating output quality file \"$outqualname\"" ); $OUTQ = stdopen ( $append?">>":">", $outqualname, "quality" ); } if ( $oneseq ) { print $OUTF ">".$prefix." ".scalar(@seq)." sequences: ", join (",", @seq), "\n"; if ( $doqual ) { print $OUTQ ">".$prefix." ".scalar(@seq)." sequences: ", join (",", @seq), "\n"; } } ############################################################ # --seq mode ############################################################ foreach my $aseqnum ( @seq ) { # strip off reverse complement flag if present my $rc = 0; if ( $aseqnum =~ m/-/ ) { $rc = 1; } $aseqnum =~ s/-//g; # check for special spacer unless ( $aseqnum =~ m/^S$/i ) { # check validity of number unless ( defined $inseq[$aseqnum] ) { die "Error, no such sequence number \"$aseqnum\"\n"; } # skip this one if it is on the exclude list if ( $excludehash{$aseqnum} ) { debugmsg ( "Specified sequence \"$aseqnum\" is on the exclude list, skipping it" ); next; } } # get relevant sequence my $aseq; my $aqual = ""; if ( $aseqnum =~ m/^S$/i ) { $aseq = "N"x$spacerlen; if ( $doqual ) { $aqual = "00 "x$spacerlen; } # not tested this may be wrong @@@ } else { $aseq = $inseq[$aseqnum]; if ( $doqual ) { $aqual = $inqseq[$aseqnum]; } } debugmsg ( "Adding sequence $aseqnum " . ($rc?"=ReverseComplement":"") . ", " . commify(length($aseq)) . " b.p." ); # reverse complement it if appropriate if ( $rc ) { $aseq = revcomp ( $aseq ); if ( $doqual ) { $aqual = qualrev ( $aqual ); } } # strip leading and trailing white space from sequence $aseq =~ s/^\s+//; $aseq =~ s/\s+$//; if ( $doqual ) { $aqual =~ s/^\s+//; $aqual =~ s/\s+$//; } # calculate sequence length for --startstop and for statistics line at end my $seqlen = 0; my $bseq = $aseq; $bseq =~ s/[^A-Za-z]//g; $seqlen = length ( $bseq ); if ( $coordinates ) { push ( @coordinatesdata, join ( "\t", $aseqnum, $rc?"-":"+", $currposition, ( $currposition+$seqlen-1 ) ) ); } $currposition += $seqlen; # print header to ouput ( unless concatenating ) $nseq++; unless ( $oneseq ) { print $OUTF ">", $inhdr[$aseqnum]; if ( $startstop ) { print $OUTF " ",($currbp+1),"..",($currbp+$seqlen); } # append " RC" to end of FASTA header if we reverse complemented it if ( $rc ) { print $OUTF " RC"; } print $OUTF "\n"; $nlines++; if ( $doqual ) { print $OUTQ ">", $inqhdr[$aseqnum], "\n"; } } if ( $startstop ) { $currbp += $seqlen; } # print sequence to output, and follow with a return print $OUTF $aseq, "\n"; if ( $blankline ) { print $OUTF "\n"; } $nlines += ( $aseq =~ tr/\n// + 1 ); if ( $doqual ) { print $OUTQ $aqual, "\n"; } } # foreach my $aseq ( @seq ) ############################################################ # --exclude mode ############################################################ if ( ( scalar @exclude ) and ( ! scalar @seq ) ) # if both --seq and --exclude were specified, we are done already { # loop through all input sequences in their original order for ( my $aseqnum=0; $aseqnum<=$#inhdr; $aseqnum++ ) { # skip undefined sequences next unless ( defined $inhdr[$aseqnum] ); # skip if on the exclude list if ( $excludehash{$aseqnum} ) { debugmsg ( "Sequence \"$aseqnum\" is on the exclude list, skipping it" ); next; } # get relevant sequence my $aseq = $inseq[$aseqnum]; my $aqual = ""; if ( $doqual ) { $aqual = $inqseq[$aseqnum]; } debugmsg ( "Adding sequence $aseqnum, " . commify(length($aseq)) . " b.p." ); # strip leading and trailing white space from sequence $aseq =~ s/^\s+//; $aseq =~ s/\s+$//; if ( $doqual ) { $aqual =~ s/^\s+//; $aqual =~ s/\s+$//; } # calculate sequence length for --startstop and for statistics line at end my $seqlen = 0; my $bseq = $aseq; $bseq =~ s/[^A-Za-z]//g; $seqlen = length ( $bseq ); if ( $coordinates ) { push ( @coordinatesdata, join ( "\t", $aseqnum, "+", $currposition, ( $currposition+$seqlen-1 ) ) ); } $currposition += $seqlen; # print header to ouput ( unless concatenating ) $nseq++; unless ( $oneseq ) { print $OUTF ">", $inhdr[$aseqnum]; if ( $startstop ) { print $OUTF " ",($currbp+1),"..",($currbp+$seqlen); } print $OUTF "\n"; $nlines++; if ( $doqual ) { print $OUTQ ">", $inqhdr[$aseqnum], "\n"; } } if ( $startstop ) { $currbp += $seqlen; } # print sequence to output, and follow with a return print $OUTF $aseq, "\n"; if ( $blankline ) { print $OUTF "\n"; } $nlines += ( $aseq =~ tr/\n// + 1 ); if ( $doqual ) { print $OUTQ $aqual, "\n"; if ( $blankline ) { print $OUTQ "\n"; } } } # for $aseqnum } # if ( ( scalar @exclude ) and ( ! scalar @seq ) ) stdclose ( $OUTF ); unless ( $quiet ) { print "Output file created, " . commify($nlines) . " lines, " . commify($nseq) . " sequences, ".commify($currposition-1)." b.p.\n"; } if ( $doqual ) { stdclose ( $OUTQ ); } if ( $coordinates ) { debugmsg ( "Storing coordinates data to file \"$coordinates\"" ); my $OUTC = stdopen ( ">", $coordinates ); print $OUTC join ( "\n", @coordinatesdata ), "\n"; stdclose ( $OUTC ); } # if ( $coordinates ) } exit 0; ############################################################ sub expandseq { my ( $debugtxt, @unexpanded ) = @_; ############################################################ # expand --seq or --exclude parameter ranges # After parsing, all entries will be an optional "-" followed # by contig number with leading zeroes stripped off # e.g. 456 or -678 my @expanded = (); foreach my $aseq ( @unexpanded ) { # split at commas or white space foreach my $asubseq ( split ( /[,\s]+/, $aseq ) ) { # skip null items next unless ( $asubseq ); # save reverse complement flag my $revcomp = ""; if ( $asubseq =~ m/-/ ) { $revcomp = "-"; } # remove all non [ digits or periods or $ or S ], removes "contig", "-", and "+" $asubseq =~ s/[^\d\.\$Ss]//g; # expand ranges if ( $asubseq =~ m/^(.*)\.\.(.*)$/ ) { ( my $start = $1 ) =~ s/^0+//; ( my $end = $2 ) =~ s/^0+//; for ( my $i=$start; $i<=$end; $i++ ) { push ( @expanded, $revcomp.$i ); } } else # single contig { $asubseq =~ s/^0+//; # trim leading zeroes push ( @expanded, $revcomp.$asubseq ); } } # foreach my $asubseq } # foreach my $aseq ( @unexpanded ) debugmsg ( "Original $debugtxt array contained ".commify(scalar(@unexpanded))." items, after expansion it contains ".commify(scalar(@expanded))." items" ); return @expanded; } # sub expandseq ############################################################ sub debugmsg { my ( $text, $noreturn, $nolinenum ) = @_; ############################################################ if ( $debug ) { my ($package, $filename, $line, $sub) = caller(0); unless ( $nolinenum ) { $text = "Line $line: " . $text; } if ( ! ( $noreturn ) ) { $text .= "\n"; } print $text; } # if ( $debug ) } # sub debugmsg ############################################################### sub timestr { ############################################################### @_ = localtime(shift || time); return(sprintf("%04d/%02d/%02d %02d:%02d", $_[5]+1900, $_[4]+1, $_[3], @_[2,1])); } # sub timestr ############################################################### sub commify { ############################################################### # http://perldoc.perl.org/perlfaq5.html#How-can-I-output-my-numbers-with-commas local $_ = shift; 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; return $_; } # commify ############################################################ sub qualname { my ( $fastaname ) = @_; ############################################################ # convert a FASTA file name to the corresponding quality file name my $qualname = $fastaname; $qualname =~ s/\.[^\.]+$//; # remove existing extension whatever it is $qualname .= ".qual"; # replace with ".qual"; } # sub qualname ############################################################ sub revcomp { my ( $dna ) = @_; ############################################################ # standard DNA reverse complement, including degenerate bases my $revcomp = reverse ( $dna ); $revcomp =~ tr/AaCcTtGgMmRrYyKkVvHhDdBb/TtGgAaCcKkYyRrMmBbDdHhVv/; return $revcomp; } # sub revcomp ############################################################ sub qualrev { my ( $qual ) = @_; ############################################################ # reverse quality score order, retain returns in string $qual =~ s/\n/ \n /g; # space on both sides of return so we can retain them # convert to array, some elements will be returns, and reverse the order my @qualarr = reverse( split ( /[ \t]+/, $qual ) ); # convert back to string, space between values $qual = join ( " ", @qualarr ); # remove extra spaces around returns that we introduced earlier $qual =~ s/ *\n *//g; return $qual } # sub revcomp ############################################################### sub stdopen { my ( $mode, $filename, $extratext ) = @_; ############################################################### # a replacement for the three-parameter open which also allows # the use of "-" as the file name to mean STDIN or STDOUT my $fh; # the file handle if ( $filename eq "-" ) # only exact match to "-" has special meaning { if ( $mode =~ m/>/ ) { $fh = *STDOUT } else { $fh = *STDIN } } else { # supplemental passed text for error messages, need one more space if ( defined $extratext ) { $extratext .= " " } else { $extratext = "" } my $text; # this is only used for error message if ( $mode =~ m/^\+?>>/ ) # ">>" or "+>>" { $text = "append" } elsif ( $mode =~ m/^\+?>/ ) # ">" or "+>" { $text = "output" } elsif ( $mode =~ m/^\+?</ ) # "<" or "+<" { $text = "input" } else { die "Error, unsupported file mode \"$mode\" specified to stdopen( $mode, $filename, $extratext )\n"; } open ( $fh, $mode, $filename ) or die ( "Error opening ${extratext}file \"$filename\" for $text: $!\n" ); } # return the opened file handle to the caller return $fh; } # stdopen ############################################################### sub stdclose { my ( $fh ) = @_; ############################################################### # same as built-in close, except in case of STDIN or STDOUT, # and in this case the file handle is not closed unless ( fileno($fh) <= 2 ) # if file number is this low, is stdin or stdout or stderr { close ( $fh ) or die ( "Error closing file handle: $!\n" ); } } # sub stdclose # eof