#!/usr/bin/perl #----------------------------------------------------------# # Author: Douglas Senalik dsenalik@wisc.edu # #----------------------------------------------------------# # "Black Box" program series =bb Get information about a particular contig =cut bb use strict; use warnings; use Getopt::Long; # for getting command line parameters # 1.0 - Mar 21, 2012 my $version = "1.0"; ############################################################ # configuration variables ############################################################ my $contiggraph = "454ContigGraph.txt"; # standard name for this file ############################################################ # global variables ############################################################ my $ansiup = "\033[1A"; # terminal control (my $prognopath = $0) =~ s/^.*[\/\\]//; my @output = (); # array with one element for each specified contig containing all output ############################################################ # command line parameters ############################################################ my $infilename = ""; # input file or directory name my $outfilename = ""; # output file name my @contigs = (); # contigs to analyze my $showscaffold = 0; # dump entire scaffold my $help = 0; # print help and exit my $quiet = 0; # only show errors my $debug = 0; # print extra debugging information GetOptions ( "infile=s" => \$infilename, # string "contig=s" => \@contigs, # integer "outfile=s" => \$outfilename, # string "showscaffold" => \$showscaffold, # flag "help" => \$help, # flag "quiet" => \$quiet, # flag "debug" => \$debug); # flag # debug implies not quiet if ( $debug ) { $quiet = 0; } unless ( $infilename ) { $help = 1 } unless ( scalar @contigs ) { $help = 1 } unless ( $outfilename ) { $help = 1 } ############################################################ # print help screen ############################################################ if ( $help ) { print "$prognopath version $version This program analyzes some of the output files from a 454 assembly to find out everything available for a particular contig. This information is all contained in the 454ContigGraph.txt file in the assembly directory. Required parameters: --infile=xxx input 454 assembly directory, or path to 454ContigGraph.txt file --contig=xxx contig to analyze ( multiple allowed ) use just the number e.g. --contig=123 or multiple numbers with , or ; as separator, e.g. --contig=123,16389;599 --outfile=xxx output file name, use \"-\" for stdout Optional parameters: --showscaffold if contig is part of a scaffold, list all contigs and gaps in that scaffold --help print this screen --quiet only print error messages --debug print extra debugging information "; exit 1; } # if ( $help ) ############################################################ # if $infilename is just the assembly directory, append file name ############################################################ unless ( -f $infilename ) { if ( -d $infilename ) { unless ( $infilename =~ m/\/$/ ) { $infilename .= "/"; } $infilename .= $contiggraph; # i.e. "454ContigGraph.txt" } else { die "Error, \"$infilename\" is not a valid file name or directory\n"; } } ############################################################ # expand multiple contigs in --contig parameter separated by "," or ";" ############################################################ { my @tmp = @contigs; @contigs = (); foreach my $acontig ( @tmp ) { my @tmp2 = split ( /[,;]/, $acontig ); push ( @contigs, @tmp2 ); } } ############################################################ # main loop ############################################################ debugmsg ( "Opening input file \"$infilename\"" ); my $INF = stdopen( "<", $infilename ); while ( my $aline = <$INF> ) { $aline =~ s/[\r\n]//g; my @cols = split ( /\t/, $aline ); for ( my $i=0; $i<=$#contigs; $i++ ) { my $acontig = $contigs[$i]; # http://contig.wordpress.com/2010/04/13/newbler-output-iii-the-454contiggraph-txt-file if ( $cols[0] eq "C" ) # Edges { if ( $acontig eq $cols[1] ) { $output[$i] .= "Edge\t$cols[2]\tConnects to contig\t$cols[3]\t$cols[4]\twith\t$cols[5]\treads\n"; } if ( $acontig eq $cols[3] ) { $output[$i] .= "Edge\t$cols[4]\tConnects to contig\t$cols[1]\t$cols[2]\twith\t$cols[5]\treads\n"; } } # "C" elsif ( $cols[0] eq "S" ) # Scaffolds { # sample line for version 2.5.3: #S 158 122973 6245:+;gapBothNoEdges:1965;6246:+;gapBothNoEdges:201;6247:+;gapBothNoEdges:277;6248:+ if ( $aline =~ m/$acontig/ ) # to save time, a quick check before we do more processing, but may be false hit { my $hit = 0; my @scaftxt = (); my @cols = split ( /\t/, $aline ); if ( defined $cols[3] ) { foreach my $asect ( split ( /;/, $cols[3] ) ) { # will be either contig, e.g. "1034:+" or a gap e.g. "gap:118" if ( $asect =~ m/^(\d+):(.*)$/ ) # is a contig number, not a gap { my $ctg = $1; my $dir = $2; if ( $ctg eq $acontig ) # hit { $output[$i] .= "Part of scaffold $cols[1] which is $cols[2] b.p. long\n"; $hit = 1; } # hit if ( $showscaffold ) { $dir =~ s/\+/forward/; $dir =~ s/\-/reverse/; push ( @scaftxt, " Contig\t$ctg\t$dir\n" ); } } # is a contig number else { if ( $showscaffold ) { $asect =~ s/:/\t/; push ( @scaftxt, " $asect\n" ); } } } # foreach } # if ( defined $cols[3] ) else { print "Undef [3] line \"$aline\"\n"; } #@@@ if ( ( $hit ) and ( $showscaffold ) ) { $output[$i] .= join ( "", @scaftxt ); } } # quick preliminary scan pass } # "S" elsif ( $cols[0] eq "I" ) # Through-flow information { if ( $acontig eq $cols[1] ) { $output[$i] .= "Flowthrough\t$cols[2]\t$cols[3]\n"; } } # "I" elsif ( $cols[0] eq "F" ) # Single end read flow information { if ( $acontig eq $cols[1] ) { foreach my $apart ( split ( /;/, $cols[2] ) ) { if ( $apart eq "-" ) { $output[$i] .= "No\treads flow from 5' end of contig$cols[1]\n"; } else { my @f = split ( /\//, $apart ); $output[$i] .= "$f[1]\treads flow from 5' end of contig$cols[1] and terminate in contig\t$f[0]"; if ( $f[2] > 0 ) { $output[$i] .= "\tafter passing through\t$f[2]\tb.p. in other contig(s)\n"; } else { $output[$i] .= "\n"; } } } foreach my $apart ( split ( /;/, $cols[3] ) ) { if ( $apart eq "-" ) { $output[$i] .= "No\treads flow from 3' end of contig$cols[1]\n"; } else { my @f = split ( /\//, $apart ); $output[$i] .= "$f[1]\treads flow from 3' end of contig$cols[1] and terminate in contig\t$f[0]"; if ( $f[2] > 0 ) { $output[$i] .= "\tafter passing through\t$f[2]\tb.p. in other contig(s)\n"; } else { $output[$i] .= "\n"; } } } } } # "F" elsif ( $cols[0] eq "P" ) # Paired end read flow information { if ( $acontig eq $cols[1] ) { foreach my $apart ( split ( /;/, $cols[2] ) ) { if ( $apart eq "-" ) { $output[$i] .= "No\tpaired end reads flow from 5' end of contig$cols[1]\n"; } else { my @f = split ( /\//, $apart ); $output[$i] .= "$f[1]\tpaired end reads flow from 5' end of contig$cols[1] and terminate in contig\t$f[0]"; if ( $f[2] > 0 ) { $output[$i] .= "\tafter passing through\t$f[2]\tb.p. in other contig(s)\n"; } else { $output[$i] .= "\n"; } } } foreach my $apart ( split ( /;/, $cols[3] ) ) { if ( $apart eq "-" ) { $output[$i] .= "No\tpaired end reads flow from 3' end of contig$cols[1]\n"; } else { my @f = split ( /\//, $apart ); $output[$i] .= "$f[1]\tpaired end reads flow from 3' end of contig$cols[1] and terminate in contig\t$f[0]"; if ( $f[2] > 0 ) { $output[$i] .= "\tafter passing through\t$f[2]\tb.p. in other contig(s)\n"; } else { $output[$i] .= "\n"; } } } } } # "P" else # first section { if ( $acontig eq $cols[0] ) { $output[$i] .= "Length\t$cols[2]\n"; $output[$i] .= "Average Coverage\t$cols[3]\n"; } } # first section } # for $i } # while <$INF> stdclose ( $INF ); ############################################################ # write collected data to output file ############################################################ my $OUTF = stdopen ( ">", $outfilename ); for ( my $i=0; $i<=$#contigs; $i++ ) { print $OUTF ">contig$contigs[$i]\n"; if ( defined $output[$i] ) { print $OUTF $output[$i]; } else { print $OUTF "No data found!\n"; } print $OUTF "\n"; } # for $i stdclose ( $OUTF ); unless ( $quiet ) { print "Done\n"; } exit 0; ############################################################ sub debugmsg { my ( $text, $noreturn, $nolinenum ) = @_; ############################################################ if ( $debug ) { my ($package, $filename, $line, $sub) = caller(0); unless ( $nolinenum ) { $text = "Line $line: " . $text; } if ( ! ( $noreturn ) ) { $text .= "\n"; } print $text; } # if ( $debug ) } # sub debugmsg ############################################################### sub timestr { ############################################################### @_ = localtime(shift || time); return(sprintf("%04d/%02d/%02d %02d:%02d", $_[5]+1900, $_[4]+1, $_[3], @_[2,1])); } # sub timestr ############################################################### sub commify { ############################################################### # http://perldoc.perl.org/perlfaq5.html#How-can-I-output-my-numbers-with-commas local $_ = shift; 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; return $_; } # commify ############################################################### sub stdopen { my ( $mode, $filename, $extratext ) = @_; ############################################################### # a replacement for the three-parameter open which also allows # the use of "-" as the file name to mean STDIN or STDOUT my $fh; # the file handle if ( $filename eq "-" ) # only exact match to "-" has special meaning { if ( $mode =~ m/>/ ) { $fh = *STDOUT } else { $fh = *STDIN } } else { # supplemental passed text for error messages, need one more space if ( defined $extratext ) { $extratext .= " " } else { $extratext = "" } my $text; # this is only used for error message if ( $mode =~ m/^\+?>>/ ) # ">>" or "+>>" { $text = "append" } elsif ( $mode =~ m/^\+?>/ ) # ">" or "+>" { $text = "output" } elsif ( $mode =~ m/^\+?</ ) # "<" or "+<" { $text = "input" } else { die "Error, unsupported file mode \"$mode\" specified to stdopen( $mode, $filename, $extratext )\n"; } open ( $fh, $mode, $filename ) or die ( "Error opening ${extratext}file \"$filename\" for $text: $!\n" ); } # return the opened file handle to the caller return $fh; } # stdopen ############################################################### sub stdclose { my ( $fh ) = @_; ############################################################### # same as built-in close, except in case of STDIN or STDOUT, # and in this case the file handle is not closed unless ( fileno($fh) <= 2 ) # if file number is this low, is stdin or stdout or stderr { close ( $fh ) or die ( "Error closing file handle: $!\n" ); } } # sub stdclose # eof