#!/usr/bin/perl #-------------------------------------------------------------------------# # Author: Douglas Senalik dsenalik@wisc.edu # # http://www.vcru.wisc.edu/simonlab/sdata/software/index.html#contignet # # Modified by: Simon Gladman simon.gladman@csiro.au # # 2011 # #-------------------------------------------------------------------------# # "Black Box" program series =bb Create a network of all interconnected 454 contigs =cut bb use strict; use warnings; use Getopt::Long; # for getting command line parameters # 1.0.1 - Dec 29, 2010 # 1.0.2 - Feb 15, 2011 - removed extra line that prevented count of nodes from working # 1.0.3 - Mar 12, 2011 - Support paired end data output files, add "@" prefix for list files, # add --nospline and --overlapmode parameters # 1.0.3-Simon - May 18, 2011 - Added pseudo-links formed by paired end information instead of just # newbler links # 1.0.4 - September 2, 2011 # Allow "+" or "-" in contig numbers, but it is ignored # Add parameter to specify output image type # Correct error in --tag help description # Add --label as a synonym for --tag # Fix bug in dead end and recursion limit labeling # 1.0.5 - October 18, 2011 # Allow keeping graphviz command file # change default overlap mode to "false" (it was "none" before) # to have same default output with newest version of neato # make scaffold connections optional, add --scaffold to use scaffold connections # 1.0.6 - November 4, 2011 # Filter out null contigs, excludes, etc. from command line # Optional ABySS-Explorer output file added # 1.0.7 - May 4, 2012 # Text output data file is now sorted by contig number # change the --scaffold parameter to --pairlinks, # since I may use --scaffold in the future for the actual scaffold section # Show paired end links and labels ( --pairlinks ) in a different color # Add support for flowthrough links ( --flowthrough ) in a third color # Add support for flowbetween links ( --flowbetween ) in a fourth color # Add paired end support to ABySS-Explorer ouput, and use exclusively the # new ABySS-Explorer 1.3.0 .dot format my $version = "1.0.7"; ############################################################ # configuration variables ############################################################ my $bpabbreviation = "nt"; # set to b.p., bp, nt, or even a null string, as you prefer my $defaultmaxlevel = 2; my $debuglimit = 1000; # how long print debug messages while extending my $numdigits = 5; # contig numbers filled out with leading zeroes to this length my $scalefactor = 0.01; # convert b.p. to graphviz length by multiplying by this value my $defaultoverlapmode = "false"; # neato overlap mode in graph section, "true" to disable my $defaultouttype = "png"; # default image type ( passed to neato ) my $dotheaderid = "adj"; # ABySS-Explorer .dot file header id my $contiggraphtxt = "454ContigGraph.txt"; # this is defined by gsAssembler/Newbler my $allcontigsfna = "454AllContigs.fna"; # this is defined by gsAssembler/Newbler my $defaultminflowbetween = 1; # default color definitions my $deadendcolor = "tomato"; my $recursionlimitcolor = "gold"; my $normallinkcolor = ""; # leave blank for the default color of black my $normalfontcolor = $normallinkcolor; my $forcedlinkcolor = "red"; # links from --force parameter my $forcedfontcolor = $forcedlinkcolor; my $pairedendlinkcolor = "dodgerblue"; # links from paired end information my $pairedendfontcolor = $pairedendlinkcolor; my $flowbetweenlinkcolor = "purple"; # links from flowthrough "F" information my $flowbetweenfontcolor = $flowbetweenlinkcolor; my $flowthroughlinkcolor = "forestgreen"; # links from flowthrough "I" information my $flowthroughfontcolor = $flowthroughlinkcolor; my $abyssk = 1; # we don't use kmers, so set this to 1 so coverage will be direct conversion my $abyssedge = 0; # negative kmer minus 1, thus zero my $abyssevalue = 0; # we don't have a value for this, so just use zero always ############################################################ # global variables ############################################################ my $ansiup = "\033[1A"; # terminal control (my $prognopath = $0) =~ s/^.*[\/\\]//; my @contiglen = (); # contig length my @contigcov = (); # contig average coverage my %ends = (); # key is contig . "3'" or "5'" or "0'", with optional "p", "f", or "i", # values are @[ contig, 3'|5', #reads, flag ] my %pairs = (); #used for temporary storage of paired end information until link ends can be sorted.. [Simon Gladman - 2011] my %deadends = (); # key is contig, value is # of ends with reads extending ( so 1 = dead end ) my %minrl = (); # key is contig, value is lowest recursion level seen for this contig my %seen = (); # key is contig, value is >=1 if we already traversed, undefined if not # actual value reflects recursion level when seen my %edgeseen = (); # key is contig 3'or5' contig 3'or5', value = 1 or undefined my %taghash = (); # key is contig, value is array of tags my %colorhash = (); # key is contig, value is array of colors my %excludehash = (); # key is contig, value is 1 to exclude my %inverthash = (); # key is contig, value is 1 to exclude my %data = (); # subset of %ends that will end up in graph my %flowdata = (); # store flow through read data here my $extensions = 0; # number of auto-extensions so far (for --extend) my @extarr = (); # extensions will apply only to supplied contigs, not auto-added ones my $infilename = ""; # 454ContigGraph.txt path and name my @listofexcl = (); # list of excluded contigs, --listexcluded turns this on my $deletecmdfile = 1; # becomes 0 if a command file name is explicitly specified my $returncode = 0; # return code of this program ############################################################ # command line parameters ############################################################ my $indirname = ""; # input file name my $outfilename = ""; # output file name my $outtype = $defaultouttype; # output image file format my $cmdfilename = ""; # graphviz .dot language command file name my $imgfilename = ""; # image file name my $outfastaname = ""; # create FASTA file of contigs in output my $level = $defaultmaxlevel; # maximum number of levels of recursion my $boldabove = 0; my @contig = (); # starting contig(s) my @tag = (); # tag certain contigs my @color = (); # color certain contigs my @exclude = (); # never go into these contigs (e.g. repeat regions) my @invert = (); # contigs to plot backwards my $len = 1; # neato len parameter my $listexcluded = 0; # list contigs that have been excluded my $extend = 0; # auto extend for best contig my @forcelink = (); # force links where none may exist my $showbp = 0; # include length in b.p. in graph my $showcov = 0; # include contig average coverage in graph my $lowlimit = 0; # ignore links with read limit < this my $highlimit = 0; # ignore links with read number > this my $nolabel = 0; # disable dead end and recursion limit labelling my $overlapmode = $defaultoverlapmode; my $nospline = 0; # to disable splines my $scaffold = 0; # to enable scaffold connection information my $pairlinks = 0; # to enable paired end read connection information my $flowthrough = 0; # to enable flowthrough connection information my $flowbetween; # to enable flow between connection information, has optional value also my $alllinks = 0; # sets --pairlinks --flowthrough --flowbetween and --scaffold my $abyssdotfile; # generate file for use with ABySS-Explorer my $help = 0; # print help and exit my $quiet = 0; # only show errors my $debug = 0; # print extra debugging information GetOptions ( "indir=s" => \$indirname, # string "outfile=s" => \$outfilename, # string "type=s" => \$outtype, # string "cmdfile=s" => \$cmdfilename, # string "imgfile=s" => \$imgfilename, # string "fastaout=s" => \$outfastaname, # string "abyssexplorer=s"=> \$abyssdotfile, # string "level=i" => \$level, # integer "boldabove=i" => \$boldabove, # integer "contig=s" => \@contig, # string array "exclude=s" => \@exclude, # string array "invert=s" => \@invert, # string array "tag|label=s" => \@tag, # string array "color=s" => \@color, # string array "forcelink=s" => \@forcelink, # string array "len=s" => \$len, # real "extend=i" => \$extend, # integer "lowlimit=i" => \$lowlimit, # integer "highlimit=i" => \$highlimit, # integer "nolabel" => \$nolabel, # flag "nospline" => \$nospline, # flag "pairlinks" => \$pairlinks, # flag "scaffold" => \$scaffold, # flag "flowthrough" => \$flowthrough, # flag "flowbetween:s" => \$flowbetween, # flag/string "alllinks" => \$alllinks, # flag "overlapmode=s" => \$overlapmode, # string "listexcluded" => \$listexcluded, # flag "showbp|shownt" => \$showbp, # flag "showcoverage" => \$showcov, # flag "help" => \$help, # flag "quiet" => \$quiet, # flag "debug" => \$debug); # flag # debug implies not quiet if ( $debug ) { $quiet = 0; } unless ( ( $indirname ) and ( $outfilename ) and ( scalar @contig ) ) { $help = 1; } # changing meaning of --scaffold for future use if ( $scaffold ) { die "--scaffold has been changed to --pairlinks\n"; } # $flowbetween is a flag with an optional value, set default value if no value was specified if ( ( defined $flowbetween ) and ( $flowbetween eq "" ) ) { $flowbetween = $defaultminflowbetween; } if ( $alllinks ) { $pairlinks = 1; $flowthrough = 1; $flowbetween = $defaultminflowbetween; $scaffold = 1; # future use } # allow specification of only the directory unless ( ( -d $indirname ) or ( $help ) ) { print "Error, specified input directory \"$indirname\" does not exist or is not a directory\n"; $help = 1; } $infilename = $indirname; unless ( $infilename =~ m/\/$/ ) { $infilename .= "/"; } $infilename .= $contiggraphtxt; # make sure input file exists unless ( ( -e $infilename ) or ( $help ) ) { print "Error, input file \"$infilename\" does not exist\n"; $help = 1; } # if no --imgfile, create name based on --outfile unless ( $imgfilename ) { $imgfilename = $outfilename . "." . $outtype; } # if no --cmdfile, create name based on --outfile if ( $cmdfilename ) { $deletecmdfile = 0; } else { $cmdfilename = $outfilename . ".graphviz"; } if ( $outfilename =~ m/png$/i ) { warn "You probably do not want to append a .png extension on --outfile\n"; } # OBSOLETE # ABySS-Explorer idiosyncracy #if ( ( $abyssexplorer ) and ( $abyssexplorer !~ m/-4.adj/ ) ) # { print "WARNING: ABySS-Explorer versions <= 1.0.1 require that the input file name ends in \"-4.adj\"\n"; } ############################################################ # print help screen ############################################################ if ( $help ) { print "$prognopath version $version Required parameters: --indir=xxx path to 454 assembly directory --outfile=xxx output text file of results --contig=xxx[,xxx]... one or more starting contig numbers, separated by comma, or multiple --contig parameters may be used. Use just the numeric portion of the contig Optional parameters: --type=xxx output file format, default is \"$defaultouttype\" ( anything besides \"png\" is experimental ) --cmdfile=xxx graphviz command file in .dot language will be created using this name. If not specified, a temporary command file will be created, and it will be deleted when done --imgfile=xxx graph image file will be created with this name. If not specified, will be --outfile with .${outtype} extension added --fastaout=xxx create a FASTA file of all contigs in the output, save in this file --abyssexplorer=xxx Generate a .dot file that can be used for visualization with ABySS-Explorer 1.3.0, http://www.bcgsc.ca/platform/bioinfo/software/abyss-explorer"; # for compatibility, the file name must end in \"-4.adj\" # If paired end information is available, and the # --pairlinks parameter is used, corresponding # \"-3.dist\" and \"-contigs.fa\" files will also be created print " --flowthrough include connection information derived from reads that flow through more than two contigs --flowbetween[=x] include connection information derived from reads that flow from one contig into another by default, if the distance value is zero, it will not be shown, the optional value for this parameter is a minimum distance, defaulting to $defaultminflowbetween, set to --flowbetween=0 to show these links also --pairlinks include connection information derived from paired end reads, only applicable for assemblies containing paired end reads --alllinks sets --flowthrough, --flowbetween, and --pairlinks --tag=tagname,contig[,contig]... list of 1 or more contigs will be given this tag. Multiple --tag allowed. tagname is a text label that will be shown in the final image, e.g. --tag=\"ATP1,14,34\" --label a synonym for --tag --showbp show length in b.p. in graph --shownt a synonym for --showbp --showcoverage show average contig read coverage in graph --color=colorname,contig[,contig]... like --tag, but color the contig. for list of valid color names see http://www.graphviz.org/doc/info/colors.html --forcelink=xxx-5:yyy-3 force a link where none exists between specified ends, xxx and yyy are contig numbers --level=xxx maximum recursion level, default=$level --boldabove=xxx lines with read coverage >= this value will be drawn in bold. no default value --exclude=xxx[,xxx]... one or contigs to never traverse past, for example a repeated region contig --listexcluded print out a list of which excluded contigs are being ignored --invert=xxx[,xxx]... one or more contigs to plot backwards on the graph, i.e. 3' to 5' direction --extend=xxx auto extension for the single best path, value is maximum steps, default=$extend --lowlimit=xxx ignore connections < this number of reads --highlimit=xxx ignore connections > this number of reads --len=xxx len parameter to neato, default=$len --nolabel disable highlighting of dead ends, and limit of recursion contigs --overlapmode neato paramter, default is $overlapmode, one of none, true, scale --nospline disable spline when edges would overlap --help print this screen --quiet only print error messages --debug print extra debugging information In place of lists of contigs, you can use \@filename to read in values for that parameter from a file, e.g. --exclude=\@excl.txt This program requires that the graphviz program \"neato\" be available in the default PATH. The graphviz web site is http://www.graphviz.org/ "; exit 1; } # if ( $help ) ############################################################ # expand --contig lists separated by commas into single array ############################################################ { my @tmp = (); foreach my $acontig (@contig) { $acontig = expandatprefix ( $acontig ); push ( @tmp, split ( /\s*[,;]\s*/, $acontig ) ); } # cleaning and validation @contig = (); foreach my $item ( @tmp ) { $item = expandatprefix ( $item ); $item =~ s/^0+//; # remove leading zeroes $item =~ s/\s//g; # remove any white space $item =~ s/[\+\-]//g; # allow "+" or "-" in contig numbers, but it is ignored if ( $item =~ m/[^\d]/ ) { die "Error, non-numeric character used for --contig \"$item\"\n"; } unless ( $item =~ m/^$/ ) # skip null items { push ( @contig, $item ); } } debugmsg ( "Supplied contig list of ".scalar(@contig)." contigs = \"".join ("\" \"", @contig)."\"" ); } ############################################################ # expand --tag lists separated by commas into hash of arrays ############################################################ { my $ntags = 0; foreach my $atag (@tag) { my @tmp = split ( /\s*[,;]\s*/, $atag ); my $taglabel = shift ( @tmp ); foreach my $item ( @tmp ) { $item = expandatprefix ( $item ); $item =~ s/^0+//; # remove leading zeroes unless ( $item =~ m/^$/ ) # skip null items { push ( @{$taghash{$item}}, $taglabel ); $ntags++; } } } # foreach (@tag) debugmsg ( "Stored ".commify($ntags)." tags" ); } ############################################################ # expand --color lists separated by commas into hash of arrays ############################################################ { my $ncolors = 0; foreach my $acolor (@color) { my @tmp = split ( /\s*[,;]\s*/, $acolor ); my $colorlabel = shift ( @tmp ); foreach my $item ( @tmp ) { $item = expandatprefix ( $item ); $item =~ s/^0+//; # remove leading zeroes $item =~ s/[\-\+]//g; # remove plus or minus - has no meaning, but this is a convenience to be compatible with bb.fastareorder unless ( $item =~ m/^$/ ) # skip null items { push ( @{$colorhash{$item}}, $colorlabel ); $ncolors++; } } } # foreach (@color) debugmsg ( "Stored ".commify($ncolors)." colors" ); } ############################################################ # expand --forcelink lists separated by commas into hash of arrays ############################################################ { my $nforce = 0; foreach my $aforce (@forcelink) { my @tmp = split ( /\s*[,;]\s*/, $aforce ); foreach my $link ( @tmp ) { $link = expandatprefix ( $link ); my @parts = split ( /\s*[:-]\s*/, $link ); unless ( scalar @parts == 4 ) { die "Invalid format for --forcelink \"$link\"\n"; } foreach ( @parts ) { s/^0+//; # remove leading zeroes s/'//g; # remove primes (they are optional) } # add this artificial link to the list push ( @{$ends{$parts[0].$parts[1]."'"}}, [ $parts[2], $parts[3]."'", 0 ] ); push ( @{$ends{$parts[2].$parts[3]."'"}}, [ $parts[0], $parts[1]."'", 0 ] ); } $nforce++; } # foreach (@forcelink) debugmsg ( "Stored ".commify($nforce)." forced links" ); } ############################################################ # convert --exclude lists to a simple hash ############################################################ { foreach my $aexclude (@exclude) { foreach my $item ( split ( /\s*[,;]\s*/, $aexclude ) ) { $item = expandatprefix ( $item ); # cleaning and validation $item =~ s/^0+//; # remove leading zeroes $item =~ s/\s//g; # remove any white space unless ( $item =~ m/^$/ ) # skip null items { if ( $item =~ m/[^\d]/ ) { die "Error, non-numeric character used for --exclude \"$item\"\n"; } $excludehash{$item} = 1; } } } debugmsg ( "Stored ".commify(scalar keys %excludehash)." exclude contigs" ); } ############################################################ # convert --invert lists to a simple hash ############################################################ { foreach my $ainvert (@invert) { $ainvert = expandatprefix ( $ainvert ); foreach my $item ( split ( /\s*[,;]\s*/, $ainvert ) ) { $item = expandatprefix ( $item ); # cleaning and validation $item =~ s/^0+//; # remove leading zeroes $item =~ s/\s//g; # remove any white space unless ( $item =~ m/^$/ ) # skip null items { if ( $item =~ m/[^\d]/ ) { die "Error, non-numeric character used for --invert \"$item\"\n"; } $inverthash{$item} = 1; } } } debugmsg ( "Stored ".commify(scalar keys %inverthash)." invert contigs" ); } ############################################################ # parse 454ContigGraph.txt ############################################################ # sample content: refer to this excellent description for more info: # http://contig.wordpress.com/2010/04/13/newbler-output-iii-the-454contiggraph-txt-file #1 contig00001 588 2.6 #2 contig00002 1072 6.8 #3 contig00003 644 4.1 #... #C 7 3' 14770 5' 3 #C 12 3' 14824 5' 5 #C 12 3' 52148 5' 4 #... #S 1 84148 1:+;2:+;gapOneNoEdges:186;3:+;4:+;5:+;6:+ #S 2 17530 7:+ #S 3 25222 8:+;9:-;10:-;11:+;12:+;14:+;gapMultiEdges:4733;15:+;gapMultiEdges:4241;16:+;17:+;19:+ #... #I 7 aCAACaTTATCATTGtATTTatATTCcTGTTtGAGATACGTGTGGACAGAGAATGTTGGTTTTTTGGACTAGAATCGGATTTATCATTATTATAATGT... #I 48 AGTTCGTCCTGGACGACTTGAGTT 11:19543-5'..16159-5';6:19543-5'..60104-5' #I 50 GGgTAATAGTTGACCGTCTTACGAAATtGGCACATTTTCTTCCAATTAACGAGAAATCTTCggtAGACAGACTAGTTCATATGTATGTGCGtGAAATC... #... #F 7 - 14770/3/0.0 #F 8 56895/6/0.0;54006/2/231.5 - #F 12 - 14824/5/0.0;52148/4/0.0 #... #P 1 10130/2/0.0;33848/3/0.0;34537/2/397.0;25104/2/679.5;15/170/698.4;6/2/1600.0;209/175/2352.0;9364/2/3929.5;19/89/4351.2;16/128/5345.5;17/25/5380.7;14/15/603... #P 2 1/284/927.3;20341/2/7324.0;23108/2/8174.5 4/210/2787.7;3/22/3918.7;39/2/4345.5 #P 3 4/183/1037.9;24637/4/6940.0 1/156/3004.0;2/22/3918.7;1675/2/4906.0;29922/3/5867.0;97/2/6139.5;7360/2/7464.0 { my $lines = 0; my $clines = 0; my $ilines = 0; my $flines = 0; my $slines = 0; my $plines = 0; my $endsstored = 0; open ( my $INF, "<", $infilename ) or die ( "Error opening input file \"$infilename\": $!\n" ); while ( my $aline = <$INF> ) { $lines++; # progress indicator unless ( $quiet ) { if ( ( $lines % 1000 ) == 0 ) { print commify($lines), "\n", $ansiup; } } $aline =~ s/[\r\n]//g; my @cols = split ( /\t/, $aline ); if ( $cols[0] eq "C" ) { $clines++; # store both orientations in hash my $keep = 1; if ( $cols[5] < $lowlimit ) { $keep = 0; } if ( ( $highlimit ) and ( $cols[5] > $highlimit ) ) { $keep = 0; } if ( $keep ) { # store data from each end, the three columns are [0]=nextcontig [1]=5'|3'|5'p|3'p|5'f|3'f|5'i... [2]=readnum push ( @{$ends{$cols[1].$cols[2]}}, [ $cols[3], $cols[4], $cols[5] ] ); # and store reciprocal end push ( @{$ends{$cols[3].$cols[4]}}, [ $cols[1], $cols[2], $cols[5] ] ); $endsstored+=2; } # if $keep if ( $endsstored < $debuglimit ) { my $txt = $keep?" Storing":"Not storing"; debugmsg ( "$txt ends: \$ends{$cols[1]$cols[2]} [ $cols[3], $cols[4], $cols[5] ];" ); debugmsg ( " : \$ends{$cols[3]$cols[4]} [ $cols[1], $cols[2], $cols[5] ];" ); } } # "C" elsif ( $cols[0] eq "I" ) # flowthrough information { $ilines++; if ( $flowthrough ) { # cols[1] is contig number # cols[2] is contig sequence ( if <= 256 b.p.) # cols[3] is the through-flow information, a ";" delimited list of # the format 15:1805-3'..207-3' # cols[3] will sometimes be null if ( $cols[3] ) { my @parts = split ( /;/, $cols[3] ); foreach my $apart ( @parts ) { my @subparts = split ( /[:\-\.]/, $apart ); # @subparts columns become [0]=15 [1]=1805 [2]=3' [3]=null [4]=207 [5]=3' push ( @{$ends{$cols[1]."0'"."i"}}, [ $subparts[1], $subparts[2], $subparts[0] ] ); push ( @{$ends{$cols[1]."0'"."i"}}, [ $subparts[3], $subparts[4], $subparts[0] ] ); } # foreach @leftparts } # if ( $cols[3] ) } # if ( $flowthrough ) } # "I" elsif ( $cols[0] eq "F" ) # flowbetween information { $flines++; if ( defined $flowbetween ) { # cols[1] is contig number # cols[2] is flow information for reads flowing from the 5' end of the contig # cols[3] is flow information for reads flowing from the 3' end of the contig my @leftparts = split ( /;/, $cols[2] ); my @rightparts = split ( /;/, $cols[3] ); # each part is of the format xx/yy/z.z where xx=contig yy=number of reads z.z=distance in b.p. if ( $cols[2] ne "-" ) # "-" is the indicator for null entry { foreach my $apart ( @leftparts ) { my @subparts = split ( /\//, $apart ); # this, and paired links is the only case where we save a fourth column in # the %ends hash, a distance in b.p. value # $flowbetween is also used as a minimum distance cutoff for filtering, so # filter by this distance value, and ignore if too short if ( $subparts[2] >= $flowbetween ) { push ( @{$ends{$cols[1]."5'f"}}, [ $subparts[0], "0'", $subparts[1], $subparts[2] ] ); } } # foreach @leftparts } # if ne "-" if ( $cols[3] ne "-" ) # "-" is the indicator for null entry { foreach my $apart ( @rightparts ) { my @subparts = split ( /\//, $apart ); if ( $subparts[2] >= $flowbetween ) { push ( @{$ends{$cols[1]."3'f"}}, [ $subparts[0], "0'", $subparts[1], $subparts[2] ] ); } } # foreach @leftparts } # if ne "-" } # if ( defined $flowbetween ) } # "F" elsif ( $cols[0] eq "S" ) { $slines++; } # "S" elsif ( $cols[0] eq "P" ) { $plines++; if ( $pairlinks ) { my $con_number = $cols[1]; my $fiveprime_connects = $cols[2]; my $threeprime_connects = $cols[3]; my @tmp = split ";", $fiveprime_connects; foreach my $connection (@tmp){ #now split up the connection. next if $connection eq "-"; my @x = split "/", $connection; my $termcontig = $x[0]; my $num_connects = $x[1]; my $distance = $x[2]; if($num_connects >= $lowlimit){ #store in temp pairs var and search through later for termcontig details. push( @{$pairs{$con_number}}, [ $termcontig, "5'", $num_connects, $distance ] ); $endsstored += 2; if ( $endsstored < $debuglimit ) { debugmsg ( "Storing ends: \$ends{".$con_number."3'p} [ $termcontig, 5', $num_connects];" ); debugmsg ( " : \$ends{".$termcontig."5'p} [ $con_number, 3', $num_connects];" ); } } } @tmp = split ";", $threeprime_connects; foreach my $connection (@tmp){ #now split up the connection next if $connection eq "-"; my @x = split "/", $connection; my $termcontig = $x[0]; my $num_connects = $x[1]; my $distance = $x[2]; if($num_connects >= $lowlimit){ #store in temp pairs var and search through later for termcontig details. push( @{$pairs{$con_number}}, [ $termcontig, "3'", $num_connects, $distance ] ); $endsstored += 2; if ( $endsstored < $debuglimit ) { debugmsg ( "Storing ends: \$ends{".$con_number."3'p} [ $termcontig, 5', $num_connects];" ); debugmsg ( " : \$ends{".$termcontig."5'p} [ $con_number, 3', $num_connects];" ); } } } } # if ( $pairlinks ) } # "P" elsif ( $cols[0] =~ m/^\d+$/ ) # first section of file { $contiglen[$cols[0]] = $cols[2]; $contigcov[$cols[0]] = $cols[3]; } # section 1 else { die ( "Error on line $lines of file \"$infilename\", unknown type of content:\n$aline\n" ); } } # while <$INF> close $INF; debugmsg ( commify($lines) . " lines read from input file \"$infilename\"" ); debugmsg ( commify($#contiglen) . " contig lengths were stored" ); debugmsg ( commify($clines) . " \"C\" lines were found" ); debugmsg ( commify($endsstored) . " ends were stored" ); debugmsg ( commify($ilines) . " \"I\" lines were found" ); debugmsg ( commify($flines) . " \"F\" lines were found" ); debugmsg ( commify($plines) . " \"P\" lines were found" ); debugmsg ( commify($slines) . " \"S\" lines were found (and ignored)" ); } ############################################################ # make paired end information links ############################################################ # Added by Simon Gladman - CSIRO - 2011 # Adds paired end links to the link data variable "%ends" foreach my $key (keys %pairs){ my @contig = @{$pairs{$key}}; foreach my $tmp (@contig){ my @x = @{$tmp}; my $term_contig = $x[0]; my @tcontig; # 10/11/2011 is this a bug? occassional not defined state here if ( $pairs{$term_contig} ) { @tcontig = @{$pairs{$term_contig}}; } # end of fix #my @tcontig = @{$pairs{$term_contig}}; foreach my $ttmp (@tcontig){ my @y = @{$ttmp}; if($y[0] == $key){ push( @{$ends{$key."$x[1]"."p"}}, [ $term_contig, "$y[1]", $x[2], $x[3] ] ); push( @{$ends{$term_contig."$y[1]"."p"}}, [ $key, "$x[1]", $y[2], $y[3] ] ); last; } } } } ############################################################ # sort data ############################################################ # sort data so that higher read coverage links come first # sorting is only needed if we will automatically extend network if ( $extend ) { # extend applies only to command line contigs and not auto-generated ones foreach ( @contig ) { push ( @extarr, $extend ) } # but the $extend value just evaluates to "true" later debugmsg ( "--extend=$extend, Sorting data" ); foreach my $key ( keys %ends ) { @{$ends{$key}} = sort {$b->[2] <=> $a->[2]} @{$ends{$key}}; } # foreach my $key ( keys %ends ) debugmsg ( "Extend contig list of ".scalar(@extarr)." contigs" ); } # if ( $extend ) ############################################################ # dead end detection ############################################################ # dead ends are contigs which might have reads extending to another contig # from either the 5' or 3' end, but not both. Both ends could be dead ends, too. # %deadends{contigid} is # of ends extending, so 1 = dead end, # 2 = continues both ends, 0=isolated contig without any connections unless ( $nolabel ) { debugmsg ( "Dead end detection" ); foreach my $key ( keys %ends ) { ( my $contig = $key ) =~ s/[530]'[pfis]?$//; # remove 5' or 3' or 5'p etc at end of string # note that this version of $contig does not have leading zeroes $deadends{$contig}++; @{$ends{$key}} = sort {$b->[2] <=> $a->[2]} @{$ends{$key}}; } # foreach my $key ( keys %ends ) } # unless ( $nolabel ) ############################################################ # construct network from starting point(s) ############################################################ my $totalnodes = 0; my $index = 0; foreach my $acontig (@contig) { my $dbgtxt = (defined $extarr[$index])?"user-defined":"auto-extend"; debugmsg ( "Contig #".($index+1)."=$dbgtxt, Recursion starting at \"$acontig\"" ); # when hit end of our specified contigs unless ( defined $extarr[$index] ) { if ( $extend ) { debugmsg ( "At end of specified contigs, turning off extend" ); } $extend = 0; } recurse ( $acontig, "", "", 0, $extend ); $index++; } # foreach my $acontig (@contig) unless ( $quiet ) { print commify($totalnodes), " nodes present in output\n"; } ############################################################ sub recurse { my ( $startcontig, $camefrom, $fromend, $recurselevel, $followlevel ) = @_; ############################################################ unless ( $totalnodes > $debuglimit ) { debugmsg ( "recurse \"$startcontig\", camefrom=\"$camefrom\" end=\"$fromend\" recurselevel=$recurselevel" ); } # data for this contig is in @ { %ends{contig . 5'|3'|5'p|3'p|5'f|3'f|5'i... } } [0]=nextcontig [1]=5'|3'|5'p|3'p"5'f|3'f|5'i... [2]=readnum # store lowest recursion level seen for this contig unless ( $nolabel ) { if ( ( ! defined $minrl{$startcontig} ) or ( $recurselevel < $minrl{$startcontig} ) ) { $minrl{$startcontig} = $recurselevel; } } # unless ( $nolabel ) # here is the limit to recursion my $stophere = 0; if ( $recurselevel > $level ) { unless ( $totalnodes > $debuglimit ) { debugmsg ( "recursion for \"$startcontig\" at limit level=$level, returning" ); } $stophere = 1; } # return if this contig is on the exclude list elsif ( $excludehash{$startcontig} ) { unless ( $totalnodes > $debuglimit ) { debugmsg ( "contig \"$startcontig\" on exclude list, returning" ); } # save information for list of excluded option my @row = ( $startcontig, $camefrom, $fromend ); push ( @listofexcl, \@row ); return 0; ## was this wrong? always return if on exclude list $stophere = 1; } # skip return if follow allows it if ( ( $stophere ) and ( $followlevel < 1 ) ) { return 0; } # count nodes unless ( defined $seen{$startcontig} ) { if ( $startcontig =~ m/^\s*$/ ) { die "Error, null contig in sub recurse($startcontig, $camefrom, $fromend, $recurselevel, $followlevel)\n"; } $seen{$startcontig} = 1; $totalnodes++; } foreach my $end ( "5'", "3'", "5'p", "3'p", "5'f", "3'f", "5'i", "3'i", "0'" ) { my $first = 1; foreach my $contigref ( @{$ends{$startcontig.$end}} ) { unless ( $totalnodes > $debuglimit ) { debugmsg ( "from \"$startcontig\" end \"$end\" find linked contig ".join(";",@$contigref) ); } # always skip links back to where we just came from ( and don't clear $first flag ) if ( $contigref->[0] eq $camefrom ) { unless ( $totalnodes > $debuglimit ) { debugmsg ( "\"$startcontig\": skipping link back to source contig \"$camefrom\"" ); } next; } # skip data storing if this edge was already stored unless ( defined $edgeseen{$startcontig.$end.$contigref->[0].$contigref->[1]} ) { debugmsg ( "Storing in \@data at key \"$startcontig$end\": [ \"".join ( "\", \"", @$contigref )."\" ]" ); push ( @{$data{$startcontig.$end}}, $contigref ); # $contigref is array reference if ( ( $first ) and ( $followlevel ) ) { unless ( $totalnodes > $debuglimit ) { debugmsg ( "auto extension following contig \"$contigref->[0]\", \$extensions=$extensions" ); } $extensions++; $edgeseen{$startcontig.$end.$contigref->[0].$contigref->[1]} = $recurselevel; recurse ( $contigref->[0], $startcontig, $end, $recurselevel, $followlevel-1 ); } # if } # unless $edgeseen else { debugmsg ( "Seen, not storing in \@data at key \"$startcontig$end\": [ \"".join ( "\", \"", @$contigref )."\" ]" ); } # recurse if edge seen level is higher than current level or not seen before if ( ( ! defined $edgeseen{$startcontig.$end.$contigref->[0].$contigref->[1]} ) or ( $edgeseen{$startcontig.$end.$contigref->[0].$contigref->[1]} > $recurselevel ) ) { $edgeseen{$startcontig.$end.$contigref->[0].$contigref->[1]} = $recurselevel; recurse ( $contigref->[0], $startcontig, $end, $recurselevel+1, 0 ); } $first = 0; } # foreach $contigref } # foreach $end } # sub recurse ############################################################ # consolidate pairs of flow between links ############################################################ collapseflowbetween(); ############################################################ # create output table text file ############################################################ createoutputtable(); ############################################################ # generate optional ABySS-Explorer .dot file ############################################################ if ( $abyssdotfile ) { createabyssfile(); } ############################################################ # generate optional FASTA file ############################################################ if ( $outfastaname ) { createfastafile(); } ############################################################ # create graphviz command file (.dot file) ############################################################ my $savednodes = 0; my $savededges = 0; debugmsg ( "Creating graphviz command file \"$cmdfilename\"" ); open ( my $OUTF, ">", $cmdfilename ) or die ( "Error creating graphviz command file \"$cmdfilename\": $!\n" ); print $OUTF "graph G\n"; print $OUTF " {\n"; print $OUTF " edge [len=$len];\n"; my $splinemode = $nospline?"false":"true"; print $OUTF " graph [overlap=$overlapmode,splines=$splinemode];\n"; print $OUTF " node [shape=plaintext];\n"; # table of nodes print $OUTF "\n // Nodes\n"; foreach my $seencontig ( keys %seen ) { my $contigid = sprintf ("%0${numdigits}d", $seencontig); # box is proportional to size, except if contig is small, # the box is still as large as the labels it contains my $scaledcontiglen = sprintf ( "%1d", $contiglen[$seencontig] * $scalefactor ); # define the box representing the contig, implemented as a table in HTML print $OUTF " c$contigid [label=< <TABLE BORDER=\"1\" CELLBORDER=\"0\" CELLSPACING=\"0\" CELLPADDING=\"0\""; if ( $colorhash{$seencontig}->[0] ) { print $OUTF " BGCOLOR=\"$colorhash{$seencontig}->[0]\""; } # top row is always present print $OUTF "><TR>"; if ( $inverthash{$seencontig} ) { print $OUTF "<TD PORT=\"R\">3'</TD>"; } else { print $OUTF "<TD PORT=\"L\">5'</TD>"; } print $OUTF "<TD PORT=\"C\" WIDTH=\"$scaledcontiglen\""; print $OUTF ">c$contigid</TD>"; if ( $inverthash{$seencontig} ) { print $OUTF "<TD PORT=\"L\">5'</TD>"; } else { print $OUTF "<TD PORT=\"R\">3'</TD>"; } print $OUTF "</TR>"; # optional table row for contig size if ( $showbp ) { print $OUTF "<TR><TD COLSPAN=\"3\">",commify($contiglen[$contigid])," $bpabbreviation</TD></TR>"; } # optional table row for coverage if ( $showcov ) { print $OUTF "<TR><TD COLSPAN=\"3\">cov=",commify($contigcov[$contigid]),"</TD></TR>"; } # one or more optional rows for labels specified on the command line if ( $taghash{$seencontig}->[0] ) { foreach my $tag ( @{$taghash{$seencontig}} ) { print $OUTF "<TR><TD COLSPAN=\"3\">$tag</TD></TR>"; } } # optional final rows for dead end contigs, or recursion limit contigs unless ( $nolabel ) { if ( ( ! defined $deadends{$seencontig} ) or ( $deadends{$seencontig} <= 1 ) ) { print $OUTF "<TR><TD COLSPAN=\"3\" BGCOLOR=\"$deadendcolor\">Dead End</TD></TR>"; } if ( ( defined $minrl{$seencontig} ) and ( $minrl{$seencontig} >= $level ) ) { print $OUTF "<TR><TD COLSPAN=\"3\" BGCOLOR=\"$recursionlimitcolor\">Recursion limit</TD></TR>"; } } # unless ( $nolabel ) # and the end of this huge mess of HTML print $OUTF "</TABLE> >];\n"; } # foreach my $seencontig ( keys %seen ) debugmsg ( "Saved ".commify($savednodes)." nodes" ); # table of edges ( connections ) print $OUTF "\n // Adjacency Edges\n"; my %alreadydrawn = (); foreach my $key ( keys %data ) { $key =~ m/^(.*)([530]'[pfis]?)$/; my $srcend = $2; my $srccontig = sprintf ("%0${numdigits}d", $1); my $srclr = "C"; if ( $srcend =~ m/5'/ ) { $srclr = "L"; } if ( $srcend =~ m/3'/ ) { $srclr = "R"; } unless ( $srclr ) { die "Error, no valid end at \"$key\"\n"; } if ( $savededges < $debuglimit ) { debugmsg ( "Key=\"$key\" Contig=\"$srccontig\" End=\"$srcend\" Port=\"$srclr\"" ); } foreach my $edgeref (@{$data{$key}}) # edgeref elements: [0]=contig(no leading 0) [1]=5'|3'|5'p|3'p|5'f|3'f|5'i... [2]=readnumber { # skip those final edges not leading to a node in the %seen list, # or those we have deleted by marking with "X" if ( ( $seen{$edgeref->[0]} ) and ( $edgeref->[1] ne "X" ) ) { if ( $savededges < $debuglimit ) { debugmsg ( "Edge=[ \"$edgeref->[0]\" \"$edgeref->[1]\" \"$edgeref->[2]\" ]" ); } my $contigid = sprintf ("%0${numdigits}d", $edgeref->[0]); # format for graph has leading zeroes my $lr = "C"; if ( $edgeref->[1] =~ m/5'/ ) { $lr = "L"; } if ( $edgeref->[1] =~ m/3'/ ) { $lr = "R"; } unless ( $lr ) { die "Error no valid end at \"$edgeref->[0]\"\n"; } unless ( $alreadydrawn{$srccontig.$srclr."-".$contigid.$lr} ) { my $linklabel = $edgeref->[2]; if ( $edgeref->[3] ) { # if the distance is 10 b.p. or greater, remove the decimal places to eliminate clutter if ( $edgeref->[3] >= 10 ) { $edgeref->[3] =~ s/\..*$//; } $linklabel .= "/" . $edgeref->[3] . $bpabbreviation; } print $OUTF " \"c$srccontig\":$srclr -- \"c$contigid\":$lr [label=\"$linklabel\""; # bold for high coverage links if ( ( $boldabove ) and ( $edgeref->[2] >= $boldabove ) ) { print $OUTF " style=bold"; } # specify colors of lines and labels connecting contigs if ( $edgeref->[2] == 0 ) # forced link { print $OUTF " color=$forcedlinkcolor fontcolor=$forcedfontcolor"; } elsif ( ( $srcend =~ m/p/ ) or ( $edgeref->[1] =~ m/p/ ) ) # paired end link { print $OUTF " color=$pairedendlinkcolor fontcolor=$pairedendfontcolor"; } elsif ( ( $srcend =~ m/f/ ) or ( $edgeref->[1] =~ m/f/ ) ) # flowbetween link { print $OUTF " color=$flowbetweenlinkcolor fontcolor=$flowbetweenfontcolor"; } elsif ( ( $srcend =~ m/i/ ) or ( $edgeref->[1] =~ m/i/ ) ) # flowthrough link { print $OUTF " color=$flowthroughlinkcolor fontcolor=$flowthroughfontcolor"; } else # if $normallinkcolor is a null string, don't specify any color, use default of black { if ( $normallinkcolor ) { print $OUTF " color=$normallinkcolor"; } if ( $normalfontcolor ) { print $OUTF " fontcolor=$normalfontcolor"; } } print $OUTF "];\n"; $alreadydrawn{$contigid.$lr."-".$srccontig.$srclr} = 1; # note we store in reverse orientation here $savededges++; } } } # foreach my $edgeref (@$arrayref) } # foreach my $arrayref ( keys %data ) debugmsg ( "Saved ".commify($savededges)." edges" ); print $OUTF " }\n"; close $OUTF; ############################################################ # run graphviz program neato ############################################################ my $cmd = "neato -T${outtype} -o\"$imgfilename\" \"$cmdfilename\""; debugmsg ( "running command \"$cmd\"" ); my $result = system ( $cmd ); if ( $result ) { print "Problem creating image. Error code $result returned from command \"$cmd\"\n"; $returncode = $result; } else { unless ( $quiet ) { print "Success\n"; } } # remove graphviz command file if ( $deletecmdfile ) { unlink $cmdfilename; } ############################################################ # print out list of excluded contigs ############################################################ if ( $listexcluded ) { print "List of excluded contigs\n"; print "Contig\tLinked from\tFrom End\n"; foreach my $rowref ( @listofexcl ) { print join ( "\t", @{$rowref} ), "\n"; } } # if ( $listexcluded ) ############################################################ # end of program ############################################################ exit $returncode; ############################################################ sub collapseflowbetween { ############################################################ # flowbetween links do not have a ending end, so are designated 0' there, # but since they come in pairs, we can figure out the ends that way # and consolidate the two links into one to eliminate clutter # global variables (command line parameters) used: # # other global variables uses # %data my %finder; # first step is to create an index foreach my $key ( keys %data ) { next unless ( $key =~ m/f/ ); unless ( $key =~ m/^(.*)([530]'[pfis]?)$/ ) { die "Program bug parsing key \"$key\"\n"; } my $srcend = $2; my $srccontig = $1; #sprintf ("%0${numdigits}d", $1); foreach my $edgeref (@{$data{$key}}) # [0]contig(no leading 0) [1]5'|3'|5'p|3'p|5'f|3'f|5'i... [2]readnumber [3]distance { if ( $edgeref->[1] =~ m/0'/ ) { my $bothkey = $srccontig . ":" . $edgeref->[0] . ":" . $edgeref->[3]; # omit source end in this key debugmsg ( "Save collapsible reference \"$bothkey\" from key \"$key\"" ); # save the source end as array element [4] $edgeref->[4] = $srcend; push ( @{$finder{$bothkey}}, $edgeref ); } } # foreach $edgeref } # foreach my $key ( keys %data ) # second step is to check the index for unique reciprocal links foreach my $key ( keys %data ) { next unless ( $key =~ m/f/ ); unless ( $key =~ m/^(.*)([530]'[pfis]?)$/ ) { die "Program bug parsing key \"$key\"\n"; } my $srcend = $2; my $srccontig = $1; foreach my $edgeref (@{$data{$key}}) # [0]contig(no leading 0) [1]5'|3'|5'p|3'p|5'f|3'f|5'i... [2]readnumber [3]distance { my $bothkey = $srccontig . ":" . $edgeref->[0] . ":" . $edgeref->[3]; my $reversekey = $edgeref->[0] . ":" . $srccontig . ":" . $edgeref->[3]; if ( ( $finder{$bothkey} ) and ( $finder{$reversekey} ) ) { # if by chance multiple matches, do not merge links if ( ( scalar @{$finder{$bothkey}} == 1 ) and ( scalar @{$finder{$reversekey}} == 1 ) ) { debugmsg ( "Collapsing link \"$bothkey\" <=> \"$reversekey\"" ); my @parts = split ( /:/, $bothkey ); # look up the correct other end of this link from the other member of the reciprocal pair my $correctotherend = $finder{$reversekey}->[0]->[4]; # save new link push @{$data{$key}}, [ $edgeref->[0], $correctotherend, $edgeref->[2], $edgeref->[3] ]; # set flag on old links to indicate that they should be ignored later # the value from %finder is a reference back to the $edgeref from %data, # so here we are modifying the %data hash indirectly $finder{$bothkey}->[0]->[1] = "X"; $finder{$reversekey}->[0]->[1] = "X"; # finished processing, remove both original links from %finder hash # to avoid hitting the same link again in the reverse orientation undef ( $finder{$bothkey} ); undef ( $finder{$reversekey} ); } else { debugmsg ( "Ignoring multiple collapsible links for \"$bothkey\" or \"$reversekey\"" ); } } } # foreach $edgeref } # foreach my $key ( keys %data ) } # sub collapseflowbetween ############################################################ sub createoutputtable { ############################################################ # global variables (command line parameters) used: # $outfilename, $pairlinks, $flowthrough, $flowbetween, $scaffold # other global variables uses # %data my %alreadyprinted = (); my $printededges = 0; open ( my $OUTF, ">", $outfilename ) or die ( "Error opening output file \"$outfilename\": $!\n" ); print $OUTF join ( "\t", "#contig", "contiglen", "avg.cov.", "5'or3'", "is linked to", "5'or3'", "by read num", "contiglen", "avg.cov." ); if ( ( $pairlinks ) or ( $flowthrough ) or ( defined $flowbetween ) or ( $scaffold ) ) { print $OUTF "\tlink type"; } print $OUTF "\n"; foreach my $key ( sort { ($a=~m/^(\d+)[530]'/)[0] <=> ($b=~m/^(\d+)[530]'/)[0] } keys %data ) { unless ( $key =~ m/^(.*)([530]'[pfis]?)$/ ) { die "Program bug, unparsable key \"$key\"\n"; } my $srccontig = $1; my $srcend = $2; foreach my $edgeref (@{$data{$key}}) # [0]contig(no leading 0) [1]5'|3'|5'p|3'p|5'f|3'f|5'i... [2]readnumber { unless ( ( $alreadyprinted{$srccontig.$srcend."-".$edgeref->[0].$edgeref->[1]} ) or ( $edgeref->[1] eq "X" ) ) { print $OUTF join ( "\t", $srccontig, $contiglen[$srccontig], $contigcov[$srccontig], $srcend, $edgeref->[0], $edgeref->[1], $edgeref->[2], $contiglen[$edgeref->[0]], $contigcov[$edgeref->[0]] ); if ( ( $pairlinks ) or ( $flowthrough ) or ( defined $flowbetween ) or ( $scaffold ) ) { my $t = "direct"; if ( $srcend =~ m/p/ ) { $t = "paired-end"; } if ( $srcend =~ m/f/ ) { $t = "flowbetween"; } if ( $srcend =~ m/i/ ) { $t = "flowthrough"; } if ( $srcend =~ m/s/ ) { $t = "scaffold"; } print $OUTF "\t", $t; } print $OUTF "\n"; $alreadyprinted{$edgeref->[0].$edgeref->[1]."-".$srccontig.$srcend} = 1; # note we store in reverse orientation here $printededges++; } } # foreach my $edgeref (@{$data{$key}}) } # foreach my $arrayref ( keys %data ) debugmsg ( "Printed ".commify($printededges)." edges to output file \"$outfilename\"" ); close $OUTF; } # sub createoutputtable ############################################################ sub createabyssfile { ############################################################ # global variables (command line parameters) used: # $abyssdotfile, $pairlinks, $flowthrough, $flowbetween, $scaffold # other global variables uses # %data # collect connection data my %aedata; # direct links my %aepdata; # paired end links foreach my $key ( keys %data ) { unless ( $key =~ m/^(.*)([530]'[pfis]?)$/ ) { die "Program bug parsing key \"$key\"\n"; } my $srccontig = $1; my $srcend = $2; debugmsg ( "AE: key=\"$key\" becomes srccontig=\"$srccontig\" srcend=\"$srcend\"" ); foreach my $edgeref (@{$data{$key}}) # edgeref elements: [0]=contig(no leading 0) [1]=5'|3'|5'p|3'p|5'i... [2]=readnumber [3]=distance { if ( $edgeref->[1] eq "X" ) # masked entry, ignore it { debugmsg ( "AE: masked edgeref \"$edgeref->[0]\" \"$edgeref->[1]\" \"$edgeref->[2]\"" ); next; } if ( $key !~ m/[pfis]/ ) # direct connection { my $nreads = $edgeref->[2]; my $distance = 0; debugmsg ( "AE: direct edgeref \"$edgeref->[0]\" \"$edgeref->[1]\" \"$edgeref->[2]\"" ); # values assigned here are not currently used, just the state of being defined $aedata{$srccontig}->{$srcend}->{$edgeref->[0]}->{$edgeref->[1]} = [ $distance, $nreads ]; # $nreads would be zero for forced links # if no links otherwise show up, still need a dummy line in the .adj file $aedata{$edgeref->[0]}->{used} = 1; } elsif ( $key =~ m/p/ ) # paired end connection { my $nreads = $edgeref->[2]; my $distance = $edgeref->[3]; debugmsg ( "AE: paired-end edgeref \"$edgeref->[0]\" \"$edgeref->[1]\" \"$edgeref->[2]\" \"$edgeref->[3]\"" ); unless ( $distance ) { die "Program bug distance = \"$distance\" key $key\n"; } $aepdata{$srccontig}->{$srcend}->{$edgeref->[0]}->{$edgeref->[1]} = [ $distance, $nreads ]; $aepdata{$edgeref->[0]}->{used} = 1; } } # foreach my $edgeref (@{$data{$key}}) } # foreach my $arrayref ( keys %data ) ##### ABySS-Explorer 1.3.0 .dot output file example lines # parts of the example file SRP000220-6.dot #digraph adj { #graph [k=32] #edge [d=-31] #"18+" [l=334 C=32002] #"18-" [l=334 C=32002] #... #"1807+" -> "1811-" [d=-706] #"1807+" -> "1825+" [d=-706] # # from SRP000220-6.path1.dot # "1861+" -> "1863+" [d=35 e=1.6 n=172] # "1861+" -> "1886-" [d=-13 e=1.8 n=135] # not sure about use of this file, not loadable SRX000430-6.dist.dot #digraph dist { #graph [k=32 s=100 n=10] #"18+" -> "71-" [d=320 e=2.6 n=62] debugmsg ( "Creating ABySS-Explorer .dot file \"$abyssdotfile\"" ); open ( my $OUTF, ">", $abyssdotfile ) or die ( "Error creating ABySS-Explorer .dot file \"$abyssdotfile\": $!\n" ); # start of .dot file, the header line, I don't know if other names than "adj" are valid, I didn't check print $OUTF "digraph $dotheaderid \{\n"; print $OUTF "graph [k=", $abyssk, "]\n"; # this will be 1 because we don't use kmers print $OUTF "edge [d=", $abyssedge, "]\n"; ### Vertices { my %list; # make a list of just the contig numbers, but from both shotgun and paired end foreach my $acontig ( keys %aedata ) { if ( $aedata{$acontig}->{used} ) #$acontig =~ s/[530]'[pfis]?//; { $list{$acontig} = 1; } } foreach my $acontig ( keys %aepdata ) { if ( $aepdata{$acontig}->{used} ) { $list{$acontig} = 1; } } foreach my $acontig ( sort { $a <=> $b } keys %list ) { # coverage must be an integer for ABySS-Explorer, so round to nearest integer # multiply coverage by length because ABySS-Explorer uses kmer coverage, we need to simulate that my $avgcov = sprintf( "%0d", ( $contigcov[$acontig] * $contiglen[$acontig] ) ); # print contig here print $OUTF "\"" . $acontig . "+\" [l=" . $contiglen[$acontig] . " C=" . $avgcov . "]\n"; print $OUTF "\"" . $acontig . "-\" [l=" . $contiglen[$acontig] . " C=" . $avgcov . "]\n"; } # foreach my $acontig %data } ### adj pattern foreach my $acontig ( sort { $a <=> $b } keys %aedata ) # $acontig is just a contig number { debugmsg ( "AE: write adj data for contig \"$acontig\"" ); foreach my $fend ( sort keys %{$aedata{$acontig}} ) { next if ( $fend !~ m/\d/ ); # i.e., ne 'used' foreach my $rcontig ( sort { $a <=> $b } keys %{$aedata{$acontig}->{$fend}} ) { foreach my $rend ( sort keys %{$aedata{$acontig}->{$fend}->{$rcontig}} ) { # currently only use defined state, values ignored for direct connections # my ( $distance, $nreads ) = @{$aepdata{$acontig}->{$fend}->{$rcontig}->{$rend}}; # + and - are backwards from what looks natural, so that ABySS-Explorer # defaults to showing arrows 5' to 3' # 3'->5' = - - 3'->3' = - + # 5'->5' = + - 5'->3' = + + my $fdir = ($fend=~m/5/)?"+":"-"; my $rdir = ($rend=~m/3/)?"+":"-"; # print link here print $OUTF "\"${acontig}${fdir}\" -> \"${rcontig}${rdir}\"\n"; debugmsg ( "AE: rcontig=$rcontig pm=$rdir link is \"${acontig}${fdir}\" -> \"${rcontig}${rdir}\"" ); } # foreach $rend } # foreach $rcontig } # foreach $fend } # foreach my $acontig %aedata ### dist pattern foreach my $acontig ( sort { $a <=> $b } keys %aepdata ) { debugmsg ( "AE: write dist data for contig \"$acontig\"" ); foreach my $fend ( sort keys %{$aepdata{$acontig}} ) { next if ( $fend !~ m/\d/ ); # i.e., ne 'used' foreach my $rcontig ( sort { $a <=> $b } keys %{$aepdata{$acontig}->{$fend}} ) { if ( $fend !~ m/p/ ) { die "Program bug: \%aepdata key for $acontig has no \"p\": \"$fend\"\n"; } foreach my $rend ( sort keys %{$aepdata{$acontig}->{$fend}->{$rcontig}} ) { my ( $distance, $nreads ) = @{$aepdata{$acontig}->{$fend}->{$rcontig}->{$rend}}; unless ( $distance ) { die "Program bug, paired end distance not defined contig $acontig end $fend to end $rcontig end $rend\n"; } # 3'->5' = - - 3'->3' = - + # 5'->5' = + - 5'->3' = + + my $fdir = ($fend=~m/5/)?"+":"-"; my $rdir = ($rend=~m/3/)?"+":"-"; # special precise formatting needed by ABySS-Explorer $distance = int ( $distance + 0.5 ); # must be integer my $e = sprintf ( "%0.1f", $abyssevalue ); # units are b.p., required 1 decimal place my $n = int($nreads); # n is number of mates, must be integer, here we use the number of reads, same thing # print link here print $OUTF "\"${acontig}${fdir}\" -> \"${rcontig}${rdir}\" [d=$distance e=$e n=$n]\n"; debugmsg ( "AE: write pe link \"${acontig}${fdir}\" -> \"${rcontig}${rdir}\" [d=$distance e=$e n=$n]" ); } # foreach $rend } # foreach $rcontig } # foreach $fend } # foreach my $acontig %aepdata # end of file print $OUTF "\}\n"; close $OUTF; } # sub createabyssfile ############################################################ sub createfastafile { ############################################################ # global variables (command line parameters) used: # $indirname, $outfastaname # other global variables uses # %seen, $bpabbreviation my %fasta = (); # store all contigs in memory my $fastainfilename = $indirname; unless ( $fastainfilename =~ m/\/$/ ) { $fastainfilename .= "/"; } $fastainfilename .= $allcontigsfna; my $lines = 0; my $sequences = 0; my $seqsaved = 0; my $bpsaved = 0; my $saveflag = 0; my $id = ""; open ( my $INF, "<", $fastainfilename ) or die ( "Error opening input file \"$fastainfilename\": $!\n" ); while ( my $aline = <$INF> ) { $lines++; $aline =~ s/[\r\n]//g; if ( $aline =~ m/^>([^\s]*)/ ) { $id = $1; # up to first white space $id =~ s/contig0*//; # remove "contig" and any leading zeroes if ( $id =~ m/^\s*$/ ) { die "Error, null contig name from line $lines of file \"$fastainfilename\"\n"; } $sequences++; $saveflag = $seen{$id}; } # if if ( $saveflag ) { $fasta{$id} .= $aline . "\n"; unless ( $aline =~ m/^>/ ) { $aline =~ s/[^AaCcTtGgMmRrYyKkVvHhDdBb]//g; $bpsaved += length ( $aline ); } } # if } # while close $INF; unless ( $quiet ) { print "Input FASTA file contained ", commify($lines), " lines and ", commify($sequences), " sequences\n"; } open ( my $OUTF, ">", $outfastaname ) or die ( "Error opening output file \"$outfastaname\": $!\n" ); foreach my $acontig ( sort { $a <=> $b } keys %seen ) { my $seq = exists($fasta{$acontig}) ? $fasta{$acontig} : ""; if ( $seq ) { print $OUTF $seq; } else { print "Warning, empty sequence for contig \"$acontig\"\n"; } $seqsaved++; } close $OUTF; unless ( $quiet ) { print commify($seqsaved), " sequences, ", commify($bpsaved), " $bpabbreviation saved in \"$outfastaname\"\n"; } } # sub createfastafile ############################################################ sub debugmsg { my ( $text, $noreturn, $nolinenum ) = @_; ############################################################ if ( $debug ) { my ($package, $filename, $line, $sub) = caller(0); unless ( $nolinenum ) { $text = "Line $line: " . $text; } if ( ! ( $noreturn ) ) { $text .= "\n"; } print $text; } # if ( $debug ) } # sub debugmsg ############################################################### sub expandatprefix { my ( $string ) = @_; ############################################################### # some parameters with contigs can have "@xxx" used instead, # the text following "@" is a filename. # This file contains the parameters, which will be # substituted in. Otherwise return the string unmodified if ( $string =~ m/^\@(.*)$/ ) { my $filename = $1; open my $EFILE,"<",$filename or die ( "Error opening file \"$filename\": $!\n" ); my @contents = <$EFILE>; close $EFILE; $string = join ( ",", @contents ); $string =~ s/[\r\n\s]//g; } # if ( $string =~ m/^\@/ ) return $string; } # sub expandatprefix ############################################################### sub timestr { ############################################################### @_ = localtime(shift || time); return(sprintf("%04d/%02d/%02d %02d:%02d", $_[5]+1900, $_[4]+1, $_[3], @_[2,1])); } # sub timestr ############################################################### sub commify { ############################################################### # http://perldoc.perl.org/perlfaq5.html#How-can-I-output-my-numbers-with-commas local $_ = shift; 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; return $_; } # commify # eof =pod sample graph G { edge [len=1]; graph [overlap=none,splines=true]; node [shape=plaintext]; c12345 [label=< <TABLE BORDER="1" CELLBORDER="0" CELLSPACING="0"><TR><TD PORT="L">5'</TD><TD WIDTH="200">c12345</TD... c23456 [label=< <TABLE BORDER="1" CELLBORDER="0" CELLSPACING="0"><TR><TD PORT="L">5'</TD><TD WIDTH="10">c23456</TD>... c34567 [label=< <TABLE BORDER="1" CELLBORDER="0" CELLSPACING="0"><TR><TD PORT="L">5'</TD><TD WIDTH="1">c34567</TD><... c45678 [label=< <TABLE BORDER="1" CELLBORDER="0" CELLSPACING="0"><TR><TD PORT="L">5'</TD><TD WIDTH="100">c45678</TD... "c23456":L -- "c34567":R [label="31"]; "c34567":L -- "c45678":R [label="12"]; "c45678":L -- "c12345":R [label="18"]; "c34567":L -- "c12345":R [label="1"]; } =end