#!/usr/bin/perl -w use strict; # place_features.plx, last update: 8/14/03 # places contigs, clones, and loci into a GFF file # by Chet Langin, clangin@siu.edu # SIU Plant Biotechnology and Genome Core-facility # start timer use Time::HiRes; my $start_time = Time::HiRes::time; # predeclare subroutines sub get_most_misplaced(); # for spreading contigs sub max($ $); sub min($ $); sub process_clone(); # for dropping clone anchors sub process_contig(); # for dropping contig anchors sub process_group(); # for spreading contigs sub push_left(); # for spreading contigs # global constants use constant AVE_CLONE_LENGTH => 145000; my $ave_band_length = 0; # set dynamically my $offset = 0; # set dynamically # filenames -- given my $clone2locus_file_str = "given_data/sorted_clone2locus_relations.txt"; my $locus2clone_file_str = "given_data/sorted_locus2clone_relations.txt"; my $contig2clone_file_str = "given_data/sorted_contig2clone_relations.txt"; my $locus_anchors_file_str = "given_data/sorted_locus_anchors.txt"; my $qtl_file_str = "given_data/qtl.gff"; my $mlg_file_str = "given_data/mlg.gff"; # filenames -- new my $clone_anchors_file_str = "new_data/clone_anchors.txt"; my $clone_placements_file_str = "new_data/clone_placements.txt"; my $clones_b1_file_str = "new_data/clones_b1.txt"; my $clones_b2_file_str = "new_data/clones_b2.txt"; my $clones_g_file_str = "new_data/clones_g.txt"; my $contig_anchors_file_str = "new_data/contig_anchors.txt"; my $contig_placements_file_str = "new_data/sorted_contig_placements.txt"; my $gff_file_str = "new_data/soybean.gff"; my $locus_placements_file_str = "new_data/locus_placements.txt"; my $sorted_clone_anchors_file_str = "new_data/sorted_clone_anchors.txt"; my $sorted_clone_placements_file_str = "new_data/sorted_clone_placements.txt"; my $sorted_contig_anchors_file_str = "new_data/sorted_contig_anchors.txt"; my $sorted_locus_placements_file_str = "new_data/sorted_locus_placements.txt"; # filenames -- error my $clone_errors_file_str = "errors/drop_clone_anchors_errors.txt"; my $contig_errors_file_str = "errors/drop_contig_anchors_errors.txt"; my $spread_clones_errors_file_str = "errors/spread_clones_errors.txt"; # global variables my %locus_mlg = (); # locus data... my %locus_start = (); # this is used only once at the start my %locus_end = (); my %locus_midpoint = (); my $locus_mlg = ""; my $locus_start = 0; my $locus_end = 0; my $locus_midpoint = 0; my %clone_mlg = (); # clone data... my %clone_midpoint = (); # used in drop_contig_anchors and spread_loci -- use different data! my $clone_start = 0; # the first is anchors; the second is placements my $clone_end = 0; my $clone_midband = 0; my $clone_midband_length = 0; my $clone_mlg = ""; my $clone_midpoint = 0; my %b1g_clones = (); my %b2g_clones = (); my %contig_mlg = (); # contig data... my %contig_start = (); # used in spreading clones my @contig_array = (); # used to spread contigs my $contig_counter = 0; my $contig_mlg = ""; my $contig_end = 0; my $contig_start = 0; my %lengths = (); # length variables used to determine average band length... my $length_sum = 0; my $length_divisor = 0; my $ave_length = 0; my $biggest_distance = 0; # temporary variables that may be used more than once... my $biggest_neg_distance = 0; my $comparative_displacement = 0; my $current_clone = ""; my $current_contig = ""; my $current_contig_start = 0; my $current_locus = ""; my $current_mlg = ""; my $double_matches = 0; my $doubles_count = 0; my $dump_contig = 0; # (boolean) my $error_counter = 0; my @fields = (); my @file_list = (); my $largest_contig = 0; my $largest_contig_name = ""; my %largest_end = (); my $largest_group = 1; my $midpoint_summation = 0; my $previous_clone = ""; my $previous_contig = ""; my $previous_end = 0; my $previous_locus = ""; my $previous_mlg = ""; my $previous_start = 0; my $midpoint_sum = 0; my $midpoint_divisor = 0; my @mlg_list = (); my $most_negative_band = 0; my $record_number = 0; my $single_matches = 0; my $start_sum = 0; my $start_divisor = 0; my $track = ""; my $valid_locus = 1; # (boolean) # start the display print "---------\n"; print "------------------\n"; print "------------------------------------\n"; print "Dropping clone anchors...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # create a hash of the loci locations open(LOCUS_ANCHORS_FILE, "<", $locus_anchors_file_str) or die "Cannot open locus input file: $!\n"; while() { chomp; my @fields = split /\t/; my $mlg = $fields[0]; my $start = $fields[1]; my $end = $fields[2]; my $midpoint = $fields[3]; my $locus_name = $fields[4]; $locus_mlg{$locus_name} = $mlg; $locus_start{$locus_name} = $start; $locus_end{$locus_name} = $end; $locus_midpoint{$locus_name} = $midpoint; } # while close LOCUS_ANCHORS_FILE; # prepare the hash of clone anchors %clone_mlg = (); %clone_midpoint = (); # open some clone files and process them open(CLONE2LOCUS_FILE, "<", $clone2locus_file_str) or die "Cannot open relation input file: $!\n"; open(CLONE_ANCHORS_FILE, ">", $clone_anchors_file_str) or die "Cannot open clone anchors file: $!\n"; open(CLONE_ERRORS_FILE, ">", $clone_errors_file_str) or die "Cannot open error file: $!\n"; LINE: while() { chomp; @fields = split /\t/; $current_clone = $fields[0]; $current_locus = $fields[1]; $current_mlg = $fields[2]; $locus_mlg = ""; $locus_start = 0; $locus_end = 0; $locus_midpoint = 0; if(exists($locus_midpoint{$current_locus})) { $locus_mlg = $locus_mlg{$current_locus}; $locus_start = $locus_start{$current_locus}; $locus_end = $locus_end{$current_locus}; $locus_midpoint = $locus_midpoint{$current_locus}; if($locus_mlg ne $current_mlg) { print CLONE_ERRORS_FILE "$current_clone: $current_locus MLG's do not match\n"; $error_counter++; next LINE; } # if if($current_clone eq $previous_clone) { if($current_mlg ne $previous_mlg) { print CLONE_ERRORS_FILE "$current_clone: markers on different MLG's\n"; print CLONE_ERRORS_FILE "$current_clone: markers on different MLG's\n"; $error_counter++; $error_counter++; $previous_clone = ""; next LINE; } # if $midpoint_sum += $locus_midpoint; $midpoint_divisor++; $previous_start = min($previous_start, $locus_start); $previous_end = max($previous_end, $locus_end); $largest_group = $midpoint_divisor if($midpoint_divisor > $largest_group); $doubles_count++; } # if else { # current clone does not equal previous clone &process_clone; $midpoint_sum = $locus_midpoint; $midpoint_divisor = 1; $previous_clone = $current_clone; $previous_mlg = $current_mlg; $previous_start = $locus_start; $previous_end = $locus_end; } # else } # if else { print CLONE_ERRORS_FILE "$current_clone: $current_locus does not exist\n"; $error_counter++; } # else } # while &process_clone; # close the files close CLONE2LOCUS_FILE; close CLONE_ANCHORS_FILE; close CLONE_ERRORS_FILE; system ("sort $clone_anchors_file_str > $sorted_clone_anchors_file_str"); # display clone anchor output print " Largest group: $largest_group\n"; print " Single matches: $single_matches\n"; print " Doubles: $doubles_count\n"; print " Double matches (same MLG): $double_matches\n"; print " Errors: $error_counter\n"; print "Determining average band size...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX open(CONTIG2CLONE_FILE, "<", $contig2clone_file_str) or die "Cannot open contig2clone file: $!\n"; while() { chomp; my @fields = split /\t/; my $start = $fields[2]; my $end = $fields[3]; my $length = $end - $start; $length_sum += $length; $length_divisor++; if(exists($lengths{$length})) { $lengths{$length}++; } # if else { $lengths{$length} = 1; } # else } # while close CONTIG2CLONE_FILE; $length_divisor = 1 if ($length_divisor == 0); $ave_length = $length_sum / $length_divisor; print " Average length: $ave_length bands\n"; $ave_band_length = AVE_CLONE_LENGTH / $ave_length; print " Average band length: $ave_band_length bases\n"; print "Dropping contig anchors...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # global variables $current_clone = ""; $clone_start = 0; $clone_end = 0; $clone_midband = 0; $clone_midband_length = 0; $clone_mlg = ""; $clone_midpoint = 0; $dump_contig = 0; # boolean @mlg_list = (); $largest_contig = 0; $largest_contig_name = ""; $most_negative_band = 0; # open the contig files open(CONTIG2CLONE_FILE, "<", $contig2clone_file_str) or die "Cannot open contig2clone file: $!\n"; open(CONTIG_ANCHORS_FILE, ">", $contig_anchors_file_str) or die "Cannot open contig anchors file: $!\n"; open(CONTIG_ERRORS_FILE, ">", $contig_errors_file_str) or die "Cannot open contig errors file: $!\n"; $previous_contig = ""; $previous_mlg = ""; $start_sum = 0; $start_divisor = 0; $contig_end = 0; $largest_group = 1; $doubles_count = 0; %largest_end = (); LINE: while() { chomp; @fields = split /\t/; $current_contig = $fields[0]; $current_clone = $fields[1]; $clone_start = $fields[2]; $clone_end = $fields[3]; if(exists($largest_end{$current_contig})) { $largest_end{$current_contig} = $clone_end if($clone_end > $largest_end{$current_contig}); } # if else { $largest_end{$current_contig} = $clone_end; } # else $clone_midband = ($clone_end + $clone_start) / 2; $clone_midband_length = $clone_midband * $ave_band_length; $clone_mlg = ""; $clone_midpoint = 0; # see if we have a location for the clone if(exists($clone_midpoint{$current_clone})) { $clone_mlg = $clone_mlg{$current_clone}; $clone_midpoint = $clone_midpoint{$current_clone}; $current_contig_start = $clone_midpoint - $clone_midband_length; if($current_contig eq $previous_contig) { $dump_contig = 1 if($clone_mlg ne $previous_mlg); $most_negative_band = $clone_start if($clone_start < $most_negative_band); $start_sum += $current_contig_start; $start_divisor++; push @mlg_list, $clone_mlg; $previous_mlg = $clone_mlg; $largest_group = $start_divisor if($start_divisor > $largest_group); $contig_end = $clone_end if($clone_end > $contig_end); $doubles_count++; } # if else { # current contig does not equal previous contig &process_contig; $previous_contig = $current_contig; $start_divisor = 1; @mlg_list = (); $contig_end = $clone_end; $dump_contig = 0; $start_sum = $current_contig_start; $previous_mlg = $clone_mlg; $previous_clone = $current_clone; push @mlg_list, $clone_mlg; $contig_end = $clone_end; } # else } # if else { # the clone location is unknown print CONTIG_ERRORS_FILE "$current_contig: location of $current_clone unknown\n"; $error_counter++; } # else } # while &process_contig; # close the files close CONTIG2CLONE_FILE; close CONTIG_ANCHORS_FILE; close CONTIG_ERRORS_FILE; system "sort $contig_anchors_file_str > $sorted_contig_anchors_file_str"; # display output print " Largest group: $largest_group\n"; print " Largest contig: $largest_contig ($largest_contig_name)\n"; print " Single matches: $single_matches\n"; print " Doubles: $doubles_count\n"; print " Double matches (same MLG): $double_matches\n"; print " Most negative band: $most_negative_band\n"; $offset = sprintf("%.0f", abs($most_negative_band * $ave_band_length + 1000)); print " Recommended offset: $offset bases\n"; print " Errors: $error_counter\n"; print "Spreading contigs...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # contig_array fields # 0: mlg # 1: start # 2: end # 3: ctg # 4: number of associated clones # 5: original start # prepare variables $previous_mlg = ""; @contig_array = (); $record_number = 0; $biggest_distance = 0; $biggest_neg_distance = 0; $comparative_displacement = 0; %contig_mlg = (); %contig_start = (); $contig_counter = 0; open(CONTIG_ANCHORS_FILE, "<", $sorted_contig_anchors_file_str) or die "Cannot open contig anchors file: $!\n"; open(CONTIG_PLACEMENTS_FILE, ">", $contig_placements_file_str) or die "Cannot open contig placements file: $!\n"; while() { chomp; # fields: mlg start end contig_name no_of_clone_matches my @fields = split/\t/; # process one MLG at a time my $current_mlg = $fields[0]; if($current_mlg eq $previous_mlg) { # add this record to current MLG group $record_number++; $contig_array[$record_number] = [@fields]; $contig_array[$record_number][5] = $fields[1]; } # if else { # process previous MLG group if($previous_mlg ne "") { &process_group; # save to the output file for(my $lcv = 1; $lcv <= $record_number; $lcv++) { print CONTIG_PLACEMENTS_FILE "$contig_array[$lcv][0]\t$contig_array[$lcv][1]\t$contig_array[$lcv][2]\t$contig_array[$lcv][3]\tGamma\n"; # store the data into hashes $contig_mlg{$contig_array[$lcv][3]} = $contig_array[$lcv][0]; $contig_start{$contig_array[$lcv][3]} = $contig_array[$lcv][1]; $contig_counter++; } # for } # if # start a new MLG group (add first record to it) $previous_mlg = $current_mlg; @contig_array = (); $record_number = 1; $contig_array[$record_number] = [@fields]; $contig_array[$record_number][5] = $fields[1]; } # else } # while &process_group; # save to the output file for(my $lcv = 1; $lcv <= $record_number; $lcv++) { print CONTIG_PLACEMENTS_FILE "$contig_array[$lcv][0]\t$contig_array[$lcv][1]\t$contig_array[$lcv][2]\t$contig_array[$lcv][3]\tGamma\n"; # store the data into hashes $contig_mlg{$contig_array[$lcv][3]} = $contig_array[$lcv][0]; $contig_start{$contig_array[$lcv][3]} = $contig_array[$lcv][1]; $contig_counter++; } # for close CONTIG_ANCHORS_FILE; close CONTIG_PLACEMENTS_FILE; print "Spreading clones...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # prepare the data $previous_contig = ""; $contig_mlg = ""; $contig_start = 0; %clone_mlg = (); %clone_midpoint = (); # open some more files open(CONTIG2CLONE_FILE, "<", $contig2clone_file_str) or die "Cannot open contig2clone file: $!\n"; open(CLONE_PLACEMENTS_FILE, ">", $clone_placements_file_str) or die "Cannot open clone placements file: $!\n"; open(SPREAD_CLONES_ERRORS_FILE, ">", $spread_clones_errors_file_str) or die "Cannot open spread clones errors file: $!\n"; open(CLONES_B2, ">", $clones_b2_file_str) or die "Cannot open clones b2 file: $!\n"; open(CLONES_G, ">", $clones_g_file_str) or die "Cannot open clones g file: $!\n"; # place each clone while() { chomp; my @fields = split/\t/; my $contig = $fields[0]; my $clone = $fields[1]; my $start_band = $fields[2]; my $end_band = $fields[3]; if($contig ne $previous_contig) { if(exists($contig_mlg{$contig})) { $contig_mlg = $contig_mlg{$contig}; $contig_start = $contig_start{$contig}; } # if else { print SPREAD_CLONES_ERRORS_FILE "No location known for $contig\n"; $contig_mlg = ""; } # else } # if if($contig_mlg ne "") { my $clone_start = sprintf("%08.0f", $contig_start + ($start_band * $ave_band_length)); my $clone_end = sprintf("%08.0f", $contig_start + ($end_band * $ave_band_length)); print CLONE_PLACEMENTS_FILE "$contig_mlg\t$clone_start\t$clone_end\t$clone\n"; # subdivide into beta 2 and gamma clones if(exists($b1g_clones{$clone})) { # is a gamma clone print CLONES_G "$contig_mlg\t$clone_start\t$clone_end\t$clone\tGamma\n"; } # if else { # is a beta 2 clone print CLONES_B2 "$contig_mlg\t$clone_start\t$clone_end\t$clone\tBeta 2\n"; } # else # put the data into a hash for spreading the loci if(exists($clone_mlg{$clone})) { print "Clone already exists in hash: $clone\n"; } # if else { $clone_mlg{$clone} = $contig_mlg; $clone_midpoint{$clone} = ($clone_start + $clone_end) / 2; } # else } # if $previous_contig = $contig; } # while close CONTIG2CLONE_FILE; close CLONE_PLACEMENTS_FILE; close SPREAD_CLONES_ERRORS_FILE; close CLONES_B2; close CLONES_G; print " Processing $contig_counter contigs.\n"; system "sort $clone_placements_file_str > $sorted_clone_placements_file_str"; print "Spreading loci...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # prepare variables $previous_locus = ""; $previous_mlg = ""; $valid_locus = 1; $midpoint_summation = 0; $midpoint_divisor = 1; # determine the location for each locus open(LOCUS2CLONE_FILE, "<", $locus2clone_file_str) or die "Cannot open locus2clone file: $!\n"; open(LOCUS_PLACEMENTS_FILE, ">", $locus_placements_file_str) or die "Cannot open locus placements file: $!\n"; while() { chomp; my @fields = split /\t/; my $current_locus = $fields[0]; my $current_clone = $fields[1]; my $current_mlg = $fields[2]; my $clone_mlg = ""; my $clone_midpoint = 0; if(exists($clone_mlg{$current_clone})) { $clone_mlg = $clone_mlg{$current_clone}; $clone_midpoint = $clone_midpoint{$current_clone}; } # if else { next; } # else if($current_locus eq $previous_locus) { $valid_locus = 0 if($clone_mlg ne $previous_mlg); if($valid_locus) { $midpoint_summation += $clone_midpoint; $midpoint_divisor++; } # if } # if else { if($valid_locus && $previous_locus ne "") { my $current_midpoint = $midpoint_summation / $midpoint_divisor; my $start = sprintf("%08.0f", $current_midpoint - 100); my $end = sprintf("%08.0f", $current_midpoint + 100); print LOCUS_PLACEMENTS_FILE "$previous_mlg\t$start\t$end\t$previous_locus\tGamma\n"; } # if $previous_locus = $current_locus; $previous_mlg = $clone_mlg; $valid_locus = 1; $midpoint_summation = $clone_midpoint; $midpoint_divisor = 1; } # else } # while if($valid_locus) { my $current_midpoint = $midpoint_summation / $midpoint_divisor; my $start = sprintf("%08.0f", $current_midpoint - 100); my $end = sprintf("%08.0f", $current_midpoint + 100); print LOCUS_PLACEMENTS_FILE "$previous_mlg\t$start\t$end\t$previous_locus\tGamma\n"; } # if close LOCUS2CLONE_FILE; close LOCUS_PLACEMENTS_FILE; system "sort $locus_placements_file_str > $sorted_locus_placements_file_str"; print "Creating GFF file...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX open(GFF_FILE, ">", $gff_file_str) or die "Cannot open GFF output file: $!\n"; print " Current file being processed: MLG\n"; open(MLG_FILE, "<", $mlg_file_str) or die "Cannot open MLG file: $!\n"; # copy each line as is to the output file while() { print GFF_FILE; } # while close MLG_FILE; print " Current file being processed: QTL\n"; open(QTL_FILE, "<", $qtl_file_str) or die "Cannot open QTL file: $!\n"; # copy each line as is to the output file while() { print GFF_FILE; } # while close QTL_FILE; @file_list = ($sorted_locus_placements_file_str, $clones_b2_file_str, $clones_g_file_str, $contig_placements_file_str); $track = ""; foreach my $current_file(@file_list) { if($current_file =~ /locus/) { $track = "Loci"; } # if elsif($current_file =~ /clone/) { $track = "Clone"; } # elsif else { $track = "Ctg"; } # else print " Current file being processed: $track\n"; open(CURRENT_INPUT, "<", $current_file) or die "Cannot open $current_file: $!\n"; while() { chomp; my @fields = split /\t/; my $mlg = $fields[0]; my $start = $fields[1]; my $end = $fields[2]; my $name = $fields[3]; my $type = $fields[4]; print GFF_FILE "$mlg\tvarious\t$track\t$start\t$end\t.\t+\t.\tSequence \"$name\" ; Note \"$type\"\n"; } close CURRENT_INPUT; } # foreach close GFF_FILE; # determine and display run time my $run_time = Time::HiRes::time - $start_time; print "\nRun time: $run_time seconds\n"; print "------------------------------------\n"; print "------------------\n"; print "---------\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX subroutines # for spreading contig sub process_group() { my $conflicts = 0; my $consecutive_conflicts = 0; my $mlg = ""; for(my $lcv = 1; $lcv < $record_number; $lcv++) { $mlg = $contig_array[$lcv][0]; my $start = $contig_array[$lcv][1]; my $end = $contig_array[$lcv][2]; my $next_start = $contig_array[$lcv + 1][1]; my $next_end = $contig_array[$lcv + 1][2]; my $overlap = $end + $offset - $next_start; if($overlap > 0) { # a conflict $conflicts++; $consecutive_conflicts++; $contig_array[$lcv + 1][1] += $overlap + $offset; $contig_array[$lcv + 1][2] += $overlap + $offset; } # if else { # not a conflict $consecutive_conflicts = 0; } # else } # for print " $mlg conflicts: $conflicts\n"; my $most_misplaced = &get_most_misplaced; # do the contigs one at a time from the right for(my $lcv = $record_number; $lcv > 0; $lcv--) { $comparative_displacement = $contig_array[$lcv][1] - $contig_array[$lcv][5]; PUSH: while($comparative_displacement > $offset) { last PUSH if(!&push_left($lcv)); $comparative_displacement = $contig_array[$lcv][1] - $contig_array[$lcv][5]; } # if } # for $most_misplaced = &get_most_misplaced; } # process_group() # for spreading contigs sub get_most_misplaced() { $biggest_distance = 0; $biggest_neg_distance = 0; my $current_winner = 0; for(my $lcv = 1; $lcv < $record_number; $lcv++) { my $distance = $contig_array[$lcv][1] - $contig_array[$lcv][5]; if($distance > $biggest_distance) { $biggest_distance = $distance; $current_winner = $lcv; } # if $biggest_neg_distance = $distance if($distance < $biggest_neg_distance); } # for print " Biggest distance: $biggest_distance $biggest_neg_distance\n"; $current_winner; } # get_most_misplaced() # for spreading contigs sub push_left() { my $ok = 0; my $current_contig = $_[0]; if($current_contig == 1) { if($contig_array[$current_contig][1] > $offset) { # it is the leftmost contig and there is room to move left $ok = 1; } # if } # if elsif($current_contig > 1 && $contig_array[$current_contig][1] > $contig_array[$current_contig - 1][2] + $offset) { # there is room to move without bumping into another contig $ok = 1; } # elsif elsif(&push_left($current_contig - 1)) { # another contig can be bumped to the left $ok = 1; } # elsif my $current_displacement = $contig_array[$current_contig][1] - $contig_array[$current_contig][5]; if($ok && abs($current_displacement - $offset) > $comparative_displacement) { $ok = 0; } # if if($ok) { $contig_array[$current_contig][1] = sprintf("%08.0f", $contig_array[$current_contig][1] - $offset); $contig_array[$current_contig][2] = sprintf("%08.0f", $contig_array[$current_contig][2] - $offset); } # if $ok; } # push_left() # used for dropping contig anchors sub process_contig() { my $start = 0; my $end_str = ""; my $end = 0; my $length = 0; if($previous_contig ne "") { # was not the first entry nor thrown out for a mismatch if(exists($largest_end{$previous_contig})) { $contig_end = $largest_end{$previous_contig}; } # if if($start_divisor == 1) { # a single match $single_matches++; $start = sprintf("%08.0f", $start_sum); $end = sprintf("%08.0f", $contig_end * $ave_band_length + $start); print CONTIG_ANCHORS_FILE "$previous_mlg\t$start\t$end\t$previous_contig\t$start_divisor\n"; $length = $end - $start; if($length > $largest_contig) { $largest_contig = $length; $largest_contig_name = $previous_contig; } # if } # if else { # two or more matches if($dump_contig) { $error_counter += $start_divisor; for(my $lcv = 0; $lcv < $start_divisor; $lcv++) { my $mlg = shift @mlg_list; print CONTIG_ERRORS_FILE "$previous_contig: MLG's do not match, $mlg\n"; } # for } # if else { $double_matches++; my $start_str = $start_sum / $start_divisor; $start = sprintf("%08.0f", $start_str); $end = sprintf("%08.0f", ($contig_end * $ave_band_length) + $start); print CONTIG_ANCHORS_FILE "$previous_mlg\t$start\t$end\t$previous_contig\t$start_divisor\n"; $length = $end - $start; if($length > $largest_contig) { $largest_contig = $length; $largest_contig_name = $previous_contig; } # if } # else } # else $b2g_clones{$previous_clone} = 1; } # if } # process_contig() # used for dropping clone anchors sub process_clone() { if($previous_clone ne "") { # was not the first entry nor thrown out for a mismatch if($midpoint_divisor == 1) { # a single match $single_matches++; print CLONE_ANCHORS_FILE "$previous_mlg\t$midpoint_sum\t$previous_clone\n"; # save the clone data to hashes $clone_mlg{$previous_clone} = $previous_mlg; $clone_midpoint{$previous_clone} = $midpoint_sum; } # if else { # two or more matches $double_matches++; my $length = $previous_end - $previous_start; if($length > 2000000) { print CLONE_ERRORS_FILE "$current_clone: $length is too long\n"; $error_counter++; } # if else { my $midpoint_str = $midpoint_sum / $midpoint_divisor; my $midpoint = sprintf("%.0f", $midpoint_str); print CLONE_ANCHORS_FILE "$previous_mlg\t$midpoint\t$previous_clone\t$midpoint_divisor\n"; # save the clone data to hashes $clone_mlg{$previous_clone} = $previous_mlg; $clone_midpoint{$previous_clone} = $midpoint; } # else } # else # store as a b1 or g clone $b1g_clones{$previous_clone} = 1; } # if } # process_clone() sub min($ $) { if($_[0] < $_[1]) { $_[0]; } # if else { $_[1]; } # else } # min() sub max($ $) { if($_[0] > $_[1]) { $_[0]; } # if else { $_[1]; } # else } # max()