#!/usr/bin/perl -w use strict; # place_features.plx, last update: 11/07/03 # places contigs, clones, and loci into a GFF file # by Chet Langin, clangin@siu.edu # SIU Plant Biotechnology and Genome Core-facility # Table of contents # print "Getting MLG lengths...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXX # print "Dropping clone anchors...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXX # print "Determining average band size...\n"; # XXXXXXXXXXXXXXXXXXXXX # print "Dropping contig anchors...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXX # print "Spreading contigs...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # print "Loading clone comments...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXX # print "Spreading clones...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # print "Spreading loci...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # print "Creating GFF file...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX subroutines # start timer use Time::HiRes; my $start_time = Time::HiRes::time; # predeclare subroutines sub get_most_misplaced(); # for spreading contigs sub max($ $); sub min($ $); sub process_clone(); # for dropping clone anchors sub process_contig(); # for dropping contig anchors sub process_group(); # for spreading contigs sub push_left(); # for spreading contigs # global constants use constant AVE_CLONE_LENGTH => 130500; my $ave_band_length = 0; # set dynamically my $offset = 10000; # an arbitrary number # filenames -- given my $clone2locus_file_str = "given_data/sorted_clone2locus_relations.txt"; my $clone2comments_file_str = "given_data/clone2comment.txt"; my $locus2clone_file_str = "given_data/sorted_locus2clone_relations.txt"; my $contig2clone_file_str = "given_data/sorted_contig2clone_relations.txt"; my $locus_anchors_file_str = "given_data/sorted_locus_anchors.txt"; my $qtl_file_str = "given_data/qtl.gff"; my $mlg_file_str = "given_data/mlg.gff"; # filenames -- new my $loci_a_file_str = "new_data/loci_a.txt"; my $loci_b_file_str = "new_data/loci_b.txt"; my $loci_g_file_str = "new_data/loci_g.txt"; my $clone_anchors_file_str = "new_data/clone_anchors.txt"; my $clones_b1_file_str = "new_data/clones_b1.txt"; my $clones_b2_file_str = "new_data/clones_b2.txt"; my $clones_g_file_str = "new_data/clones_g.txt"; my $contig_anchors_file_str = "new_data/contig_anchors.txt"; my $contigs_b_file_str = "new_data/sorted_contigs_b.txt"; my $contigs_g_file_str = "new_data/sorted_contigs_g.txt"; my $gff_file_str = "new_data/soybean.gff"; my $sorted_clone_anchors_file_str = "new_data/sorted_clone_anchors.txt"; my $sorted_contig_anchors_file_str = "new_data/sorted_contig_anchors.txt"; my $sorted_loci_a_file_str = "new_data/sorted_loci_a.txt"; my $sorted_loci_b_file_str = "new_data/sorted_loci_b.txt"; my $sorted_loci_g_file_str = "new_data/sorted_loci_g.txt"; my $sorted_clones_b1_file_str = "new_data/sorted_clones_b1.txt"; my $sorted_clones_b2_file_str = "new_data/sorted_clones_b2.txt"; my $sorted_clones_g_file_str = "new_data/sorted_clones_g.txt"; my $dups_file_str = "new_data/dups.txt"; # filenames -- error my $clone_errors_file_str = "errors/drop_clone_anchors_errors.txt"; my $contig_errors_file_str = "errors/drop_contig_anchors_errors.txt"; my $spread_clones_errors_file_str = "errors/spread_clones_errors.txt"; # global variables my %locus_mlg = (); # locus data... my %locus_start = (); # this is used only once at the start my %locus_end = (); my %locus_midpoint = (); my $locus_mlg = ""; my $locus_start = 0; my $locus_end = 0; my $locus_midpoint = 0; my %clone_mlg = (); # clone data... my %clone_midpoint = (); # used in drop_contig_anchors and spread_loci # -- they use different data! my %clone_contig = (); # the contig, if any, that a clone is in my $clone_start = 0; # the first is anchors; # the second is placements my $clone_end = 0; my $clone_midband = 0; my $clone_midband_length = 0; my $clone_mlg = ""; my $clone_midpoint = 0; my %b1g_clones = (); my %b2g_clones = (); my %clone2comments = (); my %contig_mlg = (); # contig data... my %contig_start = (); # used in spreading clones my @contig_array = (); # used to spread contigs my $contig_counter = 0; my $contig_mlg = ""; my $contig_end = 0; my $contig_start = 0; my $contig_negatives = 0; my %lengths = (); # length variables used to determine # average band length... my $length_sum = 0; my $length_divisor = 0; my $ave_length = 0; my $q_start = 0; # to keep tracking of contig locations in Queue my $no_matches = 0; # how many contigs with no anchors my $biggest_distance = 0; # temporary variables that # may be used more than once... my $biggest_neg_distance = 0; my $comparative_displacement = 0; my $current_clone = ""; my $current_contig = ""; my $current_contig_start = 0; my $current_locus = ""; my $current_mlg = ""; my $double_matches = 0; my $doubles_count = 0; my $dump_contig = 0; # (boolean) my $error_counter = 0; my @fields = (); my @file_list = (); my $largest_contig = 0; my $largest_contig_name = ""; my %largest_end = (); my %smallest_start = (); my $largest_group = 1; my %midpoint_summation = (); my $previous_clone = ""; my $previous_contig = ""; my $previous_end = 0; my $previous_locus = ""; my $previous_mlg = ""; my $previous_start = 0; my $midpoint_sum = 0; my %midpoint_divisor = (); my $midpoint_divisor = 0; my @mlg_list = (); my $most_negative_band = 0; my $record_number = 0; my $single_matches = 0; my $start_sum = 0; my $start_divisor = 0; my $track = ""; my $dup_counter = 0; my %mlg_length = (); my $right_end_limit = 0; my $right_end_limit_exceeded = 0; # boolean my $most_misplaced = 0; # start the display print "---------\n"; print "------------------\n"; print "------------------------------------\n"; print "Getting MLG lengths...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXX open(MLG_FILE, "<", $mlg_file_str) or die "Cannot open MLG file: $!\n"; # copy each line as is to the output file while() { chomp; @fields = split /\t/; my $name = $fields[0]; my $length = $fields[4]; $mlg_length{$name} = $length; } # while close MLG_FILE; $mlg_length{"Queue"} = 100000000000; print "Dropping clone anchors...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXX # create a hash of the loci locations open(LOCUS_ANCHORS_FILE, "<", $locus_anchors_file_str) or die "Cannot open locus input file: $!\n"; while() { chomp; my @fields = split /\t/; my $mlg = $fields[0]; my $start = $fields[1]; my $end = $fields[2]; my $midpoint = $fields[3]; my $locus_name = $fields[4]; $locus_mlg{$locus_name} = $mlg; $locus_start{$locus_name} = $start; $locus_end{$locus_name} = $end; $locus_midpoint{$locus_name} = $midpoint; } # while close LOCUS_ANCHORS_FILE; # prepare the hash of clone anchors %clone_mlg = (); %clone_midpoint = (); # open some clone files and process them open(CLONE2LOCUS_FILE, "<", $clone2locus_file_str) or die "Cannot open relation input file: $!\n"; open(CLONE_ANCHORS_FILE, ">", $clone_anchors_file_str) or die "Cannot open clone anchors file: $!\n"; open(CLONE_ERRORS_FILE, ">", $clone_errors_file_str) or die "Cannot open error file: $!\n"; LINE: while() { chomp; @fields = split /\t/; $current_clone = $fields[0]; $current_locus = $fields[1]; $current_mlg = $fields[2]; $locus_mlg = ""; $locus_start = 0; $locus_end = 0; $locus_midpoint = 0; if(exists($locus_midpoint{$current_locus})) { $locus_mlg = $locus_mlg{$current_locus}; $locus_start = $locus_start{$current_locus}; $locus_end = $locus_end{$current_locus}; $locus_midpoint = $locus_midpoint{$current_locus}; if($locus_mlg ne $current_mlg) { $current_mlg = $locus_mlg; } # if if($current_clone eq $previous_clone) { if($current_mlg ne $previous_mlg) { $current_mlg = $previous_mlg; } # if $midpoint_sum += $locus_midpoint; $midpoint_divisor++; $previous_start = min($previous_start, $locus_start); $previous_end = max($previous_end, $locus_end); $largest_group = $midpoint_divisor if($midpoint_divisor > $largest_group); $doubles_count++; } # if else { # current clone does not equal previous clone &process_clone; $midpoint_sum = $locus_midpoint; $midpoint_divisor = 1; $previous_clone = $current_clone; $previous_mlg = $current_mlg; $previous_start = $locus_start; $previous_end = $locus_end; } # else } # if else { print CLONE_ERRORS_FILE "$current_clone: $current_locus does not exist\n"; $error_counter++; } # else } # while &process_clone; # close the files close CLONE2LOCUS_FILE; close CLONE_ANCHORS_FILE; close CLONE_ERRORS_FILE; system ("sort $clone_anchors_file_str > $sorted_clone_anchors_file_str"); # display clone anchor output print " Largest group: $largest_group\n"; print " Single matches: $single_matches\n"; print " Doubles: $doubles_count\n"; print " Double matches (same MLG): $double_matches\n"; print " Errors: $error_counter\n"; print "Determining average band size...\n"; # XXXXXXXXXXXXXXXXXXXXXXX open(CONTIG2CLONE_FILE, "<", $contig2clone_file_str) or die "Cannot open contig2clone file: $!\n"; while() { chomp; my @fields = split /\t/; my $start = $fields[2]; my $end = $fields[3]; my $length = $end - $start; $length_sum += $length; $length_divisor++; if(exists($lengths{$length})) { $lengths{$length}++; } # if else { $lengths{$length} = 1; } # else } # while close CONTIG2CLONE_FILE; $length_divisor = 1 if ($length_divisor == 0); $ave_length = $length_sum / $length_divisor; print " Average length: $ave_length bands\n"; $ave_band_length = AVE_CLONE_LENGTH / $ave_length; print " Average band length: $ave_band_length bases\n"; print "Dropping contig anchors...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXX # global variables $current_clone = ""; $clone_start = 0; $clone_end = 0; $clone_midband = 0; $clone_midband_length = 0; $clone_mlg = ""; $clone_midpoint = 0; $dump_contig = 0; # boolean @mlg_list = (); $largest_contig = 0; $largest_contig_name = ""; $most_negative_band = 0; # open the contig files open(CONTIG2CLONE_FILE, "<", $contig2clone_file_str) or die "Cannot open contig2clone file: $!\n"; open(CONTIG_ANCHORS_FILE, ">", $contig_anchors_file_str) or die "Cannot open contig anchors file: $!\n"; open(CONTIG_ERRORS_FILE, ">", $contig_errors_file_str) or die "Cannot open contig errors file: $!\n"; $previous_contig = ""; $previous_mlg = "unknown"; $start_sum = 0; $start_divisor = 0; $contig_end = 0; $largest_group = 1; $doubles_count = 0; %largest_end = (); %smallest_start = (); LINE: while() { chomp; @fields = split /\t/; $current_contig = $fields[0]; $current_clone = $fields[1]; $clone_start = $fields[2]; $most_negative_band = $clone_start if($clone_start < $most_negative_band); if(exists($smallest_start{$current_contig})) { $smallest_start{$current_contig} = $clone_start if($clone_start < $smallest_start{$current_contig}); } # if else { $smallest_start{$current_contig} = 0; $smallest_start{$current_contig} = $clone_start if($clone_start < $smallest_start{$current_contig}); } # else $clone_end = $fields[3]; if(exists($largest_end{$current_contig})) { $largest_end{$current_contig} = $clone_end if($clone_end > $largest_end{$current_contig}); } # if else { $largest_end{$current_contig} = $clone_end; } # else $clone_midband = ($clone_end + $clone_start) / 2; $clone_midband_length = $clone_midband * $ave_band_length; $clone_mlg = ""; $clone_midpoint = 0; # see if we have a location for the clone $clone_mlg = "unknown"; if(exists($clone_midpoint{$current_clone})) { $clone_mlg = $clone_mlg{$current_clone}; $clone_midpoint = $clone_midpoint{$current_clone}; # bases $current_contig_start = $clone_midpoint - $clone_midband_length; $doubles_count++; } # if if($current_contig eq $previous_contig) { # this is another clone in the same contig if($clone_mlg ne "unknown") { if($clone_mlg ne $previous_mlg && $previous_mlg ne "unknown") { # this is it xxx # $dump_contig = 1 } # if else { $start_sum += $current_contig_start; $start_divisor++; push @mlg_list, $clone_mlg; $previous_mlg = $clone_mlg; $largest_group = $start_divisor if($start_divisor > $largest_group); } # else } # if } # if else { # current contig does not equal previous contig &process_contig; $previous_contig = $current_contig; @mlg_list = (); if($clone_mlg eq "unknown") { $start_divisor = 0; $start_sum = 0; $previous_mlg = "unknown"; } # if else { $start_divisor = 1; $start_sum = $current_contig_start; $previous_mlg = $clone_mlg; push @mlg_list, $clone_mlg; } # else $dump_contig = 0; $contig_end = $clone_end; $previous_clone = $current_clone; } # else } # while &process_contig; # close the files close CONTIG2CLONE_FILE; close CONTIG_ANCHORS_FILE; close CONTIG_ERRORS_FILE; system "sort $contig_anchors_file_str > $sorted_contig_anchors_file_str"; # display output print " Largest group: $largest_group\n"; print " Largest contig: $largest_contig ($largest_contig_name)\n"; print " No matches: $no_matches\n"; print " Single matches: $single_matches\n"; print " Doubles: $doubles_count\n"; print " Double matches (same MLG): $double_matches\n"; print " Most negative band: $most_negative_band\n"; print " Errors: $error_counter\n"; print "Spreading contigs...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # contig_array fields # 0: mlg # 1: start # 2: end # 3: ctg # 4: number of associated clones # 5: original start # 6: boolean: is in a duplicated area # prepare variables $previous_mlg = ""; @contig_array = (); $record_number = 0; $biggest_distance = 0; $biggest_neg_distance = 0; $comparative_displacement = 0; %contig_mlg = (); %contig_start = (); $contig_counter = 0; open(CONTIG_ANCHORS_FILE, "<", $sorted_contig_anchors_file_str) or die "Cannot open contig anchors file: $!\n"; open(CONTIGS_B, ">", $contigs_b_file_str) or die "Cannot open contigs b file: $!\n"; open(CONTIGS_G, ">", $contigs_g_file_str) or die "Cannot open contigs g file: $!\n"; open(DUPS_FILE, ">", $dups_file_str) or die "Cannot open dups file: $!\n"; while() { chomp; # fields: mlg start end contig_name no_of_clone_matches my @fields = split /\t/; # process one MLG at a time my $current_mlg = $fields[0]; if($current_mlg eq $previous_mlg) { # add this record to current MLG group $record_number++; $contig_array[$record_number] = [@fields]; $contig_array[$record_number][5] = $fields[1]; $contig_array[$record_number][6] = 0; # false } # if else { # process previous MLG group if($previous_mlg ne "") { &process_group; # save to the output file for(my $lcv = 1; $lcv <= $record_number; $lcv++) { if($contig_array[$lcv][0] eq "Queue") { print CONTIGS_B "$contig_array[$lcv][0]\t$contig_array[$lcv][1]\t" . "$contig_array[$lcv][2]\t$contig_array[$lcv][3]\tBeta\n"; } # if else { print CONTIGS_G "$contig_array[$lcv][0]\t$contig_array[$lcv][1]\t" . "$contig_array[$lcv][2]\t$contig_array[$lcv][3]\tGamma\n"; } # else # store the data into hashes $contig_mlg{$contig_array[$lcv][3]} = $contig_array[$lcv][0]; $contig_start{$contig_array[$lcv][3]} = $contig_array[$lcv][1]; $contig_counter++; } # for } # if # start a new MLG group (add first record to it) $previous_mlg = $current_mlg; @contig_array = (); $record_number = 1; $contig_array[$record_number] = [@fields]; $contig_array[$record_number][5] = $fields[1]; $contig_array[$record_number][6] = 0; # false } # else } # while &process_group; # save to the output file for(my $lcv = 1; $lcv <= $record_number; $lcv++) { if($contig_array[$lcv][0] eq "Queue") { print CONTIGS_B "$contig_array[$lcv][0]\t$contig_array[$lcv][1]\t" . "$contig_array[$lcv][2]\t$contig_array[$lcv][3]\tBeta\n"; } # if else { print CONTIGS_G "$contig_array[$lcv][0]\t$contig_array[$lcv][1]\t" . "$contig_array[$lcv][2]\t$contig_array[$lcv][3]\tGamma\n"; } # else # store the data into hashes $contig_mlg{$contig_array[$lcv][3]} = $contig_array[$lcv][0]; $contig_start{$contig_array[$lcv][3]} = $contig_array[$lcv][1]; $contig_counter++; } # for close CONTIG_ANCHORS_FILE; close CONTIGS_B; close CONTIGS_G; close DUPS_FILE; print "Loading clone comments...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXX my @clone2comments_file_array = (); # open(CLONE2COMMENTS, "<", $clone2comments_file_str) # or die "Cannot open clone2comment file: $!\n"; # @clone2comments_file_array = ; # close CLONE2COMMENTS; for(my $lcv= 0; $lcv < scalar(@clone2comments_file_array); $lcv++) { my $current_line = $clone2comments_file_array[$lcv]; chomp $current_line; my @line_array = split /\t/, $current_line; my $clone_name = $line_array[0]; my $comment = $line_array[1]; if(exists($clone2comments{$line_array[0]})) { print "$line_array[0] is duplicated in comments hash\n"; } # if else { $clone2comments{$line_array[0]} = $line_array[1]; } # else } # for print "Spreading clones...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # prepare the data $previous_contig = ""; $contig_mlg = ""; $contig_start = 0; $contig_negatives = 0; %clone_mlg = (); %clone_midpoint = (); # open some more files open(CONTIG2CLONE_FILE, "<", $contig2clone_file_str) or die "Cannot open contig2clone file: $!\n"; open(SPREAD_CLONES_ERRORS_FILE, ">", $spread_clones_errors_file_str) or die "Cannot open spread clones errors file: $!\n"; open(CLONES_B2, ">", $clones_b2_file_str) or die "Cannot open clones b2 file: $!\n"; open(CLONES_G, ">", $clones_g_file_str) or die "Cannot open clones g file: $!\n"; # place each clone while() { chomp; my @fields = split /\t/; my $contig = $fields[0]; my $clone = $fields[1]; my $start_band = $fields[2]; my $end_band = $fields[3]; if($contig ne $previous_contig) { if(exists($contig_mlg{$contig})) { $contig_mlg = $contig_mlg{$contig}; $contig_start = $contig_start{$contig}; $contig_negatives = $smallest_start{$contig}; $clone_contig{$clone} = $contig; } # if else { print SPREAD_CLONES_ERRORS_FILE "No location known for $contig\n"; $contig_mlg = ""; } # else } # if if($contig_mlg ne "") { my $clone_start = sprintf("%08.0f", $contig_start + ($start_band * $ave_band_length)); my $clone_end = sprintf("%08.0f", $contig_start + (($end_band + $contig_negatives) * $ave_band_length)); # subdivide into beta 2 and gamma clones if(exists($b1g_clones{$clone})) { # is a gamma clone my $comment = "Gamma"; if(exists($clone2comments{$clone})) { $comment .= " $clone2comments{$clone}"; } # if print CLONES_G "$contig_mlg\t$clone_start\t$clone_end\t$clone\t$comment\n"; } # if else { # is a beta 2 clone my $comment = "Beta 2"; if(exists($clone2comments{$clone})) { $comment .= " $clone2comments{$clone}"; } # if print CLONES_B2 "$contig_mlg\t$clone_start\t$clone_end\t$clone\t$comment\n"; } # else # put the data into a hash for spreading the loci if(exists($clone_mlg{$clone})) { print "Clone already exists in hash: $clone\n"; } # if else { $clone_mlg{$clone} = $contig_mlg; # below measurement is in bases $clone_midpoint{$clone} = ($clone_start + $clone_end) / 2; $clone_contig{$clone} = $contig; } # else } # if $previous_contig = $contig; } # while close CONTIG2CLONE_FILE; close SPREAD_CLONES_ERRORS_FILE; close CLONES_B2; close CLONES_G; print " Processing $contig_counter contigs.\n"; system "sort $clones_b2_file_str > $sorted_clones_b2_file_str"; system "sort $clones_g_file_str > $sorted_clones_g_file_str"; print "Spreading loci...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # prepare variables $previous_locus = ""; $previous_mlg = ""; %midpoint_summation = (); # one for each contig %midpoint_divisor = (); # one for each contig %contig_mlg = (); # one for each contig in a loop below my @clone_array = (); # for storing clones without locations # determine the location for each locus open(LOCUS2CLONE_FILE, "<", $locus2clone_file_str) or die "Cannot open locus2clone file: $!\n"; open(LOCI_A, ">", $loci_a_file_str) or die "Cannot open loci a file: $!\n"; open(LOCI_B, ">", $loci_b_file_str) or die "Cannot open loci b file: $!\n"; open(LOCI_G, ">", $loci_g_file_str) or die "Cannot open loci g file: $!\n"; open(CLONES_B1, ">", $clones_b1_file_str) or die "Cannot open clones b1 file: $!\n"; while() { chomp; my @fields = split /\t/; my $current_locus = $fields[0]; my $current_clone = $fields[1]; my $current_mlg = $fields[2]; my $clone_mlg = ""; my $clone_midpoint = 0; my $clone_contig = ""; my $skip = 0; # boolean if(exists($clone_mlg{$current_clone})) { # The clone has a location, the locus can be placed as gamma $clone_mlg = $clone_mlg{$current_clone}; $clone_midpoint = $clone_midpoint{$current_clone}; $clone_contig = $clone_contig{$current_clone}; $contig_mlg{$clone_contig} = $clone_mlg; } # if else { # the clone's location is unknown, the locus can't be placed by it # the locus might be alpha or beta, though $skip = 1; } # else if($current_locus eq $previous_locus) { if($skip) { push @clone_array, $current_clone; next; } # if # two hits for the same locus # might be in the same or different contigs if(exists($midpoint_summation{$clone_contig})) { $midpoint_summation{$clone_contig} += $clone_midpoint; $midpoint_divisor{$clone_contig}++; } # if else { $midpoint_summation{$clone_contig} = $clone_midpoint; $midpoint_divisor{$clone_contig} = 1; } # else } # if else { # a new locus, save the previous one if($previous_locus ne "") { my $duplicates = 0; my $duplicate_counter = 0; my $duplicate_character = ""; $duplicates = 1 if(scalar(keys %midpoint_summation) > 1); if(%midpoint_summation) { foreach my $this_contig(keys %midpoint_summation) { my $this_mlg = $contig_mlg{$this_contig}; if($this_mlg) { my $current_midpoint = $midpoint_summation{$this_contig} / $midpoint_divisor{$this_contig}; my $start = sprintf("%08.0f", $current_midpoint); my $end = sprintf("%08.0f", $current_midpoint + 200); if($duplicates) { $duplicate_counter++; if($duplicate_counter == 1) { $duplicate_character = "a"; } # if else { $duplicate_character = chr(ord($duplicate_character) + 1); } # else my $extended_locus = $previous_locus . $duplicate_character; print LOCI_G "$this_mlg\t$start\t$end\t" . "$extended_locus\tGamma\n"; } # if else { print LOCI_G "$this_mlg\t$start\t$end\t" . "$previous_locus\tGamma\n"; } # else } # if } # foreach } # if } # if if(!%midpoint_summation) { # there were no clone hits with locations # see if this locus had an anchor if(exists($locus_mlg{$previous_locus})) { my $mlg = $locus_mlg{$previous_locus}; my $start = $locus_start{$previous_locus}; my $end = $locus_end{$previous_locus}; if(scalar(@clone_array) == 0) { # no clones were associated with this locus print LOCI_A "$mlg\t$start\t$end\t$previous_locus\tAlpha\n"; } # if else { # unlocatable clones were associated with this locus print LOCI_B "$mlg\t$start\t$end\t$previous_locus\tBeta\n"; # change its location in the hash $locus_mlg{$previous_locus} = $mlg; $locus_midpoint{$previous_locus} = $start + 100; } # else } # if } # if # do the Beta 1 clones foreach my $each_clone(@clone_array) { if(exists($locus_midpoint{$previous_locus})) { my $start = 0; my $end = 0; my $mlg = $locus_mlg{$previous_locus}; my $midpoint = $locus_midpoint{$previous_locus}; if($midpoint - 50000 < 1) { $start = $midpoint; $end = $midpoint + 100000; } # if elsif($midpoint + 50000 > $mlg_length{$mlg}) { $start = $midpoint - 100000; $end = $midpoint; } #elsif else { $start = $midpoint - 50000; $end = $midpoint + 50000; } # else print CLONES_B1 "$mlg\t$start\t$end\t$each_clone\tBeta 1\n"; } # if } # foreach $previous_locus = $current_locus; $previous_mlg = $clone_mlg; %midpoint_summation = (); %midpoint_divisor = (); %contig_mlg = (); @clone_array = (); push(@clone_array, $current_clone) if($skip); $midpoint_summation{$clone_contig} = $clone_midpoint; $midpoint_divisor{$clone_contig} = 1; $contig_mlg{$clone_contig} = $clone_mlg; } # else } # while my $duplicates = 0; my $duplicate_counter = 0; my $duplicate_character = ""; $duplicates = 1 if(scalar(keys %midpoint_summation) > 1); if(%midpoint_summation) { foreach my $this_contig(keys %midpoint_summation) { my $this_mlg = $contig_mlg{$this_contig}; if($this_mlg) { my $current_midpoint = $midpoint_summation{$this_contig} / $midpoint_divisor{$this_contig}; my $start = sprintf("%08.0f", $current_midpoint); my $end = sprintf("%08.0f", $current_midpoint + 200); if($duplicates) { $duplicate_counter++; if($duplicate_counter == 1) { $duplicate_character = "a"; } # if else { $duplicate_character = chr(ord($duplicate_character) + 1); } # else my $extended_locus = $previous_locus . $duplicate_character; print LOCI_G "$this_mlg\t$start\t$end\t" . "$extended_locus\tGamma\n"; } # if else { print LOCI_G "$this_mlg\t$start\t$end\t$previous_locus\tGamma\n"; } # else } # if } # foreach } # if if(!%midpoint_summation) { # there were no clone hits with locations # see if this locus had an anchor if(exists($locus_mlg{$previous_locus})) { my $mlg = $locus_mlg{$previous_locus}; my $start = $locus_start{$previous_locus}; my $end = $locus_end{$previous_locus}; if(scalar(@clone_array) == 0) { # no clones were associated with this locus print LOCI_A "$mlg\t$start\t$end\t$previous_locus\tAlpha\n"; } # if else { # unlocatable clones were associated with this locus print LOCI_B "$mlg\t$start\t$end\t$previous_locus\tBeta\n"; # change its location in the hash $locus_mlg{$previous_locus} = $mlg; $locus_midpoint{$previous_locus} = $start + 100; } # else } # if } # if # do the Beta 1 clones foreach my $each_clone(@clone_array) { if(exists($locus_midpoint{$previous_locus})) { my $start = 0; my $end = 0; my $mlg = $locus_mlg{$previous_locus}; my $midpoint = $locus_midpoint{$previous_locus}; if($midpoint - 50000 < 1) { $start = $midpoint; $end = $midpoint + 100000; } # if elsif($midpoint + 50000 > $mlg_length{$mlg}) { $start = $midpoint - 100000; $end = $midpoint; } #elsif else { $start = $midpoint - 50000; $end = $midpoint + 50000; } # else print CLONES_B1 "$mlg\t$start\t$end\t$each_clone\tBeta 1\n"; } # if else { # alpha clones will go here xxx } # else } # foreach close LOCUS2CLONE_FILE; close LOCI_A; close LOCI_B; close LOCI_G; close CLONES_B1; system ("sort $loci_a_file_str > $sorted_loci_a_file_str"); system ("sort $loci_b_file_str > $sorted_loci_b_file_str"); system ("sort $loci_g_file_str > $sorted_loci_g_file_str"); system "sort $clones_b1_file_str > $sorted_clones_b1_file_str"; print "Creating GFF file...\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX my $total_features = 0; open(GFF_FILE, ">", $gff_file_str) or die "Cannot open GFF output file: $!\n"; print " Current file being processed: MLG\n"; open(MLG_FILE, "<", $mlg_file_str) or die "Cannot open MLG file: $!\n"; # copy each line as is to the output file while() { print GFF_FILE; $total_features++; } # while close MLG_FILE; print GFF_FILE "Queue\tchromosome\tComponent\t1\t$q_start" . "\t.\t.\t.\tSequence \"Queue\"\n"; $total_features++; print " Current file being processed: QTL\n"; open(QTL_FILE, "<", $qtl_file_str) or die "Cannot open QTL file: $!\n"; # copy each line as is to the output file while() { print GFF_FILE; $total_features++; } # while close QTL_FILE; @file_list = ($sorted_loci_a_file_str, $sorted_loci_b_file_str, $sorted_loci_g_file_str, $sorted_clones_b1_file_str, $sorted_clones_b2_file_str, $sorted_clones_g_file_str, $contigs_b_file_str, $contigs_g_file_str, $dups_file_str ); $track = ""; my $count_negatives = 0; my $count_end_runs = 0; my $longest_end_run = 0; my $biggest_negative = 0; my $biggest_negative_name = ""; my $total_a1_contig_length = 0; my $total_contig_length = 0; my $total_clone_length = 0; foreach my $current_file(@file_list) { if($current_file =~ /loci/) { $track = "Loci"; } # if elsif($current_file =~ /clone/) { $track = "Clone"; } # elsif elsif($current_file =~ /dups/) { $track = "Dups"; } # elsif else { $track = "Ctg"; } # else print " Current file being processed: $track\n"; open(CURRENT_INPUT, "<", $current_file) or die "Cannot open $current_file: $!\n"; while() { chomp; my @fields = split /\t/; my $mlg = $fields[0]; my $start = $fields[1]; if($start < 1) { print "Negative location: $fields[3]\n"; $count_negatives++; } # if my $end = $fields[2]; if(!$mlg) { print "$current_file\n"; exit; } # if if($end > $mlg_length{$mlg}) { my $margin = $end - $mlg_length{$mlg}; print "End run: $fields[3] $margin $fields[0]\n"; $count_end_runs++; $longest_end_run = $margin if($margin > $longest_end_run); } # if if($start < $biggest_negative) { $biggest_negative = $start if($start < $biggest_negative); $biggest_negative_name = $mlg; } # if my $name = $fields[3]; my $type = $fields[4]; if($type eq " ") { print GFF_FILE "$mlg\tvarious\t$track\t$start\t$end" . "\t.\t+\t.\tSequence \"$name\" ; Note\n"; $total_features++; } # if else { print GFF_FILE "$mlg\tvarious\t$track\t$start\t$end" . "\t.\t+\t.\tSequence \"$name\" ; Note \"$type\"\n"; $total_features++; } # else if($mlg eq "A1" && $track eq "Ctg") { $total_a1_contig_length += ($end - $start); } # if $total_contig_length += ($end - $start) if($track eq "Ctg"); $total_clone_length += ($end - $start) if($track eq "Clone"); } # while close CURRENT_INPUT; } # foreach print "Total negatives: $count_negatives\n"; print "Biggest negative: $biggest_negative ($biggest_negative_name)\n"; print "Total end runs: $count_end_runs\n"; print "Longest end run: $longest_end_run\n"; print "Total A1 Contig lengths = $total_a1_contig_length\n"; print "Total contig lengths = $total_contig_length\n"; print "Total clone lengths = $total_clone_length\n"; print "Total features = $total_features\n"; close GFF_FILE; # determine and display run time my $run_time = Time::HiRes::time - $start_time; if($run_time < 60) { print "\nRun time: $run_time seconds\n"; } # if else { my $minutes = int($run_time / 60); $run_time %= 60; print "\nRun time: $minutes minutes, $run_time seconds\n"; } # else print "------------------------------------\n"; print "------------------\n"; print "---------\n"; # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX subroutines # for spreading contig sub process_group() { my $conflicts = 0; my $consecutive_conflicts = 0; my $mlg = ""; for(my $lcv = 1; $lcv < $record_number; $lcv++) { $mlg = $contig_array[$lcv][0]; return if($mlg eq "Queue"); my $start = $contig_array[$lcv][1]; if($start < $offset + 1) { my $margin = $offset + 1 - $start; $start += $margin; $contig_array[$lcv][1] += $margin; $contig_array[$lcv][2] += $margin; } # if my $end = $contig_array[$lcv][2]; my $next_start = $contig_array[$lcv + 1][1]; my $next_end = $contig_array[$lcv + 1][2]; my $overlap = $end + $offset - $next_start; if($overlap > 0) { # a conflict $conflicts++; $consecutive_conflicts++; $contig_array[$lcv + 1][1] += $overlap + $offset; $contig_array[$lcv + 1][2] += $overlap + $offset; $contig_array[$lcv][6] = 1; # set to true $contig_array[$lcv + 1][6] = 1; } # if else { # not a conflict $consecutive_conflicts = 0; } # else } # for print " $mlg conflicts: $conflicts\n"; $most_misplaced = &get_most_misplaced; # do the contigs one at a time from the right for(my $lcv = $record_number; $lcv > 0; $lcv--) { $mlg = $contig_array[$lcv][0]; $right_end_limit = $mlg_length{$mlg}; $comparative_displacement = $contig_array[$lcv][1] - $contig_array[$lcv][5]; PUSH: while($comparative_displacement > $offset || $contig_array[$lcv][2] >= $right_end_limit) { if($contig_array[$lcv][2] >= $right_end_limit) { $right_end_limit_exceeded = 1; } #if else { $right_end_limit_exceeded = 0; } # else last PUSH if(!&push_left($lcv, 0)); $comparative_displacement = $contig_array[$lcv][1] - $contig_array[$lcv][5]; } # if } # for $most_misplaced = &get_most_misplaced; # notate the duplicated areas my $begin = -1; my $end = -1; for(my $lcv = 0; $lcv < $record_number; $lcv++) { if($contig_array[$lcv][6]) { # part of a duplicated area if($begin == -1) { # the left part of a duplicated area $begin = $contig_array[$lcv][1]; $end = $contig_array[$lcv][2]; } # if else { # could be a continuation or a new duplicated area if($contig_array[$lcv - 1][2] + $offset + 1 >= $contig_array[$lcv][1]) { # continuation of a duplicated area $end = $contig_array[$lcv][2]; } # if else { # a new duplicated area # record the old one $dup_counter++; my $dup_name = "Dup$dup_counter"; print DUPS_FILE "$mlg\t$begin\t$end\t$dup_name\t \n"; # reset $begin = $contig_array[$lcv][1]; $end = $contig_array[$lcv][2]; } # else } # else } # if else { # not part of a duplicated area if($begin != -1) { # a duplicated area need to be recorded $dup_counter++; my $dup_name = "Dup$dup_counter"; print DUPS_FILE "$mlg\t$begin\t$end\t$dup_name\t \n"; # reset $begin = -1; $end = -1; } # if } # else } # for } # process_group() # for spreading contigs sub get_most_misplaced() { $biggest_distance = 0; $biggest_neg_distance = 0; my $current_winner = 0; for(my $lcv = 1; $lcv < $record_number; $lcv++) { my $distance = $contig_array[$lcv][1] - $contig_array[$lcv][5]; if($distance > $biggest_distance) { $biggest_distance = $distance; $current_winner = $lcv; } # if $biggest_neg_distance = $distance if($distance < $biggest_neg_distance); } # for print " Biggest distance: $biggest_distance $biggest_neg_distance\n"; $current_winner; } # get_most_misplaced() # for spreading contigs sub push_left() { my $ok = 0; my $proceed = 0; $proceed = 1 if($right_end_limit_exceeded); my $current_contig = $_[0]; my $maximum_displacement = $_[1]; my $current_displacement = 0; $current_displacement = abs($contig_array[$current_contig][1] - # the start $contig_array[$current_contig][5]); # the original start my $intended_displacement = 0; $intended_displacement = abs(($contig_array[$current_contig][1] - $offset) - $contig_array[$current_contig][5]); # $current_displacement = 0 if(!defined($current_displacement)); # $intended_displacement = 0 if(!defined($intended_displacement)); # exit if(!$current_displacement); if($intended_displacement < $current_displacement) { $proceed = 1; $maximum_displacement = $current_displacement if($current_displacement > $maximum_displacement); } # if elsif ($current_displacement < $maximum_displacement) { # it is going further away # but is still less than the max $proceed = 1; } # else return $ok if(!$proceed); if($current_contig == 1) { if($contig_array[$current_contig][1] > $offset * 2) { # it is the leftmost contig and there is room to move left $ok = 1; } # if } # if elsif($current_contig > 1 && $contig_array[$current_contig][1] > $contig_array[$current_contig - 1][2] + $offset) { # there is room to move without bumping into another contig $ok = 1; } # elsif elsif(&push_left($current_contig - 1, $maximum_displacement)) { # another contig can be bumped to the left $ok = 1; } # elsif # # see if this contig is moving further away instead of closer # if($ok && abs($current_displacement - $offset) > # $comparative_displacement # && !$right_end_limit_exceeded) { # $ok = 0; # } # if if($ok) { $contig_array[$current_contig][1] = sprintf("%08.0f", $contig_array[$current_contig][1] - $offset); $contig_array[$current_contig][2] = sprintf("%08.0f", $contig_array[$current_contig][2] - $offset); $contig_array[$current_contig][6] = 1; # set to true } # if $ok; } # push_left() # used for dropping contig anchors sub process_contig() { my $start = 0; my $end_str = ""; my $end = 0; my $length = 0; if($previous_contig ne "") { # was not the first entry if(exists($largest_end{$previous_contig})) { $contig_end = $largest_end{$previous_contig}; # in bands $contig_end += abs($smallest_start{$previous_contig}); } # if else { print "contig_end undefined $previous_contig\n"; exit; } # else if($start_divisor == 0 || $dump_contig) { # these contigs are not anchored $no_matches++; # below calculates the largest offset, so far $q_start += $offset; $start = $q_start; $end = sprintf("%08.0f", $contig_end * $ave_band_length + $start); $q_start = $end + 1; print CONTIG_ANCHORS_FILE "Queue\t$start\t$end\t$previous_contig\t$start_divisor\n"; if($length > $largest_contig) { $largest_contig = $length; $largest_contig_name = $previous_contig; } # if } # if elsif($start_divisor == 1) { # anchored with a single clone match $single_matches++; $start = sprintf("%08.0f", $start_sum); $end = sprintf("%08.0f", $contig_end * $ave_band_length + $start); if($end >= $mlg_length{$previous_mlg}) { my $margin = $end - $mlg_length{$previous_mlg} + 1; $start -= $margin; $end -= $margin; } # if print CONTIG_ANCHORS_FILE "$previous_mlg\t$start\t$end\t" . "$previous_contig\t$start_divisor\n"; $length = $end - $start; if($length > $largest_contig) { $largest_contig = $length; $largest_contig_name = $previous_contig; } # if } # if else { # anchored with two or more clone matches if($dump_contig) { $error_counter += $start_divisor; for(my $lcv = 0; $lcv < $start_divisor; $lcv++) { my $mlg = shift @mlg_list; print CONTIG_ERRORS_FILE "$previous_contig: MLG's do not match, $mlg\n"; } # for } # if else { $double_matches++; my $start_str = $start_sum / $start_divisor; $start = sprintf("%08.0f", $start_str); $end = sprintf ("%08.0f", ($contig_end * $ave_band_length) + $start); if($end >= $mlg_length{$previous_mlg}) { my $margin = $end - $mlg_length{$previous_mlg} + 1; $start -= $margin; $end -= $margin; } # if print CONTIG_ANCHORS_FILE "$previous_mlg\t$start\t$end\t" . "$previous_contig\t$start_divisor\n"; $length = $end - $start; if($length > $largest_contig) { $largest_contig = $length; $largest_contig_name = $previous_contig; } # if } # else } # else $b2g_clones{$previous_clone} = 1; } # if } # process_contig() # used for dropping clone anchors sub process_clone() { if($previous_clone ne "") { # was not the first entry nor thrown out for a mismatch if($midpoint_divisor == 1) { # a single match $single_matches++; print CLONE_ANCHORS_FILE "$previous_mlg\t$midpoint_sum\t$previous_clone\n"; # save the clone data to hashes $clone_mlg{$previous_clone} = $previous_mlg; $clone_midpoint{$previous_clone} = $midpoint_sum; } # if else { # two or more matches $double_matches++; my $length = $previous_end - $previous_start; if($length > 2000000) { print CLONE_ERRORS_FILE "$current_clone: $length is too long\n"; $error_counter++; } # if else { my $midpoint_str = $midpoint_sum / $midpoint_divisor; my $midpoint = sprintf("%.0f", $midpoint_str); print CLONE_ANCHORS_FILE "$previous_mlg\t$midpoint\t" . "$previous_clone\t$midpoint_divisor\n"; # save the clone data to hashes $clone_mlg{$previous_clone} = $previous_mlg; $clone_midpoint{$previous_clone} = $midpoint; } # else } # else # store as a b1 or g clone $b1g_clones{$previous_clone} = 1; } # if } # process_clone() sub min($ $) { if($_[0] < $_[1]) { $_[0]; } # if else { $_[1]; } # else } # min() sub max($ $) { if($_[0] > $_[1]) { $_[0]; } # if else { $_[1]; } # else } # max()