#!/usr/bin/perl -w use strict; # drop_contig_anchors.pl, last update: 7/29/03 # for each contig # identify clone anchors # get clone anchor locations # determine contig location # by Chet Langin, clangin@siu.edu # SIU Plant Biotechnology and Genome Core-facility # start timer use Time::HiRes; my $start_time = Time::HiRes::time; # subroutines sub process_contig(); sub min($ $); sub max($ $); use constant AVE_BAND_LENGTH => 3881.89350119645; # global variables my $error_counter = 0; my $relation_input_file_str = ""; my $clone_file_str = ""; my $output_file_str = ""; my $single_matches = 0; my $double_matches = 0; my @fields = 0; my $current_contig = ""; my $current_contig_start = 0; my $current_clone = ""; my $clone_start = 0; my $clone_end = 0; my $clone_midband = 0; my $clone_midband_length = 0; my $clone_mlg = ""; my $clone_midpoint = 0; my $dump_contig = 0; # boolean my @mlg_list = (); my $largest_contig = 0; # start the display print "---------\n"; print "------------------\n"; print "------------------------------------\n"; # check the parameters if($ARGV[0] && $ARGV[0] eq "1") { # a shortcut while programming $relation_input_file_str = "sorted_contig2clone_relations.txt"; $clone_file_str = "sorted_clone_anchors.txt"; $output_file_str = "contig_anchors.txt"; } # if elsif(scalar(@ARGV) != 3) { print "Usage: ./drop_contig_anchors relation_input_file clone_anchor_input_file output_file\n"; print "Example: ./drop_contig_anchors sorted_contig2clone_relations.txt sorted_clone_anchors.txt contig_anchors.txt\n"; print "------------------------------------\n"; print "------------------\n"; print "---------\n"; exit(); } # if else { $relation_input_file_str = $ARGV[0]; $clone_file_str = $ARGV[1]; $output_file_str = $ARGV[2]; } # else # open the files open(RELATION_INPUT_FILE, "<", $relation_input_file_str) or die "Cannot open relation input file: $!\n"; open(CLONE_INPUT_FILE, "<", $clone_file_str) or die "Cannot open locus input file: $!\n"; open(OUTPUT_FILE, ">", $output_file_str) or die "Cannot open output file: $!\n"; open ERROR_FILE, ">drop_contig_anchors_errors.txt" or die "Cannot open error file: $!\n"; # create a hash of the clone locations my %clone_mlg = (); my %clone_midpoint = (); while() { chomp; my @fields = split /\t/; my $mlg = $fields[0]; my $midpoint = $fields[1]; my $clone_name = $fields[2]; $clone_mlg{$clone_name} = $mlg; $clone_midpoint{$clone_name} = $midpoint; } # while close CLONE_INPUT_FILE; my $previous_contig = ""; my $previous_mlg = ""; my $start_sum = 0; my $start_divisor = 0; my $contig_end = 0; my $largest_group = 1; my $doubles_count = 0; LINE: while() { chomp; @fields = split /\t/; $current_contig = $fields[0]; $current_clone = $fields[1]; $clone_start = $fields[2]; $clone_end = $fields[3]; $clone_midband = ($clone_end + $clone_start) / 2; $clone_midband_length = $clone_midband * AVE_BAND_LENGTH; $clone_mlg = ""; $clone_midpoint = 0; if(exists($clone_midpoint{$current_clone})) { $clone_mlg = $clone_mlg{$current_clone}; $clone_midpoint = $clone_midpoint{$current_clone}; $current_contig_start = $clone_midpoint - $clone_midband_length; if($current_contig eq $previous_contig) { $dump_contig = 1 if($clone_mlg ne $previous_mlg); $start_sum += $current_contig_start; $start_divisor++; $contig_end = $clone_end if($clone_end > $contig_end); push @mlg_list, $clone_mlg; $previous_mlg = $clone_mlg; $largest_group = $start_divisor if($start_divisor > $largest_group); $doubles_count++; } # if else { # current clone does not equal previous clone &process_contig; $start_sum = $current_contig_start; $start_divisor = 1; $contig_end = $clone_end; $previous_contig = $current_contig; $previous_mlg = $clone_mlg; $dump_contig = 0; @mlg_list = (); push @mlg_list, $clone_mlg; } # else } # if else { print ERROR_FILE "$current_contig: $current_clone does not exist\n"; $error_counter++; } # else } # while &process_contig; # close the files close RELATION_INPUT_FILE; close CLONE_INPUT_FILE; close OUTPUT_FILE; close ERROR_FILE; # display output print "Largest group: $largest_group\n"; print "Largest contig: $largest_contig\n"; print "Single matches: $single_matches\n"; print "Doubles: $doubles_count\n"; print "Double matches (same MLG): $double_matches\n"; print "Errors: $error_counter\n"; # determine and display run time my $run_time = Time::HiRes::time - $start_time; print "Run time: $run_time\n"; print "------------------------------------\n"; print "------------------\n"; print "---------\n"; exit; sub process_contig() { my $start = 0; my $end_str = ""; my $end = 0; my $length = 0; if($previous_contig ne "") { # was not the first entry nor thrown out for a mismatch if($start_divisor == 1) { # a single match $single_matches++; $start = sprintf("%08.0f", $start_sum); $end = sprintf("%08.0f", $contig_end * AVE_BAND_LENGTH + $start); print OUTPUT_FILE "$previous_mlg\t$start\t$end\t$previous_contig\t$start_divisor\n"; $length = $end - $start; $largest_contig = $length if($length > $largest_contig); } # if else { # two or more matches if($dump_contig) { $error_counter += $start_divisor; for(my $lcv = 0; $lcv < $start_divisor; $lcv++) { my $mlg = shift @mlg_list; print ERROR_FILE "$previous_contig: MLG's do not match, $mlg\n"; } # for } # if else { $double_matches++; my $start_str = $start_sum / $start_divisor; $start = sprintf("%08.0f", $start_str); $end = sprintf("%08.0f", $contig_end * AVE_BAND_LENGTH + $start); print OUTPUT_FILE "$previous_mlg\t$start\t$end\t$previous_contig\t$start_divisor\n"; $length = $end - $start; $largest_contig = $length if($length > $largest_contig); } # else } # else } # if } # process_contig() sub min($ $) { if($_[0] < $_[1]) { $_[0]; } # if else { $_[1]; } # else } # min() sub max($ $) { if($_[0] > $_[1]) { $_[0]; } # if else { $_[1]; } # else } # max()