#!/usr/bin/perl -w use strict; # drop_clone_anchors.pl, last update: 7/28/03 # for each clone # identify loci anchors # get loci anchor locations # determine clone location # by Chet Langin, clangin@siu.edu # SIU Plant Biotechnology and Genome Core-facility # start timer use Time::HiRes; my $start_time = Time::HiRes::time; # subroutines sub process_clone(); sub min($ $); sub max($ $); # global variables my $error_counter = 0; my $relation_input_file_str = ""; my $locus_file_str = ""; my $output_file_str = ""; my $single_matches = 0; my $double_matches = 0; my @fields = 0; my $current_clone = ""; my $current_locus = ""; my $current_mlg = ""; my $locus_mlg = ""; my $locus_start = 0; my $locus_end = 0; my $locus_midpoint = 0; # start the display print "---------\n"; print "------------------\n"; print "------------------------------------\n"; # check the parameters if($ARGV[0] eq "1") { # a shortcut while programming $relation_input_file_str = "sorted_clone2locus_relations.txt"; $locus_file_str = "sorted_locus_anchors.txt"; $output_file_str = "clone_anchors.txt"; } # if elsif(scalar(@ARGV) != 3) { print "Usage: ./drop_clone_anchors relation_input_file locus_anchor_input_file output_file\n"; print "Example: ./drop_clone_anchors sorted_clone2locus_relations.txt sorted_locus_anchors.txt clone_anchors.txt\n"; print "------------------------------------\n"; print "------------------\n"; print "---------\n"; exit(); } # if else { $relation_input_file_str = $ARGV[0]; $locus_file_str = $ARGV[1]; $output_file_str = $ARGV[2]; } # else # open the files open(RELATION_INPUT_FILE, "<", $relation_input_file_str) or die "Cannot open relation input file: $!\n"; open(LOCUS_INPUT_FILE, "<", $locus_file_str) or die "Cannot open locus input file: $!\n"; open(OUTPUT_FILE, ">", $output_file_str) or die "Cannot open output file: $!\n"; open ERROR_FILE, ">drop_clone_anchors_errors.txt" or die "Cannot open error file: $!\n"; # create a hash of the loci locations my %locus_mlg = (); my %locus_start = (); my %locus_end = (); my %locus_midpoint = (); while() { chomp; my @fields = split /\t/; my $mlg = $fields[0]; my $start = $fields[1]; my $end = $fields[2]; my $midpoint = $fields[3]; my $locus_name = $fields[4]; $locus_mlg{$locus_name} = $mlg; $locus_start{$locus_name} = $start; $locus_end{$locus_name} = $end; $locus_midpoint{$locus_name} = $midpoint; } # while close LOCUS_INPUT_FILE; my $previous_clone = ""; my $previous_mlg = ""; my $previous_start = 0; my $previous_end = 0; my $midpoint_sum = 0; my $midpoint_divisor = 0; my $largest_group = 1; my $doubles_count = 0; LINE: while() { chomp; @fields = split /\t/; $current_clone = $fields[0]; $current_locus = $fields[1]; $current_mlg = $fields[2]; $locus_mlg = ""; $locus_start = 0; $locus_end = 0; $locus_midpoint = 0; if(exists($locus_midpoint{$current_locus})) { $locus_mlg = $locus_mlg{$current_locus}; $locus_start = $locus_start{$current_locus}; $locus_end = $locus_end{$current_locus}; $locus_midpoint = $locus_midpoint{$current_locus}; if($locus_mlg ne $current_mlg) { print ERROR_FILE "$current_clone: $current_locus MLG's do not match\n"; $error_counter++; next LINE; } # if if($current_clone eq $previous_clone) { if($current_mlg ne $previous_mlg) { print ERROR_FILE "$current_clone: markers on different MLG's\n"; print ERROR_FILE "$current_clone: markers on different MLG's\n"; $error_counter++; $error_counter++; $previous_clone = ""; next LINE; } # if $midpoint_sum += $locus_midpoint; $midpoint_divisor++; $previous_start = min($previous_start, $locus_start); $previous_end = max($previous_end, $locus_end); $largest_group = $midpoint_divisor if($midpoint_divisor > $largest_group); $doubles_count++; } # if else { # current clone does not equal previous clone &process_clone; $midpoint_sum = $locus_midpoint; $midpoint_divisor = 1; $previous_clone = $current_clone; $previous_mlg = $current_mlg; $previous_start = $locus_start; $previous_end = $locus_end; } # else } # if else { print ERROR_FILE "$current_clone: $current_locus does not exist\n"; $error_counter++; } # else } # while &process_clone; # close the files close RELATION_INPUT_FILE; close LOCUS_INPUT_FILE; close OUTPUT_FILE; close ERROR_FILE; # display output print "Largest group: $largest_group\n"; print "Single matches: $single_matches\n"; print "Doubles: $doubles_count\n"; print "Double matches (same MLG): $double_matches\n"; print "Errors: $error_counter\n"; # determine and display run time my $run_time = Time::HiRes::time - $start_time; print "Run time: $run_time\n"; print "------------------------------------\n"; print "------------------\n"; print "---------\n"; exit; sub process_clone() { if($previous_clone ne "") { # was not the first entry nor thrown out for a mismatch if($midpoint_divisor == 1) { # a single match $single_matches++; print OUTPUT_FILE "$previous_mlg\t$midpoint_sum\t$previous_clone\n"; } # if else { # two or more matches $double_matches++; my $length = $previous_end - $previous_start; if($length > 2000000) { print ERROR_FILE "$current_clone: $length is too long\n"; $error_counter++; } # if else { my $midpoint_str = $midpoint_sum / $midpoint_divisor; my $midpoint = sprintf("%.0f", $midpoint_str); print OUTPUT_FILE "$previous_mlg\t$midpoint\t$previous_clone\t$midpoint_divisor\n"; } # else } # else } # if } # process_clone() sub min($ $) { if($_[0] < $_[1]) { $_[0]; } # if else { $_[1]; } # else } # min() sub max($ $) { if($_[0] > $_[1]) { $_[0]; } # if else { $_[1]; } # else } # max()