package UpdateContigAnchors; =head1 NAME UpdateContigAnchors -- Updates the contig anchors in the contig_anchors table so that differences involved with multiple anchors are resolved. =head1 SYNOPSIS UpdateContigAnchors -> new; =head1 DESCRIPTION UpdateContigAnchors -- Updates the contig anchors in the contig_anchors table so that differences involved with multiple anchors are resolved. The updating is done via a file, ../working_data/contig_anchors.txt, provided by the user. This is a tab-delimited text file with the first column being the contig name, the second column being the anchor, a location, the third column indicates the index number of a duplicated clone with multiple references, the fourth column indicates the total number of duplications for a clone, and additional columns provide more information for the benefit of the user on direct inspection. =head1 VERSION 0.001 (last update: 6/30/04) =head1 AUTHOR Chet Langin, clangin@siu.edu SIU Plant Biotechnology and Genomics Core-facility =head1 BUGS None known. =head1 SEE ALSO extropy ExtropyConstants ExtropyUtils Extropy::MenuMain =head1 COPYRIGHT Copyright 2004, Chet Langin, All Rights Reserved. This program is free software. You may copy or redistribute it under the same terms as Perl itself. =head1 METHODS The remainder of this document describes the methods available to the programmer. =cut # load the pragmas use warnings; use strict; # load other modules use ExtropyConstants; use ExtropyUtils; =head2 new() UpdateContigAnchors->new; Updates the contig anchors in the contig_anchors table so that they resolve multiple references to anchors. =cut # package variables my @contig_array = (); my @anchor_array = (); # ******************************** new ****************************** sub new { my $self = shift; my $configuration = shift; my $db_manager = shift; my $project = shift; my $number_lines = 0; message_start; if($project->{current_project} eq "") { message("You must activate a project, first."); } # if elsif(!$project->{contig_anchors_counted}) { message("You must count contig anchors, first."); } # elsif else { # The updates can be made message("Connecting to the database."); $db_manager->connect; # force unbuffered output my $old_fh = select(STDOUT); $| = 1; select($old_fh); # Open and read the update file message("Opening ../working_data/contig_anchors.txt."); my $open_error = FALSE; open(INPUT_FILE, "<", "../working_data/contig_anchors.txt") or $open_error = TRUE; if($open_error) { message_start; message("Could not open ../working_data/contig_anchors.txt"); message("$!"); } # if else { # input file was opened ok. message("Deleting contig_anchors table for new data"); $db_manager->execute("delete from contig_anchors"); message("Obtaining data for contigs with single anchors."); my $sth1 = $db_manager->execute("select contigs.ctg, clone_anchors.mlg, clone_anchors.anchor, contigs.start, contigs.length, contig2clone.start, contig2clone.length from contigs, clone_anchors, contig2clone where contigs.anchors = 1 and contigs.ctg = contig2clone.ctg and contig2clone.clone = clone_anchors.clone"); message("Saving the single-anchored contig data into contig_anchors."); my @row1 = (); while(@row1 = $sth1->fetchrow_array()) { my $db_eid = $db_manager->{dbh}->quote(""); my $ctg = shift @row1; my $db_contig = $db_manager->{dbh}->quote($ctg); my $db_mlg = $db_manager->{dbh}->quote(shift @row1); my $anchor = shift @row1; my $db_anchor = $db_manager->{dbh}->quote($anchor); # To determine the displacement of a contig my $ctg_start = shift @row1; my $ctg_length = shift @row1; my $clone_start = shift @row1; my $clone_length = shift @row1; my $clone_mid = clone_mid($anchor, $clone_start, $ctg_start, $clone_length, $project); # determine the start and end my $base_length = int($ctg_length * $project->{band_factor}); my $start = $anchor - $clone_mid; my $end = $start + $base_length; # determine the reverse start and end my $rev_end = $anchor + $clone_mid; my $rev_start = $rev_end - $base_length; if($start < 0) { my $offset = ((-1) * $start) + 1; $start += $offset; $end += $offset; } # if if($rev_start < 0) { my $offset = ((-1) * $rev_start) + 1; $rev_start += $offset; $rev_end += $offset; } # if my $db_dup = $db_manager->{dbh}->quote("1"); $db_manager->execute("insert into contig_anchors values($db_eid, $db_contig, $db_mlg, $db_anchor, '$start', '$end', '$rev_start', '$rev_end', -1, -1, $db_dup, $db_dup, 'no')"); } # while message("Getting and saving data from the input file."); my @file_array = ; close INPUT_FILE; my $number_lines = scalar(@file_array); message("Placing contigs with multiple anchors"); # look at each line in the input file for(my $current_line=0; $current_line < $number_lines; $current_line++) { my $this_line = $file_array[$current_line]; chomp $this_line; my @field_array = split /\t/, $this_line; my $db_eid = $db_manager->{dbh}->quote(""); my $ctg = $field_array[0]; # next if($ctg ne 'ctg9406'); my $db_contig = $db_manager->{dbh}->quote($field_array[0]); my $num_anchors = $field_array[1]; my $db_dup1 = $db_manager->{dbh}->quote($field_array[2]); my $db_dup2 = $db_manager->{dbh}->quote($field_array[3]); my $clone_name = $field_array[4]; my $db_clone = $db_manager->{dbh}->quote($clone_name); my $db_mlg = $db_manager->{dbh}->quote($field_array[5]); my $anchor = $field_array[6]; my $db_anchor = $db_manager->{dbh}->quote($field_array[6]); my $flip = 'no'; # Mark and skip the bad relationships if($field_array[2] == 0) { # this was determined to be a bad relationship $db_manager->execute("update contig2clone set valid = 'no' where ctg = $db_contig and clone = $db_clone"); next; } # if # get the ctg_length and determine the start and end my $sth1b = $db_manager->execute("select start, length from contigs where ctg = '$ctg'"); my @row1b = (); my $ctg_start = -1; my $ctg_length = -1; while(@row1b = $sth1b->fetchrow_array()) { $ctg_start = shift @row1b; $ctg_length = shift @row1b; } # while # get the clone_start and clone_length my $sth1c = $db_manager->execute("select start, length from contig2clone where clone = '$clone_name'"); my @row1c = (); my $clone_start = -1; my $clone_length = -1; while(@row1c = $sth1c->fetchrow_array()) { $clone_start = shift @row1c; $clone_length = shift @row1c; } # while my $clone_mid = clone_mid($anchor, $clone_start, $ctg_start, $clone_length, $project); my $displacement = -1; my $rev_displacement = -1; if($num_anchors == 1) { $displacement = $anchor - $clone_mid; # for only one anchor $rev_displacement = $anchor + $clone_mid - int($ctg_length * $project->{band_factor}); #convert $ctg_length from bands to bases $ctg_length = int($ctg_length * $project->{band_factor}); # if($ctg eq 'ctg9406') { # print "clone start: $clone_start\n"; # print "contig start: $ctg_start\n"; # print "clone length: $clone_length\n"; # print "clone mid: $clone_mid\n"; # print "reverse: $rev_displacement\n"; # } # if } # if else { # There is more than one clone anchor # Fill the anchor array # 0: clone_mid # 1: anchor @anchor_array = (); # Index 0 will not be used $anchor_array[0][0] = -1; $anchor_array[0][1] = -1; $anchor_array[1][0] = $clone_mid; $anchor_array[1][1] = $anchor; # get the remaining anchors print"."; for(my $lcv = 2; $lcv <= $num_anchors; $lcv++) { $current_line++; my $this_line = $file_array[$current_line]; chomp $this_line; my @field_array = split /\t/, $this_line; my $clone_name = $field_array[4]; my $anchor = $field_array[6]; # get the clone_start and clone_length my $sth1d = $db_manager->execute("select start, length from contig2clone where clone = '$clone_name'"); my @row1d = (); my $clone_start = -1; my $clone_length = -1; while(@row1d = $sth1d->fetchrow_array()) { $clone_start = shift @row1d; $clone_length = shift @row1d; } # while my $clone_mid = clone_mid($anchor, $clone_start, $ctg_start, $clone_length, $project); $anchor_array[$lcv][0] = $clone_mid; $anchor_array[$lcv][1] = $anchor; } # for #convert $ctg_length from bands to bases $ctg_length = int($ctg_length * $project->{band_factor}); # get the displacement (starting point of the ctg) $flip = 'no'; $displacement = get_displacement($ctg_start, $ctg_length, $num_anchors, $ctg, \$flip); # above sub also uses @anchor_array # Resolve the reverse displacement # See if all of the anchors are in the same location my $comparison_anchor = $anchor_array[1][1]; my $the_same = TRUE; for(my $lcv2 = 2; $lcv2 <= $num_anchors; $lcv2++) { if($anchor_array[$lcv2][1] != $comparison_anchor) { $the_same = FALSE; } # if } # for if($the_same) { $rev_displacement = ($comparison_anchor + ($comparison_anchor - $displacement)) - $ctg_length; } # if else { $rev_displacement = $displacement; } # else } # else my $base_length = $ctg_length; my $start = $displacement; my $end = $start + $base_length; my $rev_start = $rev_displacement; my $rev_end = $rev_start + $base_length; if($start < 0) { my $offset = ((-1) * $start) + 1; $start += $offset; $end += $offset; } # if if($rev_start < 0) { my $offset = ((-1) * $start) + 1; $rev_start += $offset; $rev_end += $offset; } # if # Save the data in the contig_anchors table $db_manager->execute("insert into contig_anchors values($db_eid, $db_contig, $db_mlg, $db_anchor, '$start', '$end', '$rev_start', '$rev_end', -1, -1, $db_dup1, $db_dup2, '$flip')"); } # for print "\n"; message("Clearing contigs_onQ table for new data"); $db_manager->execute("delete from contigs_onQ"); message("Assigning contigs to Queue"); my $current_location = OFFSET; my $sth2 = $db_manager->execute("select ctg, length from contigs where type = 'beta' order by ctg"); my @row2 = (); while(@row2 = $sth2->fetchrow_array()) { my $ctg = shift @row2; my $band_length = shift @row2; my $base_length = int($band_length * $project->{band_factor}); my $start = $current_location; $current_location += $base_length; my $end = $current_location; $current_location += OFFSET; $db_manager->execute("insert into contigs_onQ values('', '$ctg', '$start', '$end')"); } # while # The alpha clones are put into Queue a few lines below message("Getting the beta loci"); my $sth4 = $db_manager->execute("select distinct locus from clone2locus3"); message("Marking the beta loci"); my @row4 = (); while(@row4 = $sth4->fetchrow_array()) { my $locus = shift @row4; $db_manager->execute("update loci set type = 'beta' where locus = '$locus'"); } # while message("Getting the gamma loci"); my $sth5 = $db_manager->execute("select loci.locus from loci, clone2locus3, contig2clone where loci.type = 'beta' and loci.locus = clone2locus3.locus and clone2locus3.clone = contig2clone.clone"); message("Marking the gamma loci"); my @row5 = (); while(@row5 = $sth5->fetchrow_array()) { my $locus = shift @row5; $db_manager->execute("update loci set type = 'gamma' where locus = '$locus'"); } # while message("Getting the beta 1 clones"); my $sth6 = $db_manager->execute("select distinct clone from clone2locus3"); message("Marking the beta 1 clones"); my @row6 = (); while(@row6 = $sth6->fetchrow_array()) { my $clone = shift @row6; $db_manager->execute("update clones set type = 'beta 1' where clone = '$clone'"); } # while message("Getting the gamma clones"); my $sth7 = $db_manager->execute("select distinct contig2clone.clone from clones, contig2clone where clones.type = 'beta 1' and clones.clone = contig2clone.clone"); message("Marking the gamma clones"); my @row7 = (); while(@row7 = $sth7->fetchrow_array()) { my $clone = shift @row7; $db_manager->execute("update clones set type = 'gamma' where clone = '$clone'"); } # while message("Getting the beta 2 clones"); my $sth8 = $db_manager->execute("select distinct contig2clone.clone from clones, contig2clone where clones.type = 'alpha' and clones.clone = contig2clone.clone"); message("Marking the beta 2 clones"); my @row8 = (); while(@row8 = $sth8->fetchrow_array()) { my $clone = shift @row8; $db_manager->execute("update clones set type = 'beta 2' where clone = '$clone'"); } # while message("Deleting clone_locations table info for new input"); $db_manager->execute("delete from clone_locations"); message("Assigning clones to Queue"); # xxx my $sth3 = $db_manager->execute("select clone from clones where type = 'alpha' order by clone"); my @row3 = (); while(@row3 = $sth3->fetchrow_array()) { my $clone = shift @row3; my $start = $current_location; $current_location += AVE_CLONE_LENGTH; my $end = $current_location; $current_location += OFFSET; $db_manager->execute("insert into clone_locations values('', '$clone', 'Queue', '$start', '$end', 'alpha')"); } # while $current_location += OFFSET * 9; $db_manager->execute("update mlg set length = '$current_location' where mlg = 'Queue'"); message("Queue is $current_location bases long"); ##################################### ###### Spreading the contigs ###### ##################################### message("Spreading the contigs"); @contig_array = (); # contig_array fields: # 0: mlg # 1: start # 2: end # 3: ctg # 4: number of associated clones # 5: original start # 6: boolean: is in a duplicated area, initialized to FALSE # fill the contig_array one MLG at a time my @mlgs = qw/A1 A2 B1 B2 C1 C2 D1AQ D1BW D2 E F G H I J K L M N O/; for(my $mlg_counter = 0; $mlg_counter < 20; $mlg_counter++) { # get the mlg, start, end, ctg, no_of_clone_matches for each ctg in the MLG my $mlg = $mlgs[$mlg_counter]; my $record_number = 0; # counting starts at 1 message("Getting the contig info for $mlg"); my $sth9 = $db_manager->execute("select mlg, start, end, ctg from contig_anchors where mlg = '$mlg' order by start"); my @row9 = (); while(@row9 = $sth9->fetchrow_array()) { $record_number++; # counting starts at 1 my $mlg = shift @row9; my $start = shift @row9; my $end = shift @row9; my $ctg = shift @row9; $contig_array[$record_number][0] = $mlg; $contig_array[$record_number][1] = $start; $contig_array[$record_number][2] = $end; $contig_array[$record_number][3] = $ctg; $contig_array[$record_number][4] = -1; $contig_array[$record_number][5] = $start; $contig_array[$record_number][6] = FALSE; } # while message("Getting the length of $mlg"); my $sth10 = $db_manager->execute("select length from mlg where mlg = '$mlg'"); my @row10 = (); my $right_end_limit = 0; while(@row10 = $sth10->fetchrow_array()) { $right_end_limit = shift @row10; } # while # call the subroutine to spread the contigs out message("Spreading $mlg"); process_group($record_number, $right_end_limit); # save the results to contig_anchors message("Saving spreads to $mlg"); for(my $lcv = 1; $lcv <= $record_number; $lcv++) { my $mlg = $contig_array[$lcv][0]; my $start = $contig_array[$lcv][1]; my $end = $contig_array[$lcv][2]; my $ctg = $contig_array[$lcv][3]; my $old_start = $contig_array[$lcv][5]; $db_manager->execute("update contig_anchors set spread_start = '$start', spread_end = '$end' where ctg = '$ctg' and mlg = '$mlg' and start = '$old_start'"); } # for } # for (each MLG) $project->{contig_anchors_updated} = TRUE; message("Disconnecting from the database."); $db_manager->disconnect; } # else } # else press_enter; } # new ####################################### ######## Subroutines ################## ####################################### # for spreading contig sub process_group() { my $record_number = shift; my $right_end_limit = shift; my $comparative_displacement = 0; my $right_end_limit_exceeded = 0; my $dup_counter = 0; my $conflicts = 0; my $consecutive_conflicts = 0; my $mlg = ""; for(my $lcv = 1; $lcv < $record_number; $lcv++) { $mlg = $contig_array[$lcv][0]; return if($mlg eq "Queue"); my $start = $contig_array[$lcv][1]; if($start < OFFSET + 1) { my $margin = OFFSET + 1 - $start; $start += $margin; $contig_array[$lcv][1] += $margin; $contig_array[$lcv][2] += $margin; } # if my $end = $contig_array[$lcv][2]; my $next_start = $contig_array[$lcv + 1][1]; my $next_end = $contig_array[$lcv + 1][2]; my $overlap = $end + OFFSET - $next_start; if($overlap > 0) { # my $other_ctg = $contig_array[$lcv][3]; # my $ctg = $contig_array[$lcv + 1][3]; # print "here $other_ctg\n" if($ctg eq 'ctg1137'); # a conflict $conflicts++; $consecutive_conflicts++; $contig_array[$lcv + 1][1] += $overlap + OFFSET; $contig_array[$lcv + 1][2] += $overlap + OFFSET; $contig_array[$lcv][6] = 1; # set to true $contig_array[$lcv + 1][6] = 1; } # if else { # not a conflict $consecutive_conflicts = 0; } # else } # for message("$mlg conflicts: $conflicts"); # return; # $most_misplaced = &get_most_misplaced; # do the contigs one at a time from the right for(my $lcv = $record_number; $lcv > 0; $lcv--) { $mlg = $contig_array[$lcv][0]; $comparative_displacement = $contig_array[$lcv][1] - $contig_array[$lcv][5]; PUSH: while($comparative_displacement > OFFSET || $contig_array[$lcv][2] >= $right_end_limit) { if($contig_array[$lcv][2] >= $right_end_limit) { $right_end_limit_exceeded = 1; } #if else { $right_end_limit_exceeded = 0; } # else last PUSH if(!&push_left($lcv, 0, $right_end_limit_exceeded)); $comparative_displacement = $contig_array[$lcv][1] - $contig_array[$lcv][5]; } # if } # for # $most_misplaced = &get_most_misplaced; # notate the duplicated areas my $begin = -1; my $end = -1; for(my $lcv = 0; $lcv < $record_number; $lcv++) { if($contig_array[$lcv][6]) { # part of a duplicated area if($begin == -1) { # the left part of a duplicated area $begin = $contig_array[$lcv][1]; $end = $contig_array[$lcv][2]; } # if else { # could be a continuation or a new duplicated area if($contig_array[$lcv - 1][2] + OFFSET + 1 >= $contig_array[$lcv][1]) { # continuation of a duplicated area $end = $contig_array[$lcv][2]; } # if else { # a new duplicated area # record the old one $dup_counter++; my $dup_name = "Dup$dup_counter"; # print DUPS_FILE "$mlg\t$begin\t$end\t$dup_name\t \n"; # reset $begin = $contig_array[$lcv][1]; $end = $contig_array[$lcv][2]; } # else } # else } # if else { # not part of a duplicated area if($begin != -1) { # a duplicated area need to be recorded $dup_counter++; my $dup_name = "Dup$dup_counter"; # print DUPS_FILE "$mlg\t$begin\t$end\t$dup_name\t \n"; # reset $begin = -1; $end = -1; } # if } # else } # for } # process_group() # for spreading contigs sub push_left() { # return FALSE; my $current_contig = shift; my $maximum_displacement = shift; my $right_end_limit_exceeded = shift; my $ok = 0; my $proceed = 0; $proceed = 1 if($right_end_limit_exceeded); # my $current_contig = $_[0]; # my $maximum_displacement = $_[1]; my $current_displacement = 0; $current_displacement = abs($contig_array[$current_contig][1] - # the start $contig_array[$current_contig][5]); # the original start my $intended_displacement = 0; $intended_displacement = abs(($contig_array[$current_contig][1] - OFFSET) - $contig_array[$current_contig][5]); # $current_displacement = 0 if(!defined($current_displacement)); # $intended_displacement = 0 if(!defined($intended_displacement)); # exit if(!$current_displacement); if($intended_displacement < $current_displacement) { $proceed = 1; $maximum_displacement = $current_displacement if($current_displacement > $maximum_displacement); } # if elsif ($current_displacement < $maximum_displacement) { # it is going further away # but is still less than the max $proceed = 1; } # else return $ok if(!$proceed); if($current_contig == 1) { if($contig_array[$current_contig][1] > OFFSET * 2) { # it is the leftmost contig and there is room to move left $ok = 1; } # if } # if elsif($current_contig > 1 && $contig_array[$current_contig][1] > $contig_array[$current_contig - 1][2] + OFFSET) { # there is room to move without bumping into another contig $ok = 1; } # elsif elsif(&push_left($current_contig - 1, $maximum_displacement, $right_end_limit_exceeded)) { # another contig can be bumped to the left $ok = 1; } # elsif if($ok) { $contig_array[$current_contig][1] = sprintf("%08.0f", $contig_array[$current_contig][1] - OFFSET); $contig_array[$current_contig][2] = sprintf("%08.0f", $contig_array[$current_contig][2] - OFFSET); $contig_array[$current_contig][6] = 1; # set to true } # if $ok; } # push_left() # for determining the midpoint of a clone in a ctg sub clone_mid (){ my $anchor = shift; my $clone_start = shift; my $ctg_start = shift; my $clone_length = shift; my $project = shift; my $clone_mid = ($clone_start - $ctg_start) + ($clone_length / 2); $clone_mid = int($clone_mid * $project->{band_factor}); $clone_mid; } # clone_mid # get the displacement for ctgs with multiple valid anchors sub get_displacement() { my $ctg_start = shift; my $ctg_length = shift; my $num_anchors = shift; my $ctg = shift; my $flip_ref = shift; my $flipped = FALSE; # Get the smallest and largest anchor locations my $smallest = $anchor_array[1][1]; my $largest = $anchor_array[1][1]; for(my $lcv = 2; $lcv <= $num_anchors; $lcv++) { $smallest = $anchor_array[$lcv][1] if($anchor_array[$lcv][1] < $smallest); $largest = $anchor_array[$lcv][1] if($anchor_array[$lcv][1] > $largest); } # for # where to start the placement of the ctg my $starting_point = $smallest - $ctg_length; # get the minimum average distance (based on $total) my $first = TRUE; my $total = -1; my $winner = -1; for(my $current_location = $starting_point; $current_location <= $largest; $current_location++) { my $current_total = 0; for(my $lcv2 = 1; $lcv2 <= $num_anchors; $lcv2++) { $current_total += abs(($current_location + $anchor_array[$lcv2][0]) - $anchor_array[$lcv2][1]); } # for if($first) { $total = $current_total; $winner = $current_location; $first = FALSE; } # if else { if($current_total < $total) { $total = $current_total; $winner = $current_location; } # if } # else } # for if($smallest != $largest) { # This contig may be optimized in reverse # See if a reverse sequence would be better # flip the clone_mid values (index 0) for(my $lcv3 = 1; $lcv3 <= $num_anchors; $lcv3++) { $anchor_array[$lcv3][0] = $ctg_length - $anchor_array[$lcv3][0]; } # for # Find the best solution in reverse $flipped = FALSE; for(my $current_location = $starting_point; $current_location <= $largest; $current_location++) { my $current_total = 0; for(my $lcv4 = 1; $lcv4 <= $num_anchors; $lcv4++) { $current_total += abs(($current_location + $anchor_array[$lcv4][0]) - $anchor_array[$lcv4][1]); } # for if($current_total < $total) { $total = $current_total; $winner = $current_location; $flipped = TRUE; } # if } # for if($flipped) { # print "Flipping $ctg"; ${$flip_ref} = 'yes'; } # if } # if $winner; # the starting point of the best location for the ctg } # get_displacement 1