package GDxBase::GeneModel::Specification::Consensus; =head1 NAME GDxBase::GeneModel::Specification::Consensus =head1 SYNOPSIS A class that is used by GDxBase::GeneModel::Collection. =head1 DESCRIPTION A subclass of GDxBase::GeneModel::Specification, to gather features from multiple other gene models, and display them as consensus tracks. This is not a trivial process, but rather than going into detail here, comments are wantonly strewn throughout the code at appropriate moments. Briefly, two transcripts are considered to be equivalent if both sets of subfeatures are within one base of each other, in either direction. The extreme end points of the transcripts are disregarded, to allow for varying lengths of UTR, within certain bounds; these bounds are determined by the length of the gene. Last Revision: 9-Aug-06 =head1 See Also GDxBase::GeneModel::Specification GDxBase::GeneModel::Collection =head1 Author James Allen Email: james.allen@cimr.cam.ac.uk =head1 Copyright Copyright 2006 James Allen, DIL =cut use warnings; use strict; use Carp; use Class::AutoClass; use GDxBase::GeneModel::Specification; our @ISA = ("Class::AutoClass", "GDxBase::GeneModel::Specification"); our (@AUTO_ATTRIBUTES, %DEFAULTS); @AUTO_ATTRIBUTES = qw(); %DEFAULTS = (); Class::AutoClass::declare(__PACKAGE__); ################################################################################ =head1 METHODS =cut =head2 _init_self Title: _init_self Function: Create new GDxBase::GeneModel::Specification::Consensus object Usage: $gm = GDxBase::GeneModel::Specification::Consensus->new($source) Args: source - mandatory, e.g. Consensus Returns: A GDxBase::GeneModel::Specification::Consensus object =cut sub _init_self { my $self = shift; # Nobody here but us chickens. } ################################################################################ =head2 get_features Title: get_features Function: Return a set of features for the gene features stored in a GDxBase::GeneModel::Collection object Usage: ($features, $start, $stop) = GDxBase::GeneModel::Specification::Consensus->get_features($gmc, $config_values) Args: gmc - mandatory, a GDxBase::GeneModel::Collection object config_values - optional, a hash of config-ini settings Returns: An array of Bio::DB::GFF::Feature objects The start and stop values for the gene model as integer values =cut sub get_features { my $self = shift; my ($gmc, $config_values) = @_; my (%existing_features, %matched_transcripts, @consensus_transcripts, $all_consensus_transcripts, %features, %all_features); # The order of the elements in @include is crucial; when displaying a # consensus transcript we have to decide how exactly to display it: we either # pick one of the transcripts, or we merge them all somehow. The latter is a # lot of hassle, and doesn't really do anything for us. So the order determines # which transcript to display as the consensus - if the first element happens # to be part of a consensus, then that will be used; if not, then the second # element will be used (as long as it's in a consensus), and so on. my @include; if ($$config_values{"include"} && $$config_values{"include"} ne "") { @include = @{$$config_values{"include"}}; } else { return(); } foreach my $source (@include) { if (exists ${$gmc->features}{$source}{$gmc->chromosome}) { $existing_features{$source}{$gmc->chromosome} = ${$gmc->features}{$source}{$gmc->chromosome}; } } # We want to compare every transcript of one gene model with every transcript of # the other gene models. This boils down to two features having the same # subfeatures. When we get a match, we store the feature in an array, as a # three-element array, where the first element is the feature itself, the second # element is an array of the names of the gene models, and the third is an array # of the corresponding transcript IDs. # We have the gene models in the order defined by @include, and we compare the features # of each gene model with the gene models that appear after it in the array. When # we find a hit with a gene model, we move on to the next, as no model can # contribute more than one transcript. We need to keep track of whether the # previous iterations have produced a consensus transcript, so that we know # whether to add a matched transcript to our array, or to just add # the new gene model's name/id to those arrays in the second and third # array elements. We effectively merge UTR and CDS sections at either end, then # disregard the extreme start and end points. To prevent false positives, we # only merge if the resulting exon is sufficiently short. We allow a difference # of one base in either direction when deciding whether two subfeatures are the same. # Once we're all done comparing, we add any transcripts which are of high # quality that haven't already been dealt with; eg manually curated transcripts # are very reliable, so a single one can be considered as an honourary consensus # transcript. for (my $i=0; $i < scalar(@include); $i++) { next unless exists $existing_features{$include[$i]}; foreach my $current_feature (sort {$a->display_name cmp $b->display_name} @{$existing_features{$include[$i]}{$gmc->chromosome}}) { # Bail if we've previously taken care of this transcript. next if exists($matched_transcripts{$current_feature->display_name}); my $matched = 0; my %current_feature; my $reset_hash = 1; GENE_MODEL: for (my $j=$i+1; $j < scalar(@include); $j++) { next GENE_MODEL unless exists $existing_features{$include[$j]}; FEATURE: foreach my $feature (sort {$a->display_name cmp $b->display_name} @{$existing_features{$include[$j]}{$gmc->chromosome}}) { # Bail if we've previously taken care of this transcript. next if exists($matched_transcripts{$feature->display_name}); # Create a hash with the start and end points of the subfeatures, # with all values set to zero. When we go through the subfeatures # we'll increment the values, one by one, bailing if we have a # start/stop value that isn't a key; if we get all of the way through, # then we check that we have all 1s as values; if not, that ain't # a complete match. We potentially need to reset this hash for # each feature, otherwise several features could cumulatively match; # the problem is, resetting is time-consuming and we won't need to do # it very often, so we use a flag. my @current_sub_features; if (ref($current_feature) eq "Bio::SeqFeature::Generic") { @current_sub_features = $current_feature->get_SeqFeatures; } elsif (ref($feature) eq "Bio::SeqFeature::Transcript") { @current_sub_features = $current_feature->features; } else { warn "Unrecognised feature type ", ref($feature), "\n"; } if ($reset_hash) { my $previous_start = 0; my $previous_end = 0; foreach my $current_sub_feature (sort {$a->start <=> $b->start} @current_sub_features) { # We don't rely on the 'strand' attribute, since we have # heterogeneous features which use it in different ways. my ($current_sub_feature_start, $current_sub_feature_end); if ($current_sub_feature->start < $current_sub_feature->end) { $current_sub_feature_start = $current_sub_feature->start; $current_sub_feature_end = $current_sub_feature->end; } else { # On the other strand, in opposite-land, where # crooks chase cops and cats have puppies... $current_sub_feature_start = $current_sub_feature->end; $current_sub_feature_end = $current_sub_feature->start; } # We have a mixture of data sources; some are exons, # some are UTRs and CDS, some are just CDS. To get a # level playing field we merge adjacent subfeatures, # effectively turning UTR/CDS regions into exons, so # that we can then compare apples to apples. Of course # it would be be nicer to split the exons up, but # the data isn't necessarily there for us to do that. # However, we need to be careful with transcripts which # have a small number of (large) exons - since we're # disregarding the extreme start point we could easily # make an erroneous match. To counter this we only # merge UTR and CDS when the length of the resulting # segment is less than an arbitrary figure. This figure # was chosen, partly by logic and partly by trial-and-error # to be 5% of the gene length. Doing this means that # we may miss a few transcripts that should be included # in the consensus, but ensures that the number of false # positives that we get is very small, if there are any at all. if ($current_sub_feature_start == $previous_end+1 && ($current_sub_feature_end - $previous_start) < (($gmc->gene_stop - $gmc->gene_start)/20) ) { delete $current_feature{"end"}{$previous_end}; $current_feature{"end"}{$current_sub_feature_end} = 0; } else { $current_feature{"start"}{$current_sub_feature_start} = 0; $current_feature{"end"}{$current_sub_feature_end} = 0; } $previous_start = $current_sub_feature_start; $previous_end = $current_sub_feature_end; } # We want to be a bit liberal about the start and end points, # since UTRs might be off, when what we're really interested # in are CDS regions. So we remove the outermost points, unless # they are the only points (which means we have a single-exon CDS). my @starts = sort keys(%{$current_feature{"start"}}); if (scalar(@starts) > 1) { delete $current_feature{"start"}{$starts[0]}; } my @ends = sort keys(%{$current_feature{"end"}}); if (scalar(@ends) > 1) { delete $current_feature{"end"}{$ends[-1]}; } $reset_hash = 0; } my %feature; my @sub_features; if (ref($feature) eq "Bio::SeqFeature::Generic") { @sub_features = $feature->get_SeqFeatures; } elsif (ref($feature) eq "Bio::SeqFeature::Transcript") { @sub_features = $feature->features; } else { warn "Unrecognised feature type ", ref($feature), "\n"; } my $previous_start = 0; my $previous_end = 0; foreach my $sub_feature (sort {$a->start <=> $b->start} @sub_features) { my ($sub_feature_start, $sub_feature_end); if ($sub_feature->start < $sub_feature->end) { $sub_feature_start = $sub_feature->start; $sub_feature_end = $sub_feature->end; } else { # On the other strand... $sub_feature_start = $sub_feature->end; $sub_feature_end = $sub_feature->start; } # Remember all that messing around we had to do earlier, # merging UTR/CDS regions, so that we could mix our metaphors? # Yep, we've gotta take account of that here as well. if ($sub_feature_start == $previous_end+1 && ($sub_feature_end - $previous_start) < (($gmc->gene_stop - $gmc->gene_start)/20) ) { delete $feature{"end"}{$previous_end}; $feature{"end"}{$sub_feature_end} = 0; } else { $feature{"start"}{$sub_feature_start} = 0; $feature{"end"}{$sub_feature_end} = 0; } $previous_start = $sub_feature_start; $previous_end = $sub_feature_end; } my @starts = sort keys(%{$feature{"start"}}); if (scalar(@starts) > 1) { delete $feature{"start"}{$starts[0]}; } my @ends = sort keys(%{$feature{"end"}}); if (scalar(@ends) > 1) { delete $feature{"end"}{$ends[-1]}; } # The two hashes we've now got, %current_feature and %feature, # should have the same keys, give or take a base - to save # wasted effort we first check if they're the same size; if not, # then we've got no hope of matching them. if (scalar(keys(%current_feature)) != scalar(keys(%feature))) { next FEATURE; } foreach my $sub_feature_start (keys %{$feature{"start"}}) { # We don't mind if we're one base out... if (exists($current_feature{"start"}{$sub_feature_start})) { $current_feature{"start"}{$sub_feature_start} = 1; $reset_hash = 1; } elsif (exists($current_feature{"start"}{($sub_feature_start)+1})) { $current_feature{"start"}{($sub_feature_start)+1} = 1; $reset_hash = 1; } elsif (exists($current_feature{"start"}{($sub_feature_start)-1})) { $current_feature{"start"}{($sub_feature_start)-1} = 1; $reset_hash = 1; } else { # As soon as we've got one miss, we bail - our criteria # for similarity permits no dissimilarity between subfeatures, # other than UTR differences which have already been taken # into account. next FEATURE; } } foreach my $sub_feature_end (keys %{$feature{"end"}}) { if (exists($current_feature{"end"}{$sub_feature_end})) { $current_feature{"end"}{$sub_feature_end} = 1; $reset_hash = 1; } elsif (exists($current_feature{"end"}{($sub_feature_end)+1})) { $current_feature{"end"}{($sub_feature_end)+1} = 1; $reset_hash = 1; } elsif (exists($current_feature{"end"}{($sub_feature_end)-1})) { $current_feature{"end"}{($sub_feature_end)-1} = 1; $reset_hash = 1; } else { next FEATURE; } } @starts = sort values(%{$current_feature{"start"}}); @ends = sort values(%{$current_feature{"end"}}); if ($starts[0] and $ends[0]) { # Beezer, we got a match! if ($matched) { # Add name of gene model to existing list; the last # two elements of the array are the ones we want, and # those elements are arrays themselves, of course... push @{$consensus_transcripts[-1][-2]}, $include[$j]; push @{$consensus_transcripts[-1][-1]}, $feature->display_name; } else { # First match so far. push @consensus_transcripts, [ $current_feature, [$include[$i], $include[$j]], [$current_feature->display_name, $feature->display_name] ]; $matched = 1; } # To prevent counting this matched transcript again, we # have to ignore it - removing elements from an array within # an iteration is gonna cause trouble, so instead we just # keep a record, which we can check against later. $matched_transcripts{$current_feature->display_name} = 1; $matched_transcripts{$feature->display_name} = 1; # No other features from this gene model can match now, so # we bail to save time. next GENE_MODEL; } } } } } # Some transcripts are considered to be a consensus on their own, due # to the high quality of the source data. my @high_quality; if ($$config_values{"high_quality"} && $$config_values{"high_quality"} ne "") { if (ref($$config_values{"high_quality"}) eq 'ARRAY') { @high_quality = @{$$config_values{"high_quality"}}; } else { @high_quality = ($$config_values{"high_quality"}); } foreach my $gene_model (@high_quality) { next unless exists $existing_features{$gene_model}{$gmc->chromosome}; foreach my $feature (@{$existing_features{$gene_model}{$gmc->chromosome}}) { # Bail if we've previously taken care of this transcript. next if exists($matched_transcripts{$feature->display_name}); push @consensus_transcripts, [ $feature, ["single $gene_model transcript"], [$feature->display_name] ]; } } } # We render each feature as a regular processed transcript, with shades-of-blue # colour-coding, and a mouseover text which indicates the gene models which have # contributed to that consensus transcript. if (scalar(@consensus_transcripts)) { foreach my $consensus_transcript (sort {scalar(@{$$b[1]}) <=> scalar(@{$$a[1]}) || join("", @{$$a[1]}) cmp join("", @{$$b[1]}) || join("", @{$$a[2]}) cmp join("", @{$$b[2]}) } @consensus_transcripts) { my $bgcolor; # We go from light to dark blue. if (scalar(@{$$consensus_transcript[1]}) == 1) { $bgcolor = '#00CCFF'; } elsif (scalar(@{$$consensus_transcript[1]}) == 2) { $bgcolor = '#0077FF'; } elsif (scalar(@{$$consensus_transcript[1]}) == 3) { $bgcolor = '#0044FF'; } elsif (scalar(@{$$consensus_transcript[1]}) == 4) { $bgcolor = '#0033BB'; } else { $bgcolor = '#0000AA'; } # Create a new feature, and add the sub features to it anew. We # enter the number of contributing transcripts as the score, so # that it's easy to sort the consensus transcripts in the panel. my $consensus_feature = Bio::SeqFeature::Generic->new( -display_name => "Consensus from ".join(", ", @{$$consensus_transcript[1]}), -score => scalar(@{$$consensus_transcript[1]}), -tag => { desc => " (".join(", ", @{$$consensus_transcript[2]}).")" } ); my @sub_features; if (ref($$consensus_transcript[0]) eq "Bio::SeqFeature::Generic") { @sub_features = $$consensus_transcript[0]->get_SeqFeatures; } elsif (ref($$consensus_transcript[0]) eq "Bio::SeqFeature::Transcript") { @sub_features = $$consensus_transcript[0]->features; } else { warn "Unrecognised feature type ", ref($$consensus_transcript[0]), "\n"; } foreach my $sub_feature (@sub_features) { my $consensus_sub_feature = Bio::SeqFeature::Generic->new( -start => $sub_feature->start, -end => $sub_feature->end, -strand => $sub_feature->strand, -tag => { bgcolor => $bgcolor }); $consensus_feature->add_SeqFeature($consensus_sub_feature, "EXPAND"); } push @{$all_features{$gmc->chromosome}}, $consensus_feature; } } return (\%all_features, undef, undef); } ################################################################################ 1;