package GDxBase::GeneModel::Specification::GFF; =head1 NAME GDxBase::GeneModel::Specification::GFF =head1 SYNOPSIS A class that is used by GDxBase::GeneModel::Collection. =head1 DESCRIPTION A subclass of GDxBase::GeneModel::Specification, for the manipulation and display of GFF data. Last Revision: 9-Aug-06 =head1 See Also GDxBase::GeneModel::Specification GDxBase::GeneModel::Collection =head1 Author James Allen Email: james.allen@cimr.cam.ac.uk =head1 Copyright Copyright 2006 James Allen, DIL =cut use warnings; use strict; use Carp; use Data::Dumper; use Bio::DB::GFF; use Bio::SeqFeature::Generic; use Class::AutoClass; use GDxBase::GeneModel::Specification; our @ISA = ("Class::AutoClass", "GDxBase::GeneModel::Specification"); our (@AUTO_ATTRIBUTES, %DEFAULTS); @AUTO_ATTRIBUTES = qw(types); %DEFAULTS = (types => ["CDS", "UTR"]); Class::AutoClass::declare(__PACKAGE__); ################################################################################ =head1 METHODS =cut =head2 _init_self Title: _init_self Function: Create new GDxBase::GeneModel::Specification::GFF object Usage: $gm = GDxBase::GeneModel::Specification::GFF->new($source, [$types]) Args: source - mandatory, e.g. EnsEMBL, Vega, CCDS types - mandatory, arrayref, ['CDS', 'UTR'] by default Returns: A GDxBase::GeneModel::Specification::GFF object =cut sub _init_self { my $self = shift; confess "An array of types (e.g. ['CDS', 'UTR']) must be supplied..." unless $self->types; } ################################################################################ =head2 get_features Title: get_features Function: Return a set of features for the gene (and possibly build) specified in a GDxBase::GeneModel::Collection object Usage: ($features, $start, $stop) = GDxBase::GeneModel::Specification::GFF->get_features($gmc, $config_values) Args: gmc - mandatory, a GDxBase::GeneModel::Collection object config_values - optional, a hash of config-ini settings Returns: An array of Bio::DB::GFF::Feature objects The start and stop values for the gene model as integer values =cut sub get_features { my $self = shift; my ($gmc, $config_values) = @_; #confess $$config_values{"gene_id_att_name"}; my ($db, @extended_types, $attributes, @feature_set, %transcripts, $model_start, $model_stop, %all_features); my $dsn; if (exists $$config_values{"dsn"}) { $dsn = $$config_values{"dsn"}; } else { $dsn = "driver=mysql;host=".$$config_values{"host"}.";dbname=".$$config_values{"db_name"} } $db = Bio::DB::GFF->new(-dsn => $dsn, -user => $$config_values{"db_user"}, -pass => $$config_values{"db_pass"}, -aggregator => $$config_values{"aggregator"} ); $db->absolute(1); foreach my $type (@{$self->types}) { push @extended_types, $type.":".$self->source; } $attributes = { $$config_values{"gene_id_att_name"} => $gmc->gene_id }; if (exists $$config_values{"limit_att_name"}) { $$attributes{$$config_values{"limit_att_name"}} = $$config_values{"limit_att_value"}; } # The features returned are Bio::DB::GFF::Feature objects. @feature_set = $db->features( -types => \@extended_types, -attributes => $attributes ); #confess Dumper $attributes if $extended_types[0] eq 'variation:dbSNP'; # If we have no data, then bail out. if (not scalar(@feature_set)) { return ({}, undef, undef); } # We might have more than one group in our dataset, and if so, we want # each on a separate track; we manage this with a nice little hash. foreach my $feature (@feature_set) { # If the location of the feature is wildly different from the gene, # then don't show it - it doesn't really make much sense. # It's a bit clunky, but for an easy life we put a dummy value # in the array when we're ignoring the build. my @builds; if ($gmc->ignore_build) { @builds = ["ignore"]; } else { @builds = $feature->attributes('Build'); } foreach my $build (@builds) { if ($gmc->ignore_build || $build eq $gmc->build) { if ($feature->ref =~ /^chr/) { unless ( $feature->ref eq $gmc->chromosome && ($feature->start - $gmc->collection_start) > -500000 && ($gmc->collection_stop - $feature->stop) > -500000) { next; } } # The GFF module returns subfeatures as Bio::DB::GFF::Features, which is # no good for us, as later we might want to add tags which control the # display of the data, so we create a new feature. my $new_feature = Bio::SeqFeature::Generic->new( -start => $feature->start, -end => $feature->stop, -strand => $feature->strand, -score => $feature->score, -primary_tag => $feature->method); my $transcript_name = $feature->group; my $fref = $feature->sourceseq; if ($feature->attributes('PositionVariant')) { $transcript_name .= " (Variant " . $feature->attributes('PositionVariant').")"; } if (exists $$config_values{"desc_att_name"}) { my $desc = $feature->attributes($$config_values{"desc_att_name"}); $new_feature->add_tag_value('desc', $desc) if $desc; } if (exists $$config_values{"alt_att_name"} && defined $feature->attributes($$config_values{"alt_att_name"})) { if ($feature->attributes($$config_values{"alt_att_name"}) eq $$config_values{"alt_att_value"}) { $new_feature->add_tag_value('bgcolor', $$config_values{"alt_bgcolor"}); $new_feature->add_tag_value('fgcolor', $$config_values{"alt_fgcolor"}); } } elsif(exists $$config_values{"alt_att_name"} && $$config_values{"alt_att_name"} eq 'alternate'){ if($transcript_name =~ /$$config_values{"alt_att_value"}(\d+)/){ $transcript_name = $1; if ($transcript_name%2 == 0){ $new_feature->add_tag_value('bgcolor', $$config_values{"alt_bgcolor"}); $new_feature->add_tag_value('fgcolor', $$config_values{"alt_fgcolor"}); } } } if(exists $$config_values{"att_to_keep"}){ my @atts = split(':', $$config_values{"att_to_keep"}); foreach my $att (@atts){ if (defined $feature->attributes($att)){ $new_feature->add_tag_value($att, $feature->attributes($att)); } } } if (exists $$config_values{"link_att_name"}) { my $link = $feature->attributes($$config_values{"link_att_name"}); $new_feature->add_tag_value('link', $link) if $link; } if (exists $$config_values{"link_name_find"}) { my $find = $$config_values{"link_name_find"}; my $replace = $$config_values{"link_name_replace"}; my $link = $feature->name; $link =~ s/$find/$replace/; $new_feature->add_tag_value('link', $link) if $link; } push @{$transcripts{$fref}{$transcript_name}}, $new_feature; if (!$model_start || $feature->start < $model_start) { $model_start = $feature->start; } if (!$model_stop || $feature->end > $model_stop) { $model_stop = $feature->end; } } } } # If we have no data, then bail out. if (not scalar(keys(%transcripts))) { return ({}, undef, undef); } foreach my $fref (keys %transcripts) { foreach my $transcript (keys %{$transcripts{$fref}}){ my $gff_features = Bio::SeqFeature::Generic->new( -display_name => $transcript, -source_tag => $self->source); foreach my $feature (@{$transcripts{$fref}{$transcript}}) { $gff_features->add_SeqFeature($feature, "EXPAND"); if ($feature->has_tag("desc") && !$gff_features->has_tag("desc")) { $gff_features->add_tag_value("desc", join("", $feature->get_tag_values("desc"))); } if ($feature->has_tag("bgcolor") && !$gff_features->has_tag("bgcolor")) { $gff_features->add_tag_value("bgcolor", join("", $feature->get_tag_values("bgcolor"))); } if ($feature->has_tag("fgcolor") && !$gff_features->has_tag("fgcolor")) { $gff_features->add_tag_value("fgcolor", join("", $feature->get_tag_values("fgcolor"))); } if ($feature->has_tag("link") && !$gff_features->has_tag("link")) { $gff_features->add_tag_value("link", join("", $feature->get_tag_values("link"))); } } push @{$all_features{$fref}}, $gff_features; } } return (\%all_features, $model_start, $model_stop); } ################################################################################ 1;