#!/usr/bin/perl use warnings; use strict; use Getopt::Long; ################################################################################ # Arguments - Start my ($help, $gff_dir, %source, $tax_id); GetOptions( 'help' => \$help, 'gff_dir=s' => \$gff_dir, 'source=s' => \%source, 'tax_id:i' => \$tax_id ); sub usage { print qq~ Converts the tab-delimited knownGene.txt file from the UCSC Golden Path into a gff file. This script is (almost) exactly the same as ensembl.pl. Last updated: 09-Apr-08 Author: James Allen (james.allen\@cimr.cam.ac.uk) Please provide --gff_dir --source [--tax_id] [--help] ~; print "\n"; exit; } usage if defined $help; usage unless $gff_dir && scalar(keys(%source)); my $script_dir = $0; $script_dir =~ s!/[^/]+$!!; my $name = $source{"name"}; my $build = $source{"build"}; my $data_file = "$gff_dir/rawdata/$name\_$build/".$source{"data_file"}; my $info_file = "$gff_dir/rawdata/$name\_$build/".$source{"info_file"}; my $refseq_file = "$gff_dir/RefSeq_$build.gff"; my $gff_file = "$gff_dir/$name\_$build.gff"; # Arguments - End ################################################################################ ################################################################################ # Resolve Ambiguities in Info File - Start my $script = "$script_dir/resolve_id_ambiguities.pl"; my $params = "--name='$name'". " --data_file='$data_file'". " --info_file='$info_file'". " --refseq_file='$refseq_file'". " --source='ucsc'"; if (-e $script) { my $error = system("perl $script $params"); if ($error) { warn "Error executing $script"; exit 1; } } else { warn "Script $script does not exist"; exit 1; } # Resolve Ambiguities in Info File - End ################################################################################ ################################################################################ # Process Info File - Start my %gene_ids; open(INFO_FILE, "$info_file") || die "Cannot open file $info_file."; # The first line has column names, so we throw that away, once # we've worked out the order of the columns. my $line = ; my $native_id_first = 1; if ($line =~ /^EntrezGene ID/) { $native_id_first = 0; } while (my $line = ) { $line =~ /^([^\t]*)\t([^\t^\n]*)$/; if ($1 and $2) { # We use a hash of hashes so that we get unique values. if ($native_id_first) { $gene_ids{$1}{$2} = 1; } else { $gene_ids{$2}{$1} = 1; } } } close(INFO_FILE); # Process Info File - End ################################################################################ ################################################################################ # Process Data File - Start my $output; my %native_ids; open(DATA_FILE, "$data_file") || die "Cannot open file $data_file."; open(GFF_FILE, ">$gff_file") || die "Cannot open file $gff_file."; while (my $line = ) { # Throughout the file, start values are minus-oned, # presumably to make length calculataions easier. my ($native_id, $chromosome, $strand, $transcript_start, $transcript_end, $cds_start, $cds_end, $exon_count, $exon_starts, $exon_ends) = split(/\t/, $line); $native_id =~ s/\.\d+//; $cds_start = $cds_start + 1; chomp $exon_ends; my @exon_starts = sort split(",", $exon_starts); my @exon_ends = sort split(",", $exon_ends); my $score = "."; my $frame = "."; my $attributes = ""; foreach my $gene_id (keys %{$gene_ids{$native_id}}) { $attributes .= ";EntrezGene $gene_id"; } # Check if this is one of several positional variants; if it's the second # variant we've seen, then we need to go back and edit the first as well... if (exists $native_ids{$native_id}) { if ($native_ids{$native_id} == 1) { $output =~ s/Native_id $native_id;Build $build$attributes/Native_id $native_id;Build $build$attributes;PositionVariant 1/g; } $native_ids{$native_id}++; $attributes .= ";PositionVariant ".$native_ids{$native_id}; } else { $native_ids{$native_id} = 1; } # Now it gets a little hairy, so hold on to your hat; using the 'implied_utrs' # option of the 'processed_transcript' glyph is a PITA, so we explicitly create # UTRs. As with the CDS regions, however, these may span multiple exons, and/or # terminate within an exon, so we need a fairly complicated bunch of logic to # sort all this out. for (my $i=0; $i < scalar(@exon_starts); $i++) { # Take account of start points being minus-oned. my $exon_start = $exon_starts[$i] + 1; my $exon_end = $exon_ends[$i]; # If the exon end is less than the CDS start, the whole exon must be UTR. # Similarly, if the exon start is greater than the CDS end, the whole exon # must be UTR. if (($exon_end < $cds_start) or ($exon_start > $cds_end)) { $output .= "$chromosome\t$name\tUTR\t$exon_start\t$exon_end\t"; $output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n"; # If an exon is entirely contained within the CDS bounds, it must be CDS. } elsif (($exon_start >= $cds_start) and ($exon_end <= $cds_end)) { $output .= "$chromosome\t$name\tCDS\t$exon_start\t$exon_end\t"; $output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n"; # If a CDS is entirely contained within the exon bounds, then we have # a UTR-CDS-UTR patterned exon. } elsif (($exon_start < $cds_start) and ($exon_end > $cds_end)) { $output .= "$chromosome\t$name\tUTR\t$exon_start\t".($cds_start-1)."\t"; $output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n"; $output .= "$chromosome\t$name\tCDS\t$cds_start\t$cds_end\t"; $output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n"; $output .= "$chromosome\t$name\tUTR\t".($cds_end+1)."\t$exon_end\t"; $output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n"; # The only thing left to deal with now is when an exon is part-UTR and # part-CDS; first up is the case where the 'left' section of the exon is UTR... } elsif (($exon_start < $cds_start) and ($exon_end >= $cds_start)) { $output .= "$chromosome\t$name\tUTR\t$exon_start\t".($cds_start-1)."\t"; $output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n"; $output .= "$chromosome\t$name\tCDS\t$cds_start\t$exon_end\t"; $output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n"; # ...then the case where the 'right' section of the exon is UTR. } elsif (($exon_start <= $cds_end) and ($exon_end > $cds_end)) { $output .= "$chromosome\t$name\tCDS\t$exon_start\t$cds_end\t"; $output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n"; $output .= "$chromosome\t$name\tUTR\t".($cds_end+1)."\t$exon_end\t"; $output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n"; # If we get here then we're in trouble, since we should have exhausted # all of the possibilities... } else { die "oops-a-daisy, $native_id is causing consternation."; } } } print GFF_FILE $output; close(DATA_FILE); close(GFF_FILE); # Process Data File - End ################################################################################