#!/usr/bin/perl use warnings; use strict; use Getopt::Long; ################################################################################ # Arguments - Start my ($help, $gff_dir, %source, $tax_id); GetOptions( 'help' => \$help, 'gff_dir=s' => \$gff_dir, 'source=s' => \%source, 'tax_id=i' => \$tax_id ); sub usage { print qq~ Converts the tab-delimited knownGene.txt file from the UCSC Golden Path into a gff file. This script is very similar to ensembl.pl and ucsc.pl. Last updated: 10-Apr-08 Author: James Allen (james.allen\@cimr.cam.ac.uk) Please provide --gff_dir --source --tax_id [--help] ~; print "\n"; exit; } usage if defined $help; usage unless $gff_dir && scalar(keys(%source)) && $tax_id; my $name = $source{"name"}; my $build = $source{"build"}; my $data_file = "$gff_dir/rawdata/$name\_$build/".$source{"data_file"}; my $gff_file = "$gff_dir/$name\_$build.gff"; my $reference_name; $reference_name = "reference" if $tax_id == 9606 && $build =~ /^NCBI36/; # Won't pick up the patches that applied between major builds with # the next line; too fiddly to be worth it. $reference_name = "GRCh37.p2-Primary" if $tax_id == 9606 && $build =~ /^NCBI37/; $reference_name = "C57BL/6J" if $tax_id == 10090; $reference_name = "RGSC_v3.4" if $tax_id == 10116; # Arguments - End ################################################################################ ################################################################################ # Process Data File - Start my $output; open(DATA_FILE, "$data_file") || die "Cannot open file $data_file."; open(GFF_FILE, ">$gff_file") || die "Cannot open file $gff_file."; # Discard header row; my $line = ; while (my $line = ) { # The columns are, in order: # tax_id, chr, chr_start, chr_stop, chr_orient, contig, ctg_start, ctg_stop, ctg_orient, # feature_name, feature_id (ie "GeneID:XXX"), feature_type (eg UTR, CDS), # group_label (eg reference, Celera), transcript (eg NM_, XM_; with version tacked on), # evidence_code. # Grabbing everything is a bit excessive, but hopefully makes things a bit clearer. my ($tax_id, $chr, $chr_start, $chr_stop, $chr_orient, $contig, $ctg_start, $ctg_stop, $ctg_orient, $feature_name, $feature_id, $feature_type, $group_label, $transcript, $evidence_code) = split(/\t/, $line); # Only proceed if we have a UTR or a CDS, for the reference sequence. if ($chr !~ /\|/ && ($feature_type eq "UTR" || $feature_type eq "CDS") && $group_label =~ /^$reference_name/ && $transcript =~ /^[NX][MR]/) { # We want to lop off the trailing version number from the transcript id. $transcript =~ s/\.[\d]+//; # Other gff columns are constant values. my $score = "."; my $frame = "."; $feature_id =~ s/GeneID://; $feature_id =~ s/LocusID://; my $attributes = ";Build $build;EntrezGene $feature_id"; # Each UTR/CDS is a feature, and the hierarchical grouping of gff ties them together. $output .= "chr$chr\t$name\t$feature_type\t$chr_start\t$chr_stop\t"; $output .= "$score\t$chr_orient\t$frame\tNative_id $transcript$attributes\n"; } } print GFF_FILE $output; close(DATA_FILE); close(GFF_FILE); # Process Data File - End ################################################################################