#!/usr/bin/perl

use warnings;
use strict;

use Getopt::Long;

################################################################################
# Arguments - Start
my ($help, $gff_dir, %source, $tax_id);

GetOptions(
	'help' => \$help,
	'gff_dir=s' => \$gff_dir,
	'source=s' => \%source,
	'tax_id=i' => \$tax_id
);

sub usage {
	print qq~
	Converts the tab-delimited knownGene.txt file from
	the UCSC Golden Path into a gff file.
	This script is very similar to ensembl.pl and ucsc.pl.

	Last updated: 10-Apr-08
	Author: James Allen (james.allen\@cimr.cam.ac.uk)

	Please provide
	--gff_dir
	--source
	--tax_id

	[--help]
	~;
	print "\n";
	exit;
}

usage if defined $help;
usage unless $gff_dir && scalar(keys(%source)) && $tax_id;

my $name = $source{"name"};
my $build = $source{"build"};
my $data_file = "$gff_dir/rawdata/$name\_$build/".$source{"data_file"};
my $gff_file = "$gff_dir/$name\_$build.gff";

my $reference_name;
$reference_name = "reference" if $tax_id == 9606 && $build =~ /^NCBI36/;
# Won't pick up the patches that applied between major builds with
# the next line; too fiddly to be worth it.
$reference_name = "GRCh37.p2-Primary" if $tax_id == 9606 && $build =~ /^NCBI37/;
$reference_name = "C57BL/6J" if $tax_id == 10090;
$reference_name = "RGSC_v3.4" if $tax_id == 10116;
# Arguments - End
################################################################################

################################################################################
# Process Data File - Start
my $output;

open(DATA_FILE, "$data_file") || die "Cannot open file $data_file.";
open(GFF_FILE, ">$gff_file") || die "Cannot open file $gff_file.";

# Discard header row;
my $line = <DATA_FILE>;

while (my $line = <DATA_FILE>) {
	# The columns are, in order:
	# tax_id, chr, chr_start, chr_stop, chr_orient, contig, ctg_start, ctg_stop, ctg_orient,
	# feature_name, feature_id (ie "GeneID:XXX"), feature_type (eg UTR, CDS),
	# group_label (eg reference, Celera), transcript (eg NM_, XM_; with version tacked on),
	# evidence_code.

	# Grabbing everything is a bit excessive, but hopefully makes things a bit clearer.
	my ($tax_id, $chr, $chr_start, $chr_stop, $chr_orient, $contig, $ctg_start,
		$ctg_stop, $ctg_orient, $feature_name, $feature_id, $feature_type, $group_label,
		$transcript, $evidence_code) = split(/\t/, $line);

	# Only proceed if we have a UTR or a CDS, for the reference sequence.
	if ($chr !~ /\|/
		&& ($feature_type eq "UTR" || $feature_type eq "CDS")
		&& $group_label =~ /^$reference_name/
		&& $transcript =~ /^[NX][MR]/)
	{

		# We want to lop off the trailing version number from the transcript id.
		$transcript =~ s/\.[\d]+//;

		# Other gff columns are constant values.
		my $score = ".";
		my $frame = ".";

		$feature_id =~ s/GeneID://;
		$feature_id =~ s/LocusID://;
		my $attributes = ";Build $build;EntrezGene $feature_id";

		# Each UTR/CDS is a feature, and the hierarchical grouping of gff ties them together.
		$output .= "chr$chr\t$name\t$feature_type\t$chr_start\t$chr_stop\t";
		$output .= "$score\t$chr_orient\t$frame\tNative_id $transcript$attributes\n";
	}
}
print GFF_FILE $output;

close(DATA_FILE);
close(GFF_FILE);
# Process Data File - End
################################################################################