#!/usr/bin/perl

use warnings;
use strict;

use Getopt::Long;

################################################################################
# Arguments - Start
my ($help, $gff_dir, %source, $tax_id);

GetOptions(
	'help' => \$help,
	'gff_dir=s' => \$gff_dir,
	'source=s' => \%source,
	'tax_id:i' => \$tax_id
);

sub usage {
	print qq~
	Converts the tab-delimited knownGene.txt file from
	the UCSC Golden Path into a gff file.
	This script is (almost) exactly the same as ensembl.pl.

	Last updated: 09-Apr-08
	Author: James Allen (james.allen\@cimr.cam.ac.uk)

	Please provide
	--gff_dir
	--source
	[--tax_id]

	[--help]
	~;
	print "\n";
	exit;
}

usage if defined $help;
usage unless $gff_dir && scalar(keys(%source));

my $script_dir = $0;
$script_dir =~ s!/[^/]+$!!;

my $name = $source{"name"};
my $build = $source{"build"};
my $data_file = "$gff_dir/rawdata/$name\_$build/".$source{"data_file"};
my $info_file = "$gff_dir/rawdata/$name\_$build/".$source{"info_file"};
my $refseq_file = "$gff_dir/RefSeq_$build.gff";
my $gff_file = "$gff_dir/$name\_$build.gff";
# Arguments - End
################################################################################

################################################################################
# Resolve Ambiguities in Info File - Start
my $script = "$script_dir/resolve_id_ambiguities.pl";
my $params = "--name='$name'".
			" --data_file='$data_file'".
			" --info_file='$info_file'".
			" --refseq_file='$refseq_file'".
			" --source='ucsc'";

if (-e $script) {
	my $error = system("perl $script $params");
	if ($error) {
		warn "Error executing $script";
		exit 1;
	}
} else {
	warn "Script $script does not exist";
	exit 1;
}
# Resolve Ambiguities in Info File - End
################################################################################

################################################################################
# Process Info File - Start
my %gene_ids;

open(INFO_FILE, "$info_file") || die "Cannot open file $info_file.";

# The first line has column names, so we throw that away, once
# we've worked out the order of the columns.
my $line = <INFO_FILE>;
my $native_id_first = 1;
if ($line =~ /^EntrezGene ID/) {
	$native_id_first = 0;
}

while (my $line = <INFO_FILE>) {
	$line =~ /^([^\t]*)\t([^\t^\n]*)$/;
	if ($1 and $2) {
		# We use a hash of hashes so that we get unique values.
		if ($native_id_first) {
			$gene_ids{$1}{$2} = 1;
		} else {
			$gene_ids{$2}{$1} = 1;
		}
	}
}

close(INFO_FILE);
# Process Info File - End
################################################################################

################################################################################
# Process Data File - Start
my $output;
my %native_ids;

open(DATA_FILE, "$data_file") || die "Cannot open file $data_file.";
open(GFF_FILE, ">$gff_file") || die "Cannot open file $gff_file.";
while (my $line = <DATA_FILE>) {
	# Throughout the file, start values are minus-oned,
	# presumably to make length calculataions easier.
	my ($native_id, $chromosome, $strand,
		$transcript_start, $transcript_end,
		$cds_start, $cds_end,
		$exon_count, $exon_starts, $exon_ends) = split(/\t/, $line);

	$native_id =~ s/\.\d+//;
	$cds_start = $cds_start + 1;
	chomp $exon_ends;
	my @exon_starts = sort split(",", $exon_starts);
	my @exon_ends = sort split(",", $exon_ends);

	my $score = ".";
	my $frame = ".";

	my $attributes = "";
	foreach my $gene_id (keys %{$gene_ids{$native_id}}) {
		$attributes .= ";EntrezGene $gene_id";
	}

	# Check if this is one of several positional variants; if it's the second
	# variant we've seen, then we need to go back and edit the first as well...
	if (exists $native_ids{$native_id}) {
		if ($native_ids{$native_id} == 1) {
			$output =~ s/Native_id $native_id;Build $build$attributes/Native_id $native_id;Build $build$attributes;PositionVariant 1/g;
		}
		$native_ids{$native_id}++;
		$attributes .= ";PositionVariant ".$native_ids{$native_id};
	} else {
		$native_ids{$native_id} = 1;
	}

	# Now it gets a little hairy, so hold on to your hat; using the 'implied_utrs'
	# option of the 'processed_transcript' glyph is a PITA, so we explicitly create
	# UTRs. As with the CDS regions, however, these may span multiple exons, and/or
	# terminate within an exon, so we need a fairly complicated bunch of logic to
	# sort all this out.
	for (my $i=0; $i < scalar(@exon_starts); $i++) {
		# Take account of start points being minus-oned.
		my $exon_start = $exon_starts[$i] + 1;
		my $exon_end = $exon_ends[$i];

		# If the exon end is less than the CDS start, the whole exon must be UTR.
		# Similarly, if the exon start is greater than the CDS end, the whole exon
		# must be UTR.
		if (($exon_end < $cds_start) or ($exon_start > $cds_end)) {
			$output .= "$chromosome\t$name\tUTR\t$exon_start\t$exon_end\t";
			$output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n";
		# If an exon is entirely contained within the CDS bounds, it must be CDS.
		} elsif (($exon_start >= $cds_start) and ($exon_end <= $cds_end)) {
			$output .= "$chromosome\t$name\tCDS\t$exon_start\t$exon_end\t";
			$output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n";
		# If a CDS is entirely contained within the exon bounds, then we have
		# a UTR-CDS-UTR patterned exon.
		} elsif (($exon_start < $cds_start) and ($exon_end > $cds_end)) {
			$output .= "$chromosome\t$name\tUTR\t$exon_start\t".($cds_start-1)."\t";
			$output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n";
			$output .= "$chromosome\t$name\tCDS\t$cds_start\t$cds_end\t";
			$output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n";
			$output .= "$chromosome\t$name\tUTR\t".($cds_end+1)."\t$exon_end\t";
			$output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n";
		# The only thing left to deal with now is when an exon is part-UTR and
		# part-CDS; first up is the case where the 'left' section of the exon is UTR...
		} elsif (($exon_start < $cds_start) and ($exon_end >= $cds_start)) {
			$output .= "$chromosome\t$name\tUTR\t$exon_start\t".($cds_start-1)."\t";
			$output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n";
			$output .= "$chromosome\t$name\tCDS\t$cds_start\t$exon_end\t";
			$output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n";
		# ...then the case where the 'right' section of the exon is UTR.
		} elsif (($exon_start <= $cds_end) and ($exon_end > $cds_end)) {
			$output .= "$chromosome\t$name\tCDS\t$exon_start\t$cds_end\t";
			$output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n";
			$output .= "$chromosome\t$name\tUTR\t".($cds_end+1)."\t$exon_end\t";
			$output .= "$score\t$strand\t$frame\tNative_id $native_id;Build $build$attributes\n";
		# If we get here then we're in trouble, since we should have exhausted
		# all of the possibilities...
		} else {
			die "oops-a-daisy, $native_id is causing consternation.";
		}
	}
}
print GFF_FILE $output;

close(DATA_FILE);
close(GFF_FILE);
# Process Data File - End
################################################################################