#!/usr/bin/perl

use warnings;
use strict;

use Getopt::Long;

################################################################################
# Arguments - Start
my ($help, $gff_dir, %source, $tax_id);

GetOptions(
	'help' => \$help,
	'gff_dir=s' => \$gff_dir,
	'source=s' => \%source,
	'tax_id:i' => \$tax_id
);

sub usage {
	print qq~
	Converts a tab-delimited file from the CCDS website into a gff file.

	Last updated: 09-Apr-08
	Author: James Allen (james.allen\@cimr.cam.ac.uk)

	Please provide
	--gff_dir
	--source
	[--tax_id]

	[--help]
	~;
	print "\n";
	exit;
}

usage if defined $help;
usage unless $gff_dir && scalar(keys(%source));

my $name = $source{"name"};
my $build = $source{"build"};
my $data_file = "$gff_dir/rawdata/$name\_$build/".$source{"data_file"};
my $gff_file = "$gff_dir/$name\_$build.gff";
# Arguments - End
################################################################################

################################################################################
# Process Data File - Start
open(DATA_FILE, glob("$data_file")) || die "Cannot open file $data_file.";
open(GFF_FILE, ">$gff_file") || die "Cannot open file $gff_file.";
while (my $line = <DATA_FILE>) {
	if ($line =~ /^[^#]/) {
		my ($chromosome, $g_accession, $gene, $gene_id,
			$ccds_id, $ccds_status, $cds_strand,
			$cds_from, $cds_to, $cds_locations) = split(/\t/, $line);

		next if $ccds_status =~ /^Withdrawn/;

		$ccds_id =~ s/\.\d+//;
		$cds_locations =~ s/[\[\]\n]//g;
		my @cds_locations = split(/, /, $cds_locations);

		foreach my $cds_location (@cds_locations) {
			my ($start, $stop) = $cds_location =~ /(\d+)\-(\d+)/;
			$start += 1;
			$stop += 1;
			print GFF_FILE "chr$chromosome\t$name\tCDS\t$start\t$stop\t.\t$cds_strand".
				"\t.\tNative_id $ccds_id;Build $build;EntrezGene $gene_id\n";
		}
	}
}
close(DATA_FILE);
close(GFF_FILE);
# Process Data File - End
################################################################################