#!/usr/bin/perl

use warnings;
use strict;

use Getopt::Long;
use DBI;

################################################################################
# Arguments - Start
my ($help, $dir, $version, $no_download, $gff_exists, $gff_loaded, $verbose);

GetOptions(
	'help' => \$help,
	'dir=s' => \$dir,
	'version=s' => \$version,
	'no_download' => \$no_download,
	'gff_exists' => \$gff_exists,
	'gff_loaded' => \$gff_loaded,
	'verbose' => \$verbose
);

sub usage {
	print qq~
	Get latest data from the web, save it in --dir, then process into
	gff files which are loaded into a database, the name of which has
	--version as a suffix.

	Example:
	.\/generate_gene_models.pl \-\-dir \~\/scratch\/gene_models\/3_4 \-\-version 3_4_0 \-\-verbose

	Last updated: 22-Apr-08
	Author: James Allen (james.allen\@cimr.cam.ac.uk)

	Please provide
	--dir
	--version

	[--no_download] (default 0)
	[--gff_exists] (default 0)
	[--gff_loaded] (default 0)
	[--verbose] (default 0)
	[--help]
	~;
	print "\n";
	exit;
}

usage if defined $help;
usage unless $dir && $version;
die "Cannot find directory $dir" unless -e $dir;
$no_download = 0 unless $no_download;
$gff_exists = 0 unless $gff_exists;
$gff_loaded = 0 unless $gff_loaded;
$verbose = 0 unless $verbose;

my $script_dir = $0;
$script_dir =~ s!/[^/]+$!!;

my ($error, %error_sources);

# Generate XML that's used by BioMart to retrieve ID mappings.
my $template_xml =
	'<?xml version="1.0" encoding="UTF-8"?>'.
	'<!DOCTYPE Query>'.
	'<Query virtualSchemaName="default" formatter="TSV" header="1" uniqueRows="1" count="" datasetConfigVersion="0.7">'.
	'<Dataset name="DATASET" interface="default">'.
	'<Attribute name="entrezgene" />'.
	'<Attribute name="NATIVE_ID" />'.
	'</Dataset>'.
	'</Query>';

my $human_ensembl_xml = $template_xml;
$human_ensembl_xml =~ s/DATASET/hsapiens_gene_ensembl/;
$human_ensembl_xml =~ s/NATIVE_ID/ensembl_transcript_id/;

my $human_vega_xml = $template_xml;
$human_vega_xml =~ s/DATASET/hsapiens_gene_vega/;
$human_vega_xml =~ s/NATIVE_ID/vega_transcript_id/;

my $mouse_ensembl_xml = $template_xml;
$mouse_ensembl_xml =~ s/DATASET/mmusculus_gene_ensembl/;
$mouse_ensembl_xml =~ s/NATIVE_ID/ensembl_transcript_id/;

my $mouse_vega_xml = $template_xml;
$mouse_vega_xml =~ s/DATASET/mmusculus_gene_vega/;
$mouse_vega_xml =~ s/NATIVE_ID/vega_transcript_id/;

my $rat_ensembl_xml = $template_xml;
$rat_ensembl_xml =~ s/DATASET/rnorvegicus_gene_ensembl/;
$rat_ensembl_xml =~ s/NATIVE_ID/ensembl_transcript_id/;

# Build a data structure that will determine what is downloaded, from where.
# Note that it is very important that RefSeq is done first; that data is used
# to resolve ambiguities for other data sources.
my $gene_models = 
	{human =>
		{tax_id => 9606,
		 sources => 
			[
			{name => "RefSeq",
			 data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.36.3/mapview",
			 data_file => "seq_gene.md.gz",
			 info_base_url => "",
			 info_file => "",
			 build => "NCBI36.3"
			},
			{name => "RefSeq",
			 data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview",
			 data_file => "seq_gene.md.gz",
			 info_base_url => "",
			 info_file => "",
			 build => "NCBI37.1"
			},
			{name => "CCDS",
			 data_base_url => "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/archive/Hs36.3/",
			 data_file => "CCDS.20090327.txt",
			 info_base_url => "",
			 info_file => "",
			 build => "NCBI36.3"
			},
			{name => "CCDS",
			 data_base_url => "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human",
			 data_file => "CCDS.current.txt",
			 info_base_url => "",
			 info_file => "",
			 build => "NCBI37.1"
			},
			{name => "Ensembl",
			 data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database",
			 data_file => "ensGene.txt.gz",
			 info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_ensembl_xml",
			 info_file => "EnsemblInfo.txt",
			 build => "NCBI36.3"
			},
			{name => "Ensembl",
			 data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database",
			 data_file => "ensGene.txt.gz",
			 info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_ensembl_xml",
			 info_file => "EnsemblInfo.txt",
			 build => "NCBI37.1"
			},
			{name => "UCSC",
			 data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database",
			 data_file => "knownGene.txt.gz",
			 info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database",
			 info_file => "knownToLocusLink.txt.gz",
			 build => "NCBI36.3"
			},
			{name => "UCSC",
			 data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database",
			 data_file => "knownGene.txt.gz",
			 info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database",
			 info_file => "knownToLocusLink.txt.gz",
			 build => "NCBI37.1"
			},
			{name => "Vega",
			 data_base_url => "ftp://ftp.ensembl.org/pub/release-54/mysql/homo_sapiens_vega_54_36p",
			 data_file => "*.gz",
			 info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_vega_xml",
			 info_file => "VegaInfo.txt",
			 build => "NCBI36.3"
			},
			{name => "Vega",
			 data_base_url => "ftp://ftp.ensembl.org/pub/release-60/mysql/homo_sapiens_vega_60_37e",
			 data_file => "*.gz",
			 info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_vega_xml",
			 info_file => "VegaInfo.txt",
			 build => "NCBI37.1"
			}
			]
		},
	mouse =>
		{tax_id => 10090,
		 sources =>
			[
			{name => "RefSeq",
			 data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview",
			 data_file => "seq_gene.md.gz",
			 info_base_url => "",
			 info_file => "",
			 build => "NCBIM37"
			},
			{name => "CCDS",
			 data_base_url => "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_mouse",
			 data_file => "CCDS.current.txt",
			 info_base_url => "",
			 info_file => "",
			 build => "NCBIM37"
			},
			{name => "Ensembl",
			 data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/database",
			 data_file => "ensGene.txt.gz",
			 info_base_url => "http://www.biomart.org/biomart/martservice?query=$mouse_ensembl_xml",
			 info_file => "EnsemblInfo.txt",
			 build => "NCBIM37"
			},
			{name => "UCSC",
			 data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/database",
			 data_file => "knownGene.txt.gz",
			 info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/database",
			 info_file => "knownToLocusLink.txt.gz",
			 build => "NCBIM37"
			},
			{name => "Vega",
			 data_base_url => "ftp://ftp.ensembl.org/pub/release-50/mysql/mus_musculus_vega_50_37c",
			 data_file => "*.gz",
			 info_base_url => "http://www.biomart.org/biomart/martservice?query=$mouse_vega_xml",
			 info_file => "VegaInfo.txt",
			 build => "NCBIM37"
			}
			]
		},
	rat =>
		{tax_id => 10116,
		 sources =>
			[
			{name => "RefSeq",
			 data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/R_norvegicus/ARCHIVE/BUILD.4.1/mapview",
			 data_file => "seq_gene.md.gz",
			 info_base_url => "",
			 info_file => "",
			 build => "RGSC3.4"
			},
			{name => "Ensembl",
			 data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/rn4/database",
			 data_file => "ensGene.txt.gz",
			 info_base_url => "http://www.biomart.org/biomart/martservice?query=$rat_ensembl_xml",
			 info_file => "EnsemblInfo.txt",
			 build => "RGSC3.4"
			},
			{name => "UCSC",
			 data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/rn4/database",
			 data_file => "knownGene.txt.gz",
			 info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/rn4/database",
			 info_file => "knownToLocusLink.txt.gz",
			 build => "RGSC3.4"
			}
			]
		}
	}
;
# Arguments - End
################################################################################

################################################################################
# Setup Directory Structure - Start
print "Setup Directory Structure - Start\n" if $verbose;
foreach my $species (sort keys %{$gene_models}) {
	mkdir "$dir/$species" unless -e "$dir/$species";
	mkdir "$dir/$species/rawdata" unless -e "$dir/$species/rawdata";
	foreach my $source (@{$$gene_models{$species}{"sources"}}) {
		my $source_name = $$source{"name"};
		my $build = $$source{"build"};
		mkdir "$dir/$species/rawdata/$source_name\_$build"
			unless -e "$dir/$species/rawdata/$source_name\_$build";
	}
}
print "Setup Directory Structure - End\n" if $verbose;
# Setup Directory Structure - End
################################################################################

################################################################################
# Download Data - Start
unless ($no_download) {
	print "Download Data - Start\n" if $verbose;
	foreach my $species (sort keys %{$gene_models}) {
		foreach my $source (@{$$gene_models{$species}{"sources"}}) {
			my $source_name = $$source{"name"};
			my $build = $$source{"build"};

			my $dest_dir = "$dir/$species/rawdata/$source_name\_$build";
			my $data_base_url = $$source{"data_base_url"};
			my $info_base_url = $$source{"info_base_url"};
			my $data_file = $$source{"data_file"};
			my $info_file = $$source{"info_file"};

			print "    $species $build $source_name - Start\n" if $verbose;

			if (-e "$dest_dir/$data_file") {
				print "        Using existing data\n" if $verbose;
			} else {
				print "        Retrieving new data\n" if $verbose;
				if ($data_base_url) {
					# Run wget with time-stamping (-N), and quietly (-q).
					# Try to use wildcard first; if that doesn't work, get directory contents.
					$error = system("wget -q -P $dest_dir '$data_base_url/$data_file'");
					if ($error) {
						$error = system("wget -Nqr -nd -P $dest_dir '$data_base_url'");
						if ($error) {
							warn "Error retrieving $source_name data file, url: $data_base_url/$data_file";
							$error_sources{$species}{$source_name} = 1;
						}
					}

					unless ($error) {
						if ($data_file =~ /\.gz$/) {
							$error = system("cp $dest_dir/$data_file /tmp/.");
							$error += system("gunzip -f $dest_dir/$data_file");
							$error += system("mv /tmp/$data_file $dest_dir/.");
							if ($error) {
								warn "Error unzipping $source_name data file, url: $dest_dir/$data_file";
								$error_sources{$species}{$source_name} = 1;
							}
						}
					}
				}
				if ($info_base_url) {
					if ($info_base_url =~ /biomart/) {
						# Want to save with a meaningful name, as the url includes
						# the full xml query, which doesn't make for a good filename.
						$error = system("wget -q -O $dest_dir/$info_file '$info_base_url'");
						if ($error) {
							warn "Error retrieving $source_name info file, url: $info_base_url" if $error;
							$error_sources{$species}{$source_name} = 1;
						}
					} else {
						$error = system("wget -q -P $dest_dir '$info_base_url/$info_file'");
						if ($error) {
							$error = system("wget -Nqr -nd -P $dest_dir '$info_base_url'");
							if ($error) {
								warn "Error retrieving $source_name info file, url: $info_base_url/$info_file";
								$error_sources{$species}{$source_name} = 1;
							}
						} else {
							if ($info_file =~ /\.gz$/) {
								$error = system("cp $dest_dir/$info_file /tmp/.");
								$error += system("gunzip -f $dest_dir/$info_file");
								$error += system("mv /tmp/$info_file $dest_dir/.");
								if ($error) {
									warn "Error unzipping $source_name info file, url: $dest_dir/$info_file";
									$error_sources{$species}{$source_name} = 1;
								}
							}
						}
					}
				}
			}
			print "    $species $build $source_name - End\n" if $verbose;
		}
	}
	print "Download Data - End\n" if $verbose;
}
# Download Data - End
################################################################################

################################################################################
# Convert Source Data into GFF - Start
unless ($gff_exists) {
	print "Convert Source Data into GFF - Start\n" if $verbose;
	foreach my $species (sort keys %{$gene_models}) {
		# Manually curated data sits in the top-level dir, and has a filename
		# that starts with the species name, eg 'human_T1DBase_Curated.gff'. We
		# copy this into the species sub-directory(s) so that it gets processed
		# with everything else.
		opendir(DIR, "$dir") || die "Cannot open $dir";
		foreach my $gff_file (sort grep { /^$species\_(.*\.gff)$/ } readdir(DIR)) {
			print "    Copying $dir/$gff_file\n" if $verbose;
			my $gff_file_copy;
			($gff_file_copy = $gff_file) =~ s/^$species\_//;
			$error = system("cp $dir/$gff_file $dir/$species/$gff_file_copy");
			die "Copying failed" if $error;

		}
		closedir(DIR);

		foreach my $source (@{$$gene_models{$species}{"sources"}}) {
			my $source_name = $$source{"name"};
			my $build = $$source{"build"};

			if (exists $error_sources{$species}{$source_name}) {
				warn "Skipping conversion to GFF for $species $source_name due to previous errors";
				next;
			}
			print "    $species $build $source_name - Start\n" if $verbose;

			next if -e "$dir/$species/$source_name\_$build.gff";

			$$source{"data_file"} =~ s/\.gz$//;
			$$source{"info_file"} =~ s/\.gz$//;

			my $tax_id = $$gene_models{$species}{"tax_id"};
			my $script = "$script_dir/".lc($source_name).".pl";

			my $params = "--gff_dir='$dir/$species' --tax_id $tax_id ";
			foreach my $key (keys %{$source}) {
				$params .= "--source $key='".$$source{$key}."' ";
			}

			if (-e $script) {
				my $cmd = "perl $script $params";
				print "Executing $cmd\n" if $verbose;
				$error = system($cmd);
				if ($error) {
					warn "Error executing $script";
					$error_sources{$species}{$source_name} = 1;
				}
			} else {
				warn "Script $script does not exist";
				$error_sources{$species}{$source_name} = 1;
			}
			print "    $species $build $source_name - End\n" if $verbose;
		}
	}
	print "Convert Source Data into GFF - End\n" if $verbose;
}
# Convert Source Data into GFF - End
################################################################################

################################################################################
# Create Gene Models Database and Load GFF - Start
my $database = "gene_models_$version";
open(DB_INFO, "$script_dir/database_login.txt") || die "Cannot open file $script_dir/database_login.txt";
my $header = <DB_INFO>;
my ($user, $pass, $host) = <DB_INFO> =~ /^([^\t]+)\t([^\t]*)\t([^\t\n]*)$/;
close(DB_INFO);

unless ($gff_loaded) {
	my $dbh;
	if ($host) {
		$dbh = DBI->connect("DBI:mysql:mysql;host=$host", $user, $pass);
	} else {
		$dbh = DBI->connect("DBI:mysql:mysql", $user, $pass);
	}
	$dbh->do("DROP DATABASE IF EXISTS $database;") || die $dbh->errstr;
	$dbh->do("CREATE DATABASE $database;") || die $dbh->errstr;
	$dbh->disconnect;
	
	my $dbi = "dbi:mysql:database=$database";
	$dbi .= ";host=$host" if $host;
	#$dbi .= ";mysql_read_default_file=$script_dir/my.cnf";
	
	# We use a local copy of bp_fast_load_gff.pl, since the version in
	# the standard BioPerl checkout has a bug which causes data to be lost.
	#my $create = "-c ";
	my $cmd = "bp_bulk_load_gff.pl -u $user -p $pass -d '$dbi' --local ";
	foreach my $species (sort keys %{$gene_models}) {
		opendir(GFF_DIR, "$dir/$species") || die "Cannot open $dir/$species";
		foreach my $gff_file (sort grep { /\.gff$/ } readdir(GFF_DIR)) {
			print "Loading $dir/$species/$gff_file\n" if $verbose;
			#my $cmd = "$script_dir/bp_fast_load_gff.pl -u $user -p $pass -h $host -d '$dbi' ";
			#$cmd .= "$create $dir/$species/$gff_file";
			$cmd .= "$dir/$species/$gff_file ";
			#$create = "" if $create;
		}
		closedir(GFF_DIR);
	}
	print "Executing $cmd\n" if $verbose;
	$error = system($cmd);
	die "Loading failed" if $error;
}

# Create Gene Models Database and Load GFF - End
################################################################################

################################################################################
# Create Meta Data - Start
print "Loading source_builds data\n" if $verbose;# --local-infile
$error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_source_builds.sql");
die "Error executing $script_dir/load_source_builds.sql" if $error;

print "Loading xy_gene data\n" if $verbose;
$error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_xy_gene.sql");
die "Error executing $script_dir/load_xy_gene.sql" if $error;

# This is the slightly messy bit, where we decide how much consistency there is
# across the different sources, and make a record of models that look ropey
my $summary_file = "$dir/summary.txt";
my $ignore_file = "$dir/ignore_list.txt";
my $undecided_file = "$dir/undecided_list.txt";

print "Creating summary data\n" if $verbose;
$error = system("mysql -u $user -p$pass -h $host $database < $script_dir/create_summary.sql > $summary_file");
die "Error executing $script_dir/create_summary.sql" if $error;

print "Analysing summary data\n" if $verbose;
open(SUMMARY, $summary_file) || die "Cannot open $summary_file";
open(IGNORE, ">$ignore_file") || die "Cannot open $ignore_file";
open(UNDECIDED, ">$undecided_file") || die "Cannot open $undecided_file";

my (@seen, %source_ids, %location, %source, @ignore, @undecided);

# The start value that we retrieve is the minimum value for the transcript,
# rounded up to the nearest million - there might be cases where two
# transcripts might be have distinct positions, millions of bases apart, on
# the same strand of the same chromosome, so we need to check for them.

# Intialise variables with the first row of data.
my $first_line = <SUMMARY>;
my ($first_chr, $first_strand, $first_start, $current_gene_id,
	$first_source, $first_source_id, $first_position_variant) =
	$first_line =~ /^([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)$/;
push(@seen, $first_line);
$source_ids{"$first_source_id$first_chr$first_strand$first_start"} = 1;

# We track two complementary hashes of hashes to save messing around later.
$location{"$first_chr$first_strand$first_start"}{"$first_source"}++;
$source{"$first_source"}{"$first_chr$first_strand$first_start"}++;

while (my $line = <SUMMARY>) {
	my ($chr, $strand, $start, $gene_id, $source, $source_id, $position_variant) =
	$line =~ /^([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)$/;

	if ($gene_id && $current_gene_id eq $gene_id) {
		push(@seen, $line);

		# Make sure that we don't count the same ID (from different sources) twice.
		unless (exists $source_ids{"$source_id$chr$strand$start"}) {
			$source_ids{"$source_id$chr$strand$start"} = 1;
			$location{"$chr$strand$start"}{"$source"}++;
			$source{"$source"}{"$chr$strand$start"}++;
		}
	} else {
		# Check if we have a difference in chromosome and/or strand and/or start position.
		if (scalar(keys(%location)) > 1) {
			my $tentative;

			# If there's CCDS, that's the most likely to be correct...
			if (exists $source{"CCDS"} && scalar(keys(%{$source{"CCDS"}})) == 1) {
				#... as long as it doesn't conflict with Vega transcripts.
				if (exists $source{"Vega"} && scalar(keys(%{$source{"Vega"}})) == 1) {
					if (join("", keys(%{$source{"CCDS"}})) eq join("", keys(%{$source{"Vega"}}))) {
						$tentative = join("", keys(%{$source{"CCDS"}}));
					}
				} else {
					$tentative = join("", keys(%{$source{"CCDS"}}));
				}
			} elsif (exists $source{"Vega"} && scalar(keys(%{$source{"Vega"}})) == 1) {
				$tentative = join("", keys(%{$source{"Vega"}}));
			}

			# For all of the sources, do a straight count, and if one
			# combo of chromosome and strand and start position is
			# 'significantly' favoured, then ignore the others. Exactly what
			# is significant in this sense is debatable, but a difference of
			# 2:1 in favour of one combo seems to get it right a lot of the
			# time, and hardly ever wrong. Also, we have a minimum number of
			# transcripts acting as a threshold, before we even contemplate
			# what combo is favoured.
			my %location_total;
			foreach my $source (keys %source) {
				foreach my $location (keys %{$source{$source}}) {
					$location_total{$location} += $source{$source}{$location};
				}
			}

			# Sort so that the first value is the largest, and consider it the most probable.
			my ($rank_one, $rank_one_count, $remainder_count);
			foreach my $location (
			  sort {$location_total{$b} <=> $location_total{$a}}
			  keys %location_total) {
				if (not defined $rank_one) {
					$rank_one = $location;
					$rank_one_count = $location_total{$location};
				} else {
					$remainder_count += $location_total{$location};
				}
			}

			$rank_one =~ /^(.+)([\-\+])(.+)$/;
			my ($rank_one_chr, $rank_one_strand, $rank_one_start) = ($1, $2, $3);

			# If $tentative is defined, then we have at least one CCDS or Vega transcript
			# and we consider that to be the most likely answer - and hence we can
			# relax our definition of 'significant' from the previous comment.
			if (defined $tentative) {
				# It's possible for two chr/strand/start combos to have the same count value,
				# and since one is picked at random, the following condition might not always
				# make complete sense - but if two combos are present in equal numbers, we're
				# not gonna be able to do anything with the data anyway.
				if ($tentative eq $rank_one &&
					$rank_one_count > $remainder_count &&
					$rank_one_count >= 2)
				{
					foreach my $row (@seen) {
						# Plus and minus are special characters in regexes,
						# so sort out the pattern in a variable first.
						my $match = $rank_one_chr.'\t\\'.$rank_one_strand.'\t'.$rank_one_start;
						unless ($row =~ /^$match/) {
							push(@ignore, $row);
						}
					}
				# If the tentative combo doesn't match the most common, we really
				# want to be sure that we're doing the right thing...
				} elsif ($rank_one_count >= ($remainder_count*3)) {
					foreach my $row (@seen) {
						my $match = $rank_one_chr.'\t\\'.$rank_one_strand.'\t'.$rank_one_start;
						unless ($row =~ /^$match/) {
							push(@ignore, $row);
						}
					}
				} else {
					push(@undecided, @seen);
				}
			} else {
				if ($rank_one_count >= ($remainder_count*2) &&
					$rank_one_count >= 2)
				{
					foreach my $row (@seen) {
						my $match = $rank_one_chr.'\t\\'.$rank_one_strand.'\t'.$rank_one_start;
						unless ($row =~ /^$match/) {
							push(@ignore, $row);
						}
					}
				} else {
					push(@undecided, @seen);
				}
			}
		}

		# Reset all the variables for the next gene id.
		undef @seen;
		undef %source_ids;
		undef %location;
		undef %source;

		if ($gene_id) {
			push(@seen, $line);
			$source_ids{"$source_id$chr$strand$start"} = 1;
			$location{"$chr$strand$start"}{"$source"}++;
			$source{"$source"}{"$chr$strand$start"}++;
			$current_gene_id = $gene_id;
		}
	}
}

foreach my $row (@ignore) {
	print IGNORE "$row";
}
foreach my $row (@undecided) {
	print UNDECIDED "$row";
}

close(IGNORE);
close(UNDECIDED);
close(SUMMARY);

print "Loading ignore_list data\n" if $verbose;
$error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_ignore.sql");
die "Error executing $script_dir/load_ignore.sql" if $error;
$error = system("mysqlimport -u $user -p$pass -h $host --local $database $ignore_file");
die "Error importing $ignore_file" if $error;

print "Loading undecided_list data\n" if $verbose;
$error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_undecided.sql");
die "Error executing $script_dir/load_undecided.sql" if $error;
$error = system("mysqlimport -u $user -p$pass -h $host --local $database $undecided_file");
die "Error importing $undecided_file" if $error;

print "Gene Models database '$database' is complete\n";
# Create Meta Data - End
################################################################################