#!/usr/bin/perl
use warnings;
use strict;
use Getopt::Long;
use DBI;
################################################################################
# Arguments - Start
my ($help, $dir, $version, $no_download, $gff_exists, $gff_loaded, $verbose);
GetOptions(
'help' => \$help,
'dir=s' => \$dir,
'version=s' => \$version,
'no_download' => \$no_download,
'gff_exists' => \$gff_exists,
'gff_loaded' => \$gff_loaded,
'verbose' => \$verbose
);
sub usage {
print qq~
Get latest data from the web, save it in --dir, then process into
gff files which are loaded into a database, the name of which has
--version as a suffix.
Example:
.\/generate_gene_models.pl \-\-dir \~\/scratch\/gene_models\/3_4 \-\-version 3_4_0 \-\-verbose
Last updated: 22-Apr-08
Author: James Allen (james.allen\@cimr.cam.ac.uk)
Please provide
--dir
--version
[--no_download] (default 0)
[--gff_exists] (default 0)
[--gff_loaded] (default 0)
[--verbose] (default 0)
[--help]
~;
print "\n";
exit;
}
usage if defined $help;
usage unless $dir && $version;
die "Cannot find directory $dir" unless -e $dir;
$no_download = 0 unless $no_download;
$gff_exists = 0 unless $gff_exists;
$gff_loaded = 0 unless $gff_loaded;
$verbose = 0 unless $verbose;
my $script_dir = $0;
$script_dir =~ s!/[^/]+$!!;
my ($error, %error_sources);
# Generate XML that's used by BioMart to retrieve ID mappings.
my $template_xml =
''.
''.
''.
''.
''.
''.
''.
'';
my $human_ensembl_xml = $template_xml;
$human_ensembl_xml =~ s/DATASET/hsapiens_gene_ensembl/;
$human_ensembl_xml =~ s/NATIVE_ID/ensembl_transcript_id/;
my $human_vega_xml = $template_xml;
$human_vega_xml =~ s/DATASET/hsapiens_gene_vega/;
$human_vega_xml =~ s/NATIVE_ID/vega_transcript_id/;
my $mouse_ensembl_xml = $template_xml;
$mouse_ensembl_xml =~ s/DATASET/mmusculus_gene_ensembl/;
$mouse_ensembl_xml =~ s/NATIVE_ID/ensembl_transcript_id/;
my $mouse_vega_xml = $template_xml;
$mouse_vega_xml =~ s/DATASET/mmusculus_gene_vega/;
$mouse_vega_xml =~ s/NATIVE_ID/vega_transcript_id/;
my $rat_ensembl_xml = $template_xml;
$rat_ensembl_xml =~ s/DATASET/rnorvegicus_gene_ensembl/;
$rat_ensembl_xml =~ s/NATIVE_ID/ensembl_transcript_id/;
# Build a data structure that will determine what is downloaded, from where.
# Note that it is very important that RefSeq is done first; that data is used
# to resolve ambiguities for other data sources.
my $gene_models =
{human =>
{tax_id => 9606,
sources =>
[
{name => "RefSeq",
data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.36.3/mapview",
data_file => "seq_gene.md.gz",
info_base_url => "",
info_file => "",
build => "NCBI36.3"
},
{name => "RefSeq",
data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview",
data_file => "seq_gene.md.gz",
info_base_url => "",
info_file => "",
build => "NCBI37.1"
},
{name => "CCDS",
data_base_url => "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/archive/Hs36.3/",
data_file => "CCDS.20090327.txt",
info_base_url => "",
info_file => "",
build => "NCBI36.3"
},
{name => "CCDS",
data_base_url => "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human",
data_file => "CCDS.current.txt",
info_base_url => "",
info_file => "",
build => "NCBI37.1"
},
{name => "Ensembl",
data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database",
data_file => "ensGene.txt.gz",
info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_ensembl_xml",
info_file => "EnsemblInfo.txt",
build => "NCBI36.3"
},
{name => "Ensembl",
data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database",
data_file => "ensGene.txt.gz",
info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_ensembl_xml",
info_file => "EnsemblInfo.txt",
build => "NCBI37.1"
},
{name => "UCSC",
data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database",
data_file => "knownGene.txt.gz",
info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database",
info_file => "knownToLocusLink.txt.gz",
build => "NCBI36.3"
},
{name => "UCSC",
data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database",
data_file => "knownGene.txt.gz",
info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database",
info_file => "knownToLocusLink.txt.gz",
build => "NCBI37.1"
},
{name => "Vega",
data_base_url => "ftp://ftp.ensembl.org/pub/release-54/mysql/homo_sapiens_vega_54_36p",
data_file => "*.gz",
info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_vega_xml",
info_file => "VegaInfo.txt",
build => "NCBI36.3"
},
{name => "Vega",
data_base_url => "ftp://ftp.ensembl.org/pub/release-60/mysql/homo_sapiens_vega_60_37e",
data_file => "*.gz",
info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_vega_xml",
info_file => "VegaInfo.txt",
build => "NCBI37.1"
}
]
},
mouse =>
{tax_id => 10090,
sources =>
[
{name => "RefSeq",
data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview",
data_file => "seq_gene.md.gz",
info_base_url => "",
info_file => "",
build => "NCBIM37"
},
{name => "CCDS",
data_base_url => "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_mouse",
data_file => "CCDS.current.txt",
info_base_url => "",
info_file => "",
build => "NCBIM37"
},
{name => "Ensembl",
data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/database",
data_file => "ensGene.txt.gz",
info_base_url => "http://www.biomart.org/biomart/martservice?query=$mouse_ensembl_xml",
info_file => "EnsemblInfo.txt",
build => "NCBIM37"
},
{name => "UCSC",
data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/database",
data_file => "knownGene.txt.gz",
info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/database",
info_file => "knownToLocusLink.txt.gz",
build => "NCBIM37"
},
{name => "Vega",
data_base_url => "ftp://ftp.ensembl.org/pub/release-50/mysql/mus_musculus_vega_50_37c",
data_file => "*.gz",
info_base_url => "http://www.biomart.org/biomart/martservice?query=$mouse_vega_xml",
info_file => "VegaInfo.txt",
build => "NCBIM37"
}
]
},
rat =>
{tax_id => 10116,
sources =>
[
{name => "RefSeq",
data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/R_norvegicus/ARCHIVE/BUILD.4.1/mapview",
data_file => "seq_gene.md.gz",
info_base_url => "",
info_file => "",
build => "RGSC3.4"
},
{name => "Ensembl",
data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/rn4/database",
data_file => "ensGene.txt.gz",
info_base_url => "http://www.biomart.org/biomart/martservice?query=$rat_ensembl_xml",
info_file => "EnsemblInfo.txt",
build => "RGSC3.4"
},
{name => "UCSC",
data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/rn4/database",
data_file => "knownGene.txt.gz",
info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/rn4/database",
info_file => "knownToLocusLink.txt.gz",
build => "RGSC3.4"
}
]
}
}
;
# Arguments - End
################################################################################
################################################################################
# Setup Directory Structure - Start
print "Setup Directory Structure - Start\n" if $verbose;
foreach my $species (sort keys %{$gene_models}) {
mkdir "$dir/$species" unless -e "$dir/$species";
mkdir "$dir/$species/rawdata" unless -e "$dir/$species/rawdata";
foreach my $source (@{$$gene_models{$species}{"sources"}}) {
my $source_name = $$source{"name"};
my $build = $$source{"build"};
mkdir "$dir/$species/rawdata/$source_name\_$build"
unless -e "$dir/$species/rawdata/$source_name\_$build";
}
}
print "Setup Directory Structure - End\n" if $verbose;
# Setup Directory Structure - End
################################################################################
################################################################################
# Download Data - Start
unless ($no_download) {
print "Download Data - Start\n" if $verbose;
foreach my $species (sort keys %{$gene_models}) {
foreach my $source (@{$$gene_models{$species}{"sources"}}) {
my $source_name = $$source{"name"};
my $build = $$source{"build"};
my $dest_dir = "$dir/$species/rawdata/$source_name\_$build";
my $data_base_url = $$source{"data_base_url"};
my $info_base_url = $$source{"info_base_url"};
my $data_file = $$source{"data_file"};
my $info_file = $$source{"info_file"};
print " $species $build $source_name - Start\n" if $verbose;
if (-e "$dest_dir/$data_file") {
print " Using existing data\n" if $verbose;
} else {
print " Retrieving new data\n" if $verbose;
if ($data_base_url) {
# Run wget with time-stamping (-N), and quietly (-q).
# Try to use wildcard first; if that doesn't work, get directory contents.
$error = system("wget -q -P $dest_dir '$data_base_url/$data_file'");
if ($error) {
$error = system("wget -Nqr -nd -P $dest_dir '$data_base_url'");
if ($error) {
warn "Error retrieving $source_name data file, url: $data_base_url/$data_file";
$error_sources{$species}{$source_name} = 1;
}
}
unless ($error) {
if ($data_file =~ /\.gz$/) {
$error = system("cp $dest_dir/$data_file /tmp/.");
$error += system("gunzip -f $dest_dir/$data_file");
$error += system("mv /tmp/$data_file $dest_dir/.");
if ($error) {
warn "Error unzipping $source_name data file, url: $dest_dir/$data_file";
$error_sources{$species}{$source_name} = 1;
}
}
}
}
if ($info_base_url) {
if ($info_base_url =~ /biomart/) {
# Want to save with a meaningful name, as the url includes
# the full xml query, which doesn't make for a good filename.
$error = system("wget -q -O $dest_dir/$info_file '$info_base_url'");
if ($error) {
warn "Error retrieving $source_name info file, url: $info_base_url" if $error;
$error_sources{$species}{$source_name} = 1;
}
} else {
$error = system("wget -q -P $dest_dir '$info_base_url/$info_file'");
if ($error) {
$error = system("wget -Nqr -nd -P $dest_dir '$info_base_url'");
if ($error) {
warn "Error retrieving $source_name info file, url: $info_base_url/$info_file";
$error_sources{$species}{$source_name} = 1;
}
} else {
if ($info_file =~ /\.gz$/) {
$error = system("cp $dest_dir/$info_file /tmp/.");
$error += system("gunzip -f $dest_dir/$info_file");
$error += system("mv /tmp/$info_file $dest_dir/.");
if ($error) {
warn "Error unzipping $source_name info file, url: $dest_dir/$info_file";
$error_sources{$species}{$source_name} = 1;
}
}
}
}
}
}
print " $species $build $source_name - End\n" if $verbose;
}
}
print "Download Data - End\n" if $verbose;
}
# Download Data - End
################################################################################
################################################################################
# Convert Source Data into GFF - Start
unless ($gff_exists) {
print "Convert Source Data into GFF - Start\n" if $verbose;
foreach my $species (sort keys %{$gene_models}) {
# Manually curated data sits in the top-level dir, and has a filename
# that starts with the species name, eg 'human_T1DBase_Curated.gff'. We
# copy this into the species sub-directory(s) so that it gets processed
# with everything else.
opendir(DIR, "$dir") || die "Cannot open $dir";
foreach my $gff_file (sort grep { /^$species\_(.*\.gff)$/ } readdir(DIR)) {
print " Copying $dir/$gff_file\n" if $verbose;
my $gff_file_copy;
($gff_file_copy = $gff_file) =~ s/^$species\_//;
$error = system("cp $dir/$gff_file $dir/$species/$gff_file_copy");
die "Copying failed" if $error;
}
closedir(DIR);
foreach my $source (@{$$gene_models{$species}{"sources"}}) {
my $source_name = $$source{"name"};
my $build = $$source{"build"};
if (exists $error_sources{$species}{$source_name}) {
warn "Skipping conversion to GFF for $species $source_name due to previous errors";
next;
}
print " $species $build $source_name - Start\n" if $verbose;
next if -e "$dir/$species/$source_name\_$build.gff";
$$source{"data_file"} =~ s/\.gz$//;
$$source{"info_file"} =~ s/\.gz$//;
my $tax_id = $$gene_models{$species}{"tax_id"};
my $script = "$script_dir/".lc($source_name).".pl";
my $params = "--gff_dir='$dir/$species' --tax_id $tax_id ";
foreach my $key (keys %{$source}) {
$params .= "--source $key='".$$source{$key}."' ";
}
if (-e $script) {
my $cmd = "perl $script $params";
print "Executing $cmd\n" if $verbose;
$error = system($cmd);
if ($error) {
warn "Error executing $script";
$error_sources{$species}{$source_name} = 1;
}
} else {
warn "Script $script does not exist";
$error_sources{$species}{$source_name} = 1;
}
print " $species $build $source_name - End\n" if $verbose;
}
}
print "Convert Source Data into GFF - End\n" if $verbose;
}
# Convert Source Data into GFF - End
################################################################################
################################################################################
# Create Gene Models Database and Load GFF - Start
my $database = "gene_models_$version";
open(DB_INFO, "$script_dir/database_login.txt") || die "Cannot open file $script_dir/database_login.txt";
my $header = ;
my ($user, $pass, $host) = =~ /^([^\t]+)\t([^\t]*)\t([^\t\n]*)$/;
close(DB_INFO);
unless ($gff_loaded) {
my $dbh;
if ($host) {
$dbh = DBI->connect("DBI:mysql:mysql;host=$host", $user, $pass);
} else {
$dbh = DBI->connect("DBI:mysql:mysql", $user, $pass);
}
$dbh->do("DROP DATABASE IF EXISTS $database;") || die $dbh->errstr;
$dbh->do("CREATE DATABASE $database;") || die $dbh->errstr;
$dbh->disconnect;
my $dbi = "dbi:mysql:database=$database";
$dbi .= ";host=$host" if $host;
#$dbi .= ";mysql_read_default_file=$script_dir/my.cnf";
# We use a local copy of bp_fast_load_gff.pl, since the version in
# the standard BioPerl checkout has a bug which causes data to be lost.
#my $create = "-c ";
my $cmd = "bp_bulk_load_gff.pl -u $user -p $pass -d '$dbi' --local ";
foreach my $species (sort keys %{$gene_models}) {
opendir(GFF_DIR, "$dir/$species") || die "Cannot open $dir/$species";
foreach my $gff_file (sort grep { /\.gff$/ } readdir(GFF_DIR)) {
print "Loading $dir/$species/$gff_file\n" if $verbose;
#my $cmd = "$script_dir/bp_fast_load_gff.pl -u $user -p $pass -h $host -d '$dbi' ";
#$cmd .= "$create $dir/$species/$gff_file";
$cmd .= "$dir/$species/$gff_file ";
#$create = "" if $create;
}
closedir(GFF_DIR);
}
print "Executing $cmd\n" if $verbose;
$error = system($cmd);
die "Loading failed" if $error;
}
# Create Gene Models Database and Load GFF - End
################################################################################
################################################################################
# Create Meta Data - Start
print "Loading source_builds data\n" if $verbose;# --local-infile
$error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_source_builds.sql");
die "Error executing $script_dir/load_source_builds.sql" if $error;
print "Loading xy_gene data\n" if $verbose;
$error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_xy_gene.sql");
die "Error executing $script_dir/load_xy_gene.sql" if $error;
# This is the slightly messy bit, where we decide how much consistency there is
# across the different sources, and make a record of models that look ropey
my $summary_file = "$dir/summary.txt";
my $ignore_file = "$dir/ignore_list.txt";
my $undecided_file = "$dir/undecided_list.txt";
print "Creating summary data\n" if $verbose;
$error = system("mysql -u $user -p$pass -h $host $database < $script_dir/create_summary.sql > $summary_file");
die "Error executing $script_dir/create_summary.sql" if $error;
print "Analysing summary data\n" if $verbose;
open(SUMMARY, $summary_file) || die "Cannot open $summary_file";
open(IGNORE, ">$ignore_file") || die "Cannot open $ignore_file";
open(UNDECIDED, ">$undecided_file") || die "Cannot open $undecided_file";
my (@seen, %source_ids, %location, %source, @ignore, @undecided);
# The start value that we retrieve is the minimum value for the transcript,
# rounded up to the nearest million - there might be cases where two
# transcripts might be have distinct positions, millions of bases apart, on
# the same strand of the same chromosome, so we need to check for them.
# Intialise variables with the first row of data.
my $first_line = ;
my ($first_chr, $first_strand, $first_start, $current_gene_id,
$first_source, $first_source_id, $first_position_variant) =
$first_line =~ /^([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)$/;
push(@seen, $first_line);
$source_ids{"$first_source_id$first_chr$first_strand$first_start"} = 1;
# We track two complementary hashes of hashes to save messing around later.
$location{"$first_chr$first_strand$first_start"}{"$first_source"}++;
$source{"$first_source"}{"$first_chr$first_strand$first_start"}++;
while (my $line = ) {
my ($chr, $strand, $start, $gene_id, $source, $source_id, $position_variant) =
$line =~ /^([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)$/;
if ($gene_id && $current_gene_id eq $gene_id) {
push(@seen, $line);
# Make sure that we don't count the same ID (from different sources) twice.
unless (exists $source_ids{"$source_id$chr$strand$start"}) {
$source_ids{"$source_id$chr$strand$start"} = 1;
$location{"$chr$strand$start"}{"$source"}++;
$source{"$source"}{"$chr$strand$start"}++;
}
} else {
# Check if we have a difference in chromosome and/or strand and/or start position.
if (scalar(keys(%location)) > 1) {
my $tentative;
# If there's CCDS, that's the most likely to be correct...
if (exists $source{"CCDS"} && scalar(keys(%{$source{"CCDS"}})) == 1) {
#... as long as it doesn't conflict with Vega transcripts.
if (exists $source{"Vega"} && scalar(keys(%{$source{"Vega"}})) == 1) {
if (join("", keys(%{$source{"CCDS"}})) eq join("", keys(%{$source{"Vega"}}))) {
$tentative = join("", keys(%{$source{"CCDS"}}));
}
} else {
$tentative = join("", keys(%{$source{"CCDS"}}));
}
} elsif (exists $source{"Vega"} && scalar(keys(%{$source{"Vega"}})) == 1) {
$tentative = join("", keys(%{$source{"Vega"}}));
}
# For all of the sources, do a straight count, and if one
# combo of chromosome and strand and start position is
# 'significantly' favoured, then ignore the others. Exactly what
# is significant in this sense is debatable, but a difference of
# 2:1 in favour of one combo seems to get it right a lot of the
# time, and hardly ever wrong. Also, we have a minimum number of
# transcripts acting as a threshold, before we even contemplate
# what combo is favoured.
my %location_total;
foreach my $source (keys %source) {
foreach my $location (keys %{$source{$source}}) {
$location_total{$location} += $source{$source}{$location};
}
}
# Sort so that the first value is the largest, and consider it the most probable.
my ($rank_one, $rank_one_count, $remainder_count);
foreach my $location (
sort {$location_total{$b} <=> $location_total{$a}}
keys %location_total) {
if (not defined $rank_one) {
$rank_one = $location;
$rank_one_count = $location_total{$location};
} else {
$remainder_count += $location_total{$location};
}
}
$rank_one =~ /^(.+)([\-\+])(.+)$/;
my ($rank_one_chr, $rank_one_strand, $rank_one_start) = ($1, $2, $3);
# If $tentative is defined, then we have at least one CCDS or Vega transcript
# and we consider that to be the most likely answer - and hence we can
# relax our definition of 'significant' from the previous comment.
if (defined $tentative) {
# It's possible for two chr/strand/start combos to have the same count value,
# and since one is picked at random, the following condition might not always
# make complete sense - but if two combos are present in equal numbers, we're
# not gonna be able to do anything with the data anyway.
if ($tentative eq $rank_one &&
$rank_one_count > $remainder_count &&
$rank_one_count >= 2)
{
foreach my $row (@seen) {
# Plus and minus are special characters in regexes,
# so sort out the pattern in a variable first.
my $match = $rank_one_chr.'\t\\'.$rank_one_strand.'\t'.$rank_one_start;
unless ($row =~ /^$match/) {
push(@ignore, $row);
}
}
# If the tentative combo doesn't match the most common, we really
# want to be sure that we're doing the right thing...
} elsif ($rank_one_count >= ($remainder_count*3)) {
foreach my $row (@seen) {
my $match = $rank_one_chr.'\t\\'.$rank_one_strand.'\t'.$rank_one_start;
unless ($row =~ /^$match/) {
push(@ignore, $row);
}
}
} else {
push(@undecided, @seen);
}
} else {
if ($rank_one_count >= ($remainder_count*2) &&
$rank_one_count >= 2)
{
foreach my $row (@seen) {
my $match = $rank_one_chr.'\t\\'.$rank_one_strand.'\t'.$rank_one_start;
unless ($row =~ /^$match/) {
push(@ignore, $row);
}
}
} else {
push(@undecided, @seen);
}
}
}
# Reset all the variables for the next gene id.
undef @seen;
undef %source_ids;
undef %location;
undef %source;
if ($gene_id) {
push(@seen, $line);
$source_ids{"$source_id$chr$strand$start"} = 1;
$location{"$chr$strand$start"}{"$source"}++;
$source{"$source"}{"$chr$strand$start"}++;
$current_gene_id = $gene_id;
}
}
}
foreach my $row (@ignore) {
print IGNORE "$row";
}
foreach my $row (@undecided) {
print UNDECIDED "$row";
}
close(IGNORE);
close(UNDECIDED);
close(SUMMARY);
print "Loading ignore_list data\n" if $verbose;
$error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_ignore.sql");
die "Error executing $script_dir/load_ignore.sql" if $error;
$error = system("mysqlimport -u $user -p$pass -h $host --local $database $ignore_file");
die "Error importing $ignore_file" if $error;
print "Loading undecided_list data\n" if $verbose;
$error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_undecided.sql");
die "Error executing $script_dir/load_undecided.sql" if $error;
$error = system("mysqlimport -u $user -p$pass -h $host --local $database $undecided_file");
die "Error importing $undecided_file" if $error;
print "Gene Models database '$database' is complete\n";
# Create Meta Data - End
################################################################################