#!/usr/bin/perl use warnings; use strict; use Getopt::Long; use DBI; ################################################################################ # Arguments - Start my ($help, $dir, $version, $no_download, $gff_exists, $gff_loaded, $verbose); GetOptions( 'help' => \$help, 'dir=s' => \$dir, 'version=s' => \$version, 'no_download' => \$no_download, 'gff_exists' => \$gff_exists, 'gff_loaded' => \$gff_loaded, 'verbose' => \$verbose ); sub usage { print qq~ Get latest data from the web, save it in --dir, then process into gff files which are loaded into a database, the name of which has --version as a suffix. Example: .\/generate_gene_models.pl \-\-dir \~\/scratch\/gene_models\/3_4 \-\-version 3_4_0 \-\-verbose Last updated: 22-Apr-08 Author: James Allen (james.allen\@cimr.cam.ac.uk) Please provide --dir --version [--no_download] (default 0) [--gff_exists] (default 0) [--gff_loaded] (default 0) [--verbose] (default 0) [--help] ~; print "\n"; exit; } usage if defined $help; usage unless $dir && $version; die "Cannot find directory $dir" unless -e $dir; $no_download = 0 unless $no_download; $gff_exists = 0 unless $gff_exists; $gff_loaded = 0 unless $gff_loaded; $verbose = 0 unless $verbose; my $script_dir = $0; $script_dir =~ s!/[^/]+$!!; my ($error, %error_sources); # Generate XML that's used by BioMart to retrieve ID mappings. my $template_xml = ''. ''. ''. ''. ''. ''. ''. ''; my $human_ensembl_xml = $template_xml; $human_ensembl_xml =~ s/DATASET/hsapiens_gene_ensembl/; $human_ensembl_xml =~ s/NATIVE_ID/ensembl_transcript_id/; my $human_vega_xml = $template_xml; $human_vega_xml =~ s/DATASET/hsapiens_gene_vega/; $human_vega_xml =~ s/NATIVE_ID/vega_transcript_id/; my $mouse_ensembl_xml = $template_xml; $mouse_ensembl_xml =~ s/DATASET/mmusculus_gene_ensembl/; $mouse_ensembl_xml =~ s/NATIVE_ID/ensembl_transcript_id/; my $mouse_vega_xml = $template_xml; $mouse_vega_xml =~ s/DATASET/mmusculus_gene_vega/; $mouse_vega_xml =~ s/NATIVE_ID/vega_transcript_id/; my $rat_ensembl_xml = $template_xml; $rat_ensembl_xml =~ s/DATASET/rnorvegicus_gene_ensembl/; $rat_ensembl_xml =~ s/NATIVE_ID/ensembl_transcript_id/; # Build a data structure that will determine what is downloaded, from where. # Note that it is very important that RefSeq is done first; that data is used # to resolve ambiguities for other data sources. my $gene_models = {human => {tax_id => 9606, sources => [ {name => "RefSeq", data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.36.3/mapview", data_file => "seq_gene.md.gz", info_base_url => "", info_file => "", build => "NCBI36.3" }, {name => "RefSeq", data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview", data_file => "seq_gene.md.gz", info_base_url => "", info_file => "", build => "NCBI37.1" }, {name => "CCDS", data_base_url => "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/archive/Hs36.3/", data_file => "CCDS.20090327.txt", info_base_url => "", info_file => "", build => "NCBI36.3" }, {name => "CCDS", data_base_url => "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human", data_file => "CCDS.current.txt", info_base_url => "", info_file => "", build => "NCBI37.1" }, {name => "Ensembl", data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database", data_file => "ensGene.txt.gz", info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_ensembl_xml", info_file => "EnsemblInfo.txt", build => "NCBI36.3" }, {name => "Ensembl", data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database", data_file => "ensGene.txt.gz", info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_ensembl_xml", info_file => "EnsemblInfo.txt", build => "NCBI37.1" }, {name => "UCSC", data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database", data_file => "knownGene.txt.gz", info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database", info_file => "knownToLocusLink.txt.gz", build => "NCBI36.3" }, {name => "UCSC", data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database", data_file => "knownGene.txt.gz", info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database", info_file => "knownToLocusLink.txt.gz", build => "NCBI37.1" }, {name => "Vega", data_base_url => "ftp://ftp.ensembl.org/pub/release-54/mysql/homo_sapiens_vega_54_36p", data_file => "*.gz", info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_vega_xml", info_file => "VegaInfo.txt", build => "NCBI36.3" }, {name => "Vega", data_base_url => "ftp://ftp.ensembl.org/pub/release-60/mysql/homo_sapiens_vega_60_37e", data_file => "*.gz", info_base_url => "http://www.biomart.org/biomart/martservice?query=$human_vega_xml", info_file => "VegaInfo.txt", build => "NCBI37.1" } ] }, mouse => {tax_id => 10090, sources => [ {name => "RefSeq", data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview", data_file => "seq_gene.md.gz", info_base_url => "", info_file => "", build => "NCBIM37" }, {name => "CCDS", data_base_url => "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_mouse", data_file => "CCDS.current.txt", info_base_url => "", info_file => "", build => "NCBIM37" }, {name => "Ensembl", data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/database", data_file => "ensGene.txt.gz", info_base_url => "http://www.biomart.org/biomart/martservice?query=$mouse_ensembl_xml", info_file => "EnsemblInfo.txt", build => "NCBIM37" }, {name => "UCSC", data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/database", data_file => "knownGene.txt.gz", info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/database", info_file => "knownToLocusLink.txt.gz", build => "NCBIM37" }, {name => "Vega", data_base_url => "ftp://ftp.ensembl.org/pub/release-50/mysql/mus_musculus_vega_50_37c", data_file => "*.gz", info_base_url => "http://www.biomart.org/biomart/martservice?query=$mouse_vega_xml", info_file => "VegaInfo.txt", build => "NCBIM37" } ] }, rat => {tax_id => 10116, sources => [ {name => "RefSeq", data_base_url => "ftp://ftp.ncbi.nih.gov/genomes/R_norvegicus/ARCHIVE/BUILD.4.1/mapview", data_file => "seq_gene.md.gz", info_base_url => "", info_file => "", build => "RGSC3.4" }, {name => "Ensembl", data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/rn4/database", data_file => "ensGene.txt.gz", info_base_url => "http://www.biomart.org/biomart/martservice?query=$rat_ensembl_xml", info_file => "EnsemblInfo.txt", build => "RGSC3.4" }, {name => "UCSC", data_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/rn4/database", data_file => "knownGene.txt.gz", info_base_url => "ftp://hgdownload.cse.ucsc.edu/goldenPath/rn4/database", info_file => "knownToLocusLink.txt.gz", build => "RGSC3.4" } ] } } ; # Arguments - End ################################################################################ ################################################################################ # Setup Directory Structure - Start print "Setup Directory Structure - Start\n" if $verbose; foreach my $species (sort keys %{$gene_models}) { mkdir "$dir/$species" unless -e "$dir/$species"; mkdir "$dir/$species/rawdata" unless -e "$dir/$species/rawdata"; foreach my $source (@{$$gene_models{$species}{"sources"}}) { my $source_name = $$source{"name"}; my $build = $$source{"build"}; mkdir "$dir/$species/rawdata/$source_name\_$build" unless -e "$dir/$species/rawdata/$source_name\_$build"; } } print "Setup Directory Structure - End\n" if $verbose; # Setup Directory Structure - End ################################################################################ ################################################################################ # Download Data - Start unless ($no_download) { print "Download Data - Start\n" if $verbose; foreach my $species (sort keys %{$gene_models}) { foreach my $source (@{$$gene_models{$species}{"sources"}}) { my $source_name = $$source{"name"}; my $build = $$source{"build"}; my $dest_dir = "$dir/$species/rawdata/$source_name\_$build"; my $data_base_url = $$source{"data_base_url"}; my $info_base_url = $$source{"info_base_url"}; my $data_file = $$source{"data_file"}; my $info_file = $$source{"info_file"}; print " $species $build $source_name - Start\n" if $verbose; if (-e "$dest_dir/$data_file") { print " Using existing data\n" if $verbose; } else { print " Retrieving new data\n" if $verbose; if ($data_base_url) { # Run wget with time-stamping (-N), and quietly (-q). # Try to use wildcard first; if that doesn't work, get directory contents. $error = system("wget -q -P $dest_dir '$data_base_url/$data_file'"); if ($error) { $error = system("wget -Nqr -nd -P $dest_dir '$data_base_url'"); if ($error) { warn "Error retrieving $source_name data file, url: $data_base_url/$data_file"; $error_sources{$species}{$source_name} = 1; } } unless ($error) { if ($data_file =~ /\.gz$/) { $error = system("cp $dest_dir/$data_file /tmp/."); $error += system("gunzip -f $dest_dir/$data_file"); $error += system("mv /tmp/$data_file $dest_dir/."); if ($error) { warn "Error unzipping $source_name data file, url: $dest_dir/$data_file"; $error_sources{$species}{$source_name} = 1; } } } } if ($info_base_url) { if ($info_base_url =~ /biomart/) { # Want to save with a meaningful name, as the url includes # the full xml query, which doesn't make for a good filename. $error = system("wget -q -O $dest_dir/$info_file '$info_base_url'"); if ($error) { warn "Error retrieving $source_name info file, url: $info_base_url" if $error; $error_sources{$species}{$source_name} = 1; } } else { $error = system("wget -q -P $dest_dir '$info_base_url/$info_file'"); if ($error) { $error = system("wget -Nqr -nd -P $dest_dir '$info_base_url'"); if ($error) { warn "Error retrieving $source_name info file, url: $info_base_url/$info_file"; $error_sources{$species}{$source_name} = 1; } } else { if ($info_file =~ /\.gz$/) { $error = system("cp $dest_dir/$info_file /tmp/."); $error += system("gunzip -f $dest_dir/$info_file"); $error += system("mv /tmp/$info_file $dest_dir/."); if ($error) { warn "Error unzipping $source_name info file, url: $dest_dir/$info_file"; $error_sources{$species}{$source_name} = 1; } } } } } } print " $species $build $source_name - End\n" if $verbose; } } print "Download Data - End\n" if $verbose; } # Download Data - End ################################################################################ ################################################################################ # Convert Source Data into GFF - Start unless ($gff_exists) { print "Convert Source Data into GFF - Start\n" if $verbose; foreach my $species (sort keys %{$gene_models}) { # Manually curated data sits in the top-level dir, and has a filename # that starts with the species name, eg 'human_T1DBase_Curated.gff'. We # copy this into the species sub-directory(s) so that it gets processed # with everything else. opendir(DIR, "$dir") || die "Cannot open $dir"; foreach my $gff_file (sort grep { /^$species\_(.*\.gff)$/ } readdir(DIR)) { print " Copying $dir/$gff_file\n" if $verbose; my $gff_file_copy; ($gff_file_copy = $gff_file) =~ s/^$species\_//; $error = system("cp $dir/$gff_file $dir/$species/$gff_file_copy"); die "Copying failed" if $error; } closedir(DIR); foreach my $source (@{$$gene_models{$species}{"sources"}}) { my $source_name = $$source{"name"}; my $build = $$source{"build"}; if (exists $error_sources{$species}{$source_name}) { warn "Skipping conversion to GFF for $species $source_name due to previous errors"; next; } print " $species $build $source_name - Start\n" if $verbose; next if -e "$dir/$species/$source_name\_$build.gff"; $$source{"data_file"} =~ s/\.gz$//; $$source{"info_file"} =~ s/\.gz$//; my $tax_id = $$gene_models{$species}{"tax_id"}; my $script = "$script_dir/".lc($source_name).".pl"; my $params = "--gff_dir='$dir/$species' --tax_id $tax_id "; foreach my $key (keys %{$source}) { $params .= "--source $key='".$$source{$key}."' "; } if (-e $script) { my $cmd = "perl $script $params"; print "Executing $cmd\n" if $verbose; $error = system($cmd); if ($error) { warn "Error executing $script"; $error_sources{$species}{$source_name} = 1; } } else { warn "Script $script does not exist"; $error_sources{$species}{$source_name} = 1; } print " $species $build $source_name - End\n" if $verbose; } } print "Convert Source Data into GFF - End\n" if $verbose; } # Convert Source Data into GFF - End ################################################################################ ################################################################################ # Create Gene Models Database and Load GFF - Start my $database = "gene_models_$version"; open(DB_INFO, "$script_dir/database_login.txt") || die "Cannot open file $script_dir/database_login.txt"; my $header = ; my ($user, $pass, $host) = =~ /^([^\t]+)\t([^\t]*)\t([^\t\n]*)$/; close(DB_INFO); unless ($gff_loaded) { my $dbh; if ($host) { $dbh = DBI->connect("DBI:mysql:mysql;host=$host", $user, $pass); } else { $dbh = DBI->connect("DBI:mysql:mysql", $user, $pass); } $dbh->do("DROP DATABASE IF EXISTS $database;") || die $dbh->errstr; $dbh->do("CREATE DATABASE $database;") || die $dbh->errstr; $dbh->disconnect; my $dbi = "dbi:mysql:database=$database"; $dbi .= ";host=$host" if $host; #$dbi .= ";mysql_read_default_file=$script_dir/my.cnf"; # We use a local copy of bp_fast_load_gff.pl, since the version in # the standard BioPerl checkout has a bug which causes data to be lost. #my $create = "-c "; my $cmd = "bp_bulk_load_gff.pl -u $user -p $pass -d '$dbi' --local "; foreach my $species (sort keys %{$gene_models}) { opendir(GFF_DIR, "$dir/$species") || die "Cannot open $dir/$species"; foreach my $gff_file (sort grep { /\.gff$/ } readdir(GFF_DIR)) { print "Loading $dir/$species/$gff_file\n" if $verbose; #my $cmd = "$script_dir/bp_fast_load_gff.pl -u $user -p $pass -h $host -d '$dbi' "; #$cmd .= "$create $dir/$species/$gff_file"; $cmd .= "$dir/$species/$gff_file "; #$create = "" if $create; } closedir(GFF_DIR); } print "Executing $cmd\n" if $verbose; $error = system($cmd); die "Loading failed" if $error; } # Create Gene Models Database and Load GFF - End ################################################################################ ################################################################################ # Create Meta Data - Start print "Loading source_builds data\n" if $verbose;# --local-infile $error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_source_builds.sql"); die "Error executing $script_dir/load_source_builds.sql" if $error; print "Loading xy_gene data\n" if $verbose; $error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_xy_gene.sql"); die "Error executing $script_dir/load_xy_gene.sql" if $error; # This is the slightly messy bit, where we decide how much consistency there is # across the different sources, and make a record of models that look ropey my $summary_file = "$dir/summary.txt"; my $ignore_file = "$dir/ignore_list.txt"; my $undecided_file = "$dir/undecided_list.txt"; print "Creating summary data\n" if $verbose; $error = system("mysql -u $user -p$pass -h $host $database < $script_dir/create_summary.sql > $summary_file"); die "Error executing $script_dir/create_summary.sql" if $error; print "Analysing summary data\n" if $verbose; open(SUMMARY, $summary_file) || die "Cannot open $summary_file"; open(IGNORE, ">$ignore_file") || die "Cannot open $ignore_file"; open(UNDECIDED, ">$undecided_file") || die "Cannot open $undecided_file"; my (@seen, %source_ids, %location, %source, @ignore, @undecided); # The start value that we retrieve is the minimum value for the transcript, # rounded up to the nearest million - there might be cases where two # transcripts might be have distinct positions, millions of bases apart, on # the same strand of the same chromosome, so we need to check for them. # Intialise variables with the first row of data. my $first_line = ; my ($first_chr, $first_strand, $first_start, $current_gene_id, $first_source, $first_source_id, $first_position_variant) = $first_line =~ /^([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)$/; push(@seen, $first_line); $source_ids{"$first_source_id$first_chr$first_strand$first_start"} = 1; # We track two complementary hashes of hashes to save messing around later. $location{"$first_chr$first_strand$first_start"}{"$first_source"}++; $source{"$first_source"}{"$first_chr$first_strand$first_start"}++; while (my $line = ) { my ($chr, $strand, $start, $gene_id, $source, $source_id, $position_variant) = $line =~ /^([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)$/; if ($gene_id && $current_gene_id eq $gene_id) { push(@seen, $line); # Make sure that we don't count the same ID (from different sources) twice. unless (exists $source_ids{"$source_id$chr$strand$start"}) { $source_ids{"$source_id$chr$strand$start"} = 1; $location{"$chr$strand$start"}{"$source"}++; $source{"$source"}{"$chr$strand$start"}++; } } else { # Check if we have a difference in chromosome and/or strand and/or start position. if (scalar(keys(%location)) > 1) { my $tentative; # If there's CCDS, that's the most likely to be correct... if (exists $source{"CCDS"} && scalar(keys(%{$source{"CCDS"}})) == 1) { #... as long as it doesn't conflict with Vega transcripts. if (exists $source{"Vega"} && scalar(keys(%{$source{"Vega"}})) == 1) { if (join("", keys(%{$source{"CCDS"}})) eq join("", keys(%{$source{"Vega"}}))) { $tentative = join("", keys(%{$source{"CCDS"}})); } } else { $tentative = join("", keys(%{$source{"CCDS"}})); } } elsif (exists $source{"Vega"} && scalar(keys(%{$source{"Vega"}})) == 1) { $tentative = join("", keys(%{$source{"Vega"}})); } # For all of the sources, do a straight count, and if one # combo of chromosome and strand and start position is # 'significantly' favoured, then ignore the others. Exactly what # is significant in this sense is debatable, but a difference of # 2:1 in favour of one combo seems to get it right a lot of the # time, and hardly ever wrong. Also, we have a minimum number of # transcripts acting as a threshold, before we even contemplate # what combo is favoured. my %location_total; foreach my $source (keys %source) { foreach my $location (keys %{$source{$source}}) { $location_total{$location} += $source{$source}{$location}; } } # Sort so that the first value is the largest, and consider it the most probable. my ($rank_one, $rank_one_count, $remainder_count); foreach my $location ( sort {$location_total{$b} <=> $location_total{$a}} keys %location_total) { if (not defined $rank_one) { $rank_one = $location; $rank_one_count = $location_total{$location}; } else { $remainder_count += $location_total{$location}; } } $rank_one =~ /^(.+)([\-\+])(.+)$/; my ($rank_one_chr, $rank_one_strand, $rank_one_start) = ($1, $2, $3); # If $tentative is defined, then we have at least one CCDS or Vega transcript # and we consider that to be the most likely answer - and hence we can # relax our definition of 'significant' from the previous comment. if (defined $tentative) { # It's possible for two chr/strand/start combos to have the same count value, # and since one is picked at random, the following condition might not always # make complete sense - but if two combos are present in equal numbers, we're # not gonna be able to do anything with the data anyway. if ($tentative eq $rank_one && $rank_one_count > $remainder_count && $rank_one_count >= 2) { foreach my $row (@seen) { # Plus and minus are special characters in regexes, # so sort out the pattern in a variable first. my $match = $rank_one_chr.'\t\\'.$rank_one_strand.'\t'.$rank_one_start; unless ($row =~ /^$match/) { push(@ignore, $row); } } # If the tentative combo doesn't match the most common, we really # want to be sure that we're doing the right thing... } elsif ($rank_one_count >= ($remainder_count*3)) { foreach my $row (@seen) { my $match = $rank_one_chr.'\t\\'.$rank_one_strand.'\t'.$rank_one_start; unless ($row =~ /^$match/) { push(@ignore, $row); } } } else { push(@undecided, @seen); } } else { if ($rank_one_count >= ($remainder_count*2) && $rank_one_count >= 2) { foreach my $row (@seen) { my $match = $rank_one_chr.'\t\\'.$rank_one_strand.'\t'.$rank_one_start; unless ($row =~ /^$match/) { push(@ignore, $row); } } } else { push(@undecided, @seen); } } } # Reset all the variables for the next gene id. undef @seen; undef %source_ids; undef %location; undef %source; if ($gene_id) { push(@seen, $line); $source_ids{"$source_id$chr$strand$start"} = 1; $location{"$chr$strand$start"}{"$source"}++; $source{"$source"}{"$chr$strand$start"}++; $current_gene_id = $gene_id; } } } foreach my $row (@ignore) { print IGNORE "$row"; } foreach my $row (@undecided) { print UNDECIDED "$row"; } close(IGNORE); close(UNDECIDED); close(SUMMARY); print "Loading ignore_list data\n" if $verbose; $error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_ignore.sql"); die "Error executing $script_dir/load_ignore.sql" if $error; $error = system("mysqlimport -u $user -p$pass -h $host --local $database $ignore_file"); die "Error importing $ignore_file" if $error; print "Loading undecided_list data\n" if $verbose; $error = system("mysql -u $user -p$pass -h $host $database < $script_dir/load_undecided.sql"); die "Error executing $script_dir/load_undecided.sql" if $error; $error = system("mysqlimport -u $user -p$pass -h $host --local $database $undecided_file"); die "Error importing $undecided_file" if $error; print "Gene Models database '$database' is complete\n"; # Create Meta Data - End ################################################################################