#!/usr/bin/perl -w =head1 NAME getData.pl - retrieves databases from the Internet =cut # This script shall help maintaining sets of frequently changing databases # of various sorts. It is motivated by demands in bioinformatics and # astronomy. # Copyright (c) 2008 Steffen Moeller # Copyright (c) 2008 Charles Plessy # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Or else go to GNU Web pages http://www.gnu.org and follow the white rabbit. # my $mirrordir; my $configfile="/etc/getData.conf"; if ( -r $configfile ) { open(FH,"<$configfile") or die "Could not open config data at '$configfile'.\n"; while() { next if /^\s*#/; next unless /\s*(\S.*\S)\s*=\s*"([^"]*)"\s*/; #print STDERR "Read: $1\n"; if ($1 eq "mirrordir") { $mirrordir=$2; print STDERR "Set mirrordir to '$mirrordir'.\n"; } } close FH; } $mirrordir = "/var/lib/mirrored" unless defined($mirrordir); =head1 SYNOPSIS getData.pl [ --mirrordir ] getData.pl --list =head1 DESCRIPTION Bioinformatics has the intrinsic problem to bring the biological data to the end user. Astronomers have the equivalent problem and particle physicists, well, they haven come up with (first) the web and (second) the computational grids to access their problems. Debian helps with the programs but will not provide such huge datasets that are even frequently updated. Not even in volatile.debian.org. Most bioinformatics researchers will not need too many of such databases. And even more so will gladly continue in using public services remotely. For those who need a set of databases on a regular basis, this script shall be a start to automate the burden to download the data and update indices and the like. The world has seen such magic before with the Lion Biosciences Prisma tool (http://bib.oxfordjournals.org/cgi/reprint/3/4/389.pdf) but how about something simpler (as a start) that at least gets close to what we desire and is Free. The aim must be to address the needs of all (most) communities, not only of the bioinformatics world. The seed was hence made with databases from astronomy. =head1 OPTIONS =over 4 =item --help this help =item --man Present a more detailed description in form of a man page. =item --verbose Say one or two words more than required. =item --mirrordir Specifies destination directory. The data will be mirrored to $mirrordir/$dbname/ =item --list Lists all databases that may be requested to be installed. =item Only those databases that are explicitly requested to be downloaded will be downloaded. Such databases may require considerable bandwidth, so please make sure you know you are doing the right thing. =back =head1 EXAMPLES ./getData.pl --list | head 4 =head1 TODO We now need a mechanism with which packages can specify hooks that shall be called upon an update of a database. But we cannot assume that every indexing that can be performed because of the installation of some package is also desired by the user. How to configure this properly is left to be decided. =head1 SEE ALSO http://debian-med.alioth.debian.org, http://wiki.debian.org/DebianMed, /etc/getData.conf =head1 AUTHORS Steffen Moeller , Charles Plessy ... and ?, from the Debian-Med packaging initiative. =cut use strict; use Getopt::Long; use Cwd; my %toBeMirrored = ( # A S T R O N O M Y # "tycho2" => { # name => "Tycho2 Star Coordinates", # source => "wget --mirror ftp://cdsarc.u-strasbg.fr/pub/cats/I/259/tyc2.dat*", # "post-download" => "[ -r tyc2.dat -a -z \"`find . -cnewer tyc2.dat " # . "-a ! -name .listing`\" ] " # . "&& echo \"No mirrored file newer than previously created index.\" " # . "|| zcat cdsarc.u-strasbg.fr/pub/cats/I/259/tyc2.dat* > tycho2.dat" # }, # # "astorb" => { # name => "asteroid orbits", # source => "wget --mirror ftp://ftp.lowell.edu/pub/elgb/astorb.dat.gz", # "post-download" => "[ -r astorb.dat " # . "-a ftp.lowell.edu/pub/elgb/astorb.dat.gz -nt astorb.dat ] " # . "|| zcat ftp.lowell.edu/pub/elgb/astorb.dat.gz > astorb.dat" # }, # "DE405" => { # name => "DE405", # source => "wget --mirror ftp://ssd.jpl.nasa.gov/pub/eph/export/unix/unxp2[01]*.405", ## "debian-depends" => "jpl-eph-tools", # "post-download" => "ln -s ssd.jpl.nasa.gov/pub/eph/export/unix/unxp*.405 ." # }, # # B I O I N F O R M A T I C S "gene.ontology.rdf" => { name => "GeneOntology - RDF-formatted, terms only.", source => "wget --mirror ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest/go_*-termdb.rdf-xml.gz" }, "intact.psimitab" => { name => "IntACT Protein Interaction Database", source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip", "post-download" => "unzip `find ftp.ebi.ac.uk -name '*.zip'`" }, "kegg.brite" => { name => "Kegg Brite Database", source => "wget --mirror ftp://ftp.genome.jp/pub/kegg/release/current/brite.tar.gz" }, "kegg.pathway" => { name => "Kegg Pathway Database", source => "wget --mirror ftp://ftp.genome.jp/pub/kegg/release/current/pathway.tar.gz" }, "kegg.pathway.hsa" => { name => "Kegg Pathway Database (HSA)", source => "wget --mirror ftp://ftp.genome.jp/pub/kegg/pathway/organisms/hsa" }, "kegg.pathway.hsa.xml" => { name => "XML representation of pathways in KEGG", source => "wget --mirror ftp://ftp.genome.jp/pub/kegg/xml/README ftp://ftp.genome.jp/pub/kegg/xml//KGML.dtd ftp://ftp.genome.jp/pub/kegg/xml/ko ftp://ftp.genome.jp/pub/kegg/xml/map ftp://ftp.genome.jp/pub/kegg/xml/organisms/hsa" }, "reactome.tab.human" => { name => "Reactome Pathway Database - Human interactions as tab delimited", source => "wget --mirror http://www.reactome.org/download/interactions.README.txt http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz" }, # "swiss.dat" => { # name => "UniProt - SwissProt in EMBL format", # source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/swissprot/release_compressed/uniprot_sprot.dat.gz ftp://ftp.ebi.ac.uk/pub/databases/swissprot/updates_compressed/*.dat.gz" # }, # "swiss.fasta" => { name => "UniProt - SWISS-PROT in FASTA format", source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" }, "trembl.fasta" => { name => "UniProt - TrEMBL in FASTA format", source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" }, # "jaspar.sites" => { # name => "Sites subfolder of JASPAR database", # source => "wget --mirror http://jaspar.genereg.net/html/DOWNLOAD/SITES/*/*.sites" # }, # # "jaspar.MatrixDir" => { # name => "MatrixDir subfolder of JASPAR database", # source => "wget --mirror http://jaspar.genereg.net/html/DOWNLOAD/MatrixDir/*/*.sites" # } ); my ($list,$help,$man,$verbose)=(0,0,0,0); my %options=("mirrordir:s" => \$mirrordir, "list" => \$list, "help"=>\$help, "man"=>\$man, "verbose"=>\$verbose); sub myhelp () { require "Pod/Usage.pm"; import Pod::Usage; pod2usage() unless defined($man); pod2usage( -verbose => 2 ); } myhelp() if !GetOptions(%options) or $help or $man; if ($list) { foreach my $db (sort keys %toBeMirrored) { printf "%-10s",$db; # more information like the expected size should be printed here. print "\t".$toBeMirrored{$db}{"name"}; print "\n"; } exit 0; } die "Cannot write to root destination directory at '$mirrordir'.\n" unless ( -w "$mirrordir" ); #print join(", ",@ARGV)."\n"; my $d=getcwd(); foreach my $db (@ARGV) { unless(exists($toBeMirrored{$db})) { print STDERR "Unknown database: '$db'\n"; next; } print STDERR "\"$db\" -> \"$mirrordir\"\n"; print "Mirroring ".$toBeMirrored{$db}{"name"}." ($db)\n"; unless ( -d "$mirrordir/$db" ) { print " creating directory $mirrordir/$db\n"; mkdir("$mirrordir/$db") or die "Could not create directory \"$mirrordir/$db\"\n"; } chdir("$mirrordir/$db") or die "Could not change directory to \"$mirrordir/$db\"\n"; my $cmd = $toBeMirrored{$db}{"source"}; if (!defined ($cmd) or "" eq $cmd) { print STDERR "$db: download instructions not specified - skipping.\n"; next; } print STDERR "$cmd\n"; system($cmd) and die "Experienced problem."; $cmd = $toBeMirrored{$db}{"post-download"}; if ( defined($cmd) and "" ne $cmd) { print STDERR "$cmd\n"; system($cmd) and die "Experienced problem."; } else { print STDERR "$db: No post-download command defined.\n" if $verbose; } chdir($d) or die "Could not change back to dir '$d'.\n"; }