| 1 |
#!/usr/bin/perl -w
|
| 2 |
|
| 3 |
use strict;
|
| 4 |
|
| 5 |
=head1 NAME
|
| 6 |
|
| 7 |
getData - retrieves databases from the Internet
|
| 8 |
|
| 9 |
=cut
|
| 10 |
|
| 11 |
# This script shall help maintaining sets of frequently changing databases
|
| 12 |
# of various sorts. It is motivated by demands in bioinformatics and
|
| 13 |
# astronomy.
|
| 14 |
|
| 15 |
# Copyright (c) 2008 Steffen Moeller <moeller@debian.org>
|
| 16 |
# Copyright (c) 2008 Charles Plessy <debian-no-spam@plessy.org>
|
| 17 |
#
|
| 18 |
# This program is free software; you can redistribute it and/or
|
| 19 |
# modify it under the terms of the GNU General Public License
|
| 20 |
# as published by the Free Software Foundation; either version 2
|
| 21 |
# of the License, or (at your option) any later version.
|
| 22 |
#
|
| 23 |
# This program is distributed in the hope that it will be useful,
|
| 24 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 25 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 26 |
# GNU General Public License for more details.
|
| 27 |
#
|
| 28 |
# You should have received a copy of the GNU General Public License
|
| 29 |
# along with this program; if not, write to the Free Software
|
| 30 |
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
| 31 |
# Or else go to GNU Web pages http://www.gnu.org and follow the white rabbit.
|
| 32 |
#
|
| 33 |
|
| 34 |
my $mirrordir;
|
| 35 |
|
| 36 |
my $configfile="/etc/getData.conf";
|
| 37 |
if ( -r $configfile ) {
|
| 38 |
open(FH,"<$configfile") or die "Could not open config data at '$configfile'.\n";
|
| 39 |
while(<FH>) {
|
| 40 |
next if /^\s*#/;
|
| 41 |
next unless /\s*(\S.*\S)\s*=\s*"([^"]*)"\s*/;
|
| 42 |
#print STDERR "Read: $1\n";
|
| 43 |
if ($1 eq "mirrordir") {
|
| 44 |
$mirrordir=$2;
|
| 45 |
print STDERR "Set mirrordir to '$mirrordir'.\n";
|
| 46 |
}
|
| 47 |
}
|
| 48 |
close FH;
|
| 49 |
}
|
| 50 |
|
| 51 |
$mirrordir = "/var/lib/mirrored" unless defined($mirrordir);
|
| 52 |
|
| 53 |
=head1 SYNOPSIS
|
| 54 |
|
| 55 |
getData [ --mirrordir <path> ] <list of db names>
|
| 56 |
|
| 57 |
getData --list
|
| 58 |
|
| 59 |
=head1 DESCRIPTION
|
| 60 |
|
| 61 |
Bioinformatics has the intrinsic problem to bring the biological data
|
| 62 |
to the end user. Astronomers have the equivalent problem and particle
|
| 63 |
physicists, well, they haven come up with (first) the web and (second)
|
| 64 |
the computational grids to access their problems. Debian helps with the
|
| 65 |
programs but will not provide such huge datasets that are even frequently
|
| 66 |
updated. Not even in volatile.debian.org. Most bioinformatics researchers
|
| 67 |
will not need too many of such databases. And even more so will gladly
|
| 68 |
continue in using public services remotely.
|
| 69 |
|
| 70 |
For those who need a set of databases on a regular basis, this script
|
| 71 |
shall be a start to automate the burden to download the data and update
|
| 72 |
indices and the like. The world has seen such magic before with the
|
| 73 |
Lion Biosciences Prisma tool (http://bib.oxfordjournals.org/cgi/reprint/3/4/389.pdf)
|
| 74 |
but how about something simpler (as a start) that at least gets close
|
| 75 |
to what we desire and is Free. The aim must be to address the needs of
|
| 76 |
all (most) communities, not only of the bioinformatics world. The seed was
|
| 77 |
hence made with databases from astronomy.
|
| 78 |
|
| 79 |
=head1 OPTIONS
|
| 80 |
|
| 81 |
=over 4
|
| 82 |
|
| 83 |
=item --help
|
| 84 |
|
| 85 |
this help
|
| 86 |
|
| 87 |
=item --man
|
| 88 |
|
| 89 |
Present a more detailed description in form of a man page.
|
| 90 |
|
| 91 |
=item --verbose
|
| 92 |
|
| 93 |
Say one or two words more than required.
|
| 94 |
|
| 95 |
|
| 96 |
=item --mirrordir <path>
|
| 97 |
|
| 98 |
Specifies destination directory. The data will be mirrored to the folder $mirrordir/$dbname/.
|
| 99 |
Please be aware that this mirrordir is nowhere stored. The directory can consequently be moved
|
| 100 |
to arbitrary locations at any time, if the users of the data are only informed about that
|
| 101 |
moving.
|
| 102 |
|
| 103 |
=item --list
|
| 104 |
|
| 105 |
Lists all databases that may be requested to be installed.
|
| 106 |
|
| 107 |
=item <list of db names>
|
| 108 |
|
| 109 |
Only those databases that are explicitly requested to be downloaded will be downloaded. Such databases may require considerable bandwidth, so please make sure you know you are doing the right thing.
|
| 110 |
|
| 111 |
=item --post
|
| 112 |
|
| 113 |
Perform only the unpacking/indexing, but do not retrieve/update the databases. This option is considered useful when adding a new database management system to the system, e.g. after installing EMBOSS.
|
| 114 |
|
| 115 |
=item --source
|
| 116 |
|
| 117 |
Perform only the unpacking/indexing, but do not retrieve/update the databases. This option may be beneficial when the site administator is aware of current analyses that should not be disturbed by the indexing process but the downloading from the net can already be started.
|
| 118 |
|
| 119 |
=item --config <system>
|
| 120 |
|
| 121 |
Preparation of the configuration file that would be reuired for a particular system that deals with the database. The configuration is printed to stdout and is expected to be copied manually to the proper file or folder. One could imagine this process to be automated, though this is not yet implemented. Currently implemented systems are: emboss and dre. "dre" stands for "dynamic runtime environment", which is a concept of the ARC grid middleware of which more can be learned on http://www.nordugrid.org.
|
| 122 |
|
| 123 |
=back
|
| 124 |
|
| 125 |
=head1 EXAMPLES
|
| 126 |
|
| 127 |
./getData --mirrordir=/local/databases/mirrored --list | head 4
|
| 128 |
|
| 129 |
=head1 TODO
|
| 130 |
|
| 131 |
We now need a mechanism with which packages can specify hooks that
|
| 132 |
shall be called upon an update of a database. But we cannot assume that
|
| 133 |
every indexing that can be performed because of the installation of some
|
| 134 |
package is also desired by the user. How to configure this properly is
|
| 135 |
left to be decided.
|
| 136 |
|
| 137 |
=head1 SEE ALSO
|
| 138 |
|
| 139 |
http://debian-med.alioth.debian.org, http://wiki.debian.org/DebianMed, /etc/getData.conf
|
| 140 |
|
| 141 |
=head1 AUTHORS
|
| 142 |
|
| 143 |
Steffen Moeller <moeller@debian.org>, Charles Plessy <debian-no-spam@plessy.org> ... and ?, from the Debian-Med packaging initiative.
|
| 144 |
|
| 145 |
=cut
|
| 146 |
|
| 147 |
|
| 148 |
use strict;
|
| 149 |
|
| 150 |
use Getopt::Long;
|
| 151 |
use Cwd;
|
| 152 |
|
| 153 |
my %toBeMirrored = (
|
| 154 |
|
| 155 |
#
|
| 156 |
# A S T R O N O M Y
|
| 157 |
#
|
| 158 |
|
| 159 |
"tycho2" => {
|
| 160 |
name => "Tycho2 Star Coordinates",
|
| 161 |
source => "wget --mirror ftp://cdsarc.u-strasbg.fr/pub/cats/I/259/tyc2.dat*",
|
| 162 |
"post-download" => "[ -r tyc2.dat -a -z \"`find . -cnewer tyc2.dat "
|
| 163 |
. "-a ! -name .listing`\" ] "
|
| 164 |
. "&& echo \"No mirrored file newer than previously created index.\" "
|
| 165 |
. "|| zcat cdsarc.u-strasbg.fr/pub/cats/I/259/tyc2.dat* > tycho2.dat"
|
| 166 |
},
|
| 167 |
|
| 168 |
"astorb" => {
|
| 169 |
name => "asteroid orbits",
|
| 170 |
source => "wget --mirror ftp://ftp.lowell.edu/pub/elgb/astorb.dat.gz",
|
| 171 |
"post-download" => "[ -r astorb.dat "
|
| 172 |
. "-a ftp.lowell.edu/pub/elgb/astorb.dat.gz -nt astorb.dat ] "
|
| 173 |
. "|| zcat ftp.lowell.edu/pub/elgb/astorb.dat.gz > astorb.dat"
|
| 174 |
},
|
| 175 |
|
| 176 |
"DE405" => {
|
| 177 |
name => "DE405",
|
| 178 |
source => "wget --mirror ftp://ssd.jpl.nasa.gov/pub/eph/export/unix/unxp2[01]*.405",
|
| 179 |
"debian-depends" => "jpl-eph-tools",
|
| 180 |
"post-download" => "ln -s ssd.jpl.nasa.gov/pub/eph/export/unix/unxp*.405 ."
|
| 181 |
},
|
| 182 |
|
| 183 |
|
| 184 |
#
|
| 185 |
# B I O I N F O R M A T I C S
|
| 186 |
#
|
| 187 |
|
| 188 |
"gene.ontology.rdf" => {
|
| 189 |
name => "GeneOntology - RDF-formatted, terms only.",
|
| 190 |
source => "wget --mirror ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest/go_*-termdb.rdf-xml.gz",
|
| 191 |
"post-download" => "gzip -dc ftp.geneontology.org/pub/go/godatabase/archive/latest/go_*-termdb.rdf-xml.gz > go-termdb.rdf-xml"
|
| 192 |
},
|
| 193 |
|
| 194 |
"intact.psimitab" => {
|
| 195 |
name => "IntACT Protein Interaction Database",
|
| 196 |
source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip",
|
| 197 |
"post-download" => "unzip `find ftp.ebi.ac.uk -name '*.zip'`"
|
| 198 |
},
|
| 199 |
|
| 200 |
"kegg.brite" => {
|
| 201 |
name => "Kegg Brite Database",
|
| 202 |
source => "wget --mirror ftp://ftp.genome.jp/pub/kegg/release/current/brite.tar.gz"
|
| 203 |
},
|
| 204 |
|
| 205 |
"kegg.pathway" => {
|
| 206 |
name => "Kegg Pathway Database",
|
| 207 |
source => "wget --mirror ftp://ftp.genome.jp/pub/kegg/release/current/pathway.tar.gz"
|
| 208 |
},
|
| 209 |
|
| 210 |
"kegg.pathway.hsa" => {
|
| 211 |
name => "Kegg Pathway Database (HSA)",
|
| 212 |
source => "wget --mirror ftp://ftp.genome.jp/pub/kegg/pathway/organisms/hsa"
|
| 213 |
},
|
| 214 |
|
| 215 |
"kegg.pathway.hsa.xml" => {
|
| 216 |
name => "XML representation of pathways in KEGG",
|
| 217 |
source => "wget --mirror ftp://ftp.genome.jp/pub/kegg/xml/README ftp://ftp.genome.jp/pub/kegg/xml//KGML.dtd ftp://ftp.genome.jp/pub/kegg/xml/ko ftp://ftp.genome.jp/pub/kegg/xml/map ftp://ftp.genome.jp/pub/kegg/xml/organisms/hsa"
|
| 218 |
},
|
| 219 |
|
| 220 |
"reactome.tab.human" => {
|
| 221 |
name => "Reactome Pathway Database - Human interactions as tab delimited",
|
| 222 |
source => "wget --mirror http://www.reactome.org/download/interactions.README.txt http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz"
|
| 223 |
},
|
| 224 |
|
| 225 |
"trembl.dat" => {
|
| 226 |
name => "UniProt - TrEMBL in EMBL format",
|
| 227 |
source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/swissprot/release_compressed/uniprot_trembl.dat.gz",
|
| 228 |
"post-download" => "d=uncompressed; if [ ! -d \$d ]; then mkdir \$d; fi; "
|
| 229 |
."rm -rf \$d/trembl.dat; "
|
| 230 |
."(find ftp.ebi.ac.uk -name '*.dat.gz' | xargs -r zcat ) > \$d/trembl.dat; "
|
| 231 |
."[ -x /usr/bin/dbxflat ] && cd \$d && dbxflat -dbresource embl -dbname trembllocal -idformat swiss -filenames=trembl.dat -fields id,acc -auto",
|
| 232 |
"test" => "seqret trembllocal:Q9YZN7"
|
| 233 |
},
|
| 234 |
|
| 235 |
"swiss.dat" => {
|
| 236 |
name => "UniProt - SwissProt in EMBL format",
|
| 237 |
source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/swissprot/release_compressed/uniprot_sprot.dat.gz ftp://ftp.ebi.ac.uk/pub/databases/swissprot/updates_compressed/*.dat.gz",
|
| 238 |
"post-download" => "d=uncompressed; if [ ! -d \$d ]; then mkdir \$d; fi; "
|
| 239 |
."rm -rf \$d/swissprot.dat; "
|
| 240 |
."(find ftp.ebi.ac.uk -name '*.dat.gz' | xargs -r zcat ) > \$d/swissprot.dat; "
|
| 241 |
."[ -x /usr/bin/dbiflat ] && cd \$d && dbiflat -dbname swisslocal -fields acc,des -idformat swiss -auto",
|
| 242 |
# source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/swissprot/release_compressed/uniprot_sprot.dat.gz"
|
| 243 |
"test" => "seqret -feature swisslocal:p12345 -osf swiss -stdout -auto"
|
| 244 |
},
|
| 245 |
|
| 246 |
"swiss.fasta" => {
|
| 247 |
name => "UniProt - SWISS-PROT in FASTA format",
|
| 248 |
source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
|
| 249 |
},
|
| 250 |
|
| 251 |
"trembl.fasta" => {
|
| 252 |
name => "UniProt - TrEMBL in FASTA format",
|
| 253 |
source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz"
|
| 254 |
},
|
| 255 |
|
| 256 |
# "jaspar.sites" => {
|
| 257 |
# name => "Sites subfolder of JASPAR database",
|
| 258 |
# source => "wget --mirror http://jaspar.genereg.net/html/DOWNLOAD/SITES/*/*.sites"
|
| 259 |
# },
|
| 260 |
#
|
| 261 |
# "jaspar.MatrixDir" => {
|
| 262 |
# name => "MatrixDir subfolder of JASPAR database",
|
| 263 |
# source => "wget --mirror http://jaspar.genereg.net/html/DOWNLOAD/MatrixDir/*/*.sites"
|
| 264 |
# }
|
| 265 |
);
|
| 266 |
|
| 267 |
my ($post,$source,$config,$list,$help,$man,$verbose)=(undef,undef,undef,0,0,0,0);
|
| 268 |
|
| 269 |
my %options=("mirrordir:s" => \$mirrordir,
|
| 270 |
"list" => \$list,
|
| 271 |
"help"=>\$help, "man"=>\$man,
|
| 272 |
"verbose"=>\$verbose,
|
| 273 |
"post"=>\$post,
|
| 274 |
"config:s@"=>\$config,
|
| 275 |
"source"=>\$source);
|
| 276 |
|
| 277 |
|
| 278 |
my ($do_source,$do_post,$do_config)=(1,1,0);
|
| 279 |
|
| 280 |
|
| 281 |
sub myhelp () {
|
| 282 |
require "Pod/Usage.pm";
|
| 283 |
import Pod::Usage;
|
| 284 |
pod2usage() unless defined($man);
|
| 285 |
pod2usage( -verbose => 2 );
|
| 286 |
}
|
| 287 |
|
| 288 |
|
| 289 |
GetOptions(%options) or die "Could not parse arguments.\n";
|
| 290 |
|
| 291 |
die "Cannot set both --post-download-only and --source-only options.\n" if $post and $source;
|
| 292 |
|
| 293 |
if ($help or $man) {
|
| 294 |
myhelp()
|
| 295 |
}
|
| 296 |
|
| 297 |
print "post: $post, source: $source\n" if $verbose;
|
| 298 |
|
| 299 |
if ($post) {
|
| 300 |
print "disabling retrieval of database updates.\n";
|
| 301 |
$do_source=0;
|
| 302 |
$do_post=1;
|
| 303 |
$do_config=0;
|
| 304 |
}
|
| 305 |
if ($source) {
|
| 306 |
print "disabling unpacking and indexing.\n";
|
| 307 |
$do_post=0;
|
| 308 |
$do_source=1;
|
| 309 |
$do_config=0;
|
| 310 |
}
|
| 311 |
if ($config) {
|
| 312 |
print "disabling retrieval of updates and their unpacking/indexing.\n";
|
| 313 |
$do_post=0;
|
| 314 |
$do_source=0;
|
| 315 |
$do_config=1;
|
| 316 |
}
|
| 317 |
|
| 318 |
if ($verbose) {
|
| 319 |
foreach my $o (keys %options) {
|
| 320 |
print "$o => " . ${$options{$o}}."\n";
|
| 321 |
}
|
| 322 |
}
|
| 323 |
|
| 324 |
if ($list) {
|
| 325 |
foreach my $db (sort keys %toBeMirrored) {
|
| 326 |
printf "%-10s",$db;
|
| 327 |
# more information like the expected size should be printed here.
|
| 328 |
print "\t".$toBeMirrored{$db}{"name"};
|
| 329 |
print "\n";
|
| 330 |
}
|
| 331 |
exit 0;
|
| 332 |
}
|
| 333 |
|
| 334 |
die "Cannot write to root destination directory at '$mirrordir'.\n" unless ( -w "$mirrordir" or $do_config);
|
| 335 |
|
| 336 |
#print join(", ",@ARGV)."\n";
|
| 337 |
|
| 338 |
my $d=getcwd();
|
| 339 |
|
| 340 |
foreach my $db (@ARGV) {
|
| 341 |
|
| 342 |
unless(exists($toBeMirrored{$db})) {
|
| 343 |
print STDERR "Unknown database: '$db'\n";
|
| 344 |
next;
|
| 345 |
}
|
| 346 |
|
| 347 |
if ($do_config) {
|
| 348 |
|
| 349 |
die "Please specify a system to configure.\n"
|
| 350 |
unless 0<length(@$config);
|
| 351 |
|
| 352 |
foreach my $system (@$config) {
|
| 353 |
if ("emboss" eq "$system") {
|
| 354 |
if ($db =~ /(swiss|trembl).dat/ ) {
|
| 355 |
print "\n";
|
| 356 |
print "########### " . $db . " ##############\n";
|
| 357 |
print "\n";
|
| 358 |
print "DB ";
|
| 359 |
my $n=$db;
|
| 360 |
$n =~ s/\.dat$//;
|
| 361 |
print $n;
|
| 362 |
print "local [\n";
|
| 363 |
print " type: P\n";
|
| 364 |
print " format: swiss\n";
|
| 365 |
if ($db eq "swiss.dat") {
|
| 366 |
print " method: emblcd\n";
|
| 367 |
}
|
| 368 |
elsif ($db eq "trembl.dat") {
|
| 369 |
print " method: emboss\n";
|
| 370 |
}
|
| 371 |
else {
|
| 372 |
print " # method: unknown\n";
|
| 373 |
}
|
| 374 |
print " directory: $mirrordir/$db/uncompressed\n";
|
| 375 |
print "]\n";
|
| 376 |
print "\n";
|
| 377 |
print "####################################\n";
|
| 378 |
}
|
| 379 |
}
|
| 380 |
elsif ("dre" eq "$system") {
|
| 381 |
if ( -d "$mirrordir/$db" ) {
|
| 382 |
print "MIRRORDIR=\"$mirrordir\"\n";
|
| 383 |
}
|
| 384 |
else {
|
| 385 |
print STDERR "$db: not installed, configuration not printed.";
|
| 386 |
}
|
| 387 |
}
|
| 388 |
else {
|
| 389 |
print "Unknown system $system\n";
|
| 390 |
}
|
| 391 |
}
|
| 392 |
}
|
| 393 |
|
| 394 |
print STDERR "\"$db\" -> \"$mirrordir\"\n";
|
| 395 |
print "Mirroring ".$toBeMirrored{$db}{"name"}." ($db)\n";
|
| 396 |
|
| 397 |
unless ( -d "$mirrordir/$db" ) {
|
| 398 |
if ($do_source) {
|
| 399 |
print " creating directory $mirrordir/$db\n";
|
| 400 |
mkdir("$mirrordir/$db") or die "Could not create directory \"$mirrordir/$db\"\n";
|
| 401 |
}
|
| 402 |
else {
|
| 403 |
die "Directory '$mirrordir/$db' is not existing, no data to treat post-download,"
|
| 404 |
." the download itself was disabled via the command line.\n";
|
| 405 |
}
|
| 406 |
}
|
| 407 |
chdir("$mirrordir/$db") or die "Could not change directory to \"$mirrordir/$db\"\n";
|
| 408 |
|
| 409 |
if ($do_source) {
|
| 410 |
my $cmd = $toBeMirrored{$db}{"source"};
|
| 411 |
if (!defined ($cmd) or "" eq $cmd) {
|
| 412 |
print STDERR "$db: download instructions not specified - skipping.\n";
|
| 413 |
next;
|
| 414 |
}
|
| 415 |
print STDERR "$cmd\n";
|
| 416 |
system($cmd) and die "Experienced problem.";
|
| 417 |
}
|
| 418 |
|
| 419 |
if ($do_post) {
|
| 420 |
my $cmd = $toBeMirrored{$db}{"post-download"};
|
| 421 |
if ( defined($cmd) and "" ne $cmd) {
|
| 422 |
print STDERR "$cmd\n";
|
| 423 |
system($cmd) and die "Experienced problem.";
|
| 424 |
}
|
| 425 |
else {
|
| 426 |
print STDERR "$db: No post-download command defined.\n" if $verbose;
|
| 427 |
}
|
| 428 |
}
|
| 429 |
chdir($d) or die "Could not change back to dir '$d'.\n";
|
| 430 |
}
|