#!/usr/bin/perl # geo_downloader downloads expression files from GEO (NCBI) # and is released under the terms of the GNU GPL version 3, or any # later version, at your option. See the file README and COPYING for # more information. # Copyright 2013 by Don Armstrong . use warnings; use strict; use Getopt::Long; use Pod::Usage; =head1 NAME geo_downloader - downloads expression files from GEO (NCBI) =head1 SYNOPSIS geo_downloader [options] [GSE...] Options: --debug, -d debugging level (Default 0) --help, -h display this help --man, -m display manual =head1 OPTIONS =over =item B<--debug, -d> Debug verbosity. (Default 0) =item B<--help, -h> Display brief usage information. =item B<--man, -m> Display this manual. =back =head1 EXAMPLES geo_downloader GSE20400 =cut use vars qw($DEBUG); use Net::FTP; use Cwd; use Data::Printer; my %options = (debug => 0, help => 0, man => 0, host => 'ftp.ncbi.nlm.nih.gov', ); GetOptions(\%options, 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; my @USAGE_ERRORS; if (not @ARGV) { push @USAGE_ERRORS,"You must give at least one GEO accession"; } if (@ARGV != grep {/^(gpl|gse|gsm|gds)\d+$/i} @ARGV) { push @USAGE_ERRORS,"Invalid GEO accession(s): ". join(',',grep {$_ !~ /^(gpl|gse|gsm)\d+$/} @ARGV); } pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; my $ftp = Net::FTP->new($options{host},Debug=>$DEBUG,Passive=>1) or die "Unable to create new Net::FTP object"; print STDERR "Connected\n" if $DEBUG; $ftp->login('anonymous') or die "Unable to login"; print STDERR "logged in\n" if $DEBUG; $ftp->binary(); print STDERR "binary\n" if $DEBUG; use Data::Printer; for my $geo_acc (@ARGV) { my $geo_directory = geo_directory($geo_acc); my $listing = recursive_file_listing($ftp,$geo_directory); p($listing) if $DEBUG; my $orig_dir = getcwd; mkdir($geo_acc) unless -d $geo_acc; chdir($geo_acc); for my $dir (qw(matrix miniml suppl)) { # we want all of the raw files, the xml file, and the matrix file if (exists $listing->{$dir} and ref($listing->{$dir})) { for my $file (keys %{$listing->{$dir}}) { next if ref($listing->{$dir}{$file}); $ftp->get($listing->{$dir}{$file}); } } } chdir($orig_dir); } sub recursive_file_listing { my ($ftp,$dir) = @_; my $listing; my $orig_dir = $ftp->pwd(); eval { $ftp->cwd($dir) or die "Not a directory $dir"; $listing = {}; my @files = $ftp->ls(); p @files if $DEBUG; for my $file (@files) { print STDERR "file: $file\n"; my $subdirs = recursive_file_listing($ftp,$file); if (defined $subdirs) { print STDERR "subdirs is :"; print STDERR p($subdirs); $listing->{$file} = $subdirs; } else { print STDERR "No subdirectory\n"; $listing->{$file} = $orig_dir.'/'.$dir.'/'.$file; } } }; $ftp->cwd($orig_dir); return $listing; } sub geo_directory { my $geo_acc = shift; $geo_acc = uc($geo_acc); my $geo_acc_dir = $geo_acc; $geo_acc_dir =~ s/\d{3}$/nnn/; my $geo_type_dir = undef; if ($geo_acc =~ /^GSE/) { $geo_type_dir = 'series'; } elsif ($geo_acc =~ /^GDS/) { $geo_type_dir = 'datasets'; } elsif ($geo_acc =~ /^GPL/) { $geo_type_dir = 'platforms'; } elsif ($geo_acc =~ /^GSM/) { $geo_type_dir = 'samples'; } return "/geo/".$geo_type_dir.'/'.$geo_acc_dir.'/'.$geo_acc; } __END__