From: Don Armstrong Date: Mon, 9 Jan 2017 23:14:37 +0000 (-0800) Subject: switch load_packages to use gzip -dc X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=b69bd70a344a2a26bcde07dc44397af0206a2aa2;p=debbugs.git switch load_packages to use gzip -dc - Update with new schema --- diff --git a/Debbugs/DB/Load.pm b/Debbugs/DB/Load.pm index 4697e62f..665431db 100644 --- a/Debbugs/DB/Load.pm +++ b/Debbugs/DB/Load.pm @@ -384,64 +384,123 @@ sub load_debinfo { =cut -sub load_package { - my ($schema,$suite,$component,$arch,$pkg) = @_; - if ($arch eq 'source') { - my $sp = $schema->resultset('SrcPkg')->find_or_create({pkg => $pkg->{Package}}); - my $suite = $schema->resultset('Suite')->find_or_create({suite_name => $suite}); - my $sv = $schema->resultset('SrcVer')->find_or_create({src_pkg =>$sp->id, - ver => $pkg->{Version}}); - my @addrs = getparsedaddrs($pkg->{Maintainer} // ''); - if (@addrs) { - my $mc = $schema->resultset('Correspondent')-> - find_or_create({addr => lc($addrs[0]->address())}); - my $full_name = $addrs[0]->phrase(); - $full_name =~ s/^\"|\"$//g; - $full_name =~ s/^\s+|\s+$//g; - $sv->discard_changes; - $sv->find_or_create_related('maintainer', - {name => $full_name, +sub load_packages { + my ($schema,$suite,$pkgs,$p) = @_; + my $suite_id = $schema->resultset('Suite')-> + find_or_create({codename => $suite})->id; + my %maint_cache; + my %arch_cache; + my %source_cache; + my $src_max_last_modified = $schema->resultset('SrcAssociation')-> + search_rs({suite => $suite_id}, + {order_by => {-desc => ['me.modified']}, + rows => 1, + page => 1 + } + )->single(); + my $bin_max_last_modified = $schema->resultset('BinAssociation')-> + search_rs({suite => $suite_id}, + {order_by => {-desc => ['me.modified']}, + rows => 1, + page => 1 + } + )->single(); + print STDERR time." handling packages\n"; + for my $pkg_tuple (@{$pkgs}) { + my ($arch,$component,$pkg) = @{$pkg_tuple}; + $p->update() if $p; + if ($arch eq 'source') { + my $source = $pkg->{Package}; + my $source_ver = $pkg->{Version}; + if (not exists $maint_cache{$pkg->{Maintainer}}) { + my @addrs = getparsedaddrs($pkg->{Maintainer} // ''); + if (@addrs) { + my $mc = $schema->resultset('Correspondent')-> + find_or_create({addr => lc($addrs[0]->address())}, + {key => 'correspondent_addr_idx'} + ); + my $full_name = $addrs[0]->phrase(); + $full_name =~ s/^\"|\"$//g; + $full_name =~ s/^\s+|\s+$//g; + # $sv->discard_changes; + my $maint = $schema->resultset('Maintainer')-> + find_or_create({name => $pkg->{Maintainer}, correspondent => $mc->id}, - ); - $mc->update_or_create_related('correspondent_full_names', - {full_name=>$full_name, - last_seen => 'NOW()'}); + {key => 'maintainer_name_idx'}, + ); + $mc->find_or_create_related('correspondent_full_names', + {full_name => $full_name}, + {key => 'correspondent_full_name_correspondent_full_name_idx'} + ); + $mc->update; + $maint_cache{$pkg->{Maintainer}} = $maint; + } + } + if (not exists $source_cache{$source}{$source_ver}) { + my $sp = $schema->resultset('SrcPkg')-> + find_or_create({pkg => $source}); + my $sv = $sp->find_or_create_related('src_vers', + {ver => $source_ver}); + $source_cache{$source}{$source_ver} = $sv; + if (exists $maint_cache{$pkg->{Maintainer}}) { + $source_cache{$source}{$source_ver}-> + set_from_related('maintainer', + $maint_cache{$pkg->{Maintainer}} + ); + $source_cache{$source}{$source_ver}->update; + } + } + $schema->resultset('SrcAssociation')-> + update_or_create({suite => $suite_id, + source => $source_cache{$source}{$source_ver}->id, + modified => 'NOW()', + }, + {key => 'src_associations_source_suite'} + ); + } else { + my $ar = $schema->resultset('Arch')-> + find_or_create(arch => $arch); + my $bp = $schema->resultset('BinPkg')-> + find_or_create({pkg => $pkg->{Package}}); + my $source = $pkg->{Source} // $pkg->{Package}; + my $source_ver = $pkg->{Version}; + if ($source =~ /^\s*(\S+) \(([^\)]+)\)\s*$/) { + ($source,$source_ver) = ($1,$2); + } + if (not exists $source_cache{$source}{$source_ver}) { + my $sp = $schema->resultset('SrcPkg')-> + find_or_create({pkg => $source}); + my $sv = $sp->find_or_create_related('src_vers', + {ver => $source_ver}); + $source_cache{$source}{$source_ver} = $sv; + } + my $bv = $bp->find_or_create_related('bin_vers', + {ver => $pkg->{Version}, + src_ver => $source_cache{$source}{$source_ver}->id, + arch => $ar->id, + }); + $schema->resultset('BinAssociation')-> + update_or_create({suite => $suite_id, + bin => $bv->id, + modified => 'NOW()', + }, + {key => 'bin_associations_bin_suite'} + ); } - # update the link for this source package - $schema-> - txndo(sub { - # delete associations for this source package in this - # suite - $schema->resultset('SrcAssociations')-> - search_rs({suite => $suite->id,})-> - search_related_rs('src_pkg', - {src_pkg => $sp->id})->delete; - $schema->resultset('SrcAssociations')-> - create({suite => $suite->id, - source => $sv->id, - }); - }); - } else { - my $bp = $schema->resultset('BinPkg')->find_or_create({pkg => $pkg->{Package}}); - my $suite = $schema->resultset('Suite')->find_or_create({suite_name => $suite}); - my ($bv) = $bp->search_related('bin_vers',{ver => $pkg->{Version}}); - # if there isn't already a binary version for this package, we don't - # know what source it belongs to, so we can't associate it with a - # release - return if (not defined $bv); - $schema-> - txndo(sub { - $schema->resultset('BinAssociations')-> - search_rs({suite => $suite->id,})-> - search_related_rs('bin_pkg', - {bin_pkg_id => $bp->id} - )->delete; - $schema->resultset('BinAssociations')-> - create({suite => $suite->id, - bin => $bv->id - }); - }); } + print STDERR time." deleting associations\n"; + # delete old binary associations in this suite which have not recently been + # modified + $schema->resultset('BinAssociation')-> + search_rs({suite => $suite_id, + modified => {'<',$bin_max_last_modified->modified()}, + }) if defined + $bin_max_last_modified; + $schema->resultset('SrcAssociation')-> + search_rs({suite => $suite_id, + modified => {'<',$src_max_last_modified->modified()}, + }) if defined + $src_max_last_modified; } =back diff --git a/bin/debbugs-loadsql b/bin/debbugs-loadsql index ae267fe8..e9b62ef9 100755 --- a/bin/debbugs-loadsql +++ b/bin/debbugs-loadsql @@ -98,7 +98,8 @@ Display this manual. use vars qw($DEBUG); -use Debbugs::Common qw(checkpid lockpid get_hashname getparsedaddrs getbugcomponent make_list getsourcemaintainers); +use Debbugs::Common (qw(checkpid lockpid get_hashname getparsedaddrs getbugcomponent make_list getsourcemaintainers), + qw(hash_slice)); use Debbugs::Config qw(:config); use Debbugs::Status qw(read_bug split_status_fields); use Debbugs::Log; @@ -106,8 +107,12 @@ use Debbugs::DB; use Debbugs::DB::Load qw(load_bug handle_load_bug_queue :load_package :load_suite); use DateTime; use File::stat; +use File::Basename; +use File::Spec; use IO::Dir; +use IO::File; use IO::Uncompress::AnyUncompress; +use Encode qw(decode_utf8); my %options = (debug => 0, @@ -157,6 +162,7 @@ my %subcommands = }, 'packages' => {function => \&add_packages, arguments => {'ftpdists=s' => 1, + 'suites=s@' => 0, }, }, 'help' => {function => sub {pod2usage({verbose => 2});}} @@ -202,6 +208,9 @@ if (not defined $subcommand) { pod2usage(); } +binmode(STDOUT,':encoding(UTF-8)'); +binmode(STDERR,':encoding(UTF-8)'); + my $opts = handle_subcommand_arguments(\@ARGV,$subcommands{$subcommand}{arguments}); $subcommands{$subcommand}{function}->(\%options,$opts,$prog_bar,\%config,\@ARGV); @@ -507,8 +516,6 @@ sub add_logs { sub add_packages { my ($options,$opts,$p,$config,$argv) = @_; - my $s = db_connect($options); - my $dist_dir = IO::Dir->new($opts->{ftpdists}); my @dist_names = grep { $_ !~ /^\./ and @@ -516,69 +523,67 @@ sub add_packages { not -l $opts->{ftpdists}.'/'.$_ } $dist_dir->read; my %s_p; - my %s_info; while (my $dist = shift @dist_names) { my $dist_dir = $opts->{ftpdists}.'/'.$dist; - # parse release - my $rfh = IO::Uncompress::AnyUncompress->new($dist_dir.'/Release'); - my %dist_info; - my $in_sha1; - my %p_f; - while (<$rfh>) { - chomp; - if (s/^(\S+):\s*//) { - if ($1 eq 'SHA1'or $1 eq 'SHA256') { - $in_sha1 = 1; - next; - } - $dist_info{$1} = $_; - } elsif ($in_sha1) { - s/^\s//; - my ($sha,$size,$file) = split /\s+/,$_; - next unless $file =~ /(?:Packages|Sources)(?:\.gz|\.xz)$/; - next unless $file =~ m{^([^/]+)/([^/]+)/([^/]+)$}; - my ($component,$arch,$package_source) = ($1,$2,$3); - $arch =~ s/binary-//; - next if exists $p_f{$component}{$arch}; - $p_f{$component}{$arch} = $dist_dir.'/'.$file; - } + my ($dist_info,$package_files) = + read_release_file($dist_dir.'/Release'); + $s_p{$dist_info->{Codename}} = $package_files; + } + my $tot = 0; + for my $suite (keys %s_p) { + for my $component (keys %{$s_p{$suite}}) { + $tot += scalar keys %{$s_p{$suite}{$component}}; } - $s_p{$dist_info{Suite}} = \%p_f; - $s_info{$dist_info{Suite}} = \%s_info; } + $p->target($tot) if $p; + my $i = 0; + my $avg_pkgs = 0; + my $tot_suites = scalar keys %s_p; + my $done_suites=0; + my $completed_pkgs=0; # parse packages files for my $suite (keys %s_p) { + print STDERR "working on $suite\n"; + my @pkgs; for my $component (keys %{$s_p{$suite}}) { - for my $arch (keys %{$s_p{$suite}{$component}}) { - my $pfh = IO::Uncompress::AnyUncompress->new($s_p{$suite}{$component}{$arch}) or + my @archs = keys %{$s_p{$suite}{$component}}; + if (grep {$_ eq 'source'} @archs) { + @archs = ('source',grep {$_ ne 'source'} @archs); + } + for my $arch (@archs) { + my $pfh = open_compressed_file($s_p{$suite}{$component}{$arch}) or die "Unable to open $s_p{$suite}{$component}{$arch} for reading: $!"; - my $lastkey; - my %pkg; + local $_; + local $/ = ''; # paragraph mode while (<$pfh>) { - if (/^$/) { - load_package($s,$suite,$component,$arch,\%pkg); - %pkg = (); - next; - } - if (my ($key, $value) = m/^(\S+): (.*)/) { - $pkg{$key} = $value; - $lastkey=$key; - } - else { - s/ //; - s/^\.$//; - chomp; - $pkg{$lastkey} .= "\n" . $_; + my %pkg; + for my $field (qw(Package Maintainer Version Source)) { + /^\Q$field\E: (.*)/m; + $pkg{$field} = $1; } - } - if (keys %pkg) { - load_package($s,$suite,$component,$arch,\%pkg); + next unless defined $pkg{Package} and + defined $pkg{Version}; + push @pkgs,[$arch,$component,\%pkg]; } } } + my $s = db_connect($options); + if ($avg_pkgs==0) { + $avg_pkgs = @pkgs; + } + $p->target($avg_pkgs*($tot_suites-$done_suites-1)+ + $completed_pkgs+@pkgs) if $p; + $s->txn_do(sub { + Debbugs::DB::Load::load_packages($s, + $suite, + \@pkgs, + $p) + }); + $avg_pkgs=($avg_pkgs*$done_suites + @pkgs)/($done_suites+1); + $completed_pkgs += @pkgs; + $done_suites++; } - use Data::Printer; - p %s_p; + $p->remove() if $p; } sub handle_subcommand_arguments { @@ -622,6 +627,57 @@ sub db_connect { die "Unable to connect to database: "; } +sub open_compressed_file { + my ($file) = @_; + my $fh; + my $mode = '<:encoding(UTF-8)'; + my @opts; + if ($file =~ /\.gz$/) { + $mode = '-|:encoding(UTF-8)'; + push @opts,'gzip','-dc'; + } + if ($file =~ /\.xz$/) { + $mode = '-|:encoding(UTF-8)'; + push @opts,'xz','-dc'; + } + if ($file =~ /\.bz2$/) { + $mode = '-|:encoding(UTF-8)'; + push @opts,'bzip2','-dc'; + } + open($fh,$mode,@opts,$file); + return $fh; +} + +sub read_release_file { + my ($file) = @_; + # parse release + my $rfh = open_compressed_file($file) or + die "Unable to open $file for reading: $!"; + my %dist_info; + my $in_sha1; + my %p_f; + while (<$rfh>) { + chomp; + if (s/^(\S+):\s*//) { + if ($1 eq 'SHA1'or $1 eq 'SHA256') { + $in_sha1 = 1; + next; + } + $dist_info{$1} = $_; + } elsif ($in_sha1) { + s/^\s//; + my ($sha,$size,$f) = split /\s+/,$_; + next unless $f =~ /(?:Packages|Sources)(?:\.gz|\.xz)$/; + next unless $f =~ m{^([^/]+)/([^/]+)/([^/]+)$}; + my ($component,$arch,$package_source) = ($1,$2,$3); + $arch =~ s/binary-//; + next if exists $p_f{$component}{$arch}; + $p_f{$component}{$arch} = File::Spec->catfile(dirname($file),$f); + } + } + return (\%dist_info,\%p_f); +} + sub walk_bugs { my ($dirs,$p,$what,$verbose,$sub) = @_; my @dirs = @{$dirs};