From 4b9e2a852096626f61b281f89f57fa9a49968f84 Mon Sep 17 00:00:00 2001 From: Felix Lechner Date: Wed, 11 Aug 2021 12:43:51 -0700 Subject: [PATCH 1/2] Concatenate results from Contents-${arch}.gz and Contents-all.gz when the latter is present. (Closes: #977006, #977743, #980888) Since commit 81824d23 in daklib, the archive provides separate Contents files with packages from Arch:all for some suites. [1] (All suites post-buster appear to be affected.) Those file lists were not scanned for the web pages powering packages.d.o, and the file lists were unavailable online. This commit attempts to resolve the issue by parsing the file for Arch:all immediately afterwards, if it is present. The results are effectively concatenated. A better long-term solution might be to produce separate transfer files for Arch:all, but that may not work until buster is dropped from the archive. [1] https://salsa.debian.org/ftp-team/dak/-/commit/81824d2326f5cc50fdcb95c81f9f26864aebaa15 --- bin/parse-contents | 99 +++++++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 41 deletions(-) diff --git a/bin/parse-contents b/bin/parse-contents index 4efdaea..5bcde5b 100755 --- a/bin/parse-contents +++ b/bin/parse-contents @@ -76,7 +76,7 @@ for my $suite (@suites) { my $extra = ""; $extra = "|sort" if $SORT_REVERSE_CONCURRENTLY; - open REVERSED, "$extra>$DBDIR/reverse.tmp" + open my $REVERSED, "$extra>$DBDIR/reverse.tmp" or die "Failed to open output reverse file: $!"; my $changed = 0; @@ -102,7 +102,12 @@ for my $suite (@suites) { for my $archive (@archives) { for my $section (@sections) { - my $filename = "$TOPDIR/archive/$archive/$suite/$section/Contents-$arch.gz"; +# since commit 81824d23 in daklib, the archive provides separate Contents +# with Arch:all for some suites; see also merged bugs #977006 and #977743 +# https://salsa.debian.org/ftp-team/dak/-/commit/81824d2326f5cc50fdcb95c81f9f26864aebaa15 + my $folder = "$TOPDIR/archive/$archive/$suite"; + my $filename = "$folder/$section/Contents-$arch.gz"; + my $filename_all = "$folder/$section/Contents-all.gz"; next unless -f $filename; if (-l $filename) { @@ -112,45 +117,12 @@ for my $suite (@suites) { print "Reading $archive/$suite/$section/$arch...\n"; - # Note: a possible $what parameter isn't taken into account here: - my $uncompressed_size = (split /\s+/, `gzip --quiet -l $filename`)[2]; - - open CONT, "zcat $filename|$what" - or die $!; - while () { last if /^FILE/mo; } - if (eof(CONT)) { # no header found - close CONT; # explicit close to reset $. - open CONT, "zcat $filename|$what"; - } - while () { - my $data = ""; - my %data = (); - chomp; - display_progress(tell(CONT), $uncompressed_size) - if $NR % 250000 == 0; - /^(.+?)\s+(\S+)$/o; - my ($file, $value) = ($1, $2); - $value =~ s#[^,/]+/##og; - my @packages = split m/,/, $value; - for (@packages) { - $packages_contents_nr{$_}++; - my $lw = $packages_contents_lastword{$_} || "\0"; - my $i=0; - while (substr($file,$i,1) eq substr($lw,$i++,1)) {} - $i--; - $i = 255 if $i > 255; - $packages_contents{$_} .= pack "CC/a*", ($i, substr($file, $i)); - $packages_contents_lastword{$_} = "$file\0"; - } - # Searches are case-insensitive - (my $nocase = $file) =~ tr [A-Z] [a-z]; - my $case = ($nocase eq $file) ? '-' : $file; - - print REVERSED (reverse $nocase)."\0".$case."\0". - (join ":$arch\0", @packages).":$arch\n"; - } - close CONT; + read_contents($filename, $arch, $REVERSED, \%packages_contents, + \%packages_contents_nr, \%packages_contents_lastword); + read_contents($filename_all, $arch, $REVERSED, \%packages_contents, + \%packages_contents_nr, \%packages_contents_lastword) + if -e $filename_all; } print "Sorting reverse list if needed\n"; @@ -180,7 +152,7 @@ for my $suite (@suites) { } } } - close REVERSED; + close $REVERSED; } my $go = 0; @@ -262,4 +234,49 @@ for my $suite (@suites) { activate("$DBDIR/reverse_$suite.db"); } +sub read_contents { + my ($filename, $arch, $reversed_fh, $packages_contents, $packages_contents_nr, + $packages_contents_lastword) = @_; + + # Note: a possible $what parameter isn't taken into account here: + my $uncompressed_size = (split /\s+/, `gzip --quiet -l $filename`)[2]; + + open CONT, "zcat $filename|$what" + or die $!; + while () { last if /^FILE/mo; } + if (eof(CONT)) { # no header found + close CONT; # explicit close to reset $. + open CONT, "zcat $filename|$what"; + } + while () { + chomp; + display_progress(tell(CONT), $uncompressed_size) + if $NR % 250000 == 0; + /^(.+?)\s+(\S+)$/o; + my ($file, $value) = ($1, $2); + $value =~ s#[^,/]+/##og; + my @packages = split m/,/, $value; + for (@packages) { + $packages_contents_nr->{$_}++; + my $lw = $packages_contents_lastword->{$_} || "\0"; + my $i=0; + while (substr($file,$i,1) eq substr($lw,$i++,1)) {} + $i--; + $i = 255 if $i > 255; + $packages_contents->{$_} .= pack "CC/a*", ($i, substr($file, $i)); + $packages_contents_lastword->{$_} = "$file\0"; + } + # Searches are case-insensitive + (my $nocase = $file) =~ tr [A-Z] [a-z]; + my $case = ($nocase eq $file) ? '-' : $file; + + print $reversed_fh (reverse $nocase)."\0".$case."\0". + (join ":$arch\0", @packages).":$arch\n"; + } + close CONT; + + return; +} + + # vim: set ts=4 -- GitLab From 9186dd45f3ea111b309c580b11674689dd5f2c4d Mon Sep 17 00:00:00 2001 From: Felix Lechner Date: Wed, 11 Aug 2021 13:52:00 -0700 Subject: [PATCH 2/2] Test for link on suite level when avoiding duplicates; restore service. (Closes: #923974) This commit changes the logic of existing code. The author was not sure if the existing code performed as it should. The '-l' file test on the Contents.gz data files will not capture symbolic links for suite aliases on the suite level. Looks at the suite level instead. --- bin/parse-contents | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bin/parse-contents b/bin/parse-contents index 5bcde5b..a1bfc35 100755 --- a/bin/parse-contents +++ b/bin/parse-contents @@ -83,11 +83,12 @@ for my $suite (@suites) { for my $archive (@archives) { for my $section (@sections) { - my $filename = "$TOPDIR/archive/$archive/$suite/$section/Contents-$arch.gz"; + my $folder = "$TOPDIR/archive/$archive/$suite"; + my $filename = "$folder/$section/Contents-$arch.gz"; next unless -f $filename; - if (-l $filename) { - print "Skipping link $archive/$suite/$section/Contents-$arch.gz\n"; + if (-l $folder) { + print "Skipping linked suite $archive/$suite for $section/$arch\n"; next ; # do not process symlinks, or we will have double data } @@ -110,8 +111,8 @@ for my $suite (@suites) { my $filename_all = "$folder/$section/Contents-all.gz"; next unless -f $filename; - if (-l $filename) { - print "Skipping link $archive/$suite/$section/Contents-$arch.gz\n"; + if (-l $folder) { + print "Skipping linked suite $archive/$suite for $section/$arch\n"; next ; # do not process symlinks, or we will have double data } -- GitLab