gzip.pm 4.07 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
#
# Copyright 2014 Andrew Ayer
#
# This file is part of strip-nondeterminism.
#
# strip-nondeterminism is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# strip-nondeterminism is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with strip-nondeterminism.  If not, see <http://www.gnu.org/licenses/>.
#
19
package File::StripNondeterminism::handlers::gzip;
20 21 22 23

use strict;
use warnings;

24
use File::StripNondeterminism::Common qw(copy_data);
25
use File::Temp;
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
use File::Basename;

use constant {
	FTEXT    => 1 << 0,
	FHCRC    => 1 << 1,
	FEXTRA   => 1 << 2,
	FNAME    => 1 << 3,
	FCOMMENT => 1 << 4,
};

sub normalize {
	my ($filename) = @_;

	open(my $fh, '<', $filename) or die "Unable to open $filename for reading: $!";

	# See RFC 1952

	# 0   1   2   3   4   5   6   7   8   9   10
	# +---+---+---+---+---+---+---+---+---+---+
	# |ID1|ID2|CM |FLG|     MTIME     |XFL|OS |
	# +---+---+---+---+---+---+---+---+---+---+

	# Read the current header
	my $hdr;
	my $bytes_read = read($fh, $hdr, 10);
51
	return 0 unless $bytes_read == 10;
52
	my ($id1, $id2, $cm, $flg, $mtime, $xfl, $os) = unpack('CCCCl<CC', $hdr);
53
	return 0 unless $id1 == 31 and $id2 == 139;
54 55 56 57

	my $new_flg = $flg;
	$new_flg &= ~FNAME;	# Don't include filename
	$new_flg &= ~FHCRC;	# Don't include header CRC (not all implementations support it)
58
	unless ($mtime == 0) {	# Don't set a deterministic timestamp if there wasn't already a timestamp
59 60 61 62 63 64 65
		if (defined $File::StripNondeterminism::canonical_time) {
			if (!$File::StripNondeterminism::clamp_time || $mtime > $File::StripNondeterminism::canonical_time) {
				$mtime = $File::StripNondeterminism::canonical_time;
			}
		} else {
			$mtime = 0; # gzip treats 0 as "no timestamp"
		}
66
	}
67 68
	# TODO: question: normalize some of the other fields, such as OS?

69 70
	my $tempfile = File::Temp->new(DIR => dirname($filename));

71
	# Write a new header
72
	print $tempfile pack('CCCCl<CC', $id1, $id2, $cm, $new_flg, $mtime, $xfl, $os);
73 74 75 76 77 78 79 80 81 82

	if ($flg & FEXTRA) {	# Copy through
		# 0   1   2
		# +---+---+=================================+
		# | XLEN  |...XLEN bytes of "extra field"...|
		# +---+---+=================================+
		my $buf;
		read($fh, $buf, 2) == 2 or die "$filename: Malformed gzip file";
		my ($xlen) = unpack('v', $buf);
		read($fh, $buf, $xlen) == $xlen or die "$filename: Malformed gzip file";
83
		print $tempfile pack('vA*', $xlen, $buf);
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
	}
	if ($flg & FNAME) {	# Read but do not copy through
		# 0
		# +=========================================+
		# |...original file name, zero-terminated...|
		# +=========================================+
		while (1) {
			my $buf;
			read($fh, $buf, 1) == 1 or die "$filename: Malformed gzip file";
			last if ord($buf) == 0;
		}
	}
	if ($flg & FCOMMENT) {	# Copy through
		# 0
		# +===================================+
		# |...file comment, zero-terminated...|
		# +===================================+
		while (1) {
			my $buf;
			read($fh, $buf, 1) == 1 or die "$filename: Malformed gzip file";
104
			print $tempfile $buf;
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
			last if ord($buf) == 0;
		}
	}
	if ($flg & FHCRC) {	# Read but do not copy through
		# 0   1   2
		# +---+---+
		# | CRC16 |
		# +---+---+
		my $buf;
		read($fh, $buf, 2) == 2 or die "$filename: Malformed gzip file";
	}

	# Copy through the rest of the file.
	# TODO: also normalize concatenated gzip files.  This will require reading and understanding
	# each DEFLATE block (see RFC 1951), since gzip doesn't include lengths anywhere.
	while (1) {
		my $buf;
		my $bytes_read = read($fh, $buf, 4096);
		defined($bytes_read) or die "$filename: read failed: $!";
124
		print $tempfile $buf;
125 126 127
		last if $bytes_read == 0;
	}

128 129 130
	$tempfile->close;
	copy_data($tempfile->filename, $filename)
		or die "$filename: unable to overwrite: copy_data: $!";
131 132

	return 1;
133 134 135
}

1;