Unverified Commit aca62e60 authored by Seth Michael Larson's avatar Seth Michael Larson
Browse files

Add support for comparing the 'eXtensible ARchive' (.XAR/.PKG) file format

parent 40947e9d
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -122,6 +122,7 @@ class ComparatorManager:
        ("zst.ZstFile",),
        ("vmlinuz.VmlinuzFile",),
        ("arsc.ArscFile",),
        ("xar.XarFile",),
    )

    _singleton = {}
+161 −0
Original line number Diff line number Diff line
#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2024 Seth Michael Larson <seth@python.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
import hashlib
import re
import logging
import struct
import xml.etree.ElementTree as ET
import zlib
import os

from .utils.file import File
from .utils.archive import Archive
from diffoscope.difference import Difference

logger = logging.getLogger(__name__)


class XarContainer(Archive):
    def get_member_names(self):
        toc_xml = self.parse_toc_xml()
        for file_tag in toc_xml.iter("file"):
            yield file_tag.get("id")  # Use IDs instead of names for members.

    def open_archive(self):
        pass

    def close_archive(self):
        pass

    def extract(self, member_name, dest_dir):
        toc_xml = self.parse_toc_xml()

        # Find the file name and data heap offset.
        for file_tag in toc_xml.iter("file"):
            if file_tag.get("id") == member_name:
                file_name = file_tag.find(".//name").text
                data_offset = int(file_tag.find(".//data/offset").text)
                data_length = int(file_tag.find(".//data/length").text)
                break
        else:
            raise KeyError(member_name)

        # Write data from the heap into the temporary directory.
        # We automatically handle gzipped data thanks to the header.
        dest_path = os.path.join(dest_dir, file_name)
        with open(dest_path, mode="wb") as fw, open(self._source.path, mode="rb") as fr:
            fr.seek(self._heap_offset + data_offset, 0)
            fw.write(fr.read(data_length))

        return dest_path

    def parse_toc_xml(self):
        if getattr(self, "_toc_xml", None) is None:
            with open(self._source.path, mode="rb") as f:
                # Skip the magic and format, we're looking for
                # header and TOC compressed lengths.
                header_length, toc_compressed_length, = struct.unpack(">xxxxHxxQ", f.read(16))

                # Read, decompress, and parse the TOC as XML. Save heap offset for later.
                f.seek(header_length, 0)
                toc_bytes = f.read(toc_compressed_length)
                toc_as_text = zlib.decompress(toc_bytes).decode("utf-8")
                self._toc_xml = ET.XML(toc_as_text)
                self._heap_offset = header_length + toc_compressed_length

        return self._toc_xml

class XarFile(File):
    DESCRIPTION = "eXtensible ARchive files"
    CONTAINER_CLASSES = [XarContainer]
    FILE_TYPE_RE = re.compile(r"\bpkg\b")
    FALLBACK_FILE_EXTENSION_SUFFIX = {".xar", ".pkg"}  # NOTE: Facebook's Executable Archive format also uses '.xar'.
    FALLBACK_FILE_TYPE_HEADER_PREFIX = b"xar!"

    def compare_details(self, other, source=None):
        self_xar_header, self_xar_toc = describe_xar(self.path)
        other_xar_header, other_xar_toc = describe_xar(other.path)
        return [
            Difference.from_text(
                self_xar_header,
                other_xar_header,
                self.path,
                other.path,
                source="XAR Header",
            ),
            Difference.from_text(
                self_xar_toc,
                other_xar_toc,
                self.path,
                other.path,
                source="XAR Table of Contents",
            ),
        ]


def describe_xar(path):
    with open(path, mode="rb") as f:
        magic = f.read(4)

        # Read the fixed portion of the XAR header
        # Padding length is calculated using header length.
        (
            header_length,
            format_version,
            toc_compressed_length,
            toc_uncompressed_length,
            checksum_alg
        ) = struct.unpack(">HHQQI", f.read(24))

        known_checksum_algs = {
            0: "NONE",
            1: "SHA1",
            2: "MD5",
            3: "SHA-256",
            4: "SHA-512"
        }

        header_lines = [
            "magic:                   {}".format(magic),
            "format version:          {}".format(format_version),
            "TOC compressed length:   {}".format(toc_compressed_length),
            "TOC uncompressed length: {}".format(toc_uncompressed_length),
            "checksum:                {} ({})".format(checksum_alg, known_checksum_algs.get(checksum_alg, "???")),
        ]

        # Note that this 'header length' includes the 4 bytes of magic, hence 28.
        padding_length = header_length - 28
        if padding_length > 0:  # Padding is optional.
            padding = f.read(padding_length)
            header_lines.append("padding:                 {}".format(padding))

        # Read the TOC which is always DEFLATE compressed.
        toc_bytes = f.read(toc_compressed_length)
        toc_as_text = zlib.decompress(toc_bytes).decode("utf-8")

        # Read the entire heap and add properties that allow detecting
        # "invisible" differences in the heap, for example if data is inserted
        # but isn't referenced in the TOC. This shouldn't happen in a normal XAR file.
        heap_bytes = f.read()
        header_lines.extend([
            "heap length:             {}".format(len(heap_bytes)),
            "heap checksum:           {}".format(hashlib.sha256(heap_bytes).hexdigest()),
        ])

        header_as_text = "\n".join(header_lines)
        return header_as_text, toc_as_text