Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
diffoscope
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Reproducible Builds
diffoscope
Commits
241c92af
Commit
241c92af
authored
1 year ago
by
Seth Michael Larson
Committed by
Chris Lamb
1 year ago
Browse files
Options
Downloads
Patches
Plain Diff
Add support for comparing the 'eXtensible ARchive' (.XAR/.PKG) file format
parent
f1822463
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
diffoscope/comparators/__init__.py
+1
-0
1 addition, 0 deletions
diffoscope/comparators/__init__.py
diffoscope/comparators/xar.py
+161
-0
161 additions, 0 deletions
diffoscope/comparators/xar.py
with
162 additions
and
0 deletions
diffoscope/comparators/__init__.py
+
1
−
0
View file @
241c92af
...
...
@@ -123,6 +123,7 @@ class ComparatorManager:
(
"
zst.ZstFile
"
,),
(
"
vmlinuz.VmlinuzFile
"
,),
(
"
arsc.ArscFile
"
,),
(
"
xar.XarFile
"
,),
)
_singleton
=
{}
...
...
This diff is collapsed.
Click to expand it.
diffoscope/comparators/xar.py
0 → 100644
+
161
−
0
View file @
241c92af
#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2024 Seth Michael Larson <seth@python.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
import
hashlib
import
re
import
logging
import
struct
import
xml.etree.ElementTree
as
ET
import
zlib
import
os
from
.utils.file
import
File
from
.utils.archive
import
Archive
from
diffoscope.difference
import
Difference
logger
=
logging
.
getLogger
(
__name__
)
class
XarContainer
(
Archive
):
def
get_member_names
(
self
):
toc_xml
=
self
.
parse_toc_xml
()
for
file_tag
in
toc_xml
.
iter
(
"
file
"
):
yield
file_tag
.
get
(
"
id
"
)
# Use IDs instead of names for members.
def
open_archive
(
self
):
pass
def
close_archive
(
self
):
pass
def
extract
(
self
,
member_name
,
dest_dir
):
toc_xml
=
self
.
parse_toc_xml
()
# Find the file name and data heap offset.
for
file_tag
in
toc_xml
.
iter
(
"
file
"
):
if
file_tag
.
get
(
"
id
"
)
==
member_name
:
file_name
=
file_tag
.
find
(
"
.//name
"
).
text
data_offset
=
int
(
file_tag
.
find
(
"
.//data/offset
"
).
text
)
data_length
=
int
(
file_tag
.
find
(
"
.//data/length
"
).
text
)
break
else
:
raise
KeyError
(
member_name
)
# Write data from the heap into the temporary directory.
# We automatically handle gzipped data thanks to the header.
dest_path
=
os
.
path
.
join
(
dest_dir
,
file_name
)
with
open
(
dest_path
,
mode
=
"
wb
"
)
as
fw
,
open
(
self
.
_source
.
path
,
mode
=
"
rb
"
)
as
fr
:
fr
.
seek
(
self
.
_heap_offset
+
data_offset
,
0
)
fw
.
write
(
fr
.
read
(
data_length
))
return
dest_path
def
parse_toc_xml
(
self
):
if
getattr
(
self
,
"
_toc_xml
"
,
None
)
is
None
:
with
open
(
self
.
_source
.
path
,
mode
=
"
rb
"
)
as
f
:
# Skip the magic and format, we're looking for
# header and TOC compressed lengths.
header_length
,
toc_compressed_length
,
=
struct
.
unpack
(
"
>xxxxHxxQ
"
,
f
.
read
(
16
))
# Read, decompress, and parse the TOC as XML. Save heap offset for later.
f
.
seek
(
header_length
,
0
)
toc_bytes
=
f
.
read
(
toc_compressed_length
)
toc_as_text
=
zlib
.
decompress
(
toc_bytes
).
decode
(
"
utf-8
"
)
self
.
_toc_xml
=
ET
.
XML
(
toc_as_text
)
self
.
_heap_offset
=
header_length
+
toc_compressed_length
return
self
.
_toc_xml
class
XarFile
(
File
):
DESCRIPTION
=
"
eXtensible ARchive files
"
CONTAINER_CLASSES
=
[
XarContainer
]
FILE_TYPE_RE
=
re
.
compile
(
r
"
\bpkg\b
"
)
FALLBACK_FILE_EXTENSION_SUFFIX
=
{
"
.xar
"
,
"
.pkg
"
}
# NOTE: Facebook's Executable Archive format also uses '.xar'.
FALLBACK_FILE_TYPE_HEADER_PREFIX
=
b
"
xar!
"
def
compare_details
(
self
,
other
,
source
=
None
):
self_xar_header
,
self_xar_toc
=
describe_xar
(
self
.
path
)
other_xar_header
,
other_xar_toc
=
describe_xar
(
other
.
path
)
return
[
Difference
.
from_text
(
self_xar_header
,
other_xar_header
,
self
.
path
,
other
.
path
,
source
=
"
XAR Header
"
,
),
Difference
.
from_text
(
self_xar_toc
,
other_xar_toc
,
self
.
path
,
other
.
path
,
source
=
"
XAR Table of Contents
"
,
),
]
def
describe_xar
(
path
):
with
open
(
path
,
mode
=
"
rb
"
)
as
f
:
magic
=
f
.
read
(
4
)
# Read the fixed portion of the XAR header
# Padding length is calculated using header length.
(
header_length
,
format_version
,
toc_compressed_length
,
toc_uncompressed_length
,
checksum_alg
)
=
struct
.
unpack
(
"
>HHQQI
"
,
f
.
read
(
24
))
known_checksum_algs
=
{
0
:
"
NONE
"
,
1
:
"
SHA1
"
,
2
:
"
MD5
"
,
3
:
"
SHA-256
"
,
4
:
"
SHA-512
"
}
header_lines
=
[
"
magic: {}
"
.
format
(
magic
),
"
format version: {}
"
.
format
(
format_version
),
"
TOC compressed length: {}
"
.
format
(
toc_compressed_length
),
"
TOC uncompressed length: {}
"
.
format
(
toc_uncompressed_length
),
"
checksum: {} ({})
"
.
format
(
checksum_alg
,
known_checksum_algs
.
get
(
checksum_alg
,
"
???
"
)),
]
# Note that this 'header length' includes the 4 bytes of magic, hence 28.
padding_length
=
header_length
-
28
if
padding_length
>
0
:
# Padding is optional.
padding
=
f
.
read
(
padding_length
)
header_lines
.
append
(
"
padding: {}
"
.
format
(
padding
))
# Read the TOC which is always DEFLATE compressed.
toc_bytes
=
f
.
read
(
toc_compressed_length
)
toc_as_text
=
zlib
.
decompress
(
toc_bytes
).
decode
(
"
utf-8
"
)
# Read the entire heap and add properties that allow detecting
# "invisible" differences in the heap, for example if data is inserted
# but isn't referenced in the TOC. This shouldn't happen in a normal XAR file.
heap_bytes
=
f
.
read
()
header_lines
.
extend
([
"
heap length: {}
"
.
format
(
len
(
heap_bytes
)),
"
heap checksum: {}
"
.
format
(
hashlib
.
sha256
(
heap_bytes
).
hexdigest
()),
])
header_as_text
=
"
\n
"
.
join
(
header_lines
)
return
header_as_text
,
toc_as_text
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment