buildcache create: reproducible tarballs (#35623)

Currently `spack buildcache create` creates compressed tarballs that
differ between each invocation, thanks to:

1. The gzip header containing mtime set to time.time()
2. The generated buildinfo file which has a different mtime every time.

To avoid this, you have to explicitly construct GZipFile yourself, since
the Python API doesn't expose the mtime arg, and we have to manually
create the tarinfo object for the buildinfo metadata file.

Normalize mode: regular files & hardlinks executable by user, dirs, symlinks: set 0o755 permissions in tarfile; other files use 0o644
This commit is contained in:
Harmen Stoppels 2023-03-08 16:51:55 +01:00 committed by GitHub
parent 2777ca83eb
commit 22d4e79037
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 177 additions and 50 deletions

View file

@ -6,6 +6,8 @@
import codecs
import collections
import hashlib
import io
import itertools
import json
import multiprocessing.pool
import os
@ -20,7 +22,8 @@
import urllib.parse
import urllib.request
import warnings
from contextlib import closing
from contextlib import closing, contextmanager
from gzip import GzipFile
from urllib.error import HTTPError, URLError
import ruamel.yaml as yaml
@ -739,34 +742,31 @@ def get_buildfile_manifest(spec):
return data
def write_buildinfo_file(spec, workdir, rel=False):
"""
Create a cache file containing information
required for the relocation
"""
def prefixes_to_hashes(spec):
return {
str(s.prefix): s.dag_hash()
for s in itertools.chain(
spec.traverse(root=True, deptype="link"), spec.dependencies(deptype="run")
)
}
def get_buildinfo_dict(spec, rel=False):
"""Create metadata for a tarball"""
manifest = get_buildfile_manifest(spec)
prefix_to_hash = dict()
prefix_to_hash[str(spec.package.prefix)] = spec.dag_hash()
deps = spack.build_environment.get_rpath_deps(spec.package)
for d in deps + spec.dependencies(deptype="run"):
prefix_to_hash[str(d.prefix)] = d.dag_hash()
# Create buildinfo data and write it to disk
buildinfo = {}
buildinfo["sbang_install_path"] = spack.hooks.sbang.sbang_install_path()
buildinfo["relative_rpaths"] = rel
buildinfo["buildpath"] = spack.store.layout.root
buildinfo["spackprefix"] = spack.paths.prefix
buildinfo["relative_prefix"] = os.path.relpath(spec.prefix, spack.store.layout.root)
buildinfo["relocate_textfiles"] = manifest["text_to_relocate"]
buildinfo["relocate_binaries"] = manifest["binary_to_relocate"]
buildinfo["relocate_links"] = manifest["link_to_relocate"]
buildinfo["hardlinks_deduped"] = manifest["hardlinks_deduped"]
buildinfo["prefix_to_hash"] = prefix_to_hash
filename = buildinfo_file_name(workdir)
with open(filename, "w") as outfile:
outfile.write(syaml.dump(buildinfo, default_flow_style=True))
return {
"sbang_install_path": spack.hooks.sbang.sbang_install_path(),
"relative_rpaths": rel,
"buildpath": spack.store.layout.root,
"spackprefix": spack.paths.prefix,
"relative_prefix": os.path.relpath(spec.prefix, spack.store.layout.root),
"relocate_textfiles": manifest["text_to_relocate"],
"relocate_binaries": manifest["binary_to_relocate"],
"relocate_links": manifest["link_to_relocate"],
"hardlinks_deduped": manifest["hardlinks_deduped"],
"prefix_to_hash": prefixes_to_hashes(spec),
}
def tarball_directory_name(spec):
@ -1139,6 +1139,68 @@ def generate_key_index(key_prefix, tmpdir=None):
shutil.rmtree(tmpdir)
@contextmanager
def gzip_compressed_tarfile(path):
"""Create a reproducible, compressed tarfile"""
# Create gzip compressed tarball of the install prefix
# 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
# If the filename="" is dropped, Python will use fileobj.name instead.
# This should effectively mimick `gzip --no-name`.
# 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
# compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
# compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
# So we follow gzip.
with open(path, "wb") as fileobj, closing(
GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=fileobj)
) as gzip_file, tarfile.TarFile(name="", mode="w", fileobj=gzip_file) as tar:
yield tar
def deterministic_tarinfo(tarinfo: tarfile.TarInfo):
# We only add files, symlinks, hardlinks, and directories
# No character devices, block devices and FIFOs should ever enter a tarball.
if tarinfo.isdev():
return None
# For distribution, it makes no sense to user/group data; since (a) they don't exist
# on other machines, and (b) they lead to surprises as `tar x` run as root will change
# ownership if it can. We want to extract as the current user. By setting owner to root,
# root will extract as root, and non-privileged user will extract as themselves.
tarinfo.uid = 0
tarinfo.gid = 0
tarinfo.uname = ""
tarinfo.gname = ""
# Reset mtime to epoch time, our prefixes are not truly immutable, so files may get
# touched; as long as the content does not change, this ensures we get stable tarballs.
tarinfo.mtime = 0
# Normalize mode
if tarinfo.isfile() or tarinfo.islnk():
# If user can execute, use 0o755; else 0o644
# This is to avoid potentially unsafe world writable & exeutable files that may get
# extracted when Python or tar is run with privileges
tarinfo.mode = 0o644 if tarinfo.mode & 0o100 == 0 else 0o755
else: # symbolic link and directories
tarinfo.mode = 0o755
return tarinfo
def tar_add_metadata(tar: tarfile.TarFile, path: str, data: dict):
# Serialize buildinfo for the tarball
bstring = syaml.dump(data, default_flow_style=True).encode("utf-8")
tarinfo = tarfile.TarInfo(name=path)
tarinfo.size = len(bstring)
tar.addfile(deterministic_tarinfo(tarinfo), io.BytesIO(bstring))
def _do_create_tarball(tarfile_path, binaries_dir, pkg_dir, buildinfo):
with gzip_compressed_tarfile(tarfile_path) as tar:
tar.add(name=binaries_dir, arcname=pkg_dir, filter=deterministic_tarinfo)
tar_add_metadata(tar, buildinfo_file_name(pkg_dir), buildinfo)
def _build_tarball(
spec,
out_url,
@ -1217,38 +1279,25 @@ def _build_tarball(
os.remove(temp_tarfile_path)
else:
binaries_dir = spec.prefix
mkdirp(os.path.join(workdir, ".spack"))
# create info for later relocation and create tar
write_buildinfo_file(spec, workdir, relative)
buildinfo = get_buildinfo_dict(spec, relative)
# optionally make the paths in the binaries relative to each other
# in the spack install tree before creating tarball
try:
if relative:
make_package_relative(workdir, spec, allow_root)
make_package_relative(workdir, spec, buildinfo, allow_root)
elif not allow_root:
ensure_package_relocatable(workdir, binaries_dir)
ensure_package_relocatable(buildinfo, binaries_dir)
except Exception as e:
shutil.rmtree(workdir)
shutil.rmtree(tarfile_dir)
shutil.rmtree(tmpdir)
tty.die(e)
# create gzip compressed tarball of the install prefix
# On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
# compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
# compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
# So we follow gzip.
with closing(tarfile.open(tarfile_path, "w:gz", compresslevel=6)) as tar:
tar.add(name=binaries_dir, arcname=pkg_dir)
if not relative:
# Add buildinfo file
buildinfo_path = buildinfo_file_name(workdir)
buildinfo_arcname = buildinfo_file_name(pkg_dir)
tar.add(name=buildinfo_path, arcname=buildinfo_arcname)
_do_create_tarball(tarfile_path, binaries_dir, pkg_dir, buildinfo)
# remove copy of install directory
if relative:
shutil.rmtree(workdir)
# get the sha256 checksum of the tarball
@ -1536,13 +1585,12 @@ def download_tarball(spec, unsigned=False, mirrors_for_spec=None):
return None
def make_package_relative(workdir, spec, allow_root):
def make_package_relative(workdir, spec, buildinfo, allow_root):
"""
Change paths in binaries to relative paths. Change absolute symlinks
to relative symlinks.
"""
prefix = spec.prefix
buildinfo = read_buildinfo_file(workdir)
old_layout_root = buildinfo["buildpath"]
orig_path_names = list()
cur_path_names = list()
@ -1566,9 +1614,8 @@ def make_package_relative(workdir, spec, allow_root):
relocate.make_link_relative(cur_path_names, orig_path_names)
def ensure_package_relocatable(workdir, binaries_dir):
def ensure_package_relocatable(buildinfo, binaries_dir):
"""Check if package binaries are relocatable."""
buildinfo = read_buildinfo_file(workdir)
binaries = [os.path.join(binaries_dir, f) for f in buildinfo["relocate_binaries"]]
relocate.ensure_binaries_are_relocatable(binaries)

View file

@ -2,11 +2,13 @@
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import filecmp
import glob
import io
import os
import platform
import sys
import tarfile
import urllib.error
import urllib.request
import urllib.response
@ -952,3 +954,81 @@ def fake_build_tarball(node, push_url, **kwargs):
bindist.push([spec], push_url, include_root=root, include_dependencies=deps)
assert packages_to_push == expected
def test_reproducible_tarball_is_reproducible(tmpdir):
p = tmpdir.mkdir("prefix")
p.mkdir("bin")
p.mkdir(".spack")
app = p.join("bin", "app")
tarball_1 = str(tmpdir.join("prefix-1.tar.gz"))
tarball_2 = str(tmpdir.join("prefix-2.tar.gz"))
with open(app, "w") as f:
f.write("hello world")
buildinfo = {"metadata": "yes please"}
# Create a tarball with a certain mtime of bin/app
os.utime(app, times=(0, 0))
bindist._do_create_tarball(tarball_1, binaries_dir=p, pkg_dir="pkg", buildinfo=buildinfo)
# Do it another time with different mtime of bin/app
os.utime(app, times=(10, 10))
bindist._do_create_tarball(tarball_2, binaries_dir=p, pkg_dir="pkg", buildinfo=buildinfo)
# They should be bitwise identical:
assert filecmp.cmp(tarball_1, tarball_2, shallow=False)
# Sanity check for contents:
with tarfile.open(tarball_1, mode="r") as f:
for m in f.getmembers():
assert m.uid == m.gid == m.mtime == 0
assert m.uname == m.gname == ""
assert set(f.getnames()) == {
"pkg",
"pkg/bin",
"pkg/bin/app",
"pkg/.spack",
"pkg/.spack/binary_distribution",
}
def test_tarball_normalized_permissions(tmpdir):
p = tmpdir.mkdir("prefix")
p.mkdir("bin")
p.mkdir("share")
p.mkdir(".spack")
app = p.join("bin", "app")
data = p.join("share", "file")
tarball = str(tmpdir.join("prefix.tar.gz"))
# Everyone can write & execute. This should turn into 0o755 when the tarball is
# extracted (on a different system).
with open(app, "w", opener=lambda path, flags: os.open(path, flags, 0o777)) as f:
f.write("hello world")
# User doesn't have execute permissions, but group/world have; this should also
# turn into 0o644 (user read/write, group&world only read).
with open(data, "w", opener=lambda path, flags: os.open(path, flags, 0o477)) as f:
f.write("hello world")
bindist._do_create_tarball(tarball, binaries_dir=p, pkg_dir="pkg", buildinfo={})
with tarfile.open(tarball) as tar:
path_to_member = {member.name: member for member in tar.getmembers()}
# directories should have 0o755
assert path_to_member["pkg"].mode == 0o755
assert path_to_member["pkg/bin"].mode == 0o755
assert path_to_member["pkg/.spack"].mode == 0o755
# executable-by-user files should be 0o755
assert path_to_member["pkg/bin/app"].mode == 0o755
# not-executable-by-user files should be 0o644
assert path_to_member["pkg/share/file"].mode == 0o644