buildcache create: reproducible tarballs (#35623)
Currently `spack buildcache create` creates compressed tarballs that differ between each invocation, thanks to: 1. The gzip header containing mtime set to time.time() 2. The generated buildinfo file which has a different mtime every time. To avoid this, you have to explicitly construct GZipFile yourself, since the Python API doesn't expose the mtime arg, and we have to manually create the tarinfo object for the buildinfo metadata file. Normalize mode: regular files & hardlinks executable by user, dirs, symlinks: set 0o755 permissions in tarfile; other files use 0o644
This commit is contained in:
parent
2777ca83eb
commit
22d4e79037
2 changed files with 177 additions and 50 deletions
|
@ -6,6 +6,8 @@
|
||||||
import codecs
|
import codecs
|
||||||
import collections
|
import collections
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import io
|
||||||
|
import itertools
|
||||||
import json
|
import json
|
||||||
import multiprocessing.pool
|
import multiprocessing.pool
|
||||||
import os
|
import os
|
||||||
|
@ -20,7 +22,8 @@
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import warnings
|
import warnings
|
||||||
from contextlib import closing
|
from contextlib import closing, contextmanager
|
||||||
|
from gzip import GzipFile
|
||||||
from urllib.error import HTTPError, URLError
|
from urllib.error import HTTPError, URLError
|
||||||
|
|
||||||
import ruamel.yaml as yaml
|
import ruamel.yaml as yaml
|
||||||
|
@ -739,34 +742,31 @@ def get_buildfile_manifest(spec):
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def write_buildinfo_file(spec, workdir, rel=False):
|
def prefixes_to_hashes(spec):
|
||||||
"""
|
return {
|
||||||
Create a cache file containing information
|
str(s.prefix): s.dag_hash()
|
||||||
required for the relocation
|
for s in itertools.chain(
|
||||||
"""
|
spec.traverse(root=True, deptype="link"), spec.dependencies(deptype="run")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_buildinfo_dict(spec, rel=False):
|
||||||
|
"""Create metadata for a tarball"""
|
||||||
manifest = get_buildfile_manifest(spec)
|
manifest = get_buildfile_manifest(spec)
|
||||||
|
|
||||||
prefix_to_hash = dict()
|
return {
|
||||||
prefix_to_hash[str(spec.package.prefix)] = spec.dag_hash()
|
"sbang_install_path": spack.hooks.sbang.sbang_install_path(),
|
||||||
deps = spack.build_environment.get_rpath_deps(spec.package)
|
"relative_rpaths": rel,
|
||||||
for d in deps + spec.dependencies(deptype="run"):
|
"buildpath": spack.store.layout.root,
|
||||||
prefix_to_hash[str(d.prefix)] = d.dag_hash()
|
"spackprefix": spack.paths.prefix,
|
||||||
|
"relative_prefix": os.path.relpath(spec.prefix, spack.store.layout.root),
|
||||||
# Create buildinfo data and write it to disk
|
"relocate_textfiles": manifest["text_to_relocate"],
|
||||||
buildinfo = {}
|
"relocate_binaries": manifest["binary_to_relocate"],
|
||||||
buildinfo["sbang_install_path"] = spack.hooks.sbang.sbang_install_path()
|
"relocate_links": manifest["link_to_relocate"],
|
||||||
buildinfo["relative_rpaths"] = rel
|
"hardlinks_deduped": manifest["hardlinks_deduped"],
|
||||||
buildinfo["buildpath"] = spack.store.layout.root
|
"prefix_to_hash": prefixes_to_hashes(spec),
|
||||||
buildinfo["spackprefix"] = spack.paths.prefix
|
}
|
||||||
buildinfo["relative_prefix"] = os.path.relpath(spec.prefix, spack.store.layout.root)
|
|
||||||
buildinfo["relocate_textfiles"] = manifest["text_to_relocate"]
|
|
||||||
buildinfo["relocate_binaries"] = manifest["binary_to_relocate"]
|
|
||||||
buildinfo["relocate_links"] = manifest["link_to_relocate"]
|
|
||||||
buildinfo["hardlinks_deduped"] = manifest["hardlinks_deduped"]
|
|
||||||
buildinfo["prefix_to_hash"] = prefix_to_hash
|
|
||||||
filename = buildinfo_file_name(workdir)
|
|
||||||
with open(filename, "w") as outfile:
|
|
||||||
outfile.write(syaml.dump(buildinfo, default_flow_style=True))
|
|
||||||
|
|
||||||
|
|
||||||
def tarball_directory_name(spec):
|
def tarball_directory_name(spec):
|
||||||
|
@ -1139,6 +1139,68 @@ def generate_key_index(key_prefix, tmpdir=None):
|
||||||
shutil.rmtree(tmpdir)
|
shutil.rmtree(tmpdir)
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def gzip_compressed_tarfile(path):
|
||||||
|
"""Create a reproducible, compressed tarfile"""
|
||||||
|
# Create gzip compressed tarball of the install prefix
|
||||||
|
# 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
|
||||||
|
# If the filename="" is dropped, Python will use fileobj.name instead.
|
||||||
|
# This should effectively mimick `gzip --no-name`.
|
||||||
|
# 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
|
||||||
|
# compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
|
||||||
|
# compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
|
||||||
|
# So we follow gzip.
|
||||||
|
with open(path, "wb") as fileobj, closing(
|
||||||
|
GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=fileobj)
|
||||||
|
) as gzip_file, tarfile.TarFile(name="", mode="w", fileobj=gzip_file) as tar:
|
||||||
|
yield tar
|
||||||
|
|
||||||
|
|
||||||
|
def deterministic_tarinfo(tarinfo: tarfile.TarInfo):
|
||||||
|
# We only add files, symlinks, hardlinks, and directories
|
||||||
|
# No character devices, block devices and FIFOs should ever enter a tarball.
|
||||||
|
if tarinfo.isdev():
|
||||||
|
return None
|
||||||
|
|
||||||
|
# For distribution, it makes no sense to user/group data; since (a) they don't exist
|
||||||
|
# on other machines, and (b) they lead to surprises as `tar x` run as root will change
|
||||||
|
# ownership if it can. We want to extract as the current user. By setting owner to root,
|
||||||
|
# root will extract as root, and non-privileged user will extract as themselves.
|
||||||
|
tarinfo.uid = 0
|
||||||
|
tarinfo.gid = 0
|
||||||
|
tarinfo.uname = ""
|
||||||
|
tarinfo.gname = ""
|
||||||
|
|
||||||
|
# Reset mtime to epoch time, our prefixes are not truly immutable, so files may get
|
||||||
|
# touched; as long as the content does not change, this ensures we get stable tarballs.
|
||||||
|
tarinfo.mtime = 0
|
||||||
|
|
||||||
|
# Normalize mode
|
||||||
|
if tarinfo.isfile() or tarinfo.islnk():
|
||||||
|
# If user can execute, use 0o755; else 0o644
|
||||||
|
# This is to avoid potentially unsafe world writable & exeutable files that may get
|
||||||
|
# extracted when Python or tar is run with privileges
|
||||||
|
tarinfo.mode = 0o644 if tarinfo.mode & 0o100 == 0 else 0o755
|
||||||
|
else: # symbolic link and directories
|
||||||
|
tarinfo.mode = 0o755
|
||||||
|
|
||||||
|
return tarinfo
|
||||||
|
|
||||||
|
|
||||||
|
def tar_add_metadata(tar: tarfile.TarFile, path: str, data: dict):
|
||||||
|
# Serialize buildinfo for the tarball
|
||||||
|
bstring = syaml.dump(data, default_flow_style=True).encode("utf-8")
|
||||||
|
tarinfo = tarfile.TarInfo(name=path)
|
||||||
|
tarinfo.size = len(bstring)
|
||||||
|
tar.addfile(deterministic_tarinfo(tarinfo), io.BytesIO(bstring))
|
||||||
|
|
||||||
|
|
||||||
|
def _do_create_tarball(tarfile_path, binaries_dir, pkg_dir, buildinfo):
|
||||||
|
with gzip_compressed_tarfile(tarfile_path) as tar:
|
||||||
|
tar.add(name=binaries_dir, arcname=pkg_dir, filter=deterministic_tarinfo)
|
||||||
|
tar_add_metadata(tar, buildinfo_file_name(pkg_dir), buildinfo)
|
||||||
|
|
||||||
|
|
||||||
def _build_tarball(
|
def _build_tarball(
|
||||||
spec,
|
spec,
|
||||||
out_url,
|
out_url,
|
||||||
|
@ -1217,39 +1279,26 @@ def _build_tarball(
|
||||||
os.remove(temp_tarfile_path)
|
os.remove(temp_tarfile_path)
|
||||||
else:
|
else:
|
||||||
binaries_dir = spec.prefix
|
binaries_dir = spec.prefix
|
||||||
mkdirp(os.path.join(workdir, ".spack"))
|
|
||||||
|
|
||||||
# create info for later relocation and create tar
|
# create info for later relocation and create tar
|
||||||
write_buildinfo_file(spec, workdir, relative)
|
buildinfo = get_buildinfo_dict(spec, relative)
|
||||||
|
|
||||||
# optionally make the paths in the binaries relative to each other
|
# optionally make the paths in the binaries relative to each other
|
||||||
# in the spack install tree before creating tarball
|
# in the spack install tree before creating tarball
|
||||||
try:
|
try:
|
||||||
if relative:
|
if relative:
|
||||||
make_package_relative(workdir, spec, allow_root)
|
make_package_relative(workdir, spec, buildinfo, allow_root)
|
||||||
elif not allow_root:
|
elif not allow_root:
|
||||||
ensure_package_relocatable(workdir, binaries_dir)
|
ensure_package_relocatable(buildinfo, binaries_dir)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
shutil.rmtree(workdir)
|
|
||||||
shutil.rmtree(tarfile_dir)
|
|
||||||
shutil.rmtree(tmpdir)
|
shutil.rmtree(tmpdir)
|
||||||
tty.die(e)
|
tty.die(e)
|
||||||
|
|
||||||
# create gzip compressed tarball of the install prefix
|
_do_create_tarball(tarfile_path, binaries_dir, pkg_dir, buildinfo)
|
||||||
# On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
|
|
||||||
# compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
|
|
||||||
# compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
|
|
||||||
# So we follow gzip.
|
|
||||||
with closing(tarfile.open(tarfile_path, "w:gz", compresslevel=6)) as tar:
|
|
||||||
tar.add(name=binaries_dir, arcname=pkg_dir)
|
|
||||||
if not relative:
|
|
||||||
# Add buildinfo file
|
|
||||||
buildinfo_path = buildinfo_file_name(workdir)
|
|
||||||
buildinfo_arcname = buildinfo_file_name(pkg_dir)
|
|
||||||
tar.add(name=buildinfo_path, arcname=buildinfo_arcname)
|
|
||||||
|
|
||||||
# remove copy of install directory
|
# remove copy of install directory
|
||||||
shutil.rmtree(workdir)
|
if relative:
|
||||||
|
shutil.rmtree(workdir)
|
||||||
|
|
||||||
# get the sha256 checksum of the tarball
|
# get the sha256 checksum of the tarball
|
||||||
checksum = checksum_tarball(tarfile_path)
|
checksum = checksum_tarball(tarfile_path)
|
||||||
|
@ -1536,13 +1585,12 @@ def download_tarball(spec, unsigned=False, mirrors_for_spec=None):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def make_package_relative(workdir, spec, allow_root):
|
def make_package_relative(workdir, spec, buildinfo, allow_root):
|
||||||
"""
|
"""
|
||||||
Change paths in binaries to relative paths. Change absolute symlinks
|
Change paths in binaries to relative paths. Change absolute symlinks
|
||||||
to relative symlinks.
|
to relative symlinks.
|
||||||
"""
|
"""
|
||||||
prefix = spec.prefix
|
prefix = spec.prefix
|
||||||
buildinfo = read_buildinfo_file(workdir)
|
|
||||||
old_layout_root = buildinfo["buildpath"]
|
old_layout_root = buildinfo["buildpath"]
|
||||||
orig_path_names = list()
|
orig_path_names = list()
|
||||||
cur_path_names = list()
|
cur_path_names = list()
|
||||||
|
@ -1566,9 +1614,8 @@ def make_package_relative(workdir, spec, allow_root):
|
||||||
relocate.make_link_relative(cur_path_names, orig_path_names)
|
relocate.make_link_relative(cur_path_names, orig_path_names)
|
||||||
|
|
||||||
|
|
||||||
def ensure_package_relocatable(workdir, binaries_dir):
|
def ensure_package_relocatable(buildinfo, binaries_dir):
|
||||||
"""Check if package binaries are relocatable."""
|
"""Check if package binaries are relocatable."""
|
||||||
buildinfo = read_buildinfo_file(workdir)
|
|
||||||
binaries = [os.path.join(binaries_dir, f) for f in buildinfo["relocate_binaries"]]
|
binaries = [os.path.join(binaries_dir, f) for f in buildinfo["relocate_binaries"]]
|
||||||
relocate.ensure_binaries_are_relocatable(binaries)
|
relocate.ensure_binaries_are_relocatable(binaries)
|
||||||
|
|
||||||
|
|
|
@ -2,11 +2,13 @@
|
||||||
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||||
|
import filecmp
|
||||||
import glob
|
import glob
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
import sys
|
import sys
|
||||||
|
import tarfile
|
||||||
import urllib.error
|
import urllib.error
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import urllib.response
|
import urllib.response
|
||||||
|
@ -952,3 +954,81 @@ def fake_build_tarball(node, push_url, **kwargs):
|
||||||
bindist.push([spec], push_url, include_root=root, include_dependencies=deps)
|
bindist.push([spec], push_url, include_root=root, include_dependencies=deps)
|
||||||
|
|
||||||
assert packages_to_push == expected
|
assert packages_to_push == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_reproducible_tarball_is_reproducible(tmpdir):
|
||||||
|
p = tmpdir.mkdir("prefix")
|
||||||
|
p.mkdir("bin")
|
||||||
|
p.mkdir(".spack")
|
||||||
|
|
||||||
|
app = p.join("bin", "app")
|
||||||
|
|
||||||
|
tarball_1 = str(tmpdir.join("prefix-1.tar.gz"))
|
||||||
|
tarball_2 = str(tmpdir.join("prefix-2.tar.gz"))
|
||||||
|
|
||||||
|
with open(app, "w") as f:
|
||||||
|
f.write("hello world")
|
||||||
|
|
||||||
|
buildinfo = {"metadata": "yes please"}
|
||||||
|
|
||||||
|
# Create a tarball with a certain mtime of bin/app
|
||||||
|
os.utime(app, times=(0, 0))
|
||||||
|
bindist._do_create_tarball(tarball_1, binaries_dir=p, pkg_dir="pkg", buildinfo=buildinfo)
|
||||||
|
|
||||||
|
# Do it another time with different mtime of bin/app
|
||||||
|
os.utime(app, times=(10, 10))
|
||||||
|
bindist._do_create_tarball(tarball_2, binaries_dir=p, pkg_dir="pkg", buildinfo=buildinfo)
|
||||||
|
|
||||||
|
# They should be bitwise identical:
|
||||||
|
assert filecmp.cmp(tarball_1, tarball_2, shallow=False)
|
||||||
|
|
||||||
|
# Sanity check for contents:
|
||||||
|
with tarfile.open(tarball_1, mode="r") as f:
|
||||||
|
for m in f.getmembers():
|
||||||
|
assert m.uid == m.gid == m.mtime == 0
|
||||||
|
assert m.uname == m.gname == ""
|
||||||
|
|
||||||
|
assert set(f.getnames()) == {
|
||||||
|
"pkg",
|
||||||
|
"pkg/bin",
|
||||||
|
"pkg/bin/app",
|
||||||
|
"pkg/.spack",
|
||||||
|
"pkg/.spack/binary_distribution",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_tarball_normalized_permissions(tmpdir):
|
||||||
|
p = tmpdir.mkdir("prefix")
|
||||||
|
p.mkdir("bin")
|
||||||
|
p.mkdir("share")
|
||||||
|
p.mkdir(".spack")
|
||||||
|
|
||||||
|
app = p.join("bin", "app")
|
||||||
|
data = p.join("share", "file")
|
||||||
|
tarball = str(tmpdir.join("prefix.tar.gz"))
|
||||||
|
|
||||||
|
# Everyone can write & execute. This should turn into 0o755 when the tarball is
|
||||||
|
# extracted (on a different system).
|
||||||
|
with open(app, "w", opener=lambda path, flags: os.open(path, flags, 0o777)) as f:
|
||||||
|
f.write("hello world")
|
||||||
|
|
||||||
|
# User doesn't have execute permissions, but group/world have; this should also
|
||||||
|
# turn into 0o644 (user read/write, group&world only read).
|
||||||
|
with open(data, "w", opener=lambda path, flags: os.open(path, flags, 0o477)) as f:
|
||||||
|
f.write("hello world")
|
||||||
|
|
||||||
|
bindist._do_create_tarball(tarball, binaries_dir=p, pkg_dir="pkg", buildinfo={})
|
||||||
|
|
||||||
|
with tarfile.open(tarball) as tar:
|
||||||
|
path_to_member = {member.name: member for member in tar.getmembers()}
|
||||||
|
|
||||||
|
# directories should have 0o755
|
||||||
|
assert path_to_member["pkg"].mode == 0o755
|
||||||
|
assert path_to_member["pkg/bin"].mode == 0o755
|
||||||
|
assert path_to_member["pkg/.spack"].mode == 0o755
|
||||||
|
|
||||||
|
# executable-by-user files should be 0o755
|
||||||
|
assert path_to_member["pkg/bin/app"].mode == 0o755
|
||||||
|
|
||||||
|
# not-executable-by-user files should be 0o644
|
||||||
|
assert path_to_member["pkg/share/file"].mode == 0o644
|
||||||
|
|
Loading…
Reference in a new issue