Allow spec.json files to be clearsigned, use transparent compression for *.json (#34490)

This commit allows (remote) spec.json files to be clearsigned and gzipped.

The idea is to reduce the number of requests and number of bytes transferred
This commit is contained in:
Harmen Stoppels 2023-01-07 12:22:40 +01:00 committed by GitHub
parent 86e346e906
commit 8a1b817978
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 252 additions and 155 deletions

View file

@ -5,7 +5,9 @@
import codecs
import collections
import gzip
import hashlib
import io
import json
import multiprocessing.pool
import os
@ -613,6 +615,28 @@ def read_buildinfo_file(prefix):
return buildinfo
def transparently_decompress_bytes(binary_stream):
"""Wrap stream in a decompress wrapper if gzip compressed"""
# Get magic bytes
if isinstance(binary_stream, io.BytesIO):
# Not peekable... Alternatively io.BufferedReader(io.BytesIO(...))
# but to add yet another wrapper just to read two bytes that are
# already in memory... sigh.
magic = binary_stream.read(2)
binary_stream.seek(0)
else:
magic = binary_stream.peek(2)
# Verify magic
if magic.startswith(b"\x1f\x8b"):
return gzip.GzipFile(fileobj=binary_stream)
return binary_stream
def transparently_decompress_bytes_to_string(binary_stream, encoding="utf-8"):
return codecs.getreader(encoding)(transparently_decompress_bytes(binary_stream))
class BuildManifestVisitor(BaseDirectoryVisitor):
"""Visitor that collects a list of files and symlinks
that can be checked for need of relocation. It knows how
@ -845,6 +869,42 @@ def sign_specfile(key, force, specfile_path):
spack.util.gpg.sign(key, specfile_path, signed_specfile_path, clearsign=True)
def _load_clearsigned_json(stream):
# Skip the PGP header
stream.readline()
stream.readline()
json = stream.read()
footer_index = json.rfind("-----BEGIN PGP SIGNATURE-----")
if footer_index == -1 or not json[footer_index - 1].isspace():
raise ValueError("Could not find PGP signature in clearsigned json file.")
return sjson.load(json[:footer_index])
def _load_possibly_clearsigned_json(stream):
if _is_clearsigned_stream(stream):
return _load_clearsigned_json(stream)
return sjson.load(stream)
def _is_clearsigned_stream(stream):
curr = stream.tell()
header = stream.read(34)
stream.seek(curr)
return header == "-----BEGIN PGP SIGNED MESSAGE-----"
def is_clearsigned_file(path):
with open(path, "r") as f:
return _is_clearsigned_stream(f)
def load_possibly_clearsigned_json(s):
"""Deserialize JSON from a string or stream s, removing any clearsign
header/footer."""
s = io.StringIO(s) if isinstance(s, str) else s
return _load_possibly_clearsigned_json(s)
def _read_specs_and_push_index(file_list, read_method, cache_prefix, db, temp_dir, concurrency):
"""Read all the specs listed in the provided list, using thread given thread parallelism,
generate the index, and push it to the mirror.
@ -867,11 +927,7 @@ def _fetch_spec_from_mirror(spec_url):
if spec_file_contents:
# Need full spec.json name or this gets confused with index.json.
if spec_url.endswith(".json.sig"):
specfile_json = Spec.extract_json_from_clearsig(spec_file_contents)
return Spec.from_dict(specfile_json)
if spec_url.endswith(".json"):
return Spec.from_json(spec_file_contents)
return Spec.from_dict(load_possibly_clearsigned_json(spec_file_contents))
tp = multiprocessing.pool.ThreadPool(processes=concurrency)
try:
@ -933,8 +989,8 @@ def _specs_from_cache_aws_cli(cache_prefix):
aws = which("aws")
def file_read_method(file_path):
with open(file_path) as fd:
return fd.read()
with open(file_path, "rb") as f:
return transparently_decompress_bytes_to_string(f).read()
tmpspecsdir = tempfile.mkdtemp()
sync_command_args = [
@ -981,7 +1037,7 @@ def url_read_method(url):
contents = None
try:
_, _, spec_file = web_util.read_from_url(url)
contents = codecs.getreader("utf-8")(spec_file).read()
contents = transparently_decompress_bytes_to_string(spec_file).read()
except (URLError, web_util.SpackWebError) as url_err:
tty.error("Error reading specfile: {0}".format(url))
tty.error(url_err)
@ -1379,7 +1435,7 @@ def try_verify(specfile_path):
return True
def try_fetch(url_to_fetch):
def try_fetch(url_to_fetch, try_decompress=False):
"""Utility function to try and fetch a file from a url, stage it
locally, and return the path to the staged file.
@ -1398,6 +1454,21 @@ def try_fetch(url_to_fetch):
stage.destroy()
return None
if not try_decompress:
return stage
# Stage has some logic for automatically expanding
# archives, but it is based on file extensions. So instead,
# we more or less repeat the logic.
try:
tmp = stage.save_filename + ".tmp"
with gzip.open(stage.save_filename, "rb") as compressed:
with open(tmp, "wb") as decompressed:
shutil.copyfileobj(compressed, decompressed)
os.rename(tmp, stage.save_filename)
except OSError:
pass
return stage
@ -1467,61 +1538,45 @@ def download_tarball(spec, unsigned=False, mirrors_for_spec=None):
}
)
tried_to_verify_sigs = []
# Assumes we care more about finding a spec file by preferred ext
# than by mirrory priority. This can be made less complicated as
# we remove support for deprecated spec formats and buildcache layouts.
verification_failure = False
for ext in ["json.sig", "json"]:
for mirror_to_try in mirrors_to_try:
specfile_url = "{0}.{1}".format(mirror_to_try["specfile"], ext)
spackfile_url = mirror_to_try["spackfile"]
local_specfile_stage = try_fetch(specfile_url)
if local_specfile_stage:
local_specfile_path = local_specfile_stage.save_filename
signature_verified = False
for mirror in mirrors_to_try:
# Try to download the specfile. For any legacy version of Spack's buildcache
# we definitely require this file.
specfile_url = "{0}.{1}".format(mirror["specfile"], ext)
specfile_stage = try_fetch(specfile_url, try_decompress=True)
if not specfile_stage:
continue
if ext.endswith(".sig") and not unsigned:
# If we found a signed specfile at the root, try to verify
# the signature immediately. We will not download the
# tarball if we could not verify the signature.
tried_to_verify_sigs.append(specfile_url)
signature_verified = try_verify(local_specfile_path)
if not signature_verified:
tty.warn("Failed to verify: {0}".format(specfile_url))
specfile_path = specfile_stage.save_filename
if unsigned or signature_verified or not ext.endswith(".sig"):
# We will download the tarball in one of three cases:
# 1. user asked for --no-check-signature
# 2. user didn't ask for --no-check-signature, but we
# found a spec.json.sig and verified the signature already
# 3. neither of the first two cases are true, but this file
# is *not* a signed json (not a spec.json.sig file). That
# means we already looked at all the mirrors and either didn't
# find any .sig files or couldn't verify any of them. But it
# is still possible to find an old style binary package where
# the signature is a detached .asc file in the outer archive
# of the tarball, and in that case, the only way to know is to
# download the tarball. This is a deprecated use case, so if
# something goes wrong during the extraction process (can't
# verify signature, checksum doesn't match) we will fail at
# that point instead of trying to download more tarballs from
# the remaining mirrors, looking for one we can use.
tarball_stage = try_fetch(spackfile_url)
if tarball_stage:
return {
"tarball_stage": tarball_stage,
"specfile_stage": local_specfile_stage,
"signature_verified": signature_verified,
}
# If it is a clearsign file, we must verify it (unless disabled)
should_verify = not unsigned and is_clearsigned_file(specfile_path)
if should_verify and not try_verify(specfile_path):
verification_failure = True
tty.warn("Failed to verify: {0}".format(specfile_url))
specfile_stage.destroy()
continue
local_specfile_stage.destroy()
# In case the spec.json is not clearsigned, it means it's a legacy
# format, where either the signature is in the tarball with binaries, or
# the package is unsigned. Verification
# is then postponed.
spackfile_url = mirror["spackfile"]
tarball_stage = try_fetch(spackfile_url)
if tarball_stage:
return {
"tarball_stage": tarball_stage,
"specfile_stage": specfile_stage,
"signature_verified": should_verify, # should_verify implies it was verified
}
specfile_stage.destroy()
# Falling through the nested loops meeans we exhaustively searched
# for all known kinds of spec files on all mirrors and did not find
# an acceptable one for which we could download a tarball.
if tried_to_verify_sigs:
if verification_failure:
raise NoVerifyException(
(
"Spack found new style signed binary packages, "
@ -1802,11 +1857,7 @@ def extract_tarball(spec, download_result, allow_root=False, unsigned=False, for
specfile_path = download_result["specfile_stage"].save_filename
with open(specfile_path, "r") as inputfile:
content = inputfile.read()
if specfile_path.endswith(".json.sig"):
spec_dict = Spec.extract_json_from_clearsig(content)
else:
spec_dict = sjson.load(content)
spec_dict = load_possibly_clearsigned_json(inputfile)
bchecksum = spec_dict["binary_cache_checksum"]
filename = download_result["tarball_stage"].save_filename
@ -1971,54 +2022,35 @@ def try_direct_fetch(spec, mirrors=None):
"""
specfile_name = tarball_name(spec, ".spec.json")
signed_specfile_name = tarball_name(spec, ".spec.json.sig")
specfile_is_signed = False
found_specs = []
for mirror in spack.mirror.MirrorCollection(mirrors=mirrors).values():
buildcache_fetch_url_json = url_util.join(
mirror.fetch_url, _build_cache_relative_path, specfile_name
)
buildcache_fetch_url_signed_json = url_util.join(
mirror.fetch_url, _build_cache_relative_path, signed_specfile_name
)
try:
_, _, fs = web_util.read_from_url(buildcache_fetch_url_signed_json)
specfile_is_signed = True
except (URLError, web_util.SpackWebError, HTTPError) as url_err:
for file in (specfile_name, signed_specfile_name):
url = url_util.join(mirror.fetch_url, _build_cache_relative_path, file)
try:
_, _, fs = web_util.read_from_url(buildcache_fetch_url_json)
except (URLError, web_util.SpackWebError, HTTPError) as url_err_x:
_, _, fs = web_util.read_from_url(url)
except (URLError, web_util.SpackWebError, HTTPError) as url_err:
tty.debug(
"Did not find {0} on {1}".format(
specfile_name, buildcache_fetch_url_signed_json
),
"Did not find {0} on {1}".format(specfile_name, url),
url_err,
level=2,
)
tty.debug(
"Did not find {0} on {1}".format(specfile_name, buildcache_fetch_url_json),
url_err_x,
level=2,
)
continue
specfile_contents = codecs.getreader("utf-8")(fs).read()
# read the spec from the build cache file. All specs in build caches
# are concrete (as they are built) so we need to mark this spec
# concrete on read-in.
if specfile_is_signed:
specfile_json = Spec.extract_json_from_clearsig(specfile_contents)
fetched_spec = Spec.from_dict(specfile_json)
else:
fetched_spec = Spec.from_json(specfile_contents)
fetched_spec._mark_concrete()
# read the spec from the build cache file. All specs in build caches
# are concrete (as they are built) so we need to mark this spec
# concrete on read-in.
stream = transparently_decompress_bytes_to_string(fs)
fetched_spec = Spec.from_dict(load_possibly_clearsigned_json(stream))
fetched_spec._mark_concrete()
found_specs.append(
{
"mirror_url": mirror.fetch_url,
"spec": fetched_spec,
}
)
found_specs.append(
{
"mirror_url": mirror.fetch_url,
"spec": fetched_spec,
}
)
break
return found_specs
@ -2097,7 +2129,7 @@ def get_keys(install=False, trust=False, force=False, mirrors=None):
try:
_, _, json_file = web_util.read_from_url(keys_index)
json_index = sjson.load(codecs.getreader("utf-8")(json_file))
json_index = sjson.load(transparently_decompress_bytes_to_string(json_file))
except (URLError, web_util.SpackWebError) as url_err:
if web_util.url_exists(keys_index):
err_msg = [
@ -2422,11 +2454,15 @@ def conditional_fetch(self):
raise FetchIndexError("Could not fetch index from {}".format(url_index), e)
try:
result = codecs.getreader("utf-8")(response).read()
binary_result = response.read()
except ValueError as e:
return FetchCacheError("Remote index {} is invalid".format(url_index), e)
computed_hash = compute_hash(result)
# The hash is computed on the raw bytes
computed_hash = compute_hash(binary_result)
# Only then decode as string, possibly decompress
result = transparently_decompress_bytes_to_string(io.BytesIO(binary_result)).read()
# We don't handle computed_hash != remote_hash here, which can happen
# when remote index.json and index.json.hash are out of sync, or if
@ -2480,15 +2516,21 @@ def conditional_fetch(self):
raise FetchIndexError("Could not fetch index {}".format(url), e) from e
try:
result = codecs.getreader("utf-8")(response).read()
binary_result = response.read()
except ValueError as e:
raise FetchIndexError("Remote index {} is invalid".format(url), e) from e
# The hash is computed on the raw bytes
computed_hash = compute_hash(binary_result)
# Only then decode as string, possibly decompress
result = transparently_decompress_bytes_to_string(io.BytesIO(binary_result)).read()
headers = response.headers
etag_header_value = headers.get("Etag", None) or headers.get("etag", None)
return FetchIndexResult(
etag=web_util.parse_etag(etag_header_value),
hash=compute_hash(result),
hash=computed_hash,
data=result,
fresh=False,
)

View file

@ -156,16 +156,6 @@
default_format += "{%compiler.name}{@compiler.version}{compiler_flags}"
default_format += "{variants}{arch=architecture}"
#: Regular expression to pull spec contents out of clearsigned signature
#: file.
CLEARSIGN_FILE_REGEX = re.compile(
(
r"^-----BEGIN PGP SIGNED MESSAGE-----"
r"\s+Hash:\s+[^\s]+\s+(.+)-----BEGIN PGP SIGNATURE-----"
),
re.MULTILINE | re.DOTALL,
)
#: specfile format version. Must increase monotonically
specfile_format_version = 3
@ -2453,27 +2443,6 @@ def from_json(stream):
except Exception as e:
raise sjson.SpackJSONError("error parsing JSON spec:", str(e)) from e
@staticmethod
def extract_json_from_clearsig(data):
m = CLEARSIGN_FILE_REGEX.search(data)
if m:
return sjson.load(m.group(1))
return sjson.load(data)
@staticmethod
def from_signed_json(stream):
"""Construct a spec from clearsigned json spec file.
Args:
stream: string or file object to read from.
"""
data = stream
if hasattr(stream, "read"):
data = stream.read()
extracted_json = Spec.extract_json_from_clearsig(data)
return Spec.from_dict(extracted_json)
@staticmethod
def from_detection(spec_str, extra_attributes=None):
"""Construct a spec from a spec string determined during external

View file

@ -3,7 +3,9 @@
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import glob
import gzip
import io
import json
import os
import platform
import sys
@ -889,3 +891,106 @@ def urlopen(request: urllib.request.Request):
with pytest.raises(bindist.FetchIndexError, match="Could not fetch index"):
fetcher.conditional_fetch()
def test_read_spec_from_signed_json():
spec_dir = os.path.join(test_path, "data", "mirrors", "signed_json")
file_name = (
"linux-ubuntu18.04-haswell-gcc-8.4.0-"
"zlib-1.2.12-g7otk5dra3hifqxej36m5qzm7uyghqgb.spec.json.sig"
)
spec_path = os.path.join(spec_dir, file_name)
def check_spec(spec_to_check):
assert spec_to_check.name == "zlib"
assert spec_to_check._hash == "g7otk5dra3hifqxej36m5qzm7uyghqgb"
with open(spec_path) as fd:
s = Spec.from_dict(bindist.load_possibly_clearsigned_json(fd))
check_spec(s)
def test_load_clearsigned_json():
obj = {"hello": "world"}
clearsigned = """\
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
{}
-----BEGIN PGP SIGNATURE-----
xyz
-----END PGP SIGNATURE-----""".format(
json.dumps(obj)
)
# Should accept strings and streams
assert bindist.load_possibly_clearsigned_json(clearsigned) == obj
assert bindist.load_possibly_clearsigned_json(io.StringIO(clearsigned)) == obj
def test_load_without_clearsigned_json():
obj = {"hello": "world"}
not_clearsigned = json.dumps(obj)
# Should accept strings and streams
assert bindist.load_possibly_clearsigned_json(not_clearsigned) == obj
assert bindist.load_possibly_clearsigned_json(io.StringIO(not_clearsigned)) == obj
def test_json_containing_clearsigned_message_is_json():
# Test that we don't interpret json with a PGP signed message as a string somewhere
# as a clearsigned message. It should just deserialize the json contents.
val = """\
"-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
{}
-----BEGIN PGP SIGNATURE-----
signature
-----END PGP SIGNATURE-----
"""
input = json.dumps({"key": val})
assert bindist.load_possibly_clearsigned_json(input)["key"] == val
def test_clearsign_signature_part_of_json_string():
# Check if we can deal with a string in json containing the string that is used
# at the start of a PGP signature.
obj = {"-----BEGIN PGP SIGNATURE-----": "-----BEGIN PGP SIGNATURE-----"}
input = """\
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
{}
-----BEGIN PGP SIGNATURE-----
signature
-----END PGP SIGNATURE-----
""".format(
json.dumps(obj)
)
assert bindist.load_possibly_clearsigned_json(input) == obj
def test_broken_clearsign_signature():
# In this test there is no PGP signature.
obj = {"-----BEGIN PGP SIGNATURE-----": "-----BEGIN PGP SIGNATURE-----"}
input = """\
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
{}
""".format(
json.dumps(obj)
)
with pytest.raises(ValueError, match="Could not find PGP signature"):
bindist.load_possibly_clearsigned_json(input)
def test_transparent_decompression():
"""Test a roundtrip of string -> gzip -> string."""
text = "this is utf8 that should be compressed"
gzipped = io.BytesIO()
with gzip.GzipFile(fileobj=gzipped, mode="w", compresslevel=1, mtime=0) as f:
f.write(text.encode("utf-8"))
gzipped.seek(0)
assert bindist.transparently_decompress_bytes_to_string(gzipped).read() == text

View file

@ -1324,7 +1324,9 @@ def test_push_mirror_contents(
if file_name.endswith(".spec.json.sig"):
spec_json_path = os.path.join(buildcache_path, file_name)
with open(spec_json_path) as json_fd:
json_object = Spec.extract_json_from_clearsig(json_fd.read())
json_object = spack.binary_distribution.load_possibly_clearsigned_json(
json_fd
)
jsonschema.validate(json_object, specfile_schema)
logs_dir = working_dir.join("logs_dir")

View file

@ -47,27 +47,6 @@ def test_simple_spec():
check_json_round_trip(spec)
def test_read_spec_from_signed_json():
spec_dir = os.path.join(spack.paths.test_path, "data", "mirrors", "signed_json")
file_name = (
"linux-ubuntu18.04-haswell-gcc-8.4.0-"
"zlib-1.2.12-g7otk5dra3hifqxej36m5qzm7uyghqgb.spec.json.sig"
)
spec_path = os.path.join(spec_dir, file_name)
def check_spec(spec_to_check):
assert spec_to_check.name == "zlib"
assert spec_to_check._hash == "g7otk5dra3hifqxej36m5qzm7uyghqgb"
with open(spec_path) as fd:
s = Spec.from_signed_json(fd)
check_spec(s)
with open(spec_path) as fd:
s = Spec.from_signed_json(fd.read())
check_spec(s)
def test_normal_spec(mock_packages):
spec = Spec("mpileaks+debug~opt")
spec.normalize()