diff --git a/lib/spack/spack/cmd/buildcache.py b/lib/spack/spack/cmd/buildcache.py index 543570cf28..dae2acdeb9 100644 --- a/lib/spack/spack/cmd/buildcache.py +++ b/lib/spack/spack/cmd/buildcache.py @@ -13,7 +13,6 @@ import shutil import sys import tempfile -import urllib.request from typing import Dict, List, Optional, Tuple, Union import llnl.util.tty as tty @@ -54,6 +53,7 @@ from spack.oci.oci import ( copy_missing_layers_with_retry, get_manifest_and_config_with_retry, + list_tags, upload_blob_with_retry, upload_manifest_with_retry, ) @@ -856,10 +856,7 @@ def _config_from_tag(image_ref: ImageReference, tag: str) -> Optional[dict]: def _update_index_oci(image_ref: ImageReference, tmpdir: str, pool: MaybePool) -> None: - request = urllib.request.Request(url=image_ref.tags_url()) - response = spack.oci.opener.urlopen(request) - spack.oci.opener.ensure_status(request, response, 200) - tags = json.load(response)["tags"] + tags = list_tags(image_ref) # Fetch all image config files in parallel spec_dicts = pool.starmap( diff --git a/lib/spack/spack/oci/oci.py b/lib/spack/spack/oci/oci.py index fefc66674e..09e79df347 100644 --- a/lib/spack/spack/oci/oci.py +++ b/lib/spack/spack/oci/oci.py @@ -11,7 +11,7 @@ import urllib.parse import urllib.request from http.client import HTTPResponse -from typing import NamedTuple, Tuple +from typing import List, NamedTuple, Tuple from urllib.request import Request import llnl.util.tty as tty @@ -27,6 +27,7 @@ import spack.stage import spack.traverse import spack.util.crypto +import spack.util.url from .image import Digest, ImageReference @@ -69,6 +70,42 @@ def with_query_param(url: str, param: str, value: str) -> str: ) +def list_tags(ref: ImageReference, _urlopen: spack.oci.opener.MaybeOpen = None) -> List[str]: + """Retrieves the list of tags associated with an image, handling pagination.""" + _urlopen = _urlopen or spack.oci.opener.urlopen + tags = set() + fetch_url = ref.tags_url() + + while True: + # Fetch tags + request = Request(url=fetch_url) + response = _urlopen(request) + spack.oci.opener.ensure_status(request, response, 200) + tags.update(json.load(response)["tags"]) + + # Check for pagination + link_header = response.headers["Link"] + + if link_header is None: + break + + tty.debug(f"OCI tag pagination: {link_header}") + + rel_next_value = spack.util.url.parse_link_rel_next(link_header) + + if rel_next_value is None: + break + + rel_next = urllib.parse.urlparse(rel_next_value) + + if rel_next.scheme not in ("https", ""): + break + + fetch_url = ref.endpoint(rel_next_value) + + return sorted(tags) + + def upload_blob( ref: ImageReference, file: str, diff --git a/lib/spack/spack/test/oci/mock_registry.py b/lib/spack/spack/test/oci/mock_registry.py index cc39904f3c..288598089d 100644 --- a/lib/spack/spack/test/oci/mock_registry.py +++ b/lib/spack/spack/test/oci/mock_registry.py @@ -151,7 +151,9 @@ class InMemoryOCIRegistry(DummyServer): A third option is to use the chunked upload, but this is not implemented here, because it's typically a major performance hit in upload speed, so we're not using it in Spack.""" - def __init__(self, domain: str, allow_single_post: bool = True) -> None: + def __init__( + self, domain: str, allow_single_post: bool = True, tags_per_page: int = 100 + ) -> None: super().__init__(domain) self.router.register("GET", r"/v2/", self.index) self.router.register("HEAD", r"/v2/(?P.+)/blobs/(?P.+)", self.head_blob) @@ -165,6 +167,9 @@ def __init__(self, domain: str, allow_single_post: bool = True) -> None: # If True, allow single POST upload, not all registries support this self.allow_single_post = allow_single_post + # How many tags are returned in a single request + self.tags_per_page = tags_per_page + # Used for POST + PUT upload. This is a map from session ID to image name self.sessions: Dict[str, str] = {} @@ -280,10 +285,34 @@ def handle_upload(self, req: Request, name: str, digest: Digest): return MockHTTPResponse(201, "Created", headers={"Location": f"/v2/{name}/blobs/{digest}"}) def list_tags(self, req: Request, name: str): + # Paginate using Link headers, this was added to the spec in the following commit: + # https://github.com/opencontainers/distribution-spec/commit/2ed79d930ecec11dd755dc8190409a3b10f01ca9 + # List all tags, exclude digests. - tags = [_tag for _name, _tag in self.manifests.keys() if _name == name and ":" not in _tag] - tags.sort() - return MockHTTPResponse.with_json(200, "OK", body={"tags": tags}) + all_tags = sorted( + _tag for _name, _tag in self.manifests.keys() if _name == name and ":" not in _tag + ) + + query = urllib.parse.parse_qs(urllib.parse.urlparse(req.full_url).query) + + n = int(query["n"][0]) if "n" in query else self.tags_per_page + + if "last" in query: + try: + offset = all_tags.index(query["last"][0]) + 1 + except ValueError: + return MockHTTPResponse(404, "Not found") + else: + offset = 0 + + tags = all_tags[offset : offset + n] + + if offset + n < len(all_tags): + headers = {"Link": f'; rel="next"'} + else: + headers = None + + return MockHTTPResponse.with_json(200, "OK", headers=headers, body={"tags": tags}) class DummyServerUrllibHandler(urllib.request.BaseHandler): diff --git a/lib/spack/spack/test/oci/urlopen.py b/lib/spack/spack/test/oci/urlopen.py index 78d713f7e8..efc3f3c2b0 100644 --- a/lib/spack/spack/test/oci/urlopen.py +++ b/lib/spack/spack/test/oci/urlopen.py @@ -6,6 +6,7 @@ import hashlib import json +import random import urllib.error import urllib.parse import urllib.request @@ -19,6 +20,7 @@ copy_missing_layers, get_manifest_and_config, image_from_mirror, + list_tags, upload_blob, upload_manifest, ) @@ -670,3 +672,31 @@ def test_retry(url, max_retries, expect_failure, expect_requests): assert len(server.requests) == expect_requests assert sleep_time == [2**i for i in range(expect_requests - 1)] + + +def test_list_tags(): + # Follows a relatively new rewording of the OCI distribution spec, which is not yet tagged. + # https://github.com/opencontainers/distribution-spec/commit/2ed79d930ecec11dd755dc8190409a3b10f01ca9 + N = 20 + urlopen = create_opener(InMemoryOCIRegistry("example.com", tags_per_page=5)).open + image = ImageReference.from_string("example.com/image") + to_tag = lambda i: f"tag-{i:02}" + + # Create N tags in arbitrary order + _tags_to_create = [to_tag(i) for i in range(N)] + random.shuffle(_tags_to_create) + for tag in _tags_to_create: + upload_manifest(image.with_tag(tag), default_manifest(), tag=True, _urlopen=urlopen) + + # list_tags should return all tags from all pages in order + tags = list_tags(image, urlopen) + assert len(tags) == N + assert [to_tag(i) for i in range(N)] == tags + + # Test a single request, which should give the first 5 tags + assert json.loads(urlopen(image.tags_url()).read())["tags"] == [to_tag(i) for i in range(5)] + + # Test response at an offset, which should exclude the `last` tag. + assert json.loads(urlopen(image.tags_url() + f"?last={to_tag(N - 3)}").read())["tags"] == [ + to_tag(i) for i in range(N - 2, N) + ] diff --git a/lib/spack/spack/test/util/util_url.py b/lib/spack/spack/test/util/util_url.py index e2b403f82e..befaaef1cd 100644 --- a/lib/spack/spack/test/util/util_url.py +++ b/lib/spack/spack/test/util/util_url.py @@ -207,3 +207,29 @@ def test_default_download_name_dot_dot(): assert url_util.default_download_filename("https://example.com/.") == "_" assert url_util.default_download_filename("https://example.com/..") == "_." assert url_util.default_download_filename("https://example.com/.abcdef") == "_abcdef" + + +def test_parse_link_rel_next(): + parse = url_util.parse_link_rel_next + assert parse(r'; rel="next"') == "/abc" + assert parse(r'; x=y; rel="next", ; x=y; rel="prev"') == "/abc" + assert parse(r'; rel="prev"; x=y, ; x=y; rel="next"') == "/def" + + # example from RFC5988 + assert ( + parse( + r"""; title*=UTF-8'de'letztes%20Kapitel; rel="previous",""" + r"""; title*=UTF-8'de'n%c3%a4chstes%20Kapitel; rel="next" """ + ) + == "/TheBook/chapter4" + ) + + assert ( + parse(r"""; key=";a=b, ; e=f"; rel="next" """) + == "https://example.com/example" + ) + + assert parse("https://example.com/example") is None + assert parse(" str: valid_name = "_" + valid_name[1:] return valid_name + + +def parse_link_rel_next(link_value: str) -> Optional[str]: + """Return the next link from a Link header value, if any.""" + + # Relaxed version of RFC5988 + uri = re.compile(r"\s*<([^>]+)>\s*") + param_key = r"[^;=\s]+" + quoted_string = r"\"([^\"]+)\"" + unquoted_param_value = r"([^;,\s]+)" + param = re.compile(rf";\s*({param_key})\s*=\s*(?:{quoted_string}|{unquoted_param_value})\s*") + + data = link_value + + # Parse a list of ; key=value; key=value, ; key=value; key=value, ... links. + while True: + uri_match = re.match(uri, data) + if not uri_match: + break + uri_reference = uri_match.group(1) + data = data[uri_match.end() :] + + # Parse parameter list + while True: + param_match = re.match(param, data) + if not param_match: + break + key, quoted_value, unquoted_value = param_match.groups() + value = quoted_value or unquoted_value + data = data[param_match.end() :] + + if key == "rel" and value == "next": + return uri_reference + + if not data.startswith(","): + break + + data = data[1:] + + return None