oci buildcache: handle pagination of tags (#43136)

This fixes an issue where ghcr, gitlab and possibly other container registries paginate tags by default, which violates the OCI spec v1.0, but is common practice (the spec was broken itself). After this commit, you can create build cache indices of > 100 specs on ghcr.

Co-authored-by: Harmen Stoppels <me@harmenstoppels.nl>
This commit is contained in:
Scott Wittenburg 2024-05-18 03:57:53 -06:00 committed by Harmen Stoppels
parent 97369776f0
commit 02d62cf40f
6 changed files with 171 additions and 10 deletions

View file

@ -13,7 +13,6 @@
import shutil
import sys
import tempfile
import urllib.request
from typing import Dict, List, Optional, Tuple, Union
import llnl.util.tty as tty
@ -54,6 +53,7 @@
from spack.oci.oci import (
copy_missing_layers_with_retry,
get_manifest_and_config_with_retry,
list_tags,
upload_blob_with_retry,
upload_manifest_with_retry,
)
@ -856,10 +856,7 @@ def _config_from_tag(image_ref: ImageReference, tag: str) -> Optional[dict]:
def _update_index_oci(image_ref: ImageReference, tmpdir: str, pool: MaybePool) -> None:
request = urllib.request.Request(url=image_ref.tags_url())
response = spack.oci.opener.urlopen(request)
spack.oci.opener.ensure_status(request, response, 200)
tags = json.load(response)["tags"]
tags = list_tags(image_ref)
# Fetch all image config files in parallel
spec_dicts = pool.starmap(

View file

@ -11,7 +11,7 @@
import urllib.parse
import urllib.request
from http.client import HTTPResponse
from typing import NamedTuple, Tuple
from typing import List, NamedTuple, Tuple
from urllib.request import Request
import llnl.util.tty as tty
@ -27,6 +27,7 @@
import spack.stage
import spack.traverse
import spack.util.crypto
import spack.util.url
from .image import Digest, ImageReference
@ -69,6 +70,42 @@ def with_query_param(url: str, param: str, value: str) -> str:
)
def list_tags(ref: ImageReference, _urlopen: spack.oci.opener.MaybeOpen = None) -> List[str]:
"""Retrieves the list of tags associated with an image, handling pagination."""
_urlopen = _urlopen or spack.oci.opener.urlopen
tags = set()
fetch_url = ref.tags_url()
while True:
# Fetch tags
request = Request(url=fetch_url)
response = _urlopen(request)
spack.oci.opener.ensure_status(request, response, 200)
tags.update(json.load(response)["tags"])
# Check for pagination
link_header = response.headers["Link"]
if link_header is None:
break
tty.debug(f"OCI tag pagination: {link_header}")
rel_next_value = spack.util.url.parse_link_rel_next(link_header)
if rel_next_value is None:
break
rel_next = urllib.parse.urlparse(rel_next_value)
if rel_next.scheme not in ("https", ""):
break
fetch_url = ref.endpoint(rel_next_value)
return sorted(tags)
def upload_blob(
ref: ImageReference,
file: str,

View file

@ -151,7 +151,9 @@ class InMemoryOCIRegistry(DummyServer):
A third option is to use the chunked upload, but this is not implemented here, because
it's typically a major performance hit in upload speed, so we're not using it in Spack."""
def __init__(self, domain: str, allow_single_post: bool = True) -> None:
def __init__(
self, domain: str, allow_single_post: bool = True, tags_per_page: int = 100
) -> None:
super().__init__(domain)
self.router.register("GET", r"/v2/", self.index)
self.router.register("HEAD", r"/v2/(?P<name>.+)/blobs/(?P<digest>.+)", self.head_blob)
@ -165,6 +167,9 @@ def __init__(self, domain: str, allow_single_post: bool = True) -> None:
# If True, allow single POST upload, not all registries support this
self.allow_single_post = allow_single_post
# How many tags are returned in a single request
self.tags_per_page = tags_per_page
# Used for POST + PUT upload. This is a map from session ID to image name
self.sessions: Dict[str, str] = {}
@ -280,10 +285,34 @@ def handle_upload(self, req: Request, name: str, digest: Digest):
return MockHTTPResponse(201, "Created", headers={"Location": f"/v2/{name}/blobs/{digest}"})
def list_tags(self, req: Request, name: str):
# Paginate using Link headers, this was added to the spec in the following commit:
# https://github.com/opencontainers/distribution-spec/commit/2ed79d930ecec11dd755dc8190409a3b10f01ca9
# List all tags, exclude digests.
tags = [_tag for _name, _tag in self.manifests.keys() if _name == name and ":" not in _tag]
tags.sort()
return MockHTTPResponse.with_json(200, "OK", body={"tags": tags})
all_tags = sorted(
_tag for _name, _tag in self.manifests.keys() if _name == name and ":" not in _tag
)
query = urllib.parse.parse_qs(urllib.parse.urlparse(req.full_url).query)
n = int(query["n"][0]) if "n" in query else self.tags_per_page
if "last" in query:
try:
offset = all_tags.index(query["last"][0]) + 1
except ValueError:
return MockHTTPResponse(404, "Not found")
else:
offset = 0
tags = all_tags[offset : offset + n]
if offset + n < len(all_tags):
headers = {"Link": f'</v2/{name}/tags/list?last={tags[-1]}&n={n}>; rel="next"'}
else:
headers = None
return MockHTTPResponse.with_json(200, "OK", headers=headers, body={"tags": tags})
class DummyServerUrllibHandler(urllib.request.BaseHandler):

View file

@ -6,6 +6,7 @@
import hashlib
import json
import random
import urllib.error
import urllib.parse
import urllib.request
@ -19,6 +20,7 @@
copy_missing_layers,
get_manifest_and_config,
image_from_mirror,
list_tags,
upload_blob,
upload_manifest,
)
@ -670,3 +672,31 @@ def test_retry(url, max_retries, expect_failure, expect_requests):
assert len(server.requests) == expect_requests
assert sleep_time == [2**i for i in range(expect_requests - 1)]
def test_list_tags():
# Follows a relatively new rewording of the OCI distribution spec, which is not yet tagged.
# https://github.com/opencontainers/distribution-spec/commit/2ed79d930ecec11dd755dc8190409a3b10f01ca9
N = 20
urlopen = create_opener(InMemoryOCIRegistry("example.com", tags_per_page=5)).open
image = ImageReference.from_string("example.com/image")
to_tag = lambda i: f"tag-{i:02}"
# Create N tags in arbitrary order
_tags_to_create = [to_tag(i) for i in range(N)]
random.shuffle(_tags_to_create)
for tag in _tags_to_create:
upload_manifest(image.with_tag(tag), default_manifest(), tag=True, _urlopen=urlopen)
# list_tags should return all tags from all pages in order
tags = list_tags(image, urlopen)
assert len(tags) == N
assert [to_tag(i) for i in range(N)] == tags
# Test a single request, which should give the first 5 tags
assert json.loads(urlopen(image.tags_url()).read())["tags"] == [to_tag(i) for i in range(5)]
# Test response at an offset, which should exclude the `last` tag.
assert json.loads(urlopen(image.tags_url() + f"?last={to_tag(N - 3)}").read())["tags"] == [
to_tag(i) for i in range(N - 2, N)
]

View file

@ -207,3 +207,29 @@ def test_default_download_name_dot_dot():
assert url_util.default_download_filename("https://example.com/.") == "_"
assert url_util.default_download_filename("https://example.com/..") == "_."
assert url_util.default_download_filename("https://example.com/.abcdef") == "_abcdef"
def test_parse_link_rel_next():
parse = url_util.parse_link_rel_next
assert parse(r'</abc>; rel="next"') == "/abc"
assert parse(r'</abc>; x=y; rel="next", </def>; x=y; rel="prev"') == "/abc"
assert parse(r'</abc>; rel="prev"; x=y, </def>; x=y; rel="next"') == "/def"
# example from RFC5988
assert (
parse(
r"""</TheBook/chapter2>; title*=UTF-8'de'letztes%20Kapitel; rel="previous","""
r"""</TheBook/chapter4>; title*=UTF-8'de'n%c3%a4chstes%20Kapitel; rel="next" """
)
== "/TheBook/chapter4"
)
assert (
parse(r"""<https://example.com/example>; key=";a=b, </c/d>; e=f"; rel="next" """)
== "https://example.com/example"
)
assert parse("https://example.com/example") is None
assert parse("<https://example.com/example; broken=broken") is None
assert parse("https://example.com/example; rel=prev") is None
assert parse("https://example.com/example; a=b; c=d; g=h") is None

View file

@ -10,9 +10,11 @@
import itertools
import os
import posixpath
import re
import sys
import urllib.parse
import urllib.request
from typing import Optional
from llnl.path import convert_to_posix_path
@ -261,3 +263,43 @@ def default_download_filename(url: str) -> str:
valid_name = "_" + valid_name[1:]
return valid_name
def parse_link_rel_next(link_value: str) -> Optional[str]:
"""Return the next link from a Link header value, if any."""
# Relaxed version of RFC5988
uri = re.compile(r"\s*<([^>]+)>\s*")
param_key = r"[^;=\s]+"
quoted_string = r"\"([^\"]+)\""
unquoted_param_value = r"([^;,\s]+)"
param = re.compile(rf";\s*({param_key})\s*=\s*(?:{quoted_string}|{unquoted_param_value})\s*")
data = link_value
# Parse a list of <url>; key=value; key=value, <url>; key=value; key=value, ... links.
while True:
uri_match = re.match(uri, data)
if not uri_match:
break
uri_reference = uri_match.group(1)
data = data[uri_match.end() :]
# Parse parameter list
while True:
param_match = re.match(param, data)
if not param_match:
break
key, quoted_value, unquoted_value = param_match.groups()
value = quoted_value or unquoted_value
data = data[param_match.end() :]
if key == "rel" and value == "next":
return uri_reference
if not data.startswith(","):
break
data = data[1:]
return None