From 532d844f2660a1201d71f9e38acae16ff9e0971c Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Thu, 19 Sep 2024 12:29:56 +0200 Subject: [PATCH] spack.util.url: fix join breakage in python 3.12.6 (#46453) --- lib/spack/spack/test/util/util_url.py | 194 +++++++------------------- lib/spack/spack/util/url.py | 193 ++++--------------------- 2 files changed, 77 insertions(+), 310 deletions(-) diff --git a/lib/spack/spack/test/util/util_url.py b/lib/spack/spack/test/util/util_url.py index ec8f60627a..0929f98f71 100644 --- a/lib/spack/spack/test/util/util_url.py +++ b/lib/spack/spack/test/util/util_url.py @@ -8,6 +8,8 @@ import os.path import urllib.parse +import pytest + import spack.util.path import spack.util.url as url_util @@ -45,155 +47,63 @@ def test_relative_path_to_file_url(tmpdir): assert os.path.samefile(roundtrip, path) -def test_url_join_local_paths(): - # Resolve local link against page URL +@pytest.mark.parametrize("resolve_href", [True, False]) +@pytest.mark.parametrize("scheme", ["http", "s3", "gs", "file"]) +def test_url_join_absolute(scheme, resolve_href): + """Test that joining a URL with an absolute path works the same for schemes we care about, and + whether we work in web browser mode or not.""" + netloc = "" if scheme == "file" else "example.com" + a1 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d/e/f", resolve_href=resolve_href) + a2 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d", "e", "f", resolve_href=resolve_href) + assert a1 == a2 == f"{scheme}://{netloc}/d/e/f" - # wrong: - assert ( - url_util.join("s3://bucket/index.html", "../other-bucket/document.txt") - == "s3://bucket/other-bucket/document.txt" - ) - - # correct - need to specify resolve_href=True: - assert ( - url_util.join("s3://bucket/index.html", "../other-bucket/document.txt", resolve_href=True) - == "s3://other-bucket/document.txt" - ) - - # same as above: make sure several components are joined together correctly - assert ( - url_util.join( - # with resolve_href=True, first arg is the base url; can not be - # broken up - "s3://bucket/index.html", - # with resolve_href=True, remaining arguments are the components of - # the local href that needs to be resolved - "..", - "other-bucket", - "document.txt", - resolve_href=True, - ) - == "s3://other-bucket/document.txt" - ) - - # Append local path components to prefix URL - - # wrong: - assert ( - url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=True) - == "https://mirror.spack.io/my-package" - ) - - # correct - Need to specify resolve_href=False: - assert ( - url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=False) - == "https://mirror.spack.io/build_cache/my-package" - ) - - # same as above; make sure resolve_href=False is default - assert ( - url_util.join("https://mirror.spack.io/build_cache", "my-package") - == "https://mirror.spack.io/build_cache/my-package" - ) - - # same as above: make sure several components are joined together correctly - assert ( - url_util.join( - # with resolve_href=False, first arg is just a prefix. No - # resolution is done. So, there should be no difference between - # join('/a/b/c', 'd/e'), - # join('/a/b', 'c', 'd/e'), - # join('/a', 'b/c', 'd', 'e'), etc. - "https://mirror.spack.io", - "build_cache", - "my-package", - ) - == "https://mirror.spack.io/build_cache/my-package" - ) - - # For s3:// URLs, the "netloc" (bucket) is considered part of the path. - # Make sure join() can cross bucket boundaries in this case. - args = ["s3://bucket/a/b", "new-bucket", "c"] - assert url_util.join(*args) == "s3://bucket/a/b/new-bucket/c" - - args.insert(1, "..") - assert url_util.join(*args) == "s3://bucket/a/new-bucket/c" - - args.insert(1, "..") - assert url_util.join(*args) == "s3://bucket/new-bucket/c" - - # new-bucket is now the "netloc" (bucket name) - args.insert(1, "..") - assert url_util.join(*args) == "s3://new-bucket/c" + b1 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com/b", resolve_href=resolve_href) + b2 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com", "b", resolve_href=resolve_href) + assert b1 == b2 == "https://b.com/b" -def test_url_join_absolute_paths(): - # Handling absolute path components is a little tricky. To this end, we - # distinguish "absolute path components", from the more-familiar concept of - # "absolute paths" as they are understood for local filesystem paths. - # - # - All absolute paths are absolute path components. Joining a URL with - # these components has the effect of completely replacing the path of the - # URL with the absolute path. These components do not specify a URL - # scheme, so the scheme of the URL procuced when joining them depend on - # those provided by components that came before it (file:// assumed if no - # such scheme is provided). +@pytest.mark.parametrize("scheme", ["http", "s3", "gs"]) +def test_url_join_up(scheme): + """Test that the netloc component is preserved when going .. up in the path.""" + a1 = url_util.join(f"{scheme}://netloc/a/b.html", "c", resolve_href=True) + assert a1 == f"{scheme}://netloc/a/c" + b1 = url_util.join(f"{scheme}://netloc/a/b.html", "../c", resolve_href=True) + b2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "c", resolve_href=True) + assert b1 == b2 == f"{scheme}://netloc/c" + c1 = url_util.join(f"{scheme}://netloc/a/b.html", "../../c", resolve_href=True) + c2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "..", "c", resolve_href=True) + assert c1 == c2 == f"{scheme}://netloc/c" - # For eaxmple: - p = "/path/to/resource" - # ...is an absolute path + d1 = url_util.join(f"{scheme}://netloc/a/b", "c", resolve_href=False) + assert d1 == f"{scheme}://netloc/a/b/c" + d2 = url_util.join(f"{scheme}://netloc/a/b", "../c", resolve_href=False) + d3 = url_util.join(f"{scheme}://netloc/a/b", "..", "c", resolve_href=False) + assert d2 == d3 == f"{scheme}://netloc/a/c" + e1 = url_util.join(f"{scheme}://netloc/a/b", "../../c", resolve_href=False) + e2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "c", resolve_href=False) + assert e1 == e2 == f"{scheme}://netloc/c" + f1 = url_util.join(f"{scheme}://netloc/a/b", "../../../c", resolve_href=False) + f2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "..", "c", resolve_href=False) + assert f1 == f2 == f"{scheme}://netloc/c" - # http:// URL - assert url_util.join("http://example.com/a/b/c", p) == "http://example.com/path/to/resource" - # s3:// URL - # also notice how the netloc is treated as part of the path for s3:// URLs - assert url_util.join("s3://example.com/a/b/c", p) == "s3://path/to/resource" +@pytest.mark.parametrize("scheme", ["http", "https", "ftp", "s3", "gs", "file"]) +def test_url_join_resolve_href(scheme): + """test that `resolve_href=True` behaves like a web browser at the base page, and + `resolve_href=False` behaves like joining paths in a file system at the base directory.""" + # these are equivalent because of the trailing / + netloc = "" if scheme == "file" else "netloc" + a1 = url_util.join(f"{scheme}://{netloc}/my/path/", "other/path", resolve_href=True) + a2 = url_util.join(f"{scheme}://{netloc}/my/path/", "other", "path", resolve_href=True) + assert a1 == a2 == f"{scheme}://{netloc}/my/path/other/path" + b1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=False) + b2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=False) + assert b1 == b2 == f"{scheme}://{netloc}/my/path/other/path" - # - URL components that specify a scheme are always absolute path - # components. Joining a base URL with these components effectively - # discards the base URL and "resets" the joining logic starting at the - # component in question and using it as the new base URL. - - # For eaxmple: - p = "http://example.com/path/to" - # ...is an http:// URL - - join_result = url_util.join(p, "resource") - assert join_result == "http://example.com/path/to/resource" - - # works as if everything before the http:// URL was left out - assert url_util.join("literally", "does", "not", "matter", p, "resource") == join_result - - assert url_util.join("file:///a/b/c", "./d") == "file:///a/b/c/d" - - # Finally, resolve_href should have no effect for how absolute path - # components are handled because local hrefs can not be absolute path - # components. - args = [ - "s3://does", - "not", - "matter", - "http://example.com", - "also", - "does", - "not", - "matter", - "/path", - ] - - expected = "http://example.com/path" - assert url_util.join(*args, resolve_href=True) == expected - assert url_util.join(*args, resolve_href=False) == expected - - # resolve_href only matters for the local path components at the end of the - # argument list. - args[-1] = "/path/to/page" - args.extend(("..", "..", "resource")) - - assert url_util.join(*args, resolve_href=True) == "http://example.com/resource" - - assert url_util.join(*args, resolve_href=False) == "http://example.com/path/resource" + # this is like a web browser: relative to /my. + c1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=True) + c2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=True) + assert c1 == c2 == f"{scheme}://{netloc}/my/other/path" def test_default_download_name(): diff --git a/lib/spack/spack/util/url.py b/lib/spack/spack/util/url.py index 0644a17ada..9337b103c8 100644 --- a/lib/spack/spack/util/url.py +++ b/lib/spack/spack/util/url.py @@ -7,15 +7,12 @@ Utility functions for parsing, formatting, and manipulating URLs. """ -import itertools import os import posixpath import sys import urllib.parse import urllib.request -from llnl.path import convert_to_posix_path - from spack.util.path import sanitize_filename @@ -27,26 +24,6 @@ def validate_scheme(scheme): return scheme in ("file", "http", "https", "ftp", "s3", "gs", "ssh", "git") -def _split_all(path): - """Split path into its atomic components. - - Returns the shortest list, L, of strings such that posixpath.join(*L) == - path and posixpath.split(element) == ('', element) for every element in L - except possibly the first. This first element may possibly have the value - of '/'. - """ - result = [] - a = path - old_a = None - while a != old_a: - (old_a, (a, b)) = a, posixpath.split(a) - - if a or b: - result.insert(0, b or "/") - - return result - - def local_file_path(url): """Get a local file path from a url. @@ -97,151 +74,31 @@ def format(parsed_url): return parsed_url.geturl() -def join(base_url, path, *extra, **kwargs): - """Joins a base URL with one or more local URL path components - - If resolve_href is True, treat the base URL as though it where the locator - of a web page, and the remaining URL path components as though they formed - a relative URL to be resolved against it (i.e.: as in posixpath.join(...)). - The result is an absolute URL to the resource to which a user's browser - would navigate if they clicked on a link with an "href" attribute equal to - the relative URL. - - If resolve_href is False (default), then the URL path components are joined - as in posixpath.join(). - - Note: file:// URL path components are not canonicalized as part of this - operation. To canonicalize, pass the joined url to format(). - - Examples: - base_url = 's3://bucket/index.html' - body = fetch_body(prefix) - link = get_href(body) # link == '../other-bucket/document.txt' - - # wrong - link is a local URL that needs to be resolved against base_url - spack.util.url.join(base_url, link) - 's3://bucket/other_bucket/document.txt' - - # correct - resolve local URL against base_url - spack.util.url.join(base_url, link, resolve_href=True) - 's3://other_bucket/document.txt' - - prefix = 'https://mirror.spack.io/build_cache' - - # wrong - prefix is just a URL prefix - spack.util.url.join(prefix, 'my-package', resolve_href=True) - 'https://mirror.spack.io/my-package' - - # correct - simply append additional URL path components - spack.util.url.join(prefix, 'my-package', resolve_href=False) # default - 'https://mirror.spack.io/build_cache/my-package' - - # For canonicalizing file:// URLs, take care to explicitly differentiate - # between absolute and relative join components. - """ - paths = [ - (x) if isinstance(x, str) else x.geturl() for x in itertools.chain((base_url, path), extra) - ] - - paths = [convert_to_posix_path(x) for x in paths] - n = len(paths) - last_abs_component = None - scheme = "" - for i in range(n - 1, -1, -1): - obj = urllib.parse.urlparse(paths[i], scheme="", allow_fragments=False) - - scheme = obj.scheme - - # in either case the component is absolute - if scheme or obj.path.startswith("/"): - if not scheme: - # Without a scheme, we have to go back looking for the - # next-last component that specifies a scheme. - for j in range(i - 1, -1, -1): - obj = urllib.parse.urlparse(paths[j], scheme="", allow_fragments=False) - - if obj.scheme: - paths[i] = "{SM}://{NL}{PATH}".format( - SM=obj.scheme, - NL=((obj.netloc + "/") if obj.scheme != "s3" else ""), - PATH=paths[i][1:], - ) - break - - last_abs_component = i - break - - if last_abs_component is not None: - paths = paths[last_abs_component:] - if len(paths) == 1: - result = urllib.parse.urlparse(paths[0], scheme="file", allow_fragments=False) - - # another subtlety: If the last argument to join() is an absolute - # file:// URL component with a relative path, the relative path - # needs to be resolved. - if result.scheme == "file" and result.netloc: - result = urllib.parse.ParseResult( - scheme=result.scheme, - netloc="", - path=posixpath.abspath(result.netloc + result.path), - params=result.params, - query=result.query, - fragment=None, - ) - - return result.geturl() - - return _join(*paths, **kwargs) - - -def _join(base_url, path, *extra, **kwargs): - base_url = urllib.parse.urlparse(base_url) - resolve_href = kwargs.get("resolve_href", False) - - (scheme, netloc, base_path, params, query, _) = base_url - scheme = scheme.lower() - - path_tokens = [ - part - for part in itertools.chain( - _split_all(path), - itertools.chain.from_iterable(_split_all(extra_path) for extra_path in extra), - ) - if part and part != "/" - ] - - base_path_args = ["/fake-root"] - if scheme == "s3": - if netloc: - base_path_args.append(netloc) - - if base_path.startswith("/"): - base_path = base_path[1:] - - base_path_args.append(base_path) - - if resolve_href: - new_base_path, _ = posixpath.split(posixpath.join(*base_path_args)) - base_path_args = [new_base_path] - - base_path_args.extend(path_tokens) - base_path = posixpath.relpath(posixpath.join(*base_path_args), "/fake-root") - - if scheme == "s3": - path_tokens = [part for part in _split_all(base_path) if part and part != "/"] - - if path_tokens: - netloc = path_tokens.pop(0) - base_path = posixpath.join("", *path_tokens) - - if sys.platform == "win32": - base_path = convert_to_posix_path(base_path) - - return format( - urllib.parse.ParseResult( - scheme=scheme, netloc=netloc, path=base_path, params=params, query=query, fragment=None - ) - ) +def join(base: str, *components: str, resolve_href: bool = False, **kwargs) -> str: + """Convenience wrapper around ``urllib.parse.urljoin``, with a few differences: + 1. By default resolve_href=False, which makes the function like os.path.join: for example + https://example.com/a/b + c/d = https://example.com/a/b/c/d. If resolve_href=True, the + behavior is how a browser would resolve the URL: https://example.com/a/c/d. + 2. s3:// and gs:// URLs are joined like http:// URLs. + 3. It accepts multiple components for convenience. Note that components[1:] are treated as + literal path components and appended to components[0] separated by slashes.""" + # Ensure a trailing slash in the path component of the base URL to get os.path.join-like + # behavior instead of web browser behavior. + if not resolve_href: + parsed = urllib.parse.urlparse(base) + if not parsed.path.endswith("/"): + base = parsed._replace(path=f"{parsed.path}/").geturl() + uses_netloc = urllib.parse.uses_netloc + uses_relative = urllib.parse.uses_relative + try: + # NOTE: we temporarily modify urllib internals so s3 and gs schemes are treated like http. + # This is non-portable, and may be forward incompatible with future cpython versions. + urllib.parse.uses_netloc = [*uses_netloc, "s3", "gs"] + urllib.parse.uses_relative = [*uses_relative, "s3", "gs"] + return urllib.parse.urljoin(base, "/".join(components), **kwargs) + finally: + urllib.parse.uses_netloc = uses_netloc + urllib.parse.uses_relative = uses_relative def default_download_filename(url: str) -> str: