spack.util.url: fix join breakage in python 3.12.6 (#46453)

2024-09-19 12:29:56 +02:00 · 2024-09-19 12:29:56 +02:00 · 532d844f26
commit 532d844f26
parent 907238a7e8
2 changed files with 77 additions and 310 deletions
--- a/lib/spack/spack/test/util/util_url.py
+++ b/lib/spack/spack/test/util/util_url.py
@ -8,6 +8,8 @@
 import os.path
 import urllib.parse

+import pytest
+
 import spack.util.path
 import spack.util.url as url_util

@ -45,155 +47,63 @@ def test_relative_path_to_file_url(tmpdir):
        assert os.path.samefile(roundtrip, path)


-def test_url_join_local_paths():
-    # Resolve local link against page URL
+@pytest.mark.parametrize("resolve_href", [True, False])
+@pytest.mark.parametrize("scheme", ["http", "s3", "gs", "file"])
+def test_url_join_absolute(scheme, resolve_href):
+    """Test that joining a URL with an absolute path works the same for schemes we care about, and
+    whether we work in web browser mode or not."""
+    netloc = "" if scheme == "file" else "example.com"
+    a1 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d/e/f", resolve_href=resolve_href)
+    a2 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d", "e", "f", resolve_href=resolve_href)
+    assert a1 == a2 == f"{scheme}://{netloc}/d/e/f"

-    # wrong:
-    assert (
-        url_util.join("s3://bucket/index.html", "../other-bucket/document.txt")
-        == "s3://bucket/other-bucket/document.txt"
-    )
-
-    # correct - need to specify resolve_href=True:
-    assert (
-        url_util.join("s3://bucket/index.html", "../other-bucket/document.txt", resolve_href=True)
-        == "s3://other-bucket/document.txt"
-    )
-
-    # same as above: make sure several components are joined together correctly
-    assert (
-        url_util.join(
-            # with resolve_href=True, first arg is the base url; can not be
-            # broken up
-            "s3://bucket/index.html",
-            # with resolve_href=True, remaining arguments are the components of
-            # the local href that needs to be resolved
-            "..",
-            "other-bucket",
-            "document.txt",
-            resolve_href=True,
-        )
-        == "s3://other-bucket/document.txt"
-    )
-
-    # Append local path components to prefix URL
-
-    # wrong:
-    assert (
-        url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=True)
-        == "https://mirror.spack.io/my-package"
-    )
-
-    # correct - Need to specify resolve_href=False:
-    assert (
-        url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=False)
-        == "https://mirror.spack.io/build_cache/my-package"
-    )
-
-    # same as above; make sure resolve_href=False is default
-    assert (
-        url_util.join("https://mirror.spack.io/build_cache", "my-package")
-        == "https://mirror.spack.io/build_cache/my-package"
-    )
-
-    # same as above: make sure several components are joined together correctly
-    assert (
-        url_util.join(
-            # with resolve_href=False, first arg is just a prefix. No
-            # resolution is done.  So, there should be no difference between
-            # join('/a/b/c', 'd/e'),
-            # join('/a/b', 'c', 'd/e'),
-            # join('/a', 'b/c', 'd', 'e'), etc.
-            "https://mirror.spack.io",
-            "build_cache",
-            "my-package",
-        )
-        == "https://mirror.spack.io/build_cache/my-package"
-    )
-
-    # For s3:// URLs, the "netloc" (bucket) is considered part of the path.
-    # Make sure join() can cross bucket boundaries in this case.
-    args = ["s3://bucket/a/b", "new-bucket", "c"]
-    assert url_util.join(*args) == "s3://bucket/a/b/new-bucket/c"
-
-    args.insert(1, "..")
-    assert url_util.join(*args) == "s3://bucket/a/new-bucket/c"
-
-    args.insert(1, "..")
-    assert url_util.join(*args) == "s3://bucket/new-bucket/c"
-
-    # new-bucket is now the "netloc" (bucket name)
-    args.insert(1, "..")
-    assert url_util.join(*args) == "s3://new-bucket/c"
+    b1 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com/b", resolve_href=resolve_href)
+    b2 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com", "b", resolve_href=resolve_href)
+    assert b1 == b2 == "https://b.com/b"


-def test_url_join_absolute_paths():
-    # Handling absolute path components is a little tricky.  To this end, we
-    # distinguish "absolute path components", from the more-familiar concept of
-    # "absolute paths" as they are understood for local filesystem paths.
-    #
-    # - All absolute paths are absolute path components.  Joining a URL with
-    #   these components has the effect of completely replacing the path of the
-    #   URL with the absolute path.  These components do not specify a URL
-    #   scheme, so the scheme of the URL procuced when joining them depend on
-    #   those provided by components that came before it (file:// assumed if no
-    #   such scheme is provided).
+@pytest.mark.parametrize("scheme", ["http", "s3", "gs"])
+def test_url_join_up(scheme):
+    """Test that the netloc component is preserved when going .. up in the path."""
+    a1 = url_util.join(f"{scheme}://netloc/a/b.html", "c", resolve_href=True)
+    assert a1 == f"{scheme}://netloc/a/c"
+    b1 = url_util.join(f"{scheme}://netloc/a/b.html", "../c", resolve_href=True)
+    b2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "c", resolve_href=True)
+    assert b1 == b2 == f"{scheme}://netloc/c"
+    c1 = url_util.join(f"{scheme}://netloc/a/b.html", "../../c", resolve_href=True)
+    c2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "..", "c", resolve_href=True)
+    assert c1 == c2 == f"{scheme}://netloc/c"

-    # For eaxmple:
-    p = "/path/to/resource"
-    # ...is an absolute path
+    d1 = url_util.join(f"{scheme}://netloc/a/b", "c", resolve_href=False)
+    assert d1 == f"{scheme}://netloc/a/b/c"
+    d2 = url_util.join(f"{scheme}://netloc/a/b", "../c", resolve_href=False)
+    d3 = url_util.join(f"{scheme}://netloc/a/b", "..", "c", resolve_href=False)
+    assert d2 == d3 == f"{scheme}://netloc/a/c"
+    e1 = url_util.join(f"{scheme}://netloc/a/b", "../../c", resolve_href=False)
+    e2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "c", resolve_href=False)
+    assert e1 == e2 == f"{scheme}://netloc/c"
+    f1 = url_util.join(f"{scheme}://netloc/a/b", "../../../c", resolve_href=False)
+    f2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "..", "c", resolve_href=False)
+    assert f1 == f2 == f"{scheme}://netloc/c"

-    # http:// URL
-    assert url_util.join("http://example.com/a/b/c", p) == "http://example.com/path/to/resource"

-    # s3:// URL
-    # also notice how the netloc is treated as part of the path for s3:// URLs
-    assert url_util.join("s3://example.com/a/b/c", p) == "s3://path/to/resource"
+@pytest.mark.parametrize("scheme", ["http", "https", "ftp", "s3", "gs", "file"])
+def test_url_join_resolve_href(scheme):
+    """test that `resolve_href=True` behaves like a web browser at the base page, and
+    `resolve_href=False` behaves like joining paths in a file system at the base directory."""
+    # these are equivalent because of the trailing /
+    netloc = "" if scheme == "file" else "netloc"
+    a1 = url_util.join(f"{scheme}://{netloc}/my/path/", "other/path", resolve_href=True)
+    a2 = url_util.join(f"{scheme}://{netloc}/my/path/", "other", "path", resolve_href=True)
+    assert a1 == a2 == f"{scheme}://{netloc}/my/path/other/path"
+    b1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=False)
+    b2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=False)
+    assert b1 == b2 == f"{scheme}://{netloc}/my/path/other/path"

-    # - URL components that specify a scheme are always absolute path
-    #   components.  Joining a base URL with these components effectively
-    #   discards the base URL and "resets" the joining logic starting at the
-    #   component in question and using it as the new base URL.
-
-    # For eaxmple:
-    p = "http://example.com/path/to"
-    # ...is an http:// URL
-
-    join_result = url_util.join(p, "resource")
-    assert join_result == "http://example.com/path/to/resource"
-
-    # works as if everything before the http:// URL was left out
-    assert url_util.join("literally", "does", "not", "matter", p, "resource") == join_result
-
-    assert url_util.join("file:///a/b/c", "./d") == "file:///a/b/c/d"
-
-    # Finally, resolve_href should have no effect for how absolute path
-    # components are handled because local hrefs can not be absolute path
-    # components.
-    args = [
-        "s3://does",
-        "not",
-        "matter",
-        "http://example.com",
-        "also",
-        "does",
-        "not",
-        "matter",
-        "/path",
-    ]
-
-    expected = "http://example.com/path"
-    assert url_util.join(*args, resolve_href=True) == expected
-    assert url_util.join(*args, resolve_href=False) == expected
-
-    # resolve_href only matters for the local path components at the end of the
-    # argument list.
-    args[-1] = "/path/to/page"
-    args.extend(("..", "..", "resource"))
-
-    assert url_util.join(*args, resolve_href=True) == "http://example.com/resource"
-
-    assert url_util.join(*args, resolve_href=False) == "http://example.com/path/resource"
+    # this is like a web browser: relative to /my.
+    c1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=True)
+    c2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=True)
+    assert c1 == c2 == f"{scheme}://{netloc}/my/other/path"


 def test_default_download_name():
--- a/lib/spack/spack/util/url.py
+++ b/lib/spack/spack/util/url.py
@ -7,15 +7,12 @@
 Utility functions for parsing, formatting, and manipulating URLs.
 """

-import itertools
 import os
 import posixpath
 import sys
 import urllib.parse
 import urllib.request

-from llnl.path import convert_to_posix_path
-
 from spack.util.path import sanitize_filename


@ -27,26 +24,6 @@ def validate_scheme(scheme):
    return scheme in ("file", "http", "https", "ftp", "s3", "gs", "ssh", "git")


-def _split_all(path):
-    """Split path into its atomic components.
-
-    Returns the shortest list, L, of strings such that posixpath.join(*L) ==
-    path and posixpath.split(element) == ('', element) for every element in L
-    except possibly the first.  This first element may possibly have the value
-    of '/'.
-    """
-    result = []
-    a = path
-    old_a = None
-    while a != old_a:
-        (old_a, (a, b)) = a, posixpath.split(a)
-
-        if a or b:
-            result.insert(0, b or "/")
-
-    return result
-
-
 def local_file_path(url):
    """Get a local file path from a url.

@ -97,151 +74,31 @@ def format(parsed_url):
    return parsed_url.geturl()


-def join(base_url, path, *extra, **kwargs):
-    """Joins a base URL with one or more local URL path components
-
-    If resolve_href is True, treat the base URL as though it where the locator
-    of a web page, and the remaining URL path components as though they formed
-    a relative URL to be resolved against it (i.e.: as in posixpath.join(...)).
-    The result is an absolute URL to the resource to which a user's browser
-    would navigate if they clicked on a link with an "href" attribute equal to
-    the relative URL.
-
-    If resolve_href is False (default), then the URL path components are joined
-    as in posixpath.join().
-
-    Note: file:// URL path components are not canonicalized as part of this
-    operation.  To canonicalize, pass the joined url to format().
-
-    Examples:
-      base_url = 's3://bucket/index.html'
-      body = fetch_body(prefix)
-      link = get_href(body) # link == '../other-bucket/document.txt'
-
-      # wrong - link is a local URL that needs to be resolved against base_url
-      spack.util.url.join(base_url, link)
-      's3://bucket/other_bucket/document.txt'
-
-      # correct - resolve local URL against base_url
-      spack.util.url.join(base_url, link, resolve_href=True)
-      's3://other_bucket/document.txt'
-
-      prefix = 'https://mirror.spack.io/build_cache'
-
-      # wrong - prefix is just a URL prefix
-      spack.util.url.join(prefix, 'my-package', resolve_href=True)
-      'https://mirror.spack.io/my-package'
-
-      # correct - simply append additional URL path components
-      spack.util.url.join(prefix, 'my-package', resolve_href=False) # default
-      'https://mirror.spack.io/build_cache/my-package'
-
-      # For canonicalizing file:// URLs, take care to explicitly differentiate
-      # between absolute and relative join components.
-    """
-    paths = [
-        (x) if isinstance(x, str) else x.geturl() for x in itertools.chain((base_url, path), extra)
-    ]
-
-    paths = [convert_to_posix_path(x) for x in paths]
-    n = len(paths)
-    last_abs_component = None
-    scheme = ""
-    for i in range(n - 1, -1, -1):
-        obj = urllib.parse.urlparse(paths[i], scheme="", allow_fragments=False)
-
-        scheme = obj.scheme
-
-        # in either case the component is absolute
-        if scheme or obj.path.startswith("/"):
-            if not scheme:
-                # Without a scheme, we have to go back looking for the
-                # next-last component that specifies a scheme.
-                for j in range(i - 1, -1, -1):
-                    obj = urllib.parse.urlparse(paths[j], scheme="", allow_fragments=False)
-
-                    if obj.scheme:
-                        paths[i] = "{SM}://{NL}{PATH}".format(
-                            SM=obj.scheme,
-                            NL=((obj.netloc + "/") if obj.scheme != "s3" else ""),
-                            PATH=paths[i][1:],
-                        )
-                        break
-
-            last_abs_component = i
-            break
-
-    if last_abs_component is not None:
-        paths = paths[last_abs_component:]
-        if len(paths) == 1:
-            result = urllib.parse.urlparse(paths[0], scheme="file", allow_fragments=False)
-
-            # another subtlety: If the last argument to join() is an absolute
-            # file:// URL component with a relative path, the relative path
-            # needs to be resolved.
-            if result.scheme == "file" and result.netloc:
-                result = urllib.parse.ParseResult(
-                    scheme=result.scheme,
-                    netloc="",
-                    path=posixpath.abspath(result.netloc + result.path),
-                    params=result.params,
-                    query=result.query,
-                    fragment=None,
-                )
-
-            return result.geturl()
-
-    return _join(*paths, **kwargs)
-
-
-def _join(base_url, path, *extra, **kwargs):
-    base_url = urllib.parse.urlparse(base_url)
-    resolve_href = kwargs.get("resolve_href", False)
-
-    (scheme, netloc, base_path, params, query, _) = base_url
-    scheme = scheme.lower()
-
-    path_tokens = [
-        part
-        for part in itertools.chain(
-            _split_all(path),
-            itertools.chain.from_iterable(_split_all(extra_path) for extra_path in extra),
-        )
-        if part and part != "/"
-    ]
-
-    base_path_args = ["/fake-root"]
-    if scheme == "s3":
-        if netloc:
-            base_path_args.append(netloc)
-
-    if base_path.startswith("/"):
-        base_path = base_path[1:]
-
-    base_path_args.append(base_path)
-
-    if resolve_href:
-        new_base_path, _ = posixpath.split(posixpath.join(*base_path_args))
-        base_path_args = [new_base_path]
-
-    base_path_args.extend(path_tokens)
-    base_path = posixpath.relpath(posixpath.join(*base_path_args), "/fake-root")
-
-    if scheme == "s3":
-        path_tokens = [part for part in _split_all(base_path) if part and part != "/"]
-
-        if path_tokens:
-            netloc = path_tokens.pop(0)
-            base_path = posixpath.join("", *path_tokens)
-
-    if sys.platform == "win32":
-        base_path = convert_to_posix_path(base_path)
-
-    return format(
-        urllib.parse.ParseResult(
-            scheme=scheme, netloc=netloc, path=base_path, params=params, query=query, fragment=None
-        )
-    )
+def join(base: str, *components: str, resolve_href: bool = False, **kwargs) -> str:
+    """Convenience wrapper around ``urllib.parse.urljoin``, with a few differences:
+    1. By default resolve_href=False, which makes the function like os.path.join: for example
+    https://example.com/a/b + c/d = https://example.com/a/b/c/d. If resolve_href=True, the
+    behavior is how a browser would resolve the URL: https://example.com/a/c/d.
+    2. s3:// and gs:// URLs are joined like http:// URLs.
+    3. It accepts multiple components for convenience. Note that components[1:] are treated as
+    literal path components and appended to components[0] separated by slashes."""
+    # Ensure a trailing slash in the path component of the base URL to get os.path.join-like
+    # behavior instead of web browser behavior.
+    if not resolve_href:
+        parsed = urllib.parse.urlparse(base)
+        if not parsed.path.endswith("/"):
+            base = parsed._replace(path=f"{parsed.path}/").geturl()
+    uses_netloc = urllib.parse.uses_netloc
+    uses_relative = urllib.parse.uses_relative
+    try:
+        # NOTE: we temporarily modify urllib internals so s3 and gs schemes are treated like http.
+        # This is non-portable, and may be forward incompatible with future cpython versions.
+        urllib.parse.uses_netloc = [*uses_netloc, "s3", "gs"]
+        urllib.parse.uses_relative = [*uses_relative, "s3", "gs"]
+        return urllib.parse.urljoin(base, "/".join(components), **kwargs)
+    finally:
+        urllib.parse.uses_netloc = uses_netloc
+        urllib.parse.uses_relative = uses_relative


 def default_download_filename(url: str) -> str: