spack.util.url: fix join breakage in python 3.12.6 (#46453)

This commit is contained in:
Harmen Stoppels 2024-09-19 12:29:56 +02:00 committed by Harmen Stoppels
parent 907238a7e8
commit 532d844f26
2 changed files with 77 additions and 310 deletions

View file

@ -8,6 +8,8 @@
import os.path import os.path
import urllib.parse import urllib.parse
import pytest
import spack.util.path import spack.util.path
import spack.util.url as url_util import spack.util.url as url_util
@ -45,155 +47,63 @@ def test_relative_path_to_file_url(tmpdir):
assert os.path.samefile(roundtrip, path) assert os.path.samefile(roundtrip, path)
def test_url_join_local_paths(): @pytest.mark.parametrize("resolve_href", [True, False])
# Resolve local link against page URL @pytest.mark.parametrize("scheme", ["http", "s3", "gs", "file"])
def test_url_join_absolute(scheme, resolve_href):
"""Test that joining a URL with an absolute path works the same for schemes we care about, and
whether we work in web browser mode or not."""
netloc = "" if scheme == "file" else "example.com"
a1 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d/e/f", resolve_href=resolve_href)
a2 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d", "e", "f", resolve_href=resolve_href)
assert a1 == a2 == f"{scheme}://{netloc}/d/e/f"
# wrong: b1 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com/b", resolve_href=resolve_href)
assert ( b2 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com", "b", resolve_href=resolve_href)
url_util.join("s3://bucket/index.html", "../other-bucket/document.txt") assert b1 == b2 == "https://b.com/b"
== "s3://bucket/other-bucket/document.txt"
)
# correct - need to specify resolve_href=True:
assert (
url_util.join("s3://bucket/index.html", "../other-bucket/document.txt", resolve_href=True)
== "s3://other-bucket/document.txt"
)
# same as above: make sure several components are joined together correctly
assert (
url_util.join(
# with resolve_href=True, first arg is the base url; can not be
# broken up
"s3://bucket/index.html",
# with resolve_href=True, remaining arguments are the components of
# the local href that needs to be resolved
"..",
"other-bucket",
"document.txt",
resolve_href=True,
)
== "s3://other-bucket/document.txt"
)
# Append local path components to prefix URL
# wrong:
assert (
url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=True)
== "https://mirror.spack.io/my-package"
)
# correct - Need to specify resolve_href=False:
assert (
url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=False)
== "https://mirror.spack.io/build_cache/my-package"
)
# same as above; make sure resolve_href=False is default
assert (
url_util.join("https://mirror.spack.io/build_cache", "my-package")
== "https://mirror.spack.io/build_cache/my-package"
)
# same as above: make sure several components are joined together correctly
assert (
url_util.join(
# with resolve_href=False, first arg is just a prefix. No
# resolution is done. So, there should be no difference between
# join('/a/b/c', 'd/e'),
# join('/a/b', 'c', 'd/e'),
# join('/a', 'b/c', 'd', 'e'), etc.
"https://mirror.spack.io",
"build_cache",
"my-package",
)
== "https://mirror.spack.io/build_cache/my-package"
)
# For s3:// URLs, the "netloc" (bucket) is considered part of the path.
# Make sure join() can cross bucket boundaries in this case.
args = ["s3://bucket/a/b", "new-bucket", "c"]
assert url_util.join(*args) == "s3://bucket/a/b/new-bucket/c"
args.insert(1, "..")
assert url_util.join(*args) == "s3://bucket/a/new-bucket/c"
args.insert(1, "..")
assert url_util.join(*args) == "s3://bucket/new-bucket/c"
# new-bucket is now the "netloc" (bucket name)
args.insert(1, "..")
assert url_util.join(*args) == "s3://new-bucket/c"
def test_url_join_absolute_paths(): @pytest.mark.parametrize("scheme", ["http", "s3", "gs"])
# Handling absolute path components is a little tricky. To this end, we def test_url_join_up(scheme):
# distinguish "absolute path components", from the more-familiar concept of """Test that the netloc component is preserved when going .. up in the path."""
# "absolute paths" as they are understood for local filesystem paths. a1 = url_util.join(f"{scheme}://netloc/a/b.html", "c", resolve_href=True)
# assert a1 == f"{scheme}://netloc/a/c"
# - All absolute paths are absolute path components. Joining a URL with b1 = url_util.join(f"{scheme}://netloc/a/b.html", "../c", resolve_href=True)
# these components has the effect of completely replacing the path of the b2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "c", resolve_href=True)
# URL with the absolute path. These components do not specify a URL assert b1 == b2 == f"{scheme}://netloc/c"
# scheme, so the scheme of the URL procuced when joining them depend on c1 = url_util.join(f"{scheme}://netloc/a/b.html", "../../c", resolve_href=True)
# those provided by components that came before it (file:// assumed if no c2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "..", "c", resolve_href=True)
# such scheme is provided). assert c1 == c2 == f"{scheme}://netloc/c"
# For eaxmple: d1 = url_util.join(f"{scheme}://netloc/a/b", "c", resolve_href=False)
p = "/path/to/resource" assert d1 == f"{scheme}://netloc/a/b/c"
# ...is an absolute path d2 = url_util.join(f"{scheme}://netloc/a/b", "../c", resolve_href=False)
d3 = url_util.join(f"{scheme}://netloc/a/b", "..", "c", resolve_href=False)
assert d2 == d3 == f"{scheme}://netloc/a/c"
e1 = url_util.join(f"{scheme}://netloc/a/b", "../../c", resolve_href=False)
e2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "c", resolve_href=False)
assert e1 == e2 == f"{scheme}://netloc/c"
f1 = url_util.join(f"{scheme}://netloc/a/b", "../../../c", resolve_href=False)
f2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "..", "c", resolve_href=False)
assert f1 == f2 == f"{scheme}://netloc/c"
# http:// URL
assert url_util.join("http://example.com/a/b/c", p) == "http://example.com/path/to/resource"
# s3:// URL @pytest.mark.parametrize("scheme", ["http", "https", "ftp", "s3", "gs", "file"])
# also notice how the netloc is treated as part of the path for s3:// URLs def test_url_join_resolve_href(scheme):
assert url_util.join("s3://example.com/a/b/c", p) == "s3://path/to/resource" """test that `resolve_href=True` behaves like a web browser at the base page, and
`resolve_href=False` behaves like joining paths in a file system at the base directory."""
# these are equivalent because of the trailing /
netloc = "" if scheme == "file" else "netloc"
a1 = url_util.join(f"{scheme}://{netloc}/my/path/", "other/path", resolve_href=True)
a2 = url_util.join(f"{scheme}://{netloc}/my/path/", "other", "path", resolve_href=True)
assert a1 == a2 == f"{scheme}://{netloc}/my/path/other/path"
b1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=False)
b2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=False)
assert b1 == b2 == f"{scheme}://{netloc}/my/path/other/path"
# - URL components that specify a scheme are always absolute path # this is like a web browser: relative to /my.
# components. Joining a base URL with these components effectively c1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=True)
# discards the base URL and "resets" the joining logic starting at the c2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=True)
# component in question and using it as the new base URL. assert c1 == c2 == f"{scheme}://{netloc}/my/other/path"
# For eaxmple:
p = "http://example.com/path/to"
# ...is an http:// URL
join_result = url_util.join(p, "resource")
assert join_result == "http://example.com/path/to/resource"
# works as if everything before the http:// URL was left out
assert url_util.join("literally", "does", "not", "matter", p, "resource") == join_result
assert url_util.join("file:///a/b/c", "./d") == "file:///a/b/c/d"
# Finally, resolve_href should have no effect for how absolute path
# components are handled because local hrefs can not be absolute path
# components.
args = [
"s3://does",
"not",
"matter",
"http://example.com",
"also",
"does",
"not",
"matter",
"/path",
]
expected = "http://example.com/path"
assert url_util.join(*args, resolve_href=True) == expected
assert url_util.join(*args, resolve_href=False) == expected
# resolve_href only matters for the local path components at the end of the
# argument list.
args[-1] = "/path/to/page"
args.extend(("..", "..", "resource"))
assert url_util.join(*args, resolve_href=True) == "http://example.com/resource"
assert url_util.join(*args, resolve_href=False) == "http://example.com/path/resource"
def test_default_download_name(): def test_default_download_name():

View file

@ -7,15 +7,12 @@
Utility functions for parsing, formatting, and manipulating URLs. Utility functions for parsing, formatting, and manipulating URLs.
""" """
import itertools
import os import os
import posixpath import posixpath
import sys import sys
import urllib.parse import urllib.parse
import urllib.request import urllib.request
from llnl.path import convert_to_posix_path
from spack.util.path import sanitize_filename from spack.util.path import sanitize_filename
@ -27,26 +24,6 @@ def validate_scheme(scheme):
return scheme in ("file", "http", "https", "ftp", "s3", "gs", "ssh", "git") return scheme in ("file", "http", "https", "ftp", "s3", "gs", "ssh", "git")
def _split_all(path):
"""Split path into its atomic components.
Returns the shortest list, L, of strings such that posixpath.join(*L) ==
path and posixpath.split(element) == ('', element) for every element in L
except possibly the first. This first element may possibly have the value
of '/'.
"""
result = []
a = path
old_a = None
while a != old_a:
(old_a, (a, b)) = a, posixpath.split(a)
if a or b:
result.insert(0, b or "/")
return result
def local_file_path(url): def local_file_path(url):
"""Get a local file path from a url. """Get a local file path from a url.
@ -97,151 +74,31 @@ def format(parsed_url):
return parsed_url.geturl() return parsed_url.geturl()
def join(base_url, path, *extra, **kwargs): def join(base: str, *components: str, resolve_href: bool = False, **kwargs) -> str:
"""Joins a base URL with one or more local URL path components """Convenience wrapper around ``urllib.parse.urljoin``, with a few differences:
1. By default resolve_href=False, which makes the function like os.path.join: for example
If resolve_href is True, treat the base URL as though it where the locator https://example.com/a/b + c/d = https://example.com/a/b/c/d. If resolve_href=True, the
of a web page, and the remaining URL path components as though they formed behavior is how a browser would resolve the URL: https://example.com/a/c/d.
a relative URL to be resolved against it (i.e.: as in posixpath.join(...)). 2. s3:// and gs:// URLs are joined like http:// URLs.
The result is an absolute URL to the resource to which a user's browser 3. It accepts multiple components for convenience. Note that components[1:] are treated as
would navigate if they clicked on a link with an "href" attribute equal to literal path components and appended to components[0] separated by slashes."""
the relative URL. # Ensure a trailing slash in the path component of the base URL to get os.path.join-like
# behavior instead of web browser behavior.
If resolve_href is False (default), then the URL path components are joined if not resolve_href:
as in posixpath.join(). parsed = urllib.parse.urlparse(base)
if not parsed.path.endswith("/"):
Note: file:// URL path components are not canonicalized as part of this base = parsed._replace(path=f"{parsed.path}/").geturl()
operation. To canonicalize, pass the joined url to format(). uses_netloc = urllib.parse.uses_netloc
uses_relative = urllib.parse.uses_relative
Examples: try:
base_url = 's3://bucket/index.html' # NOTE: we temporarily modify urllib internals so s3 and gs schemes are treated like http.
body = fetch_body(prefix) # This is non-portable, and may be forward incompatible with future cpython versions.
link = get_href(body) # link == '../other-bucket/document.txt' urllib.parse.uses_netloc = [*uses_netloc, "s3", "gs"]
urllib.parse.uses_relative = [*uses_relative, "s3", "gs"]
# wrong - link is a local URL that needs to be resolved against base_url return urllib.parse.urljoin(base, "/".join(components), **kwargs)
spack.util.url.join(base_url, link) finally:
's3://bucket/other_bucket/document.txt' urllib.parse.uses_netloc = uses_netloc
urllib.parse.uses_relative = uses_relative
# correct - resolve local URL against base_url
spack.util.url.join(base_url, link, resolve_href=True)
's3://other_bucket/document.txt'
prefix = 'https://mirror.spack.io/build_cache'
# wrong - prefix is just a URL prefix
spack.util.url.join(prefix, 'my-package', resolve_href=True)
'https://mirror.spack.io/my-package'
# correct - simply append additional URL path components
spack.util.url.join(prefix, 'my-package', resolve_href=False) # default
'https://mirror.spack.io/build_cache/my-package'
# For canonicalizing file:// URLs, take care to explicitly differentiate
# between absolute and relative join components.
"""
paths = [
(x) if isinstance(x, str) else x.geturl() for x in itertools.chain((base_url, path), extra)
]
paths = [convert_to_posix_path(x) for x in paths]
n = len(paths)
last_abs_component = None
scheme = ""
for i in range(n - 1, -1, -1):
obj = urllib.parse.urlparse(paths[i], scheme="", allow_fragments=False)
scheme = obj.scheme
# in either case the component is absolute
if scheme or obj.path.startswith("/"):
if not scheme:
# Without a scheme, we have to go back looking for the
# next-last component that specifies a scheme.
for j in range(i - 1, -1, -1):
obj = urllib.parse.urlparse(paths[j], scheme="", allow_fragments=False)
if obj.scheme:
paths[i] = "{SM}://{NL}{PATH}".format(
SM=obj.scheme,
NL=((obj.netloc + "/") if obj.scheme != "s3" else ""),
PATH=paths[i][1:],
)
break
last_abs_component = i
break
if last_abs_component is not None:
paths = paths[last_abs_component:]
if len(paths) == 1:
result = urllib.parse.urlparse(paths[0], scheme="file", allow_fragments=False)
# another subtlety: If the last argument to join() is an absolute
# file:// URL component with a relative path, the relative path
# needs to be resolved.
if result.scheme == "file" and result.netloc:
result = urllib.parse.ParseResult(
scheme=result.scheme,
netloc="",
path=posixpath.abspath(result.netloc + result.path),
params=result.params,
query=result.query,
fragment=None,
)
return result.geturl()
return _join(*paths, **kwargs)
def _join(base_url, path, *extra, **kwargs):
base_url = urllib.parse.urlparse(base_url)
resolve_href = kwargs.get("resolve_href", False)
(scheme, netloc, base_path, params, query, _) = base_url
scheme = scheme.lower()
path_tokens = [
part
for part in itertools.chain(
_split_all(path),
itertools.chain.from_iterable(_split_all(extra_path) for extra_path in extra),
)
if part and part != "/"
]
base_path_args = ["/fake-root"]
if scheme == "s3":
if netloc:
base_path_args.append(netloc)
if base_path.startswith("/"):
base_path = base_path[1:]
base_path_args.append(base_path)
if resolve_href:
new_base_path, _ = posixpath.split(posixpath.join(*base_path_args))
base_path_args = [new_base_path]
base_path_args.extend(path_tokens)
base_path = posixpath.relpath(posixpath.join(*base_path_args), "/fake-root")
if scheme == "s3":
path_tokens = [part for part in _split_all(base_path) if part and part != "/"]
if path_tokens:
netloc = path_tokens.pop(0)
base_path = posixpath.join("", *path_tokens)
if sys.platform == "win32":
base_path = convert_to_posix_path(base_path)
return format(
urllib.parse.ParseResult(
scheme=scheme, netloc=netloc, path=base_path, params=params, query=query, fragment=None
)
)
def default_download_filename(url: str) -> str: def default_download_filename(url: str) -> str: