spack.util.url: fix join breakage in python 3.12.6 (#46453)
This commit is contained in:
parent
907238a7e8
commit
532d844f26
2 changed files with 77 additions and 310 deletions
|
@ -8,6 +8,8 @@
|
||||||
import os.path
|
import os.path
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
import spack.util.path
|
import spack.util.path
|
||||||
import spack.util.url as url_util
|
import spack.util.url as url_util
|
||||||
|
|
||||||
|
@ -45,155 +47,63 @@ def test_relative_path_to_file_url(tmpdir):
|
||||||
assert os.path.samefile(roundtrip, path)
|
assert os.path.samefile(roundtrip, path)
|
||||||
|
|
||||||
|
|
||||||
def test_url_join_local_paths():
|
@pytest.mark.parametrize("resolve_href", [True, False])
|
||||||
# Resolve local link against page URL
|
@pytest.mark.parametrize("scheme", ["http", "s3", "gs", "file"])
|
||||||
|
def test_url_join_absolute(scheme, resolve_href):
|
||||||
|
"""Test that joining a URL with an absolute path works the same for schemes we care about, and
|
||||||
|
whether we work in web browser mode or not."""
|
||||||
|
netloc = "" if scheme == "file" else "example.com"
|
||||||
|
a1 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d/e/f", resolve_href=resolve_href)
|
||||||
|
a2 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d", "e", "f", resolve_href=resolve_href)
|
||||||
|
assert a1 == a2 == f"{scheme}://{netloc}/d/e/f"
|
||||||
|
|
||||||
# wrong:
|
b1 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com/b", resolve_href=resolve_href)
|
||||||
assert (
|
b2 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com", "b", resolve_href=resolve_href)
|
||||||
url_util.join("s3://bucket/index.html", "../other-bucket/document.txt")
|
assert b1 == b2 == "https://b.com/b"
|
||||||
== "s3://bucket/other-bucket/document.txt"
|
|
||||||
)
|
|
||||||
|
|
||||||
# correct - need to specify resolve_href=True:
|
|
||||||
assert (
|
|
||||||
url_util.join("s3://bucket/index.html", "../other-bucket/document.txt", resolve_href=True)
|
|
||||||
== "s3://other-bucket/document.txt"
|
|
||||||
)
|
|
||||||
|
|
||||||
# same as above: make sure several components are joined together correctly
|
|
||||||
assert (
|
|
||||||
url_util.join(
|
|
||||||
# with resolve_href=True, first arg is the base url; can not be
|
|
||||||
# broken up
|
|
||||||
"s3://bucket/index.html",
|
|
||||||
# with resolve_href=True, remaining arguments are the components of
|
|
||||||
# the local href that needs to be resolved
|
|
||||||
"..",
|
|
||||||
"other-bucket",
|
|
||||||
"document.txt",
|
|
||||||
resolve_href=True,
|
|
||||||
)
|
|
||||||
== "s3://other-bucket/document.txt"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Append local path components to prefix URL
|
|
||||||
|
|
||||||
# wrong:
|
|
||||||
assert (
|
|
||||||
url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=True)
|
|
||||||
== "https://mirror.spack.io/my-package"
|
|
||||||
)
|
|
||||||
|
|
||||||
# correct - Need to specify resolve_href=False:
|
|
||||||
assert (
|
|
||||||
url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=False)
|
|
||||||
== "https://mirror.spack.io/build_cache/my-package"
|
|
||||||
)
|
|
||||||
|
|
||||||
# same as above; make sure resolve_href=False is default
|
|
||||||
assert (
|
|
||||||
url_util.join("https://mirror.spack.io/build_cache", "my-package")
|
|
||||||
== "https://mirror.spack.io/build_cache/my-package"
|
|
||||||
)
|
|
||||||
|
|
||||||
# same as above: make sure several components are joined together correctly
|
|
||||||
assert (
|
|
||||||
url_util.join(
|
|
||||||
# with resolve_href=False, first arg is just a prefix. No
|
|
||||||
# resolution is done. So, there should be no difference between
|
|
||||||
# join('/a/b/c', 'd/e'),
|
|
||||||
# join('/a/b', 'c', 'd/e'),
|
|
||||||
# join('/a', 'b/c', 'd', 'e'), etc.
|
|
||||||
"https://mirror.spack.io",
|
|
||||||
"build_cache",
|
|
||||||
"my-package",
|
|
||||||
)
|
|
||||||
== "https://mirror.spack.io/build_cache/my-package"
|
|
||||||
)
|
|
||||||
|
|
||||||
# For s3:// URLs, the "netloc" (bucket) is considered part of the path.
|
|
||||||
# Make sure join() can cross bucket boundaries in this case.
|
|
||||||
args = ["s3://bucket/a/b", "new-bucket", "c"]
|
|
||||||
assert url_util.join(*args) == "s3://bucket/a/b/new-bucket/c"
|
|
||||||
|
|
||||||
args.insert(1, "..")
|
|
||||||
assert url_util.join(*args) == "s3://bucket/a/new-bucket/c"
|
|
||||||
|
|
||||||
args.insert(1, "..")
|
|
||||||
assert url_util.join(*args) == "s3://bucket/new-bucket/c"
|
|
||||||
|
|
||||||
# new-bucket is now the "netloc" (bucket name)
|
|
||||||
args.insert(1, "..")
|
|
||||||
assert url_util.join(*args) == "s3://new-bucket/c"
|
|
||||||
|
|
||||||
|
|
||||||
def test_url_join_absolute_paths():
|
@pytest.mark.parametrize("scheme", ["http", "s3", "gs"])
|
||||||
# Handling absolute path components is a little tricky. To this end, we
|
def test_url_join_up(scheme):
|
||||||
# distinguish "absolute path components", from the more-familiar concept of
|
"""Test that the netloc component is preserved when going .. up in the path."""
|
||||||
# "absolute paths" as they are understood for local filesystem paths.
|
a1 = url_util.join(f"{scheme}://netloc/a/b.html", "c", resolve_href=True)
|
||||||
#
|
assert a1 == f"{scheme}://netloc/a/c"
|
||||||
# - All absolute paths are absolute path components. Joining a URL with
|
b1 = url_util.join(f"{scheme}://netloc/a/b.html", "../c", resolve_href=True)
|
||||||
# these components has the effect of completely replacing the path of the
|
b2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "c", resolve_href=True)
|
||||||
# URL with the absolute path. These components do not specify a URL
|
assert b1 == b2 == f"{scheme}://netloc/c"
|
||||||
# scheme, so the scheme of the URL procuced when joining them depend on
|
c1 = url_util.join(f"{scheme}://netloc/a/b.html", "../../c", resolve_href=True)
|
||||||
# those provided by components that came before it (file:// assumed if no
|
c2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "..", "c", resolve_href=True)
|
||||||
# such scheme is provided).
|
assert c1 == c2 == f"{scheme}://netloc/c"
|
||||||
|
|
||||||
# For eaxmple:
|
d1 = url_util.join(f"{scheme}://netloc/a/b", "c", resolve_href=False)
|
||||||
p = "/path/to/resource"
|
assert d1 == f"{scheme}://netloc/a/b/c"
|
||||||
# ...is an absolute path
|
d2 = url_util.join(f"{scheme}://netloc/a/b", "../c", resolve_href=False)
|
||||||
|
d3 = url_util.join(f"{scheme}://netloc/a/b", "..", "c", resolve_href=False)
|
||||||
|
assert d2 == d3 == f"{scheme}://netloc/a/c"
|
||||||
|
e1 = url_util.join(f"{scheme}://netloc/a/b", "../../c", resolve_href=False)
|
||||||
|
e2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "c", resolve_href=False)
|
||||||
|
assert e1 == e2 == f"{scheme}://netloc/c"
|
||||||
|
f1 = url_util.join(f"{scheme}://netloc/a/b", "../../../c", resolve_href=False)
|
||||||
|
f2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "..", "c", resolve_href=False)
|
||||||
|
assert f1 == f2 == f"{scheme}://netloc/c"
|
||||||
|
|
||||||
# http:// URL
|
|
||||||
assert url_util.join("http://example.com/a/b/c", p) == "http://example.com/path/to/resource"
|
|
||||||
|
|
||||||
# s3:// URL
|
@pytest.mark.parametrize("scheme", ["http", "https", "ftp", "s3", "gs", "file"])
|
||||||
# also notice how the netloc is treated as part of the path for s3:// URLs
|
def test_url_join_resolve_href(scheme):
|
||||||
assert url_util.join("s3://example.com/a/b/c", p) == "s3://path/to/resource"
|
"""test that `resolve_href=True` behaves like a web browser at the base page, and
|
||||||
|
`resolve_href=False` behaves like joining paths in a file system at the base directory."""
|
||||||
|
# these are equivalent because of the trailing /
|
||||||
|
netloc = "" if scheme == "file" else "netloc"
|
||||||
|
a1 = url_util.join(f"{scheme}://{netloc}/my/path/", "other/path", resolve_href=True)
|
||||||
|
a2 = url_util.join(f"{scheme}://{netloc}/my/path/", "other", "path", resolve_href=True)
|
||||||
|
assert a1 == a2 == f"{scheme}://{netloc}/my/path/other/path"
|
||||||
|
b1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=False)
|
||||||
|
b2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=False)
|
||||||
|
assert b1 == b2 == f"{scheme}://{netloc}/my/path/other/path"
|
||||||
|
|
||||||
# - URL components that specify a scheme are always absolute path
|
# this is like a web browser: relative to /my.
|
||||||
# components. Joining a base URL with these components effectively
|
c1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=True)
|
||||||
# discards the base URL and "resets" the joining logic starting at the
|
c2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=True)
|
||||||
# component in question and using it as the new base URL.
|
assert c1 == c2 == f"{scheme}://{netloc}/my/other/path"
|
||||||
|
|
||||||
# For eaxmple:
|
|
||||||
p = "http://example.com/path/to"
|
|
||||||
# ...is an http:// URL
|
|
||||||
|
|
||||||
join_result = url_util.join(p, "resource")
|
|
||||||
assert join_result == "http://example.com/path/to/resource"
|
|
||||||
|
|
||||||
# works as if everything before the http:// URL was left out
|
|
||||||
assert url_util.join("literally", "does", "not", "matter", p, "resource") == join_result
|
|
||||||
|
|
||||||
assert url_util.join("file:///a/b/c", "./d") == "file:///a/b/c/d"
|
|
||||||
|
|
||||||
# Finally, resolve_href should have no effect for how absolute path
|
|
||||||
# components are handled because local hrefs can not be absolute path
|
|
||||||
# components.
|
|
||||||
args = [
|
|
||||||
"s3://does",
|
|
||||||
"not",
|
|
||||||
"matter",
|
|
||||||
"http://example.com",
|
|
||||||
"also",
|
|
||||||
"does",
|
|
||||||
"not",
|
|
||||||
"matter",
|
|
||||||
"/path",
|
|
||||||
]
|
|
||||||
|
|
||||||
expected = "http://example.com/path"
|
|
||||||
assert url_util.join(*args, resolve_href=True) == expected
|
|
||||||
assert url_util.join(*args, resolve_href=False) == expected
|
|
||||||
|
|
||||||
# resolve_href only matters for the local path components at the end of the
|
|
||||||
# argument list.
|
|
||||||
args[-1] = "/path/to/page"
|
|
||||||
args.extend(("..", "..", "resource"))
|
|
||||||
|
|
||||||
assert url_util.join(*args, resolve_href=True) == "http://example.com/resource"
|
|
||||||
|
|
||||||
assert url_util.join(*args, resolve_href=False) == "http://example.com/path/resource"
|
|
||||||
|
|
||||||
|
|
||||||
def test_default_download_name():
|
def test_default_download_name():
|
||||||
|
|
|
@ -7,15 +7,12 @@
|
||||||
Utility functions for parsing, formatting, and manipulating URLs.
|
Utility functions for parsing, formatting, and manipulating URLs.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import itertools
|
|
||||||
import os
|
import os
|
||||||
import posixpath
|
import posixpath
|
||||||
import sys
|
import sys
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
|
||||||
from llnl.path import convert_to_posix_path
|
|
||||||
|
|
||||||
from spack.util.path import sanitize_filename
|
from spack.util.path import sanitize_filename
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,26 +24,6 @@ def validate_scheme(scheme):
|
||||||
return scheme in ("file", "http", "https", "ftp", "s3", "gs", "ssh", "git")
|
return scheme in ("file", "http", "https", "ftp", "s3", "gs", "ssh", "git")
|
||||||
|
|
||||||
|
|
||||||
def _split_all(path):
|
|
||||||
"""Split path into its atomic components.
|
|
||||||
|
|
||||||
Returns the shortest list, L, of strings such that posixpath.join(*L) ==
|
|
||||||
path and posixpath.split(element) == ('', element) for every element in L
|
|
||||||
except possibly the first. This first element may possibly have the value
|
|
||||||
of '/'.
|
|
||||||
"""
|
|
||||||
result = []
|
|
||||||
a = path
|
|
||||||
old_a = None
|
|
||||||
while a != old_a:
|
|
||||||
(old_a, (a, b)) = a, posixpath.split(a)
|
|
||||||
|
|
||||||
if a or b:
|
|
||||||
result.insert(0, b or "/")
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def local_file_path(url):
|
def local_file_path(url):
|
||||||
"""Get a local file path from a url.
|
"""Get a local file path from a url.
|
||||||
|
|
||||||
|
@ -97,151 +74,31 @@ def format(parsed_url):
|
||||||
return parsed_url.geturl()
|
return parsed_url.geturl()
|
||||||
|
|
||||||
|
|
||||||
def join(base_url, path, *extra, **kwargs):
|
def join(base: str, *components: str, resolve_href: bool = False, **kwargs) -> str:
|
||||||
"""Joins a base URL with one or more local URL path components
|
"""Convenience wrapper around ``urllib.parse.urljoin``, with a few differences:
|
||||||
|
1. By default resolve_href=False, which makes the function like os.path.join: for example
|
||||||
If resolve_href is True, treat the base URL as though it where the locator
|
https://example.com/a/b + c/d = https://example.com/a/b/c/d. If resolve_href=True, the
|
||||||
of a web page, and the remaining URL path components as though they formed
|
behavior is how a browser would resolve the URL: https://example.com/a/c/d.
|
||||||
a relative URL to be resolved against it (i.e.: as in posixpath.join(...)).
|
2. s3:// and gs:// URLs are joined like http:// URLs.
|
||||||
The result is an absolute URL to the resource to which a user's browser
|
3. It accepts multiple components for convenience. Note that components[1:] are treated as
|
||||||
would navigate if they clicked on a link with an "href" attribute equal to
|
literal path components and appended to components[0] separated by slashes."""
|
||||||
the relative URL.
|
# Ensure a trailing slash in the path component of the base URL to get os.path.join-like
|
||||||
|
# behavior instead of web browser behavior.
|
||||||
If resolve_href is False (default), then the URL path components are joined
|
if not resolve_href:
|
||||||
as in posixpath.join().
|
parsed = urllib.parse.urlparse(base)
|
||||||
|
if not parsed.path.endswith("/"):
|
||||||
Note: file:// URL path components are not canonicalized as part of this
|
base = parsed._replace(path=f"{parsed.path}/").geturl()
|
||||||
operation. To canonicalize, pass the joined url to format().
|
uses_netloc = urllib.parse.uses_netloc
|
||||||
|
uses_relative = urllib.parse.uses_relative
|
||||||
Examples:
|
try:
|
||||||
base_url = 's3://bucket/index.html'
|
# NOTE: we temporarily modify urllib internals so s3 and gs schemes are treated like http.
|
||||||
body = fetch_body(prefix)
|
# This is non-portable, and may be forward incompatible with future cpython versions.
|
||||||
link = get_href(body) # link == '../other-bucket/document.txt'
|
urllib.parse.uses_netloc = [*uses_netloc, "s3", "gs"]
|
||||||
|
urllib.parse.uses_relative = [*uses_relative, "s3", "gs"]
|
||||||
# wrong - link is a local URL that needs to be resolved against base_url
|
return urllib.parse.urljoin(base, "/".join(components), **kwargs)
|
||||||
spack.util.url.join(base_url, link)
|
finally:
|
||||||
's3://bucket/other_bucket/document.txt'
|
urllib.parse.uses_netloc = uses_netloc
|
||||||
|
urllib.parse.uses_relative = uses_relative
|
||||||
# correct - resolve local URL against base_url
|
|
||||||
spack.util.url.join(base_url, link, resolve_href=True)
|
|
||||||
's3://other_bucket/document.txt'
|
|
||||||
|
|
||||||
prefix = 'https://mirror.spack.io/build_cache'
|
|
||||||
|
|
||||||
# wrong - prefix is just a URL prefix
|
|
||||||
spack.util.url.join(prefix, 'my-package', resolve_href=True)
|
|
||||||
'https://mirror.spack.io/my-package'
|
|
||||||
|
|
||||||
# correct - simply append additional URL path components
|
|
||||||
spack.util.url.join(prefix, 'my-package', resolve_href=False) # default
|
|
||||||
'https://mirror.spack.io/build_cache/my-package'
|
|
||||||
|
|
||||||
# For canonicalizing file:// URLs, take care to explicitly differentiate
|
|
||||||
# between absolute and relative join components.
|
|
||||||
"""
|
|
||||||
paths = [
|
|
||||||
(x) if isinstance(x, str) else x.geturl() for x in itertools.chain((base_url, path), extra)
|
|
||||||
]
|
|
||||||
|
|
||||||
paths = [convert_to_posix_path(x) for x in paths]
|
|
||||||
n = len(paths)
|
|
||||||
last_abs_component = None
|
|
||||||
scheme = ""
|
|
||||||
for i in range(n - 1, -1, -1):
|
|
||||||
obj = urllib.parse.urlparse(paths[i], scheme="", allow_fragments=False)
|
|
||||||
|
|
||||||
scheme = obj.scheme
|
|
||||||
|
|
||||||
# in either case the component is absolute
|
|
||||||
if scheme or obj.path.startswith("/"):
|
|
||||||
if not scheme:
|
|
||||||
# Without a scheme, we have to go back looking for the
|
|
||||||
# next-last component that specifies a scheme.
|
|
||||||
for j in range(i - 1, -1, -1):
|
|
||||||
obj = urllib.parse.urlparse(paths[j], scheme="", allow_fragments=False)
|
|
||||||
|
|
||||||
if obj.scheme:
|
|
||||||
paths[i] = "{SM}://{NL}{PATH}".format(
|
|
||||||
SM=obj.scheme,
|
|
||||||
NL=((obj.netloc + "/") if obj.scheme != "s3" else ""),
|
|
||||||
PATH=paths[i][1:],
|
|
||||||
)
|
|
||||||
break
|
|
||||||
|
|
||||||
last_abs_component = i
|
|
||||||
break
|
|
||||||
|
|
||||||
if last_abs_component is not None:
|
|
||||||
paths = paths[last_abs_component:]
|
|
||||||
if len(paths) == 1:
|
|
||||||
result = urllib.parse.urlparse(paths[0], scheme="file", allow_fragments=False)
|
|
||||||
|
|
||||||
# another subtlety: If the last argument to join() is an absolute
|
|
||||||
# file:// URL component with a relative path, the relative path
|
|
||||||
# needs to be resolved.
|
|
||||||
if result.scheme == "file" and result.netloc:
|
|
||||||
result = urllib.parse.ParseResult(
|
|
||||||
scheme=result.scheme,
|
|
||||||
netloc="",
|
|
||||||
path=posixpath.abspath(result.netloc + result.path),
|
|
||||||
params=result.params,
|
|
||||||
query=result.query,
|
|
||||||
fragment=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
return result.geturl()
|
|
||||||
|
|
||||||
return _join(*paths, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def _join(base_url, path, *extra, **kwargs):
|
|
||||||
base_url = urllib.parse.urlparse(base_url)
|
|
||||||
resolve_href = kwargs.get("resolve_href", False)
|
|
||||||
|
|
||||||
(scheme, netloc, base_path, params, query, _) = base_url
|
|
||||||
scheme = scheme.lower()
|
|
||||||
|
|
||||||
path_tokens = [
|
|
||||||
part
|
|
||||||
for part in itertools.chain(
|
|
||||||
_split_all(path),
|
|
||||||
itertools.chain.from_iterable(_split_all(extra_path) for extra_path in extra),
|
|
||||||
)
|
|
||||||
if part and part != "/"
|
|
||||||
]
|
|
||||||
|
|
||||||
base_path_args = ["/fake-root"]
|
|
||||||
if scheme == "s3":
|
|
||||||
if netloc:
|
|
||||||
base_path_args.append(netloc)
|
|
||||||
|
|
||||||
if base_path.startswith("/"):
|
|
||||||
base_path = base_path[1:]
|
|
||||||
|
|
||||||
base_path_args.append(base_path)
|
|
||||||
|
|
||||||
if resolve_href:
|
|
||||||
new_base_path, _ = posixpath.split(posixpath.join(*base_path_args))
|
|
||||||
base_path_args = [new_base_path]
|
|
||||||
|
|
||||||
base_path_args.extend(path_tokens)
|
|
||||||
base_path = posixpath.relpath(posixpath.join(*base_path_args), "/fake-root")
|
|
||||||
|
|
||||||
if scheme == "s3":
|
|
||||||
path_tokens = [part for part in _split_all(base_path) if part and part != "/"]
|
|
||||||
|
|
||||||
if path_tokens:
|
|
||||||
netloc = path_tokens.pop(0)
|
|
||||||
base_path = posixpath.join("", *path_tokens)
|
|
||||||
|
|
||||||
if sys.platform == "win32":
|
|
||||||
base_path = convert_to_posix_path(base_path)
|
|
||||||
|
|
||||||
return format(
|
|
||||||
urllib.parse.ParseResult(
|
|
||||||
scheme=scheme, netloc=netloc, path=base_path, params=params, query=query, fragment=None
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def default_download_filename(url: str) -> str:
|
def default_download_filename(url: str) -> str:
|
||||||
|
|
Loading…
Reference in a new issue