spack.util.url: fix join breakage in python 3.12.6 (#46453)

This commit is contained in:
Harmen Stoppels 2024-09-19 12:29:56 +02:00 committed by Harmen Stoppels
parent 68558b3dd0
commit 1c6c4b4690
2 changed files with 77 additions and 311 deletions

View file

@ -8,6 +8,8 @@
import os.path
import urllib.parse
import pytest
import spack.util.path
import spack.util.url as url_util
@ -45,155 +47,63 @@ def test_relative_path_to_file_url(tmpdir):
assert os.path.samefile(roundtrip, path)
def test_url_join_local_paths():
# Resolve local link against page URL
@pytest.mark.parametrize("resolve_href", [True, False])
@pytest.mark.parametrize("scheme", ["http", "s3", "gs", "file"])
def test_url_join_absolute(scheme, resolve_href):
"""Test that joining a URL with an absolute path works the same for schemes we care about, and
whether we work in web browser mode or not."""
netloc = "" if scheme == "file" else "example.com"
a1 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d/e/f", resolve_href=resolve_href)
a2 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d", "e", "f", resolve_href=resolve_href)
assert a1 == a2 == f"{scheme}://{netloc}/d/e/f"
# wrong:
assert (
url_util.join("s3://bucket/index.html", "../other-bucket/document.txt")
== "s3://bucket/other-bucket/document.txt"
)
# correct - need to specify resolve_href=True:
assert (
url_util.join("s3://bucket/index.html", "../other-bucket/document.txt", resolve_href=True)
== "s3://other-bucket/document.txt"
)
# same as above: make sure several components are joined together correctly
assert (
url_util.join(
# with resolve_href=True, first arg is the base url; can not be
# broken up
"s3://bucket/index.html",
# with resolve_href=True, remaining arguments are the components of
# the local href that needs to be resolved
"..",
"other-bucket",
"document.txt",
resolve_href=True,
)
== "s3://other-bucket/document.txt"
)
# Append local path components to prefix URL
# wrong:
assert (
url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=True)
== "https://mirror.spack.io/my-package"
)
# correct - Need to specify resolve_href=False:
assert (
url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=False)
== "https://mirror.spack.io/build_cache/my-package"
)
# same as above; make sure resolve_href=False is default
assert (
url_util.join("https://mirror.spack.io/build_cache", "my-package")
== "https://mirror.spack.io/build_cache/my-package"
)
# same as above: make sure several components are joined together correctly
assert (
url_util.join(
# with resolve_href=False, first arg is just a prefix. No
# resolution is done. So, there should be no difference between
# join('/a/b/c', 'd/e'),
# join('/a/b', 'c', 'd/e'),
# join('/a', 'b/c', 'd', 'e'), etc.
"https://mirror.spack.io",
"build_cache",
"my-package",
)
== "https://mirror.spack.io/build_cache/my-package"
)
# For s3:// URLs, the "netloc" (bucket) is considered part of the path.
# Make sure join() can cross bucket boundaries in this case.
args = ["s3://bucket/a/b", "new-bucket", "c"]
assert url_util.join(*args) == "s3://bucket/a/b/new-bucket/c"
args.insert(1, "..")
assert url_util.join(*args) == "s3://bucket/a/new-bucket/c"
args.insert(1, "..")
assert url_util.join(*args) == "s3://bucket/new-bucket/c"
# new-bucket is now the "netloc" (bucket name)
args.insert(1, "..")
assert url_util.join(*args) == "s3://new-bucket/c"
b1 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com/b", resolve_href=resolve_href)
b2 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com", "b", resolve_href=resolve_href)
assert b1 == b2 == "https://b.com/b"
def test_url_join_absolute_paths():
# Handling absolute path components is a little tricky. To this end, we
# distinguish "absolute path components", from the more-familiar concept of
# "absolute paths" as they are understood for local filesystem paths.
#
# - All absolute paths are absolute path components. Joining a URL with
# these components has the effect of completely replacing the path of the
# URL with the absolute path. These components do not specify a URL
# scheme, so the scheme of the URL procuced when joining them depend on
# those provided by components that came before it (file:// assumed if no
# such scheme is provided).
@pytest.mark.parametrize("scheme", ["http", "s3", "gs"])
def test_url_join_up(scheme):
"""Test that the netloc component is preserved when going .. up in the path."""
a1 = url_util.join(f"{scheme}://netloc/a/b.html", "c", resolve_href=True)
assert a1 == f"{scheme}://netloc/a/c"
b1 = url_util.join(f"{scheme}://netloc/a/b.html", "../c", resolve_href=True)
b2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "c", resolve_href=True)
assert b1 == b2 == f"{scheme}://netloc/c"
c1 = url_util.join(f"{scheme}://netloc/a/b.html", "../../c", resolve_href=True)
c2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "..", "c", resolve_href=True)
assert c1 == c2 == f"{scheme}://netloc/c"
# For eaxmple:
p = "/path/to/resource"
# ...is an absolute path
d1 = url_util.join(f"{scheme}://netloc/a/b", "c", resolve_href=False)
assert d1 == f"{scheme}://netloc/a/b/c"
d2 = url_util.join(f"{scheme}://netloc/a/b", "../c", resolve_href=False)
d3 = url_util.join(f"{scheme}://netloc/a/b", "..", "c", resolve_href=False)
assert d2 == d3 == f"{scheme}://netloc/a/c"
e1 = url_util.join(f"{scheme}://netloc/a/b", "../../c", resolve_href=False)
e2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "c", resolve_href=False)
assert e1 == e2 == f"{scheme}://netloc/c"
f1 = url_util.join(f"{scheme}://netloc/a/b", "../../../c", resolve_href=False)
f2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "..", "c", resolve_href=False)
assert f1 == f2 == f"{scheme}://netloc/c"
# http:// URL
assert url_util.join("http://example.com/a/b/c", p) == "http://example.com/path/to/resource"
# s3:// URL
# also notice how the netloc is treated as part of the path for s3:// URLs
assert url_util.join("s3://example.com/a/b/c", p) == "s3://path/to/resource"
@pytest.mark.parametrize("scheme", ["http", "https", "ftp", "s3", "gs", "file"])
def test_url_join_resolve_href(scheme):
"""test that `resolve_href=True` behaves like a web browser at the base page, and
`resolve_href=False` behaves like joining paths in a file system at the base directory."""
# these are equivalent because of the trailing /
netloc = "" if scheme == "file" else "netloc"
a1 = url_util.join(f"{scheme}://{netloc}/my/path/", "other/path", resolve_href=True)
a2 = url_util.join(f"{scheme}://{netloc}/my/path/", "other", "path", resolve_href=True)
assert a1 == a2 == f"{scheme}://{netloc}/my/path/other/path"
b1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=False)
b2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=False)
assert b1 == b2 == f"{scheme}://{netloc}/my/path/other/path"
# - URL components that specify a scheme are always absolute path
# components. Joining a base URL with these components effectively
# discards the base URL and "resets" the joining logic starting at the
# component in question and using it as the new base URL.
# For eaxmple:
p = "http://example.com/path/to"
# ...is an http:// URL
join_result = url_util.join(p, "resource")
assert join_result == "http://example.com/path/to/resource"
# works as if everything before the http:// URL was left out
assert url_util.join("literally", "does", "not", "matter", p, "resource") == join_result
assert url_util.join("file:///a/b/c", "./d") == "file:///a/b/c/d"
# Finally, resolve_href should have no effect for how absolute path
# components are handled because local hrefs can not be absolute path
# components.
args = [
"s3://does",
"not",
"matter",
"http://example.com",
"also",
"does",
"not",
"matter",
"/path",
]
expected = "http://example.com/path"
assert url_util.join(*args, resolve_href=True) == expected
assert url_util.join(*args, resolve_href=False) == expected
# resolve_href only matters for the local path components at the end of the
# argument list.
args[-1] = "/path/to/page"
args.extend(("..", "..", "resource"))
assert url_util.join(*args, resolve_href=True) == "http://example.com/resource"
assert url_util.join(*args, resolve_href=False) == "http://example.com/path/resource"
# this is like a web browser: relative to /my.
c1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=True)
c2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=True)
assert c1 == c2 == f"{scheme}://{netloc}/my/other/path"
def test_default_download_name():

View file

@ -7,17 +7,13 @@
Utility functions for parsing, formatting, and manipulating URLs.
"""
import itertools
import os
import posixpath
import re
import sys
import urllib.parse
import urllib.request
from typing import Optional
from llnl.path import convert_to_posix_path
from spack.util.path import sanitize_filename
@ -29,26 +25,6 @@ def validate_scheme(scheme):
return scheme in ("file", "http", "https", "ftp", "s3", "gs", "ssh", "git")
def _split_all(path):
"""Split path into its atomic components.
Returns the shortest list, L, of strings such that posixpath.join(*L) ==
path and posixpath.split(element) == ('', element) for every element in L
except possibly the first. This first element may possibly have the value
of '/'.
"""
result = []
a = path
old_a = None
while a != old_a:
(old_a, (a, b)) = a, posixpath.split(a)
if a or b:
result.insert(0, b or "/")
return result
def local_file_path(url):
"""Get a local file path from a url.
@ -92,151 +68,31 @@ def format(parsed_url):
return parsed_url.geturl()
def join(base_url, path, *extra, **kwargs):
"""Joins a base URL with one or more local URL path components
If resolve_href is True, treat the base URL as though it where the locator
of a web page, and the remaining URL path components as though they formed
a relative URL to be resolved against it (i.e.: as in posixpath.join(...)).
The result is an absolute URL to the resource to which a user's browser
would navigate if they clicked on a link with an "href" attribute equal to
the relative URL.
If resolve_href is False (default), then the URL path components are joined
as in posixpath.join().
Note: file:// URL path components are not canonicalized as part of this
operation. To canonicalize, pass the joined url to format().
Examples:
base_url = 's3://bucket/index.html'
body = fetch_body(prefix)
link = get_href(body) # link == '../other-bucket/document.txt'
# wrong - link is a local URL that needs to be resolved against base_url
spack.util.url.join(base_url, link)
's3://bucket/other_bucket/document.txt'
# correct - resolve local URL against base_url
spack.util.url.join(base_url, link, resolve_href=True)
's3://other_bucket/document.txt'
prefix = 'https://mirror.spack.io/build_cache'
# wrong - prefix is just a URL prefix
spack.util.url.join(prefix, 'my-package', resolve_href=True)
'https://mirror.spack.io/my-package'
# correct - simply append additional URL path components
spack.util.url.join(prefix, 'my-package', resolve_href=False) # default
'https://mirror.spack.io/build_cache/my-package'
# For canonicalizing file:// URLs, take care to explicitly differentiate
# between absolute and relative join components.
"""
paths = [
(x) if isinstance(x, str) else x.geturl() for x in itertools.chain((base_url, path), extra)
]
paths = [convert_to_posix_path(x) for x in paths]
n = len(paths)
last_abs_component = None
scheme = ""
for i in range(n - 1, -1, -1):
obj = urllib.parse.urlparse(paths[i], scheme="", allow_fragments=False)
scheme = obj.scheme
# in either case the component is absolute
if scheme or obj.path.startswith("/"):
if not scheme:
# Without a scheme, we have to go back looking for the
# next-last component that specifies a scheme.
for j in range(i - 1, -1, -1):
obj = urllib.parse.urlparse(paths[j], scheme="", allow_fragments=False)
if obj.scheme:
paths[i] = "{SM}://{NL}{PATH}".format(
SM=obj.scheme,
NL=((obj.netloc + "/") if obj.scheme != "s3" else ""),
PATH=paths[i][1:],
)
break
last_abs_component = i
break
if last_abs_component is not None:
paths = paths[last_abs_component:]
if len(paths) == 1:
result = urllib.parse.urlparse(paths[0], scheme="file", allow_fragments=False)
# another subtlety: If the last argument to join() is an absolute
# file:// URL component with a relative path, the relative path
# needs to be resolved.
if result.scheme == "file" and result.netloc:
result = urllib.parse.ParseResult(
scheme=result.scheme,
netloc="",
path=posixpath.abspath(result.netloc + result.path),
params=result.params,
query=result.query,
fragment=None,
)
return result.geturl()
return _join(*paths, **kwargs)
def _join(base_url, path, *extra, **kwargs):
base_url = urllib.parse.urlparse(base_url)
resolve_href = kwargs.get("resolve_href", False)
(scheme, netloc, base_path, params, query, _) = base_url
scheme = scheme.lower()
path_tokens = [
part
for part in itertools.chain(
_split_all(path),
itertools.chain.from_iterable(_split_all(extra_path) for extra_path in extra),
)
if part and part != "/"
]
base_path_args = ["/fake-root"]
if scheme == "s3":
if netloc:
base_path_args.append(netloc)
if base_path.startswith("/"):
base_path = base_path[1:]
base_path_args.append(base_path)
if resolve_href:
new_base_path, _ = posixpath.split(posixpath.join(*base_path_args))
base_path_args = [new_base_path]
base_path_args.extend(path_tokens)
base_path = posixpath.relpath(posixpath.join(*base_path_args), "/fake-root")
if scheme == "s3":
path_tokens = [part for part in _split_all(base_path) if part and part != "/"]
if path_tokens:
netloc = path_tokens.pop(0)
base_path = posixpath.join("", *path_tokens)
if sys.platform == "win32":
base_path = convert_to_posix_path(base_path)
return format(
urllib.parse.ParseResult(
scheme=scheme, netloc=netloc, path=base_path, params=params, query=query, fragment=None
)
)
def join(base: str, *components: str, resolve_href: bool = False, **kwargs) -> str:
"""Convenience wrapper around ``urllib.parse.urljoin``, with a few differences:
1. By default resolve_href=False, which makes the function like os.path.join: for example
https://example.com/a/b + c/d = https://example.com/a/b/c/d. If resolve_href=True, the
behavior is how a browser would resolve the URL: https://example.com/a/c/d.
2. s3:// and gs:// URLs are joined like http:// URLs.
3. It accepts multiple components for convenience. Note that components[1:] are treated as
literal path components and appended to components[0] separated by slashes."""
# Ensure a trailing slash in the path component of the base URL to get os.path.join-like
# behavior instead of web browser behavior.
if not resolve_href:
parsed = urllib.parse.urlparse(base)
if not parsed.path.endswith("/"):
base = parsed._replace(path=f"{parsed.path}/").geturl()
uses_netloc = urllib.parse.uses_netloc
uses_relative = urllib.parse.uses_relative
try:
# NOTE: we temporarily modify urllib internals so s3 and gs schemes are treated like http.
# This is non-portable, and may be forward incompatible with future cpython versions.
urllib.parse.uses_netloc = [*uses_netloc, "s3", "gs"]
urllib.parse.uses_relative = [*uses_relative, "s3", "gs"]
return urllib.parse.urljoin(base, "/".join(components), **kwargs)
finally:
urllib.parse.uses_netloc = uses_netloc
urllib.parse.uses_relative = uses_relative
def default_download_filename(url: str) -> str: