Resolve <include-fragment> tags e.g. in github release pages (#36674)

This aims to resolve #34164 by resolving the <include-fragment> tags
that GitHub has started using for their release pages, see
https://github.github.io/include-fragment-element/.

This feels a bit hacky but intended as a starting point for discussion.
After reading a page during spidering, it first parses for
include-fragments, gets them all, and treats them all as separate pages.
Then it looks for href links in both the page itself and the fragments.

Co-authored-by: Alec Scott <alec@bcs.sh>
This commit is contained in:
Wouter Deconinck 2023-04-13 13:26:26 -05:00 committed by GitHub
parent d918ae0bde
commit ff319e9863
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 70 additions and 2 deletions

View file

@ -0,0 +1 @@
<a href="foo-5.0.0.tar.gz">foo-5.0.0.tar.gz</a>

View file

@ -0,0 +1,13 @@
<html>
<head>
This is the root page.
</head>
<body>
This is a page with an include-fragment element.
<script type="module" src="https://unpkg.com/@github/include-fragment-element@latest?module"></script>
<include-fragment src="fragment.html">
<p>Loading...</p>
</include-fragment>
</body>
</html>

View file

@ -31,6 +31,8 @@ def _create_url(relative_url):
page_3 = _create_url("3.html")
page_4 = _create_url("4.html")
root_with_fragment = _create_url("index_with_fragment.html")
@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)")
@pytest.mark.parametrize(
@ -143,6 +145,14 @@ def test_find_exotic_versions_of_archive_3():
assert ver("4.5-rc5") in versions
@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)")
def test_find_versions_of_archive_with_fragment():
versions = spack.util.web.find_versions_of_archive(
root_tarball, root_with_fragment, list_depth=0
)
assert ver("5.0.0") in versions
def test_get_header():
headers = {"Content-type": "text/plain"}

View file

@ -75,7 +75,7 @@ class LinkParser(HTMLParser):
links. Good enough for a really simple spider."""
def __init__(self):
HTMLParser.__init__(self)
super().__init__()
self.links = []
def handle_starttag(self, tag, attrs):
@ -85,6 +85,21 @@ def handle_starttag(self, tag, attrs):
self.links.append(val)
class IncludeFragmentParser(HTMLParser):
"""This parser takes an HTML page and selects the include-fragments,
used on GitHub, https://github.github.io/include-fragment-element."""
def __init__(self):
super().__init__()
self.links = []
def handle_starttag(self, tag, attrs):
if tag == "include-fragment":
for attr, val in attrs:
if attr == "src":
self.links.append(val)
def read_from_url(url, accept_content_type=None):
if isinstance(url, str):
url = urllib.parse.urlparse(url)
@ -550,9 +565,38 @@ def _spider(url, collect_nested):
page = codecs.getreader("utf-8")(response).read()
pages[response_url] = page
# Parse out the links in the page
# Parse out the include-fragments in the page
# https://github.github.io/include-fragment-element
include_fragment_parser = IncludeFragmentParser()
include_fragment_parser.feed(page)
fragments = set()
while include_fragment_parser.links:
raw_link = include_fragment_parser.links.pop()
abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True)
try:
# This seems to be text/html, though text/fragment+html is also used
fragment_response_url, _, fragment_response = read_from_url(
abs_link, "text/html"
)
except Exception as e:
msg = f"Error reading fragment: {(type(e), str(e))}:{traceback.format_exc()}"
tty.debug(msg)
if not fragment_response_url or not fragment_response:
continue
fragment = codecs.getreader("utf-8")(fragment_response).read()
fragments.add(fragment)
pages[fragment_response_url] = fragment
# Parse out the links in the page and all fragments
link_parser = LinkParser()
link_parser.feed(page)
for fragment in fragments:
link_parser.feed(fragment)
while link_parser.links:
raw_link = link_parser.links.pop()