Resolve <include-fragment>
tags e.g. in github release pages (#36674)
This aims to resolve #34164 by resolving the <include-fragment> tags that GitHub has started using for their release pages, see https://github.github.io/include-fragment-element/. This feels a bit hacky but intended as a starting point for discussion. After reading a page during spidering, it first parses for include-fragments, gets them all, and treats them all as separate pages. Then it looks for href links in both the page itself and the fragments. Co-authored-by: Alec Scott <alec@bcs.sh>
This commit is contained in:
parent
d918ae0bde
commit
ff319e9863
4 changed files with 70 additions and 2 deletions
1
lib/spack/spack/test/data/web/fragment.html
Normal file
1
lib/spack/spack/test/data/web/fragment.html
Normal file
|
@ -0,0 +1 @@
|
|||
<a href="foo-5.0.0.tar.gz">foo-5.0.0.tar.gz</a>
|
13
lib/spack/spack/test/data/web/index_with_fragment.html
Normal file
13
lib/spack/spack/test/data/web/index_with_fragment.html
Normal file
|
@ -0,0 +1,13 @@
|
|||
<html>
|
||||
<head>
|
||||
This is the root page.
|
||||
</head>
|
||||
<body>
|
||||
This is a page with an include-fragment element.
|
||||
|
||||
<script type="module" src="https://unpkg.com/@github/include-fragment-element@latest?module"></script>
|
||||
<include-fragment src="fragment.html">
|
||||
<p>Loading...</p>
|
||||
</include-fragment>
|
||||
</body>
|
||||
</html>
|
|
@ -31,6 +31,8 @@ def _create_url(relative_url):
|
|||
page_3 = _create_url("3.html")
|
||||
page_4 = _create_url("4.html")
|
||||
|
||||
root_with_fragment = _create_url("index_with_fragment.html")
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)")
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -143,6 +145,14 @@ def test_find_exotic_versions_of_archive_3():
|
|||
assert ver("4.5-rc5") in versions
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)")
|
||||
def test_find_versions_of_archive_with_fragment():
|
||||
versions = spack.util.web.find_versions_of_archive(
|
||||
root_tarball, root_with_fragment, list_depth=0
|
||||
)
|
||||
assert ver("5.0.0") in versions
|
||||
|
||||
|
||||
def test_get_header():
|
||||
headers = {"Content-type": "text/plain"}
|
||||
|
||||
|
|
|
@ -75,7 +75,7 @@ class LinkParser(HTMLParser):
|
|||
links. Good enough for a really simple spider."""
|
||||
|
||||
def __init__(self):
|
||||
HTMLParser.__init__(self)
|
||||
super().__init__()
|
||||
self.links = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
|
@ -85,6 +85,21 @@ def handle_starttag(self, tag, attrs):
|
|||
self.links.append(val)
|
||||
|
||||
|
||||
class IncludeFragmentParser(HTMLParser):
|
||||
"""This parser takes an HTML page and selects the include-fragments,
|
||||
used on GitHub, https://github.github.io/include-fragment-element."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.links = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == "include-fragment":
|
||||
for attr, val in attrs:
|
||||
if attr == "src":
|
||||
self.links.append(val)
|
||||
|
||||
|
||||
def read_from_url(url, accept_content_type=None):
|
||||
if isinstance(url, str):
|
||||
url = urllib.parse.urlparse(url)
|
||||
|
@ -550,9 +565,38 @@ def _spider(url, collect_nested):
|
|||
page = codecs.getreader("utf-8")(response).read()
|
||||
pages[response_url] = page
|
||||
|
||||
# Parse out the links in the page
|
||||
# Parse out the include-fragments in the page
|
||||
# https://github.github.io/include-fragment-element
|
||||
include_fragment_parser = IncludeFragmentParser()
|
||||
include_fragment_parser.feed(page)
|
||||
|
||||
fragments = set()
|
||||
while include_fragment_parser.links:
|
||||
raw_link = include_fragment_parser.links.pop()
|
||||
abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True)
|
||||
|
||||
try:
|
||||
# This seems to be text/html, though text/fragment+html is also used
|
||||
fragment_response_url, _, fragment_response = read_from_url(
|
||||
abs_link, "text/html"
|
||||
)
|
||||
except Exception as e:
|
||||
msg = f"Error reading fragment: {(type(e), str(e))}:{traceback.format_exc()}"
|
||||
tty.debug(msg)
|
||||
|
||||
if not fragment_response_url or not fragment_response:
|
||||
continue
|
||||
|
||||
fragment = codecs.getreader("utf-8")(fragment_response).read()
|
||||
fragments.add(fragment)
|
||||
|
||||
pages[fragment_response_url] = fragment
|
||||
|
||||
# Parse out the links in the page and all fragments
|
||||
link_parser = LinkParser()
|
||||
link_parser.feed(page)
|
||||
for fragment in fragments:
|
||||
link_parser.feed(fragment)
|
||||
|
||||
while link_parser.links:
|
||||
raw_link = link_parser.links.pop()
|
||||
|
|
Loading…
Reference in a new issue