spider: respect <base> tag (#40443)

This commit is contained in:
Harmen Stoppels 2023-10-11 17:49:50 +02:00 committed by GitHub
parent 1ab8886695
commit 65e7ec0509
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -110,19 +110,28 @@ def handle_starttag(self, tag, attrs):
self.links.append(val)
class IncludeFragmentParser(HTMLParser):
class ExtractMetadataParser(HTMLParser):
"""This parser takes an HTML page and selects the include-fragments,
used on GitHub, https://github.github.io/include-fragment-element."""
used on GitHub, https://github.github.io/include-fragment-element,
as well as a possible base url."""
def __init__(self):
super().__init__()
self.links = []
self.fragments = []
self.base_url = None
def handle_starttag(self, tag, attrs):
# <include-fragment src="..." />
if tag == "include-fragment":
for attr, val in attrs:
if attr == "src":
self.links.append(val)
self.fragments.append(val)
# <base href="..." />
elif tag == "base":
for attr, val in attrs:
if attr == "href":
self.base_url = val
def read_from_url(url, accept_content_type=None):
@ -625,12 +634,15 @@ def _spider(url: urllib.parse.ParseResult, collect_nested: bool, _visited: Set[s
# Parse out the include-fragments in the page
# https://github.github.io/include-fragment-element
include_fragment_parser = IncludeFragmentParser()
include_fragment_parser.feed(page)
metadata_parser = ExtractMetadataParser()
metadata_parser.feed(page)
# Change of base URL due to <base href="..." /> tag
response_url = metadata_parser.base_url or response_url
fragments = set()
while include_fragment_parser.links:
raw_link = include_fragment_parser.links.pop()
while metadata_parser.fragments:
raw_link = metadata_parser.fragments.pop()
abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True)
try: