Fix version scraping for CRAN packages (#12021)

* Fix version scraping for CRAN packages
* Remove set literals
This commit is contained in:
Adam J. Stewart 2019-07-22 15:05:55 -05:00 committed by Todd Gamblin
parent cc4094bb9b
commit 2d144316a8
2 changed files with 20 additions and 20 deletions

View file

@ -43,8 +43,8 @@
# work on paths and URLs. There's not a good word for both of these, but # work on paths and URLs. There's not a good word for both of these, but
# "path" seemed like the most generic term. # "path" seemed like the most generic term.
# #
def find_list_url(url): def find_list_urls(url):
r"""Finds a good list URL for the supplied URL. r"""Find good list URLs for the supplied URL.
By default, returns the dirname of the archive path. By default, returns the dirname of the archive path.
@ -62,7 +62,7 @@ def find_list_url(url):
url (str): The download URL for the package url (str): The download URL for the package
Returns: Returns:
str: The list URL for the package set: One or more list URLs for the package
""" """
url_types = [ url_types = [
@ -93,12 +93,14 @@ def find_list_url(url):
lambda m: m.group(1) + '/Archive/' + m.group(2)), lambda m: m.group(1) + '/Archive/' + m.group(2)),
] ]
list_urls = set([os.path.dirname(url)])
for pattern, fun in url_types: for pattern, fun in url_types:
match = re.search(pattern, url) match = re.search(pattern, url)
if match: if match:
return fun(match) list_urls.add(fun(match))
else:
return os.path.dirname(url) return list_urls
def strip_query_and_fragment(path): def strip_query_and_fragment(path):

View file

@ -270,20 +270,18 @@ def find_versions_of_archive(archive_urls, list_url=None, list_depth=0):
"""Scrape web pages for new versions of a tarball. """Scrape web pages for new versions of a tarball.
Arguments: Arguments:
archive_urls: archive_urls (str or list or tuple): URL or sequence of URLs for
URL or sequence of URLs for different versions of a different versions of a package. Typically these are just the
package. Typically these are just the tarballs from the package tarballs from the package file itself. By default, this searches
file itself. By default, this searches the parent directories the parent directories of archives.
of archives.
Keyword Arguments: Keyword Arguments:
list_url: list_url (str or None): URL for a listing of archives.
URL for a listing of archives. Spack wills scrape these Spack will scrape these pages for download links that look
pages for download links that look like the archive URL. like the archive URL.
list_depth:
Max depth to follow links on list_url pages. Default 0.
list_depth (int): Max depth to follow links on list_url pages.
Defaults to 0.
""" """
if not isinstance(archive_urls, (list, tuple)): if not isinstance(archive_urls, (list, tuple)):
archive_urls = [archive_urls] archive_urls = [archive_urls]
@ -291,17 +289,17 @@ def find_versions_of_archive(archive_urls, list_url=None, list_depth=0):
# Generate a list of list_urls based on archive urls and any # Generate a list of list_urls based on archive urls and any
# explicitly listed list_url in the package # explicitly listed list_url in the package
list_urls = set() list_urls = set()
if list_url: if list_url is not None:
list_urls.add(list_url) list_urls.add(list_url)
for aurl in archive_urls: for aurl in archive_urls:
list_urls.add(spack.url.find_list_url(aurl)) list_urls |= spack.url.find_list_urls(aurl)
# Add '/' to the end of the URL. Some web servers require this. # Add '/' to the end of the URL. Some web servers require this.
additional_list_urls = set() additional_list_urls = set()
for lurl in list_urls: for lurl in list_urls:
if not lurl.endswith('/'): if not lurl.endswith('/'):
additional_list_urls.add(lurl + '/') additional_list_urls.add(lurl + '/')
list_urls.update(additional_list_urls) list_urls |= additional_list_urls
# Grab some web pages to scrape. # Grab some web pages to scrape.
pages = {} pages = {}