Better version wildcard handling, better spidering
- Allow version wildcards to match [_-.] instead of the exact separators the version was constructed with. - Handles the fact that boost versions are written both 1.55.0 and 1_55_0. - Update spidering to handle parse errors and warn that Python < 2.7.3 has less robust HTML parsing abilities.
This commit is contained in:
parent
15589754ec
commit
3bbca9bd05
5 changed files with 38 additions and 23 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,6 +1,6 @@
|
|||
/var/spack/stage
|
||||
*.pyc
|
||||
/opt/
|
||||
/var/
|
||||
*~
|
||||
.DS_Store
|
||||
.idea
|
||||
|
|
|
@ -41,5 +41,7 @@ def __init__(self, message):
|
|||
class NoNetworkConnectionError(SpackError):
|
||||
"""Raised when an operation needs an internet connection."""
|
||||
def __init__(self, message, url):
|
||||
super(NoNetworkConnectionError, self).__init__(message)
|
||||
super(NoNetworkConnectionError, self).__init__(
|
||||
"No network connection: " + str(message),
|
||||
"URL was: " + str(url))
|
||||
self.url = url
|
||||
|
|
|
@ -206,7 +206,7 @@ def wildcard_version(path):
|
|||
ver, start, end = parse_version_string_with_indices(path)
|
||||
|
||||
v = Version(ver)
|
||||
parts = list(re.escape(p) for p in path.split(str(v)))
|
||||
parts = [re.escape(p) for p in re.split(v.wildcard(), path)]
|
||||
|
||||
# Make a group for the wildcard, so it will be captured by the regex.
|
||||
version_group = '(%s)' % v.wildcard()
|
||||
|
|
|
@ -23,11 +23,12 @@
|
|||
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
##############################################################################
|
||||
import re
|
||||
import sys
|
||||
import subprocess
|
||||
import urllib2
|
||||
import urlparse
|
||||
from multiprocessing import Pool
|
||||
from HTMLParser import HTMLParser
|
||||
from HTMLParser import HTMLParser, HTMLParseError
|
||||
|
||||
import llnl.util.tty as tty
|
||||
|
||||
|
@ -67,7 +68,7 @@ def _spider(args):
|
|||
pool. Firing off all the child links at once makes the fetch MUCH
|
||||
faster for pages with lots of children.
|
||||
"""
|
||||
url, depth, max_depth = args
|
||||
url, depth, max_depth, raise_on_error = args
|
||||
|
||||
pages = {}
|
||||
try:
|
||||
|
@ -81,11 +82,12 @@ def _spider(args):
|
|||
resp = urllib2.urlopen(req, timeout=TIMEOUT)
|
||||
|
||||
if not "Content-type" in resp.headers:
|
||||
print "ignoring page " + url
|
||||
tty.warn("ignoring page " + url)
|
||||
return pages
|
||||
|
||||
if not resp.headers["Content-type"].startswith('text/html'):
|
||||
print "ignoring page " + url + " with content type " + resp.headers["Content-type"]
|
||||
tty.warn("ignoring page " + url + " with content type " +
|
||||
resp.headers["Content-type"])
|
||||
return pages
|
||||
|
||||
# Do the real GET request when we know it's just HTML.
|
||||
|
@ -100,9 +102,9 @@ def _spider(args):
|
|||
# If we're not at max depth, parse out the links in the page
|
||||
if depth < max_depth:
|
||||
link_parser = LinkParser()
|
||||
|
||||
subcalls = []
|
||||
link_parser.feed(page)
|
||||
|
||||
while link_parser.links:
|
||||
raw_link = link_parser.links.pop()
|
||||
|
||||
|
@ -112,7 +114,7 @@ def _spider(args):
|
|||
|
||||
# Evaluate the link relative to the page it came from.
|
||||
abs_link = urlparse.urljoin(response_url, raw_link)
|
||||
subcalls.append((abs_link, depth+1, max_depth))
|
||||
subcalls.append((abs_link, depth+1, max_depth, raise_on_error))
|
||||
|
||||
if subcalls:
|
||||
pool = Pool(processes=len(subcalls))
|
||||
|
@ -121,13 +123,21 @@ def _spider(args):
|
|||
pages.update(d)
|
||||
|
||||
except urllib2.URLError, e:
|
||||
# Only report it if it's the root page. We ignore errors when spidering.
|
||||
if depth == 1:
|
||||
raise spack.error.NoNetworkConnectionError(e.reason, url)
|
||||
if raise_on_error:
|
||||
raise spack.error.NoNetworkConnectionError(str(e), url)
|
||||
|
||||
except HTMLParseError, e:
|
||||
# This error indicates that Python's HTML parser sucks.
|
||||
msg = "Got an error parsing HTML."
|
||||
|
||||
# Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
|
||||
if sys.version_info[:3] < (2,7,3):
|
||||
msg += " Use Python 2.7.3 or newer for better HTML parsing."
|
||||
|
||||
tty.warn(msg, url, "HTMLParseError: " + str(e))
|
||||
|
||||
except Exception, e:
|
||||
# Other types of errors are completely ignored.
|
||||
pass
|
||||
pass # Other types of errors are completely ignored.
|
||||
|
||||
return pages
|
||||
|
||||
|
@ -141,5 +151,5 @@ def get_pages(root_url, **kwargs):
|
|||
performance over a sequential fetch.
|
||||
"""
|
||||
max_depth = kwargs.setdefault('depth', 1)
|
||||
pages = _spider((root_url, 1, max_depth))
|
||||
pages = _spider((root_url, 1, max_depth, False))
|
||||
return pages
|
||||
|
|
|
@ -152,21 +152,24 @@ def a_or_n(seg):
|
|||
return r'[a-zA-Z]+'
|
||||
|
||||
version = self.version
|
||||
separators = ('',) + self.separators
|
||||
|
||||
# Use a wildcard for separators, in case a version is written
|
||||
# two different ways (e.g., boost writes 1_55_0 and 1.55.0)
|
||||
sep_re = '[_.-]'
|
||||
separators = ('',) + (sep_re,) * len(self.separators)
|
||||
|
||||
version += (version[-1],) * 2
|
||||
separators += (separators[-1],) * 2
|
||||
separators += (sep_re,) * 2
|
||||
|
||||
sep_res = [re.escape(sep) for sep in separators]
|
||||
seg_res = [a_or_n(seg) for seg in version]
|
||||
segments = [a_or_n(seg) for seg in version]
|
||||
|
||||
wc = seg_res[0]
|
||||
for i in xrange(1, len(sep_res)):
|
||||
wc += '(?:' + sep_res[i] + seg_res[i]
|
||||
wc = segments[0]
|
||||
for i in xrange(1, len(separators)):
|
||||
wc += '(?:' + separators[i] + segments[i]
|
||||
|
||||
# Add possible alpha or beta indicator at the end of each segemnt
|
||||
# We treat these specially b/c they're so common.
|
||||
wc += '[ab]?)?' * (len(seg_res) - 1)
|
||||
wc += '[ab]?)?' * (len(segments) - 1)
|
||||
return wc
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue