Better version wildcard handling, better spidering

- Allow version wildcards to match [_-.] instead of the exact separators
  the version was constructed with.
  - Handles the fact that boost versions are written both 1.55.0 and 1_55_0.

- Update spidering to handle parse errors and warn that Python < 2.7.3 has
  less robust HTML parsing abilities.
This commit is contained in:
Todd Gamblin 2014-04-25 14:41:37 -07:00
parent 15589754ec
commit 3bbca9bd05
5 changed files with 38 additions and 23 deletions

2
.gitignore vendored
View file

@ -1,6 +1,6 @@
/var/spack/stage
*.pyc
/opt/
/var/
*~
.DS_Store
.idea

View file

@ -41,5 +41,7 @@ def __init__(self, message):
class NoNetworkConnectionError(SpackError):
"""Raised when an operation needs an internet connection."""
def __init__(self, message, url):
super(NoNetworkConnectionError, self).__init__(message)
super(NoNetworkConnectionError, self).__init__(
"No network connection: " + str(message),
"URL was: " + str(url))
self.url = url

View file

@ -206,7 +206,7 @@ def wildcard_version(path):
ver, start, end = parse_version_string_with_indices(path)
v = Version(ver)
parts = list(re.escape(p) for p in path.split(str(v)))
parts = [re.escape(p) for p in re.split(v.wildcard(), path)]
# Make a group for the wildcard, so it will be captured by the regex.
version_group = '(%s)' % v.wildcard()

View file

@ -23,11 +23,12 @@
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
##############################################################################
import re
import sys
import subprocess
import urllib2
import urlparse
from multiprocessing import Pool
from HTMLParser import HTMLParser
from HTMLParser import HTMLParser, HTMLParseError
import llnl.util.tty as tty
@ -67,7 +68,7 @@ def _spider(args):
pool. Firing off all the child links at once makes the fetch MUCH
faster for pages with lots of children.
"""
url, depth, max_depth = args
url, depth, max_depth, raise_on_error = args
pages = {}
try:
@ -81,11 +82,12 @@ def _spider(args):
resp = urllib2.urlopen(req, timeout=TIMEOUT)
if not "Content-type" in resp.headers:
print "ignoring page " + url
tty.warn("ignoring page " + url)
return pages
if not resp.headers["Content-type"].startswith('text/html'):
print "ignoring page " + url + " with content type " + resp.headers["Content-type"]
tty.warn("ignoring page " + url + " with content type " +
resp.headers["Content-type"])
return pages
# Do the real GET request when we know it's just HTML.
@ -100,9 +102,9 @@ def _spider(args):
# If we're not at max depth, parse out the links in the page
if depth < max_depth:
link_parser = LinkParser()
subcalls = []
link_parser.feed(page)
while link_parser.links:
raw_link = link_parser.links.pop()
@ -112,7 +114,7 @@ def _spider(args):
# Evaluate the link relative to the page it came from.
abs_link = urlparse.urljoin(response_url, raw_link)
subcalls.append((abs_link, depth+1, max_depth))
subcalls.append((abs_link, depth+1, max_depth, raise_on_error))
if subcalls:
pool = Pool(processes=len(subcalls))
@ -121,13 +123,21 @@ def _spider(args):
pages.update(d)
except urllib2.URLError, e:
# Only report it if it's the root page. We ignore errors when spidering.
if depth == 1:
raise spack.error.NoNetworkConnectionError(e.reason, url)
if raise_on_error:
raise spack.error.NoNetworkConnectionError(str(e), url)
except HTMLParseError, e:
# This error indicates that Python's HTML parser sucks.
msg = "Got an error parsing HTML."
# Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
if sys.version_info[:3] < (2,7,3):
msg += " Use Python 2.7.3 or newer for better HTML parsing."
tty.warn(msg, url, "HTMLParseError: " + str(e))
except Exception, e:
# Other types of errors are completely ignored.
pass
pass # Other types of errors are completely ignored.
return pages
@ -141,5 +151,5 @@ def get_pages(root_url, **kwargs):
performance over a sequential fetch.
"""
max_depth = kwargs.setdefault('depth', 1)
pages = _spider((root_url, 1, max_depth))
pages = _spider((root_url, 1, max_depth, False))
return pages

View file

@ -152,21 +152,24 @@ def a_or_n(seg):
return r'[a-zA-Z]+'
version = self.version
separators = ('',) + self.separators
# Use a wildcard for separators, in case a version is written
# two different ways (e.g., boost writes 1_55_0 and 1.55.0)
sep_re = '[_.-]'
separators = ('',) + (sep_re,) * len(self.separators)
version += (version[-1],) * 2
separators += (separators[-1],) * 2
separators += (sep_re,) * 2
sep_res = [re.escape(sep) for sep in separators]
seg_res = [a_or_n(seg) for seg in version]
segments = [a_or_n(seg) for seg in version]
wc = seg_res[0]
for i in xrange(1, len(sep_res)):
wc += '(?:' + sep_res[i] + seg_res[i]
wc = segments[0]
for i in xrange(1, len(separators)):
wc += '(?:' + separators[i] + segments[i]
# Add possible alpha or beta indicator at the end of each segemnt
# We treat these specially b/c they're so common.
wc += '[ab]?)?' * (len(seg_res) - 1)
wc += '[ab]?)?' * (len(segments) - 1)
return wc