Better version wildcard handling, better spidering

- Allow version wildcards to match [_-.] instead of the exact separators
  the version was constructed with.
  - Handles the fact that boost versions are written both 1.55.0 and 1_55_0.

- Update spidering to handle parse errors and warn that Python < 2.7.3 has
  less robust HTML parsing abilities.
This commit is contained in:
Todd Gamblin 2014-04-25 14:41:37 -07:00
parent 15589754ec
commit 3bbca9bd05
5 changed files with 38 additions and 23 deletions

2
.gitignore vendored
View file

@ -1,6 +1,6 @@
/var/spack/stage
*.pyc *.pyc
/opt/ /opt/
/var/
*~ *~
.DS_Store .DS_Store
.idea .idea

View file

@ -41,5 +41,7 @@ def __init__(self, message):
class NoNetworkConnectionError(SpackError): class NoNetworkConnectionError(SpackError):
"""Raised when an operation needs an internet connection.""" """Raised when an operation needs an internet connection."""
def __init__(self, message, url): def __init__(self, message, url):
super(NoNetworkConnectionError, self).__init__(message) super(NoNetworkConnectionError, self).__init__(
"No network connection: " + str(message),
"URL was: " + str(url))
self.url = url self.url = url

View file

@ -206,7 +206,7 @@ def wildcard_version(path):
ver, start, end = parse_version_string_with_indices(path) ver, start, end = parse_version_string_with_indices(path)
v = Version(ver) v = Version(ver)
parts = list(re.escape(p) for p in path.split(str(v))) parts = [re.escape(p) for p in re.split(v.wildcard(), path)]
# Make a group for the wildcard, so it will be captured by the regex. # Make a group for the wildcard, so it will be captured by the regex.
version_group = '(%s)' % v.wildcard() version_group = '(%s)' % v.wildcard()

View file

@ -23,11 +23,12 @@
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
############################################################################## ##############################################################################
import re import re
import sys
import subprocess import subprocess
import urllib2 import urllib2
import urlparse import urlparse
from multiprocessing import Pool from multiprocessing import Pool
from HTMLParser import HTMLParser from HTMLParser import HTMLParser, HTMLParseError
import llnl.util.tty as tty import llnl.util.tty as tty
@ -67,7 +68,7 @@ def _spider(args):
pool. Firing off all the child links at once makes the fetch MUCH pool. Firing off all the child links at once makes the fetch MUCH
faster for pages with lots of children. faster for pages with lots of children.
""" """
url, depth, max_depth = args url, depth, max_depth, raise_on_error = args
pages = {} pages = {}
try: try:
@ -81,11 +82,12 @@ def _spider(args):
resp = urllib2.urlopen(req, timeout=TIMEOUT) resp = urllib2.urlopen(req, timeout=TIMEOUT)
if not "Content-type" in resp.headers: if not "Content-type" in resp.headers:
print "ignoring page " + url tty.warn("ignoring page " + url)
return pages return pages
if not resp.headers["Content-type"].startswith('text/html'): if not resp.headers["Content-type"].startswith('text/html'):
print "ignoring page " + url + " with content type " + resp.headers["Content-type"] tty.warn("ignoring page " + url + " with content type " +
resp.headers["Content-type"])
return pages return pages
# Do the real GET request when we know it's just HTML. # Do the real GET request when we know it's just HTML.
@ -100,9 +102,9 @@ def _spider(args):
# If we're not at max depth, parse out the links in the page # If we're not at max depth, parse out the links in the page
if depth < max_depth: if depth < max_depth:
link_parser = LinkParser() link_parser = LinkParser()
subcalls = [] subcalls = []
link_parser.feed(page) link_parser.feed(page)
while link_parser.links: while link_parser.links:
raw_link = link_parser.links.pop() raw_link = link_parser.links.pop()
@ -112,7 +114,7 @@ def _spider(args):
# Evaluate the link relative to the page it came from. # Evaluate the link relative to the page it came from.
abs_link = urlparse.urljoin(response_url, raw_link) abs_link = urlparse.urljoin(response_url, raw_link)
subcalls.append((abs_link, depth+1, max_depth)) subcalls.append((abs_link, depth+1, max_depth, raise_on_error))
if subcalls: if subcalls:
pool = Pool(processes=len(subcalls)) pool = Pool(processes=len(subcalls))
@ -121,13 +123,21 @@ def _spider(args):
pages.update(d) pages.update(d)
except urllib2.URLError, e: except urllib2.URLError, e:
# Only report it if it's the root page. We ignore errors when spidering. if raise_on_error:
if depth == 1: raise spack.error.NoNetworkConnectionError(str(e), url)
raise spack.error.NoNetworkConnectionError(e.reason, url)
except HTMLParseError, e:
# This error indicates that Python's HTML parser sucks.
msg = "Got an error parsing HTML."
# Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
if sys.version_info[:3] < (2,7,3):
msg += " Use Python 2.7.3 or newer for better HTML parsing."
tty.warn(msg, url, "HTMLParseError: " + str(e))
except Exception, e: except Exception, e:
# Other types of errors are completely ignored. pass # Other types of errors are completely ignored.
pass
return pages return pages
@ -141,5 +151,5 @@ def get_pages(root_url, **kwargs):
performance over a sequential fetch. performance over a sequential fetch.
""" """
max_depth = kwargs.setdefault('depth', 1) max_depth = kwargs.setdefault('depth', 1)
pages = _spider((root_url, 1, max_depth)) pages = _spider((root_url, 1, max_depth, False))
return pages return pages

View file

@ -152,21 +152,24 @@ def a_or_n(seg):
return r'[a-zA-Z]+' return r'[a-zA-Z]+'
version = self.version version = self.version
separators = ('',) + self.separators
# Use a wildcard for separators, in case a version is written
# two different ways (e.g., boost writes 1_55_0 and 1.55.0)
sep_re = '[_.-]'
separators = ('',) + (sep_re,) * len(self.separators)
version += (version[-1],) * 2 version += (version[-1],) * 2
separators += (separators[-1],) * 2 separators += (sep_re,) * 2
sep_res = [re.escape(sep) for sep in separators] segments = [a_or_n(seg) for seg in version]
seg_res = [a_or_n(seg) for seg in version]
wc = seg_res[0] wc = segments[0]
for i in xrange(1, len(sep_res)): for i in xrange(1, len(separators)):
wc += '(?:' + sep_res[i] + seg_res[i] wc += '(?:' + separators[i] + segments[i]
# Add possible alpha or beta indicator at the end of each segemnt # Add possible alpha or beta indicator at the end of each segemnt
# We treat these specially b/c they're so common. # We treat these specially b/c they're so common.
wc += '[ab]?)?' * (len(seg_res) - 1) wc += '[ab]?)?' * (len(segments) - 1)
return wc return wc