Better version wildcard handling, better spidering
- Allow version wildcards to match [_-.] instead of the exact separators the version was constructed with. - Handles the fact that boost versions are written both 1.55.0 and 1_55_0. - Update spidering to handle parse errors and warn that Python < 2.7.3 has less robust HTML parsing abilities.
This commit is contained in:
parent
15589754ec
commit
3bbca9bd05
5 changed files with 38 additions and 23 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,6 +1,6 @@
|
||||||
|
/var/spack/stage
|
||||||
*.pyc
|
*.pyc
|
||||||
/opt/
|
/opt/
|
||||||
/var/
|
|
||||||
*~
|
*~
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.idea
|
.idea
|
||||||
|
|
|
@ -41,5 +41,7 @@ def __init__(self, message):
|
||||||
class NoNetworkConnectionError(SpackError):
|
class NoNetworkConnectionError(SpackError):
|
||||||
"""Raised when an operation needs an internet connection."""
|
"""Raised when an operation needs an internet connection."""
|
||||||
def __init__(self, message, url):
|
def __init__(self, message, url):
|
||||||
super(NoNetworkConnectionError, self).__init__(message)
|
super(NoNetworkConnectionError, self).__init__(
|
||||||
|
"No network connection: " + str(message),
|
||||||
|
"URL was: " + str(url))
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
|
@ -206,7 +206,7 @@ def wildcard_version(path):
|
||||||
ver, start, end = parse_version_string_with_indices(path)
|
ver, start, end = parse_version_string_with_indices(path)
|
||||||
|
|
||||||
v = Version(ver)
|
v = Version(ver)
|
||||||
parts = list(re.escape(p) for p in path.split(str(v)))
|
parts = [re.escape(p) for p in re.split(v.wildcard(), path)]
|
||||||
|
|
||||||
# Make a group for the wildcard, so it will be captured by the regex.
|
# Make a group for the wildcard, so it will be captured by the regex.
|
||||||
version_group = '(%s)' % v.wildcard()
|
version_group = '(%s)' % v.wildcard()
|
||||||
|
|
|
@ -23,11 +23,12 @@
|
||||||
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
##############################################################################
|
##############################################################################
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
import subprocess
|
import subprocess
|
||||||
import urllib2
|
import urllib2
|
||||||
import urlparse
|
import urlparse
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser, HTMLParseError
|
||||||
|
|
||||||
import llnl.util.tty as tty
|
import llnl.util.tty as tty
|
||||||
|
|
||||||
|
@ -67,7 +68,7 @@ def _spider(args):
|
||||||
pool. Firing off all the child links at once makes the fetch MUCH
|
pool. Firing off all the child links at once makes the fetch MUCH
|
||||||
faster for pages with lots of children.
|
faster for pages with lots of children.
|
||||||
"""
|
"""
|
||||||
url, depth, max_depth = args
|
url, depth, max_depth, raise_on_error = args
|
||||||
|
|
||||||
pages = {}
|
pages = {}
|
||||||
try:
|
try:
|
||||||
|
@ -81,11 +82,12 @@ def _spider(args):
|
||||||
resp = urllib2.urlopen(req, timeout=TIMEOUT)
|
resp = urllib2.urlopen(req, timeout=TIMEOUT)
|
||||||
|
|
||||||
if not "Content-type" in resp.headers:
|
if not "Content-type" in resp.headers:
|
||||||
print "ignoring page " + url
|
tty.warn("ignoring page " + url)
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
if not resp.headers["Content-type"].startswith('text/html'):
|
if not resp.headers["Content-type"].startswith('text/html'):
|
||||||
print "ignoring page " + url + " with content type " + resp.headers["Content-type"]
|
tty.warn("ignoring page " + url + " with content type " +
|
||||||
|
resp.headers["Content-type"])
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
# Do the real GET request when we know it's just HTML.
|
# Do the real GET request when we know it's just HTML.
|
||||||
|
@ -100,9 +102,9 @@ def _spider(args):
|
||||||
# If we're not at max depth, parse out the links in the page
|
# If we're not at max depth, parse out the links in the page
|
||||||
if depth < max_depth:
|
if depth < max_depth:
|
||||||
link_parser = LinkParser()
|
link_parser = LinkParser()
|
||||||
|
|
||||||
subcalls = []
|
subcalls = []
|
||||||
link_parser.feed(page)
|
link_parser.feed(page)
|
||||||
|
|
||||||
while link_parser.links:
|
while link_parser.links:
|
||||||
raw_link = link_parser.links.pop()
|
raw_link = link_parser.links.pop()
|
||||||
|
|
||||||
|
@ -112,7 +114,7 @@ def _spider(args):
|
||||||
|
|
||||||
# Evaluate the link relative to the page it came from.
|
# Evaluate the link relative to the page it came from.
|
||||||
abs_link = urlparse.urljoin(response_url, raw_link)
|
abs_link = urlparse.urljoin(response_url, raw_link)
|
||||||
subcalls.append((abs_link, depth+1, max_depth))
|
subcalls.append((abs_link, depth+1, max_depth, raise_on_error))
|
||||||
|
|
||||||
if subcalls:
|
if subcalls:
|
||||||
pool = Pool(processes=len(subcalls))
|
pool = Pool(processes=len(subcalls))
|
||||||
|
@ -121,13 +123,21 @@ def _spider(args):
|
||||||
pages.update(d)
|
pages.update(d)
|
||||||
|
|
||||||
except urllib2.URLError, e:
|
except urllib2.URLError, e:
|
||||||
# Only report it if it's the root page. We ignore errors when spidering.
|
if raise_on_error:
|
||||||
if depth == 1:
|
raise spack.error.NoNetworkConnectionError(str(e), url)
|
||||||
raise spack.error.NoNetworkConnectionError(e.reason, url)
|
|
||||||
|
except HTMLParseError, e:
|
||||||
|
# This error indicates that Python's HTML parser sucks.
|
||||||
|
msg = "Got an error parsing HTML."
|
||||||
|
|
||||||
|
# Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
|
||||||
|
if sys.version_info[:3] < (2,7,3):
|
||||||
|
msg += " Use Python 2.7.3 or newer for better HTML parsing."
|
||||||
|
|
||||||
|
tty.warn(msg, url, "HTMLParseError: " + str(e))
|
||||||
|
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
# Other types of errors are completely ignored.
|
pass # Other types of errors are completely ignored.
|
||||||
pass
|
|
||||||
|
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
|
@ -141,5 +151,5 @@ def get_pages(root_url, **kwargs):
|
||||||
performance over a sequential fetch.
|
performance over a sequential fetch.
|
||||||
"""
|
"""
|
||||||
max_depth = kwargs.setdefault('depth', 1)
|
max_depth = kwargs.setdefault('depth', 1)
|
||||||
pages = _spider((root_url, 1, max_depth))
|
pages = _spider((root_url, 1, max_depth, False))
|
||||||
return pages
|
return pages
|
||||||
|
|
|
@ -152,21 +152,24 @@ def a_or_n(seg):
|
||||||
return r'[a-zA-Z]+'
|
return r'[a-zA-Z]+'
|
||||||
|
|
||||||
version = self.version
|
version = self.version
|
||||||
separators = ('',) + self.separators
|
|
||||||
|
# Use a wildcard for separators, in case a version is written
|
||||||
|
# two different ways (e.g., boost writes 1_55_0 and 1.55.0)
|
||||||
|
sep_re = '[_.-]'
|
||||||
|
separators = ('',) + (sep_re,) * len(self.separators)
|
||||||
|
|
||||||
version += (version[-1],) * 2
|
version += (version[-1],) * 2
|
||||||
separators += (separators[-1],) * 2
|
separators += (sep_re,) * 2
|
||||||
|
|
||||||
sep_res = [re.escape(sep) for sep in separators]
|
segments = [a_or_n(seg) for seg in version]
|
||||||
seg_res = [a_or_n(seg) for seg in version]
|
|
||||||
|
|
||||||
wc = seg_res[0]
|
wc = segments[0]
|
||||||
for i in xrange(1, len(sep_res)):
|
for i in xrange(1, len(separators)):
|
||||||
wc += '(?:' + sep_res[i] + seg_res[i]
|
wc += '(?:' + separators[i] + segments[i]
|
||||||
|
|
||||||
# Add possible alpha or beta indicator at the end of each segemnt
|
# Add possible alpha or beta indicator at the end of each segemnt
|
||||||
# We treat these specially b/c they're so common.
|
# We treat these specially b/c they're so common.
|
||||||
wc += '[ab]?)?' * (len(seg_res) - 1)
|
wc += '[ab]?)?' * (len(segments) - 1)
|
||||||
return wc
|
return wc
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue