Better version wildcard handling, better spidering

- Allow version wildcards to match [_-.] instead of the exact separators the version was constructed with. - Handles the fact that boost versions are written both 1.55.0 and 1_55_0. - Update spidering to handle parse errors and warn that Python < 2.7.3 has less robust HTML parsing abilities.
2014-04-25 14:41:37 -07:00 · 2014-04-25 14:41:37 -07:00 · 3bbca9bd05
commit 3bbca9bd05
parent 15589754ec
5 changed files with 38 additions and 23 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,6 @@
+/var/spack/stage
 *.pyc
 /opt/
-/var/
 *~
 .DS_Store
 .idea
--- a/lib/spack/spack/error.py
+++ b/lib/spack/spack/error.py
@ -41,5 +41,7 @@ def __init__(self, message):
 class NoNetworkConnectionError(SpackError):
    """Raised when an operation needs an internet connection."""
    def __init__(self, message, url):
-        super(NoNetworkConnectionError, self).__init__(message)
+        super(NoNetworkConnectionError, self).__init__(
+            "No network connection: " + str(message),
+            "URL was: " + str(url))
        self.url = url
--- a/lib/spack/spack/url.py
+++ b/lib/spack/spack/url.py
@ -206,7 +206,7 @@ def wildcard_version(path):
    ver, start, end = parse_version_string_with_indices(path)

    v = Version(ver)
-    parts = list(re.escape(p) for p in path.split(str(v)))
+    parts = [re.escape(p) for p in re.split(v.wildcard(), path)]

    # Make a group for the wildcard, so it will be captured by the regex.
    version_group = '(%s)' % v.wildcard()
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@ -23,11 +23,12 @@
 # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 ##############################################################################
 import re
+import sys
 import subprocess
 import urllib2
 import urlparse
 from multiprocessing import Pool
-from HTMLParser import HTMLParser
+from HTMLParser import HTMLParser, HTMLParseError

 import llnl.util.tty as tty

@ -67,7 +68,7 @@ def _spider(args):
       pool.  Firing off all the child links at once makes the fetch MUCH
       faster for pages with lots of children.
    """
-    url, depth, max_depth = args
+    url, depth, max_depth, raise_on_error = args

    pages = {}
    try:
@ -81,11 +82,12 @@ def _spider(args):
        resp = urllib2.urlopen(req, timeout=TIMEOUT)

        if not "Content-type" in resp.headers:
-            print "ignoring page " + url
+            tty.warn("ignoring page " + url)
            return pages

        if not resp.headers["Content-type"].startswith('text/html'):
-            print "ignoring page " + url + " with content type " + resp.headers["Content-type"]
+            tty.warn("ignoring page " + url + " with content type " +
+                     resp.headers["Content-type"])
            return pages

        # Do the real GET request when we know it's just HTML.
@ -100,9 +102,9 @@ def _spider(args):
        # If we're not at max depth, parse out the links in the page
        if depth < max_depth:
            link_parser = LinkParser()
-
            subcalls = []
            link_parser.feed(page)
+
            while link_parser.links:
                raw_link = link_parser.links.pop()

@ -112,7 +114,7 @@ def _spider(args):

                # Evaluate the link relative to the page it came from.
                abs_link = urlparse.urljoin(response_url, raw_link)
-                subcalls.append((abs_link, depth+1, max_depth))
+                subcalls.append((abs_link, depth+1, max_depth, raise_on_error))

            if subcalls:
                pool = Pool(processes=len(subcalls))
@ -121,13 +123,21 @@ def _spider(args):
                    pages.update(d)

    except urllib2.URLError, e:
-        # Only report it if it's the root page.  We ignore errors when spidering.
-        if depth == 1:
-            raise spack.error.NoNetworkConnectionError(e.reason, url)
+        if raise_on_error:
+            raise spack.error.NoNetworkConnectionError(str(e), url)
+
+    except HTMLParseError, e:
+        # This error indicates that Python's HTML parser sucks.
+        msg = "Got an error parsing HTML."
+
+        # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
+        if sys.version_info[:3] < (2,7,3):
+            msg += " Use Python 2.7.3 or newer for better HTML parsing."
+
+        tty.warn(msg, url, "HTMLParseError: " + str(e))

    except Exception, e:
-        # Other types of errors are completely ignored.
-        pass
+        pass    # Other types of errors are completely ignored.

    return pages

@ -141,5 +151,5 @@ def get_pages(root_url, **kwargs):
       performance over a sequential fetch.
    """
    max_depth = kwargs.setdefault('depth', 1)
-    pages =  _spider((root_url, 1, max_depth))
+    pages =  _spider((root_url, 1, max_depth, False))
    return pages
--- a/lib/spack/spack/version.py
+++ b/lib/spack/spack/version.py
@ -152,21 +152,24 @@ def a_or_n(seg):
                return r'[a-zA-Z]+'

        version = self.version
-        separators = ('',) + self.separators
+
+        # Use a wildcard for separators, in case a version is written
+        # two different ways (e.g., boost writes 1_55_0 and 1.55.0)
+        sep_re = '[_.-]'
+        separators = ('',) + (sep_re,) * len(self.separators)

        version += (version[-1],) * 2
-        separators += (separators[-1],) * 2
+        separators += (sep_re,) * 2

-        sep_res = [re.escape(sep) for sep in separators]
-        seg_res = [a_or_n(seg) for seg in version]
+        segments = [a_or_n(seg) for seg in version]

-        wc = seg_res[0]
-        for i in xrange(1, len(sep_res)):
-            wc += '(?:' + sep_res[i] + seg_res[i]
+        wc = segments[0]
+        for i in xrange(1, len(separators)):
+            wc += '(?:' + separators[i] + segments[i]

        # Add possible alpha or beta indicator at the end of each segemnt
        # We treat these specially b/c they're so common.
-        wc += '[ab]?)?' * (len(seg_res) - 1)
+        wc += '[ab]?)?' * (len(segments) - 1)
        return wc