Added web spider capability for listing versions.

2013-11-23 13:04:36 -08:00 · 2013-11-23 13:04:36 -08:00 · 389fa1792d
commit 389fa1792d
parent fe7da0dcff
19 changed files with 321 additions and 59 deletions
--- a/bin/spack
+++ b/bin/spack
@ -30,6 +30,8 @@ parser.add_argument('-v', '--verbose', action='store_true', dest='verbose',
                    help="print additional output during builds")
 parser.add_argument('-d', '--debug', action='store_true', dest='debug',
                    help="write out debug logs during compile")
 parser.add_argument('-m', '--mock', action='store_true', dest='mock',
                    help="Use mock packages instead of real ones.")
 # each command module implements a parser() function, to which we pass its
 # subparser for setup.
@ -46,6 +48,10 @@ args = parser.parse_args()
 # Set up environment based on args.
 spack.verbose = args.verbose
 spack.debug = args.debug
 if args.mock:
    from spack.util.filesystem import new_path
    mock_path = new_path(spack.module_path, 'test', 'mock_packages')
    spack.packages_path = mock_path
 # Try to load the particular command asked for and run it
 command = spack.cmd.get_command(args.command)
--- a/lib/spack/spack/cmd/checksum.py
+++ b/lib/spack/spack/cmd/checksum.py
@ -0,0 +1,63 @@
 import os
 import re
 import argparse
 from pprint import pprint
 from subprocess import CalledProcessError
 import spack.tty as tty
 import spack.packages as packages
 from spack.stage import Stage
 from spack.colify import colify
 from spack.util.crypto import md5
 from spack.version import *
 group='foo'
 description ="Checksum available versions of a package, print out checksums for addition to a package file."
 def setup_parser(subparser):
    subparser.add_argument('package', metavar='PACKAGE', help='Package to list versions for')
    subparser.add_argument('versions', nargs=argparse.REMAINDER, help='Versions to generate checksums for')
    subparser.add_argument('-n', '--number', dest='number', type=int,
                           default=10, help='Number of versions to list')
 def checksum(parser, args):
    # get the package we're going to generate checksums for
    pkg = packages.get(args.package)
    # If the user asked for specific versions, use those.
    # Otherwise get the latest n, where n is from the -n/--number param
    versions = [ver(v) for v in args.versions]
    if not all(type(v) == Version for v in versions):
        tty.die("Cannot generate checksums for version lists or " +
                "version ranges.  Use unambiguous versions.")
    if not versions:
        versions = pkg.fetch_available_versions()[:args.number]
        if not versions:
            tty.die("Could not fetch any available versions for %s."
                    % pkg.name)
    versions.sort()
    versions.reverse()
    urls = [pkg.url_for_version(v) for v in versions]
    tty.msg("Found %s versions to checksum." % len(urls))
    tty.msg("Downloading...")
    hashes = []
    for url, version in zip(urls, versions):
        stage = Stage("checksum-%s-%s" % (pkg.name, version), url)
        try:
            stage.fetch()
            hashes.append(md5(stage.archive_file))
        finally:
            stage.destroy()
    dict_string = ["{"]
    for i, (v, h) in enumerate(zip(versions, hashes)):
        comma = "" if i == len(hashes) - 1 else ","
        dict_string.append("    '%s' : '%s'%s" % (str(v), str(h), comma))
    dict_string.append("}")
    tty.msg("Checksummed new versions of %s:" % pkg.name, *dict_string)
--- a/lib/spack/spack/cmd/spec.py
+++ b/lib/spack/spack/cmd/spec.py
@ -2,8 +2,10 @@
 import spack.cmd
 import spack.tty as tty
 import spack.url as url
 import spack
 description = "parse specs and print them out to the command line."
 def setup_parser(subparser):
@ -13,7 +15,11 @@ def spec(parser, args):
    specs = spack.cmd.parse_specs(args.specs)
    for spec in specs:
        spec.normalize()
-        print spec.tree()
+        print spec.tree(color=True)
        spec.concretize()
-        print spec.tree()
+        print spec.tree(color=True)
        pkg = spec.package
        wc = url.wildcard_version(pkg.url)
        print wc
--- a/lib/spack/spack/cmd/versions.py
+++ b/lib/spack/spack/cmd/versions.py
@ -2,12 +2,8 @@
 import re
 from subprocess import CalledProcessError
 import spack
 import spack.packages as packages
 import spack.url as url
 import spack.tty as tty
 from spack.colify import colify
 from spack.version import ver
 description ="List available versions of a package"
@ -17,4 +13,4 @@ def setup_parser(subparser):
 def versions(parser, args):
    pkg = packages.get(args.package)
-    colify(reversed(pkg.available_versions))
+    colify(reversed(pkg.fetch_available_versions()))
--- a/lib/spack/spack/package.py
+++ b/lib/spack/spack/package.py
@ -29,6 +29,8 @@
 from multi_function import platform
 from stage import Stage
 from spack.util.lang import memoized, list_modules
 from spack.util.crypto import md5
 from spack.util.web import get_pages
 class Package(object):
@ -251,6 +253,9 @@ class SomePackage(Package):
    """By default a package has no dependencies."""
    dependencies = {}
    """List of specs of virtual packages provided by this package."""
    provided_virtual_packages = {}
    #
    # These are default values for instance variables.
    #
@ -310,6 +315,9 @@ def __init__(self, spec):
        if not hasattr(self, 'list_url'):
            self.list_url = os.path.dirname(self.url)
        if not hasattr(self, 'list_depth'):
            self.list_depth = 1
    def add_commands_to_module(self):
        """Populate the module scope of install() with some useful functions.
@ -464,6 +472,11 @@ def url_version(self, version):
        return str(version)
    def url_for_version(self, version):
        """Gives a URL that you can download a new version of this package from."""
        return url.substitute_version(self.url, self.url_version(version))
    def remove_prefix(self):
        """Removes the prefix for a package along with any empty parent directories."""
        if self.dirty:
@ -640,37 +653,42 @@ def do_clean_dist(self):
        tty.msg("Successfully cleaned %s" % self.name)
    def fetch_available_versions(self):
        # If not, then try to fetch using list_url
        if not self._available_versions:
            self._available_versions = VersionList()
            url_regex = os.path.basename(url.wildcard_version(self.url))
            wildcard = self.version.wildcard()
            page_map = get_pages(self.list_url, depth=self.list_depth)
            for site, page in page_map.iteritems():
                strings = re.findall(url_regex, page)
                for s in strings:
                    match = re.search(wildcard, s)
                    if match:
                        v = match.group(0)
                        self._available_versions.add(Version(v))
            if not self._available_versions:
                tty.warn("Found no versions for %s" % self.name,
                         "Check the list_url and list_depth attribute on the "
                         + self.name + " package.",
                         "Use them to tell Spack where to look for versions.")
        return self._available_versions
    @property
    def available_versions(self):
        # If the package overrode available_versions, then use that.
        if self.versions is not None:
            return self.versions
-
+        else:
-        # If not, then try to fetch using list_url
+            vlist = self.fetch_available_versions()
-        if not self._available_versions:
+            if not vlist:
-            self._available_versions = ver([self.version])
+                vlist = ver([self.version])
-            try:
+            return vlist
                # Run curl but grab the mime type from the http headers
                listing = spack.curl('-s', '-L', self.list_url, return_output=True)
                url_regex = os.path.basename(url.wildcard_version(self.url))
                strings = re.findall(url_regex, listing)
                wildcard = self.version.wildcard()
                for s in strings:
                    match = re.search(wildcard, s)
                    if match:
                        self._available_versions.add(Version(match.group(0)))
                if not self._available_versions:
                    tty.warn("Found no versions for %s" % self.name,
                             "Packate.available_versions may require adding the list_url attribute",
                             "to the package to tell Spack where to look for versions.")
            except subprocess.CalledProcessError:
                tty.warn("Could not connect to %s" % self.list_url,
                         "Package.available_versions requires an internet connection.",
                         "Version list may be incomplete.")
        return self._available_versions
 class MakeExecutable(Executable):
--- a/lib/spack/spack/packages/init.py
+++ b/lib/spack/spack/packages/init.py
@ -19,6 +19,7 @@
 invalid_package_re = r'[_-][_-]+'
 instances = {}
 providers = {}
 def get(pkg_name):
@ -29,6 +30,24 @@ def get(pkg_name):
    return instances[pkg_name]
 def get_providers(vpkg_name):
    if not providers:
        compute_providers()
    if not vpkg_name in providers:
        raise UnknownPackageError("No such virtual package: %s" % vpkg_name)
    return providers[vpkg_name]
 def compute_providers():
    for pkg in all_packages():
        for vpkg in pkg.provided_virtual_packages:
            if vpkg not in providers:
                providers[vpkg] = []
            providers[vpkg].append(pkg)
 def valid_package_name(pkg_name):
    return (re.match(valid_package_re, pkg_name) and
            not re.search(invalid_package_re, pkg_name))
@ -75,6 +94,11 @@ def class_name_for_package_name(pkg_name):
    return class_name
 def exists(pkg_name):
    """Whether a package is concrete."""
    return os.path.exists(filename_for_package_name(pkg_name))
 def get_class_for_package_name(pkg_name):
    file_name = filename_for_package_name(pkg_name)
@ -149,7 +173,6 @@ def quote(string):
    out.write('}\n')
 class InvalidPackageNameError(spack.error.SpackError):
    """Raised when we encounter a bad package name."""
    def __init__(self, name):
--- a/lib/spack/spack/packages/dyninst.py
+++ b/lib/spack/spack/packages/dyninst.py
@ -4,6 +4,7 @@ class Dyninst(Package):
    homepage = "https://paradyn.org"
    url      = "http://www.dyninst.org/sites/default/files/downloads/dyninst/8.1.2/DyninstAPI-8.1.2.tgz"
    md5      = "bf03b33375afa66fe0efa46ce3f4b17a"
    list_url = "http://www.dyninst.org/downloads/dyninst-8.x"
    depends_on("libelf")
    depends_on("libdwarf")
--- a/lib/spack/spack/relations.py
+++ b/lib/spack/spack/relations.py
@ -45,16 +45,28 @@ class Mpileaks(Package):
        spack install mpileaks ^mpich
 """
 import sys
 import inspect
 import spack.spec
 def _caller_locals():
    """This will return the locals of the *parent* of the caller.
       This allows a fucntion to insert variables into its caller's
       scope.
    """
    stack = inspect.stack()
    try:
        return stack[2][0].f_locals
    finally:
        del stack
 def depends_on(*specs):
    """Adds a dependencies local variable in the locals of
       the calling class, based on args.
    """
    # Get the enclosing package's scope and add deps to it.
-    locals = sys._getframe(1).f_locals
+    dependencies = _caller_locals().setdefault("dependencies", {})
    dependencies = locals.setdefault("dependencies", {})
    for string in specs:
        for spec in spack.spec.parse(string):
            dependencies[spec.name] = spec
@ -66,7 +78,6 @@ def provides(*args):
       can use the providing package to satisfy the dependency.
    """
    # Get the enclosing package's scope and add deps to it.
-    locals = sys._getframe(1).f_locals
+    provides = _caller_locals().setdefault("provides", [])
    provides = locals.setdefault("provides", [])
    for name in args:
        provides.append(name)
--- a/lib/spack/spack/spec.py
+++ b/lib/spack/spack/spec.py
@ -321,9 +321,15 @@ def package(self):
        return packages.get(self.name)
    @property
    def virtual(self):
        return packages.exists(self.name)
    @property
    def concrete(self):
-        return bool(self.versions.concrete
+        return bool(not self.virtual
                    and self.versions.concrete
                    # TODO: support variants
                    and self.architecture
                    and self.compiler and self.compiler.concrete
--- a/lib/spack/spack/test/mock_packages/callpath.py
+++ b/lib/spack/spack/test/mock_packages/callpath.py
@ -5,7 +5,9 @@ class Callpath(Package):
    url      = "http://github.com/tgamblin/callpath-0.2.tar.gz"
    md5      = "foobarbaz"
-    versions = [0.8, 0.9, 1.0]
+    versions = { 0.8 : 'bf03b33375afa66fe0efa46ce3f4b17a',
                 0.9 : 'bf03b33375afa66fe0efa46ce3f4b17a',
                 1.0 : 'bf03b33375afa66fe0efa46ce3f4b17a' }
    depends_on("dyninst")
    depends_on("mpich")
--- a/lib/spack/spack/test/mock_packages/dyninst.py
+++ b/lib/spack/spack/test/mock_packages/dyninst.py
@ -5,7 +5,11 @@ class Dyninst(Package):
    url      = "http://www.dyninst.org/sites/default/files/downloads/dyninst/8.1.2/DyninstAPI-8.1.2.tgz"
    md5      = "bf03b33375afa66fe0efa46ce3f4b17a"
-    versions = '7.0, 7.0.1, 8.0, 8.1.1, 8.1.2'
+    list_url = "http://www.dyninst.org/downloads/dyninst-8.x"
    versions = {
        '8.1.2' : 'bf03b33375afa66fe0efa46ce3f4b17a',
        '8.1.1' : '1f8743e3a5662b25ce64a7edf647e77d' }
    depends_on("libelf")
    depends_on("libdwarf")
--- a/lib/spack/spack/test/mock_packages/libdwarf.py
+++ b/lib/spack/spack/test/mock_packages/libdwarf.py
@ -11,6 +11,8 @@ class Libdwarf(Package):
    md5      = "64b42692e947d5180e162e46c689dfbf"
    versions = [20070703, 20111030, 20130207]
    depends_on("libelf")
--- a/lib/spack/spack/test/mock_packages/libelf.py
+++ b/lib/spack/spack/test/mock_packages/libelf.py
@ -5,7 +5,10 @@ class Libelf(Package):
    url      = "http://www.mr511.de/software/libelf-0.8.13.tar.gz"
    md5      = "4136d7b4c04df68b686570afa26988ac"
-    versions = '0.8.10, 0.8.12, 0.8.13'
+    versions =   {
        '0.8.13' : '4136d7b4c04df68b686570afa26988ac',
        '0.8.12' : 'e21f8273d9f5f6d43a59878dc274fec7',
        '0.8.10' : '9db4d36c283d9790d8fa7df1f4d7b4d9' }
    def install(self, prefix):
        configure("--prefix=%s" % prefix,
--- a/lib/spack/spack/test/mock_packages/mpich.py
+++ b/lib/spack/spack/test/mock_packages/mpich.py
@ -3,6 +3,9 @@
 class Mpich(Package):
    homepage = "http://www.mpich.org"
    url      = "http://www.mpich.org/static/downloads/3.0.4/mpich-3.0.4.tar.gz"
    list_url   = "http://www.mpich.org/static/downloads/"
    list_depth = 2
    md5      = "9c5d5d4fe1e17dd12153f40bc5b6dbc0"
    versions = '1.0.3, 1.3.2p1, 1.4.1p1, 3.0.4, 3.1b1'
--- a/lib/spack/spack/test/mock_packages/mpileaks.py
+++ b/lib/spack/spack/test/mock_packages/mpileaks.py
@ -5,7 +5,10 @@ class Mpileaks(Package):
    url      = "http://www.llnl.gov/mpileaks-1.0.tar.gz"
    md5      = "foobarbaz"
-    versions = [1.0, 2.1, 2.2, 2.3]
+    versions = { 1.0 : None,
                 2.1 : None,
                 2.2 : None,
                 2.3 : None }
    depends_on("mpich")
    depends_on("callpath")
--- a/lib/spack/spack/url.py
+++ b/lib/spack/spack/url.py
@ -176,6 +176,8 @@ def wildcard_version(path):
       that will match this path with any version in its place.
    """
    ver, start, end = parse_version_string_with_indices(path)
    v = Version(ver)
-    return re.escape(path[:start]) + v.wildcard() + re.escape(path[end:])
+    v = Version(ver)
    parts = list(re.escape(p) for p in path.split(str(v)))
    return  v.wildcard().join(parts)
--- a/lib/spack/spack/util/crypto.py
+++ b/lib/spack/spack/util/crypto.py
@ -0,0 +1,13 @@
 import hashlib
 from contextlib import closing
 def md5(filename, block_size=2**20):
    """Computes the md5 hash of a file."""
    md5 = hashlib.md5()
    with closing(open(filename)) as file:
        while True:
            data = file.read(block_size)
            if not data:
                break
            md5.update(data)
        return md5.hexdigest()
--- a/lib/spack/spack/util/filesystem.py
+++ b/lib/spack/spack/util/filesystem.py
@ -30,7 +30,7 @@ def mkdirp(*paths):
 def new_path(prefix, *args):
-    path=str(prefix)
+    path = str(prefix)
    for elt in args:
        path = os.path.join(path, str(elt))
@ -56,16 +56,3 @@ def stem(path):
        if re.search(suffix, path):
            return re.sub(suffix, "", path)
    return path
 def md5(filename, block_size=2**20):
    """Computes the md5 hash of a file."""
    import hashlib
    md5 = hashlib.md5()
    with closing(open(filename)) as file:
        while True:
            data = file.read(block_size)
            if not data:
                break
            md5.update(data)
        return md5.hexdigest()
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@ -0,0 +1,113 @@
 import re
 import subprocess
 import urllib2
 import urlparse
 from multiprocessing import Pool
 from HTMLParser import HTMLParser
 import spack
 import spack.tty as tty
 from spack.util.compression import ALLOWED_ARCHIVE_TYPES
 # Timeout in seconds for web requests
 TIMEOUT = 10
 class LinkParser(HTMLParser):
    """This parser just takes an HTML page and strips out the hrefs on the
       links.  Good enough for a really simple spider. """
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for attr, val in attrs:
                if attr == 'href':
                    self.links.append(val)
 def _spider(args):
    """_spider(url, depth, max_depth)
       Fetches URL and any pages it links to up to max_depth.  depth should
       initially be 1, and max_depth includes the root.  This function will
       print out a warning only if the root can't be fetched; it ignores
       errors with pages that the root links to.
       This will return a list of the pages fetched, in no particular order.
       Takes args as a tuple b/c it's intended to be used by a multiprocessing
       pool.  Firing off all the child links at once makes the fetch MUCH
       faster for pages with lots of children.
    """
    url, depth, max_depth = args
    pages = {}
    try:
        # Make a HEAD request first to check the content type.  This lets
        # us ignore tarballs and gigantic files.
        # It would be nice to do this with the HTTP Accept header to avoid
        # one round-trip.  However, most servers seem to ignore the header
        # if you ask for a tarball with Accept: text/html.
        req = urllib2.Request(url)
        req.get_method = lambda: "HEAD"
        resp = urllib2.urlopen(req, timeout=TIMEOUT)
        if not resp.headers["Content-type"].startswith('text/html'):
            print "ignoring page " + url + " with content type " + resp.headers["Content-type"]
            return pages
        # Do the real GET request when we know it's just HTML.
        req.get_method = lambda: "GET"
        response = urllib2.urlopen(req, timeout=TIMEOUT)
        response_url = response.geturl()
        # Read the page and and stick it in the map we'll return
        page = response.read()
        pages[response_url] = page
        # If we're not at max depth, parse out the links in the page
        if depth < max_depth:
            link_parser = LinkParser()
            subcalls = []
            link_parser.feed(page)
            while link_parser.links:
                raw_link = link_parser.links.pop()
                # Skip stuff that looks like an archive
                if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
                    continue
                # Evaluate the link relative to the page it came from.
                abs_link = urlparse.urljoin(response_url, raw_link)
                subcalls.append((abs_link, depth+1, max_depth))
            if subcalls:
                pool = Pool(processes=len(subcalls))
                dicts = pool.map(_spider, subcalls)
                for d in dicts:
                    pages.update(d)
    except urllib2.HTTPError, e:
        # Only report it if it's the root page.  We ignore errors when spidering.
        if depth == 1:
            tty.warn("Could not connect to %s" % url, e.reason,
                     "Package.available_versions requires an internet connection.",
                     "Version list may be incomplete.")
    return pages
 def get_pages(root_url, **kwargs):
    """Gets web pages from a root URL.
       If depth is specified (e.g., depth=2), then this will also fetches pages
       linked from the root and its children up to depth.
       This will spawn processes to fetch the children, for much improved
       performance over a sequential fetch.
    """
    max_depth = kwargs.setdefault('depth', 1)
    pages =  _spider((root_url, 1, max_depth))
    return pages