Added web spider capability for listing versions.

This commit is contained in:
Todd Gamblin 2013-11-23 13:04:36 -08:00
parent fe7da0dcff
commit 389fa1792d
19 changed files with 321 additions and 59 deletions

View file

@ -30,6 +30,8 @@ parser.add_argument('-v', '--verbose', action='store_true', dest='verbose',
help="print additional output during builds")
parser.add_argument('-d', '--debug', action='store_true', dest='debug',
help="write out debug logs during compile")
parser.add_argument('-m', '--mock', action='store_true', dest='mock',
help="Use mock packages instead of real ones.")
# each command module implements a parser() function, to which we pass its
# subparser for setup.
@ -46,6 +48,10 @@ args = parser.parse_args()
# Set up environment based on args.
spack.verbose = args.verbose
spack.debug = args.debug
if args.mock:
from spack.util.filesystem import new_path
mock_path = new_path(spack.module_path, 'test', 'mock_packages')
spack.packages_path = mock_path
# Try to load the particular command asked for and run it
command = spack.cmd.get_command(args.command)

View file

@ -0,0 +1,63 @@
import os
import re
import argparse
from pprint import pprint
from subprocess import CalledProcessError
import spack.tty as tty
import spack.packages as packages
from spack.stage import Stage
from spack.colify import colify
from spack.util.crypto import md5
from spack.version import *
group='foo'
description ="Checksum available versions of a package, print out checksums for addition to a package file."
def setup_parser(subparser):
subparser.add_argument('package', metavar='PACKAGE', help='Package to list versions for')
subparser.add_argument('versions', nargs=argparse.REMAINDER, help='Versions to generate checksums for')
subparser.add_argument('-n', '--number', dest='number', type=int,
default=10, help='Number of versions to list')
def checksum(parser, args):
# get the package we're going to generate checksums for
pkg = packages.get(args.package)
# If the user asked for specific versions, use those.
# Otherwise get the latest n, where n is from the -n/--number param
versions = [ver(v) for v in args.versions]
if not all(type(v) == Version for v in versions):
tty.die("Cannot generate checksums for version lists or " +
"version ranges. Use unambiguous versions.")
if not versions:
versions = pkg.fetch_available_versions()[:args.number]
if not versions:
tty.die("Could not fetch any available versions for %s."
% pkg.name)
versions.sort()
versions.reverse()
urls = [pkg.url_for_version(v) for v in versions]
tty.msg("Found %s versions to checksum." % len(urls))
tty.msg("Downloading...")
hashes = []
for url, version in zip(urls, versions):
stage = Stage("checksum-%s-%s" % (pkg.name, version), url)
try:
stage.fetch()
hashes.append(md5(stage.archive_file))
finally:
stage.destroy()
dict_string = ["{"]
for i, (v, h) in enumerate(zip(versions, hashes)):
comma = "" if i == len(hashes) - 1 else ","
dict_string.append(" '%s' : '%s'%s" % (str(v), str(h), comma))
dict_string.append("}")
tty.msg("Checksummed new versions of %s:" % pkg.name, *dict_string)

View file

@ -2,8 +2,10 @@
import spack.cmd
import spack.tty as tty
import spack.url as url
import spack
description = "parse specs and print them out to the command line."
def setup_parser(subparser):
@ -13,7 +15,11 @@ def spec(parser, args):
specs = spack.cmd.parse_specs(args.specs)
for spec in specs:
spec.normalize()
print spec.tree()
print spec.tree(color=True)
spec.concretize()
print spec.tree()
print spec.tree(color=True)
pkg = spec.package
wc = url.wildcard_version(pkg.url)
print wc

View file

@ -2,12 +2,8 @@
import re
from subprocess import CalledProcessError
import spack
import spack.packages as packages
import spack.url as url
import spack.tty as tty
from spack.colify import colify
from spack.version import ver
description ="List available versions of a package"
@ -17,4 +13,4 @@ def setup_parser(subparser):
def versions(parser, args):
pkg = packages.get(args.package)
colify(reversed(pkg.available_versions))
colify(reversed(pkg.fetch_available_versions()))

View file

@ -29,6 +29,8 @@
from multi_function import platform
from stage import Stage
from spack.util.lang import memoized, list_modules
from spack.util.crypto import md5
from spack.util.web import get_pages
class Package(object):
@ -251,6 +253,9 @@ class SomePackage(Package):
"""By default a package has no dependencies."""
dependencies = {}
"""List of specs of virtual packages provided by this package."""
provided_virtual_packages = {}
#
# These are default values for instance variables.
#
@ -310,6 +315,9 @@ def __init__(self, spec):
if not hasattr(self, 'list_url'):
self.list_url = os.path.dirname(self.url)
if not hasattr(self, 'list_depth'):
self.list_depth = 1
def add_commands_to_module(self):
"""Populate the module scope of install() with some useful functions.
@ -464,6 +472,11 @@ def url_version(self, version):
return str(version)
def url_for_version(self, version):
"""Gives a URL that you can download a new version of this package from."""
return url.substitute_version(self.url, self.url_version(version))
def remove_prefix(self):
"""Removes the prefix for a package along with any empty parent directories."""
if self.dirty:
@ -640,37 +653,42 @@ def do_clean_dist(self):
tty.msg("Successfully cleaned %s" % self.name)
def fetch_available_versions(self):
# If not, then try to fetch using list_url
if not self._available_versions:
self._available_versions = VersionList()
url_regex = os.path.basename(url.wildcard_version(self.url))
wildcard = self.version.wildcard()
page_map = get_pages(self.list_url, depth=self.list_depth)
for site, page in page_map.iteritems():
strings = re.findall(url_regex, page)
for s in strings:
match = re.search(wildcard, s)
if match:
v = match.group(0)
self._available_versions.add(Version(v))
if not self._available_versions:
tty.warn("Found no versions for %s" % self.name,
"Check the list_url and list_depth attribute on the "
+ self.name + " package.",
"Use them to tell Spack where to look for versions.")
return self._available_versions
@property
def available_versions(self):
# If the package overrode available_versions, then use that.
if self.versions is not None:
return self.versions
# If not, then try to fetch using list_url
if not self._available_versions:
self._available_versions = ver([self.version])
try:
# Run curl but grab the mime type from the http headers
listing = spack.curl('-s', '-L', self.list_url, return_output=True)
url_regex = os.path.basename(url.wildcard_version(self.url))
strings = re.findall(url_regex, listing)
wildcard = self.version.wildcard()
for s in strings:
match = re.search(wildcard, s)
if match:
self._available_versions.add(Version(match.group(0)))
if not self._available_versions:
tty.warn("Found no versions for %s" % self.name,
"Packate.available_versions may require adding the list_url attribute",
"to the package to tell Spack where to look for versions.")
except subprocess.CalledProcessError:
tty.warn("Could not connect to %s" % self.list_url,
"Package.available_versions requires an internet connection.",
"Version list may be incomplete.")
return self._available_versions
else:
vlist = self.fetch_available_versions()
if not vlist:
vlist = ver([self.version])
return vlist
class MakeExecutable(Executable):

View file

@ -19,6 +19,7 @@
invalid_package_re = r'[_-][_-]+'
instances = {}
providers = {}
def get(pkg_name):
@ -29,6 +30,24 @@ def get(pkg_name):
return instances[pkg_name]
def get_providers(vpkg_name):
if not providers:
compute_providers()
if not vpkg_name in providers:
raise UnknownPackageError("No such virtual package: %s" % vpkg_name)
return providers[vpkg_name]
def compute_providers():
for pkg in all_packages():
for vpkg in pkg.provided_virtual_packages:
if vpkg not in providers:
providers[vpkg] = []
providers[vpkg].append(pkg)
def valid_package_name(pkg_name):
return (re.match(valid_package_re, pkg_name) and
not re.search(invalid_package_re, pkg_name))
@ -75,6 +94,11 @@ def class_name_for_package_name(pkg_name):
return class_name
def exists(pkg_name):
"""Whether a package is concrete."""
return os.path.exists(filename_for_package_name(pkg_name))
def get_class_for_package_name(pkg_name):
file_name = filename_for_package_name(pkg_name)
@ -149,7 +173,6 @@ def quote(string):
out.write('}\n')
class InvalidPackageNameError(spack.error.SpackError):
"""Raised when we encounter a bad package name."""
def __init__(self, name):

View file

@ -4,6 +4,7 @@ class Dyninst(Package):
homepage = "https://paradyn.org"
url = "http://www.dyninst.org/sites/default/files/downloads/dyninst/8.1.2/DyninstAPI-8.1.2.tgz"
md5 = "bf03b33375afa66fe0efa46ce3f4b17a"
list_url = "http://www.dyninst.org/downloads/dyninst-8.x"
depends_on("libelf")
depends_on("libdwarf")

View file

@ -45,16 +45,28 @@ class Mpileaks(Package):
spack install mpileaks ^mpich
"""
import sys
import inspect
import spack.spec
def _caller_locals():
"""This will return the locals of the *parent* of the caller.
This allows a fucntion to insert variables into its caller's
scope.
"""
stack = inspect.stack()
try:
return stack[2][0].f_locals
finally:
del stack
def depends_on(*specs):
"""Adds a dependencies local variable in the locals of
the calling class, based on args.
"""
# Get the enclosing package's scope and add deps to it.
locals = sys._getframe(1).f_locals
dependencies = locals.setdefault("dependencies", {})
dependencies = _caller_locals().setdefault("dependencies", {})
for string in specs:
for spec in spack.spec.parse(string):
dependencies[spec.name] = spec
@ -66,7 +78,6 @@ def provides(*args):
can use the providing package to satisfy the dependency.
"""
# Get the enclosing package's scope and add deps to it.
locals = sys._getframe(1).f_locals
provides = locals.setdefault("provides", [])
provides = _caller_locals().setdefault("provides", [])
for name in args:
provides.append(name)

View file

@ -321,9 +321,15 @@ def package(self):
return packages.get(self.name)
@property
def virtual(self):
return packages.exists(self.name)
@property
def concrete(self):
return bool(self.versions.concrete
return bool(not self.virtual
and self.versions.concrete
# TODO: support variants
and self.architecture
and self.compiler and self.compiler.concrete

View file

@ -5,7 +5,9 @@ class Callpath(Package):
url = "http://github.com/tgamblin/callpath-0.2.tar.gz"
md5 = "foobarbaz"
versions = [0.8, 0.9, 1.0]
versions = { 0.8 : 'bf03b33375afa66fe0efa46ce3f4b17a',
0.9 : 'bf03b33375afa66fe0efa46ce3f4b17a',
1.0 : 'bf03b33375afa66fe0efa46ce3f4b17a' }
depends_on("dyninst")
depends_on("mpich")

View file

@ -5,7 +5,11 @@ class Dyninst(Package):
url = "http://www.dyninst.org/sites/default/files/downloads/dyninst/8.1.2/DyninstAPI-8.1.2.tgz"
md5 = "bf03b33375afa66fe0efa46ce3f4b17a"
versions = '7.0, 7.0.1, 8.0, 8.1.1, 8.1.2'
list_url = "http://www.dyninst.org/downloads/dyninst-8.x"
versions = {
'8.1.2' : 'bf03b33375afa66fe0efa46ce3f4b17a',
'8.1.1' : '1f8743e3a5662b25ce64a7edf647e77d' }
depends_on("libelf")
depends_on("libdwarf")

View file

@ -11,6 +11,8 @@ class Libdwarf(Package):
md5 = "64b42692e947d5180e162e46c689dfbf"
versions = [20070703, 20111030, 20130207]
depends_on("libelf")

View file

@ -5,7 +5,10 @@ class Libelf(Package):
url = "http://www.mr511.de/software/libelf-0.8.13.tar.gz"
md5 = "4136d7b4c04df68b686570afa26988ac"
versions = '0.8.10, 0.8.12, 0.8.13'
versions = {
'0.8.13' : '4136d7b4c04df68b686570afa26988ac',
'0.8.12' : 'e21f8273d9f5f6d43a59878dc274fec7',
'0.8.10' : '9db4d36c283d9790d8fa7df1f4d7b4d9' }
def install(self, prefix):
configure("--prefix=%s" % prefix,

View file

@ -3,6 +3,9 @@
class Mpich(Package):
homepage = "http://www.mpich.org"
url = "http://www.mpich.org/static/downloads/3.0.4/mpich-3.0.4.tar.gz"
list_url = "http://www.mpich.org/static/downloads/"
list_depth = 2
md5 = "9c5d5d4fe1e17dd12153f40bc5b6dbc0"
versions = '1.0.3, 1.3.2p1, 1.4.1p1, 3.0.4, 3.1b1'

View file

@ -5,7 +5,10 @@ class Mpileaks(Package):
url = "http://www.llnl.gov/mpileaks-1.0.tar.gz"
md5 = "foobarbaz"
versions = [1.0, 2.1, 2.2, 2.3]
versions = { 1.0 : None,
2.1 : None,
2.2 : None,
2.3 : None }
depends_on("mpich")
depends_on("callpath")

View file

@ -176,6 +176,8 @@ def wildcard_version(path):
that will match this path with any version in its place.
"""
ver, start, end = parse_version_string_with_indices(path)
v = Version(ver)
return re.escape(path[:start]) + v.wildcard() + re.escape(path[end:])
v = Version(ver)
parts = list(re.escape(p) for p in path.split(str(v)))
return v.wildcard().join(parts)

View file

@ -0,0 +1,13 @@
import hashlib
from contextlib import closing
def md5(filename, block_size=2**20):
"""Computes the md5 hash of a file."""
md5 = hashlib.md5()
with closing(open(filename)) as file:
while True:
data = file.read(block_size)
if not data:
break
md5.update(data)
return md5.hexdigest()

View file

@ -30,7 +30,7 @@ def mkdirp(*paths):
def new_path(prefix, *args):
path=str(prefix)
path = str(prefix)
for elt in args:
path = os.path.join(path, str(elt))
@ -56,16 +56,3 @@ def stem(path):
if re.search(suffix, path):
return re.sub(suffix, "", path)
return path
def md5(filename, block_size=2**20):
"""Computes the md5 hash of a file."""
import hashlib
md5 = hashlib.md5()
with closing(open(filename)) as file:
while True:
data = file.read(block_size)
if not data:
break
md5.update(data)
return md5.hexdigest()

113
lib/spack/spack/util/web.py Normal file
View file

@ -0,0 +1,113 @@
import re
import subprocess
import urllib2
import urlparse
from multiprocessing import Pool
from HTMLParser import HTMLParser
import spack
import spack.tty as tty
from spack.util.compression import ALLOWED_ARCHIVE_TYPES
# Timeout in seconds for web requests
TIMEOUT = 10
class LinkParser(HTMLParser):
"""This parser just takes an HTML page and strips out the hrefs on the
links. Good enough for a really simple spider. """
def __init__(self):
HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
if tag == 'a':
for attr, val in attrs:
if attr == 'href':
self.links.append(val)
def _spider(args):
"""_spider(url, depth, max_depth)
Fetches URL and any pages it links to up to max_depth. depth should
initially be 1, and max_depth includes the root. This function will
print out a warning only if the root can't be fetched; it ignores
errors with pages that the root links to.
This will return a list of the pages fetched, in no particular order.
Takes args as a tuple b/c it's intended to be used by a multiprocessing
pool. Firing off all the child links at once makes the fetch MUCH
faster for pages with lots of children.
"""
url, depth, max_depth = args
pages = {}
try:
# Make a HEAD request first to check the content type. This lets
# us ignore tarballs and gigantic files.
# It would be nice to do this with the HTTP Accept header to avoid
# one round-trip. However, most servers seem to ignore the header
# if you ask for a tarball with Accept: text/html.
req = urllib2.Request(url)
req.get_method = lambda: "HEAD"
resp = urllib2.urlopen(req, timeout=TIMEOUT)
if not resp.headers["Content-type"].startswith('text/html'):
print "ignoring page " + url + " with content type " + resp.headers["Content-type"]
return pages
# Do the real GET request when we know it's just HTML.
req.get_method = lambda: "GET"
response = urllib2.urlopen(req, timeout=TIMEOUT)
response_url = response.geturl()
# Read the page and and stick it in the map we'll return
page = response.read()
pages[response_url] = page
# If we're not at max depth, parse out the links in the page
if depth < max_depth:
link_parser = LinkParser()
subcalls = []
link_parser.feed(page)
while link_parser.links:
raw_link = link_parser.links.pop()
# Skip stuff that looks like an archive
if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
continue
# Evaluate the link relative to the page it came from.
abs_link = urlparse.urljoin(response_url, raw_link)
subcalls.append((abs_link, depth+1, max_depth))
if subcalls:
pool = Pool(processes=len(subcalls))
dicts = pool.map(_spider, subcalls)
for d in dicts:
pages.update(d)
except urllib2.HTTPError, e:
# Only report it if it's the root page. We ignore errors when spidering.
if depth == 1:
tty.warn("Could not connect to %s" % url, e.reason,
"Package.available_versions requires an internet connection.",
"Version list may be incomplete.")
return pages
def get_pages(root_url, **kwargs):
"""Gets web pages from a root URL.
If depth is specified (e.g., depth=2), then this will also fetches pages
linked from the root and its children up to depth.
This will spawn processes to fetch the children, for much improved
performance over a sequential fetch.
"""
max_depth = kwargs.setdefault('depth', 1)
pages = _spider((root_url, 1, max_depth))
return pages