Improved website scraping.

This commit is contained in:
Todd Gamblin 2014-11-07 00:17:25 -08:00
parent d78ece658b
commit 55bf243f16
8 changed files with 136 additions and 97 deletions

View file

@ -113,4 +113,5 @@ except SpackError, e:
tty.die(e.message) tty.die(e.message)
except KeyboardInterrupt: except KeyboardInterrupt:
sys.stderr.write('\n')
tty.die("Keyboard interrupt.") tty.die("Keyboard interrupt.")

View file

@ -85,24 +85,24 @@ def checksum(parser, args):
pkg = spack.db.get(args.package) pkg = spack.db.get(args.package)
# If the user asked for specific versions, use those. # If the user asked for specific versions, use those.
versions = [ver(v) for v in args.versions] if args.versions:
versions = {}
if not all(type(v) == Version for v in versions): for v in args.versions:
tty.die("Cannot generate checksums for version lists or " + v = ver(v)
"version ranges. Use unambiguous versions.") if not isinstance(v, Version):
tty.die("Cannot generate checksums for version lists or " +
if not versions: "version ranges. Use unambiguous versions.")
versions = pkg.fetch_available_versions() versions[v] = pkg.url_for_version(v)
else:
versions = pkg.fetch_remote_versions()
if not versions: if not versions:
tty.die("Could not fetch any available versions for %s." % pkg.name) tty.die("Could not fetch any versions for %s." % pkg.name)
versions = list(reversed(sorted(versions))) sorted_versions = list(reversed(sorted(versions)))
urls = [pkg.url_for_version(v) for v in versions]
tty.msg("Found %s versions of %s." % (len(versions), pkg.name),
tty.msg("Found %s versions of %s." % (len(urls), pkg.name),
*spack.cmd.elide_list( *spack.cmd.elide_list(
["%-10s%s" % (v,u) for v, u in zip(versions, urls)])) ["%-10s%s" % (v, versions[v]) for v in sorted_versions]))
print print
archives_to_fetch = tty.get_number( archives_to_fetch = tty.get_number(
"How many would you like to checksum?", default=5, abort='q') "How many would you like to checksum?", default=5, abort='q')
@ -112,10 +112,12 @@ def checksum(parser, args):
return return
version_hashes = get_checksums( version_hashes = get_checksums(
versions[:archives_to_fetch], urls[:archives_to_fetch], keep_stage=args.keep_stage) sorted_versions[:archives_to_fetch],
[versions[v] for v in sorted_versions[:archives_to_fetch]],
keep_stage=args.keep_stage)
if not version_hashes: if not version_hashes:
tty.die("Could not fetch any available versions for %s." % pkg.name) tty.die("Could not fetch any versions for %s." % pkg.name)
version_lines = [" version('%s', '%s')" % (v, h) for v, h in version_hashes] version_lines = [" version('%s', '%s')" % (v, h) for v, h in version_hashes]
tty.msg("Checksummed new versions of %s:" % pkg.name, *version_lines) tty.msg("Checksummed new versions of %s:" % pkg.name, *version_lines)

View file

@ -159,13 +159,12 @@ def create(parser, args):
else: else:
mkdirp(os.path.dirname(pkg_path)) mkdirp(os.path.dirname(pkg_path))
versions = list(reversed(spack.package.find_versions_of_archive(url))) versions = spack.package.find_versions_of_archive(url)
archives_to_fetch = 1 archives_to_fetch = 1
if not versions: if not versions:
# If the fetch failed for some reason, revert to what the user provided # If the fetch failed for some reason, revert to what the user provided
versions = [version] versions = { version : url }
urls = [url]
else: else:
urls = [spack.url.substitute_version(url, v) for v in versions] urls = [spack.url.substitute_version(url, v) for v in versions]
if len(urls) > 1: if len(urls) > 1:
@ -181,6 +180,8 @@ def create(parser, args):
tty.msg("Aborted.") tty.msg("Aborted.")
return return
sorted_versions = list(reversed(versions))
guesser = ConfigureGuesser() guesser = ConfigureGuesser()
ver_hash_tuples = spack.cmd.checksum.get_checksums( ver_hash_tuples = spack.cmd.checksum.get_checksums(
versions[:archives_to_fetch], urls[:archives_to_fetch], versions[:archives_to_fetch], urls[:archives_to_fetch],

View file

@ -24,6 +24,7 @@
############################################################################## ##############################################################################
import os import os
from llnl.util.tty.colify import colify from llnl.util.tty.colify import colify
import llnl.util.tty as tty
import spack import spack
description ="List available versions of a package" description ="List available versions of a package"
@ -34,4 +35,21 @@ def setup_parser(subparser):
def versions(parser, args): def versions(parser, args):
pkg = spack.db.get(args.package) pkg = spack.db.get(args.package)
colify(reversed(pkg.fetch_available_versions()))
safe_versions = pkg.versions
fetched_versions = pkg.fetch_remote_versions()
remote_versions = set(fetched_versions).difference(safe_versions)
tty.msg("Safe versions (already checksummed):")
colify(sorted(safe_versions, reverse=True), indent=2)
tty.msg("Remote versions (not yet checksummed):")
if not remote_versions:
if not fetched_versions:
print " Found no versions for %s" % pkg.name
tty.debug("Check the list_url and list_depth attribute on the "
"package to help Spack find versions.")
else:
print " Found no unckecksummed versions for %s" % pkg.name
else:
colify(sorted(remote_versions, reverse=True), indent=2)

View file

@ -68,7 +68,7 @@ def concretize_version(self, spec):
# If there are known avaialble versions, return the most recent # If there are known avaialble versions, return the most recent
# version that satisfies the spec # version that satisfies the spec
pkg = spec.package pkg = spec.package
valid_versions = [v for v in pkg.available_versions valid_versions = [v for v in pkg.versions
if any(v.satisfies(sv) for sv in spec.versions)] if any(v.satisfies(sv) for sv in spec.versions)]
if valid_versions: if valid_versions:

View file

@ -39,7 +39,7 @@
import subprocess import subprocess
import platform as py_platform import platform as py_platform
import multiprocessing import multiprocessing
from urlparse import urlparse from urlparse import urlparse, urljoin
import llnl.util.tty as tty import llnl.util.tty as tty
from llnl.util.filesystem import * from llnl.util.filesystem import *
@ -333,9 +333,6 @@ def __init__(self, spec):
if '.' in self.name: if '.' in self.name:
self.name = self.name[self.name.rindex('.') + 1:] self.name = self.name[self.name.rindex('.') + 1:]
# This is set by scraping a web page.
self._available_versions = None
# Sanity check some required variables that could be # Sanity check some required variables that could be
# overridden by package authors. # overridden by package authors.
def ensure_has_dict(attr_name): def ensure_has_dict(attr_name):
@ -370,14 +367,15 @@ def ensure_has_dict(attr_name):
# Init fetch strategy and url to None # Init fetch strategy and url to None
self._fetcher = None self._fetcher = None
self.url = None self.url = getattr(self.__class__, 'url', None)
# Fix up self.url if this package fetches with a URLFetchStrategy. # Fix up self.url if this package fetches with a URLFetchStrategy.
# This makes self.url behave sanely. # This makes self.url behave sanely.
if self.spec.versions.concrete: if self.spec.versions.concrete:
# TODO: this is a really roundabout way of determining the type of fetch to do. # TODO: this is a really roundabout way of determining the type
# TODO: figure out a more sane fetch strategy/package init order # TODO: of fetch to do. figure out a more sane fetch strategy/package
# TODO: (right now it's conflated with stage, package, and the tests make assumptions) # TODO: init order (right now it's conflated with stage, package, and
# TODO: the tests make assumptions)
f = fs.for_package_version(self, self.version) f = fs.for_package_version(self, self.version)
if isinstance(f, fs.URLFetchStrategy): if isinstance(f, fs.URLFetchStrategy):
self.url = self.url_for_version(self.spec.version) self.url = self.url_for_version(self.spec.version)
@ -852,71 +850,70 @@ def do_clean_dist(self):
self.stage.destroy() self.stage.destroy()
def fetch_available_versions(self): @property
if not hasattr(self, 'url'): def all_urls(self):
urls = []
if self.url:
urls.append(self.url)
for args in self.versions.values():
if 'url' in args:
urls.append(args['url'])
return urls
def fetch_remote_versions(self):
"""Try to find remote versions of this package using the
list_url and any other URLs described in the package file."""
if not self.all_urls:
raise VersionFetchError(self.__class__) raise VersionFetchError(self.__class__)
# If not, then try to fetch using list_url try:
if not self._available_versions: return find_versions_of_archive(
try: *self.all_urls, list_url=self.list_url, list_depth=self.list_depth)
self._available_versions = find_versions_of_archive( except spack.error.NoNetworkConnectionError, e:
self.url, tty.die("Package.fetch_versions couldn't connect to:",
list_url=self.list_url, e.url, e.message)
list_depth=self.list_depth)
if not self._available_versions:
tty.warn("Found no versions for %s" % self.name,
"Check the list_url and list_depth attribute on the "
+ self.name + " package.",
"Use them to tell Spack where to look for versions.")
except spack.error.NoNetworkConnectionError, e:
tty.die("Package.fetch_available_versions couldn't connect to:",
e.url, e.message)
return self._available_versions
@property def find_versions_of_archive(*archive_urls, **kwargs):
def available_versions(self):
# If the package overrode available_versions, then use that.
if self.versions is not None:
return VersionList(self.versions.keys())
else:
vlist = self.fetch_available_versions()
if not vlist:
vlist = ver([self.version])
return vlist
def find_versions_of_archive(archive_url, **kwargs):
list_url = kwargs.get('list_url', None) list_url = kwargs.get('list_url', None)
list_depth = kwargs.get('list_depth', 1) list_depth = kwargs.get('list_depth', 1)
if not list_url: # Generate a list of list_urls based on archive urls and any
list_url = url.find_list_url(archive_url) # explicitly listed list_url in the package
list_urls = set()
# This creates a regex from the URL with a capture group for the if list_url:
# version part of the URL. The capture group is converted to a list_urls.add(list_url)
# generic wildcard, so we can use this to extract things on a page for aurl in archive_urls:
# that look like archive URLs. list_urls.add(url.find_list_url(aurl))
url_regex = url.wildcard_version(archive_url)
# We'll be a bit more liberal and just look for the archive part,
# not the full path.
archive_regex = os.path.basename(url_regex)
# Grab some web pages to scrape. # Grab some web pages to scrape.
page_map = get_pages(list_url, depth=list_depth) page_map = {}
for lurl in list_urls:
page_map.update(get_pages(lurl, depth=list_depth))
# Scrape them for archive URLs
regexes = []
for aurl in archive_urls:
# This creates a regex from the URL with a capture group for
# the version part of the URL. The capture group is converted
# to a generic wildcard, so we can use this to extract things
# on a page that look like archive URLs.
url_regex = url.wildcard_version(aurl)
# We'll be a bit more liberal and just look for the archive
# part, not the full path.
regexes.append(os.path.basename(url_regex))
# Build a version list from all the matches we find # Build a version list from all the matches we find
versions = VersionList() versions = {}
for site, page in page_map.iteritems(): for page_url, content in page_map.iteritems():
# extract versions from matches. # extract versions from matches.
matches = re.finditer(archive_regex, page) for regex in regexes:
version_strings = set(m.group(1) for m in matches) versions.update(
for v in version_strings: (Version(m.group(1)), urljoin(page_url, m.group(0)))
versions.add(Version(v)) for m in re.finditer(regex, content))
return versions return versions
@ -979,8 +976,8 @@ class VersionFetchError(PackageError):
"""Raised when a version URL cannot automatically be determined.""" """Raised when a version URL cannot automatically be determined."""
def __init__(self, cls): def __init__(self, cls):
super(VersionFetchError, self).__init__( super(VersionFetchError, self).__init__(
"Cannot fetch version for package %s " % cls.__name__ + "Cannot fetch versions for package %s " % cls.__name__ +
"because it does not define a default url.") "because it does not define any URLs to fetch.")
class NoURLError(PackageError): class NoURLError(PackageError):

View file

@ -245,6 +245,10 @@ def wildcard_version(path):
# Construct a case-insensitive regular expression for the package name. # Construct a case-insensitive regular expression for the package name.
name_re = '(%s)' % insensitize(name) name_re = '(%s)' % insensitize(name)
# protect extensions like bz2 from wildcarding.
ext = comp.extension(path)
path = comp.strip_extension(path)
# Split the string apart by things that match the name so that if the # Split the string apart by things that match the name so that if the
# name contains numbers or things that look like versions, we don't # name contains numbers or things that look like versions, we don't
# catch them with the version wildcard. # catch them with the version wildcard.
@ -261,4 +265,4 @@ def wildcard_version(path):
name_parts[i] = vgroup.join(re.escape(vp) for vp in vparts) name_parts[i] = vgroup.join(re.escape(vp) for vp in vparts)
# Put it all back together with original name matches intact. # Put it all back together with original name matches intact.
return ''.join(name_parts) return ''.join(name_parts) + '.' + ext

View file

@ -25,7 +25,7 @@
import re import re
import sys import sys
import subprocess import subprocess
import urllib2 import urllib2, cookielib
import urlparse import urlparse
from multiprocessing import Pool from multiprocessing import Pool
from HTMLParser import HTMLParser, HTMLParseError from HTMLParser import HTMLParser, HTMLParseError
@ -68,7 +68,7 @@ def _spider(args):
pool. Firing off all the child links at once makes the fetch MUCH pool. Firing off all the child links at once makes the fetch MUCH
faster for pages with lots of children. faster for pages with lots of children.
""" """
url, depth, max_depth, raise_on_error = args url, visited, root, opener, depth, max_depth, raise_on_error = args
pages = {} pages = {}
try: try:
@ -82,12 +82,12 @@ def _spider(args):
resp = urllib2.urlopen(req, timeout=TIMEOUT) resp = urllib2.urlopen(req, timeout=TIMEOUT)
if not "Content-type" in resp.headers: if not "Content-type" in resp.headers:
tty.warn("ignoring page " + url) tty.debug("ignoring page " + url)
return pages return pages
if not resp.headers["Content-type"].startswith('text/html'): if not resp.headers["Content-type"].startswith('text/html'):
tty.warn("ignoring page " + url + " with content type " + tty.debug("ignoring page " + url + " with content type " +
resp.headers["Content-type"]) resp.headers["Content-type"])
return pages return pages
# Do the real GET request when we know it's just HTML. # Do the real GET request when we know it's just HTML.
@ -114,15 +114,30 @@ def _spider(args):
# Evaluate the link relative to the page it came from. # Evaluate the link relative to the page it came from.
abs_link = urlparse.urljoin(response_url, raw_link) abs_link = urlparse.urljoin(response_url, raw_link)
subcalls.append((abs_link, depth+1, max_depth, raise_on_error))
# Skip things outside the root directory
if not abs_link.startswith(root):
continue
# Skip already-visited links
if abs_link in visited:
continue
subcalls.append((abs_link, visited, root, None, depth+1, max_depth, raise_on_error))
visited.add(abs_link)
if subcalls: if subcalls:
pool = Pool(processes=len(subcalls)) try:
dicts = pool.map(_spider, subcalls) pool = Pool(processes=len(subcalls))
for d in dicts: dicts = pool.map(_spider, subcalls)
pages.update(d) for d in dicts:
pages.update(d)
finally:
pool.terminate()
pool.join()
except urllib2.URLError, e: except urllib2.URLError, e:
tty.debug(e)
if raise_on_error: if raise_on_error:
raise spack.error.NoNetworkConnectionError(str(e), url) raise spack.error.NoNetworkConnectionError(str(e), url)
@ -137,7 +152,8 @@ def _spider(args):
tty.warn(msg, url, "HTMLParseError: " + str(e)) tty.warn(msg, url, "HTMLParseError: " + str(e))
except Exception, e: except Exception, e:
pass # Other types of errors are completely ignored. # Other types of errors are completely ignored, except in debug mode.
tty.debug("Error in _spider: %s" % e)
return pages return pages
@ -151,5 +167,5 @@ def get_pages(root_url, **kwargs):
performance over a sequential fetch. performance over a sequential fetch.
""" """
max_depth = kwargs.setdefault('depth', 1) max_depth = kwargs.setdefault('depth', 1)
pages = _spider((root_url, 1, max_depth, False)) pages = _spider((root_url, set(), root_url, None, 1, max_depth, False))
return pages return pages