Improved website scraping.
This commit is contained in:
parent
d78ece658b
commit
55bf243f16
8 changed files with 136 additions and 97 deletions
|
@ -113,4 +113,5 @@ except SpackError, e:
|
||||||
tty.die(e.message)
|
tty.die(e.message)
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
sys.stderr.write('\n')
|
||||||
tty.die("Keyboard interrupt.")
|
tty.die("Keyboard interrupt.")
|
||||||
|
|
|
@ -85,24 +85,24 @@ def checksum(parser, args):
|
||||||
pkg = spack.db.get(args.package)
|
pkg = spack.db.get(args.package)
|
||||||
|
|
||||||
# If the user asked for specific versions, use those.
|
# If the user asked for specific versions, use those.
|
||||||
versions = [ver(v) for v in args.versions]
|
if args.versions:
|
||||||
|
versions = {}
|
||||||
if not all(type(v) == Version for v in versions):
|
for v in args.versions:
|
||||||
tty.die("Cannot generate checksums for version lists or " +
|
v = ver(v)
|
||||||
"version ranges. Use unambiguous versions.")
|
if not isinstance(v, Version):
|
||||||
|
tty.die("Cannot generate checksums for version lists or " +
|
||||||
if not versions:
|
"version ranges. Use unambiguous versions.")
|
||||||
versions = pkg.fetch_available_versions()
|
versions[v] = pkg.url_for_version(v)
|
||||||
|
else:
|
||||||
|
versions = pkg.fetch_remote_versions()
|
||||||
if not versions:
|
if not versions:
|
||||||
tty.die("Could not fetch any available versions for %s." % pkg.name)
|
tty.die("Could not fetch any versions for %s." % pkg.name)
|
||||||
|
|
||||||
versions = list(reversed(sorted(versions)))
|
sorted_versions = list(reversed(sorted(versions)))
|
||||||
urls = [pkg.url_for_version(v) for v in versions]
|
|
||||||
|
|
||||||
|
tty.msg("Found %s versions of %s." % (len(versions), pkg.name),
|
||||||
tty.msg("Found %s versions of %s." % (len(urls), pkg.name),
|
|
||||||
*spack.cmd.elide_list(
|
*spack.cmd.elide_list(
|
||||||
["%-10s%s" % (v,u) for v, u in zip(versions, urls)]))
|
["%-10s%s" % (v, versions[v]) for v in sorted_versions]))
|
||||||
print
|
print
|
||||||
archives_to_fetch = tty.get_number(
|
archives_to_fetch = tty.get_number(
|
||||||
"How many would you like to checksum?", default=5, abort='q')
|
"How many would you like to checksum?", default=5, abort='q')
|
||||||
|
@ -112,10 +112,12 @@ def checksum(parser, args):
|
||||||
return
|
return
|
||||||
|
|
||||||
version_hashes = get_checksums(
|
version_hashes = get_checksums(
|
||||||
versions[:archives_to_fetch], urls[:archives_to_fetch], keep_stage=args.keep_stage)
|
sorted_versions[:archives_to_fetch],
|
||||||
|
[versions[v] for v in sorted_versions[:archives_to_fetch]],
|
||||||
|
keep_stage=args.keep_stage)
|
||||||
|
|
||||||
if not version_hashes:
|
if not version_hashes:
|
||||||
tty.die("Could not fetch any available versions for %s." % pkg.name)
|
tty.die("Could not fetch any versions for %s." % pkg.name)
|
||||||
|
|
||||||
version_lines = [" version('%s', '%s')" % (v, h) for v, h in version_hashes]
|
version_lines = [" version('%s', '%s')" % (v, h) for v, h in version_hashes]
|
||||||
tty.msg("Checksummed new versions of %s:" % pkg.name, *version_lines)
|
tty.msg("Checksummed new versions of %s:" % pkg.name, *version_lines)
|
||||||
|
|
|
@ -159,13 +159,12 @@ def create(parser, args):
|
||||||
else:
|
else:
|
||||||
mkdirp(os.path.dirname(pkg_path))
|
mkdirp(os.path.dirname(pkg_path))
|
||||||
|
|
||||||
versions = list(reversed(spack.package.find_versions_of_archive(url)))
|
versions = spack.package.find_versions_of_archive(url)
|
||||||
|
|
||||||
archives_to_fetch = 1
|
archives_to_fetch = 1
|
||||||
if not versions:
|
if not versions:
|
||||||
# If the fetch failed for some reason, revert to what the user provided
|
# If the fetch failed for some reason, revert to what the user provided
|
||||||
versions = [version]
|
versions = { version : url }
|
||||||
urls = [url]
|
|
||||||
else:
|
else:
|
||||||
urls = [spack.url.substitute_version(url, v) for v in versions]
|
urls = [spack.url.substitute_version(url, v) for v in versions]
|
||||||
if len(urls) > 1:
|
if len(urls) > 1:
|
||||||
|
@ -181,6 +180,8 @@ def create(parser, args):
|
||||||
tty.msg("Aborted.")
|
tty.msg("Aborted.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
sorted_versions = list(reversed(versions))
|
||||||
|
|
||||||
guesser = ConfigureGuesser()
|
guesser = ConfigureGuesser()
|
||||||
ver_hash_tuples = spack.cmd.checksum.get_checksums(
|
ver_hash_tuples = spack.cmd.checksum.get_checksums(
|
||||||
versions[:archives_to_fetch], urls[:archives_to_fetch],
|
versions[:archives_to_fetch], urls[:archives_to_fetch],
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
##############################################################################
|
##############################################################################
|
||||||
import os
|
import os
|
||||||
from llnl.util.tty.colify import colify
|
from llnl.util.tty.colify import colify
|
||||||
|
import llnl.util.tty as tty
|
||||||
import spack
|
import spack
|
||||||
|
|
||||||
description ="List available versions of a package"
|
description ="List available versions of a package"
|
||||||
|
@ -34,4 +35,21 @@ def setup_parser(subparser):
|
||||||
|
|
||||||
def versions(parser, args):
|
def versions(parser, args):
|
||||||
pkg = spack.db.get(args.package)
|
pkg = spack.db.get(args.package)
|
||||||
colify(reversed(pkg.fetch_available_versions()))
|
|
||||||
|
safe_versions = pkg.versions
|
||||||
|
fetched_versions = pkg.fetch_remote_versions()
|
||||||
|
remote_versions = set(fetched_versions).difference(safe_versions)
|
||||||
|
|
||||||
|
tty.msg("Safe versions (already checksummed):")
|
||||||
|
colify(sorted(safe_versions, reverse=True), indent=2)
|
||||||
|
|
||||||
|
tty.msg("Remote versions (not yet checksummed):")
|
||||||
|
if not remote_versions:
|
||||||
|
if not fetched_versions:
|
||||||
|
print " Found no versions for %s" % pkg.name
|
||||||
|
tty.debug("Check the list_url and list_depth attribute on the "
|
||||||
|
"package to help Spack find versions.")
|
||||||
|
else:
|
||||||
|
print " Found no unckecksummed versions for %s" % pkg.name
|
||||||
|
else:
|
||||||
|
colify(sorted(remote_versions, reverse=True), indent=2)
|
||||||
|
|
|
@ -68,7 +68,7 @@ def concretize_version(self, spec):
|
||||||
# If there are known avaialble versions, return the most recent
|
# If there are known avaialble versions, return the most recent
|
||||||
# version that satisfies the spec
|
# version that satisfies the spec
|
||||||
pkg = spec.package
|
pkg = spec.package
|
||||||
valid_versions = [v for v in pkg.available_versions
|
valid_versions = [v for v in pkg.versions
|
||||||
if any(v.satisfies(sv) for sv in spec.versions)]
|
if any(v.satisfies(sv) for sv in spec.versions)]
|
||||||
|
|
||||||
if valid_versions:
|
if valid_versions:
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
import subprocess
|
import subprocess
|
||||||
import platform as py_platform
|
import platform as py_platform
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse, urljoin
|
||||||
|
|
||||||
import llnl.util.tty as tty
|
import llnl.util.tty as tty
|
||||||
from llnl.util.filesystem import *
|
from llnl.util.filesystem import *
|
||||||
|
@ -333,9 +333,6 @@ def __init__(self, spec):
|
||||||
if '.' in self.name:
|
if '.' in self.name:
|
||||||
self.name = self.name[self.name.rindex('.') + 1:]
|
self.name = self.name[self.name.rindex('.') + 1:]
|
||||||
|
|
||||||
# This is set by scraping a web page.
|
|
||||||
self._available_versions = None
|
|
||||||
|
|
||||||
# Sanity check some required variables that could be
|
# Sanity check some required variables that could be
|
||||||
# overridden by package authors.
|
# overridden by package authors.
|
||||||
def ensure_has_dict(attr_name):
|
def ensure_has_dict(attr_name):
|
||||||
|
@ -370,14 +367,15 @@ def ensure_has_dict(attr_name):
|
||||||
|
|
||||||
# Init fetch strategy and url to None
|
# Init fetch strategy and url to None
|
||||||
self._fetcher = None
|
self._fetcher = None
|
||||||
self.url = None
|
self.url = getattr(self.__class__, 'url', None)
|
||||||
|
|
||||||
# Fix up self.url if this package fetches with a URLFetchStrategy.
|
# Fix up self.url if this package fetches with a URLFetchStrategy.
|
||||||
# This makes self.url behave sanely.
|
# This makes self.url behave sanely.
|
||||||
if self.spec.versions.concrete:
|
if self.spec.versions.concrete:
|
||||||
# TODO: this is a really roundabout way of determining the type of fetch to do.
|
# TODO: this is a really roundabout way of determining the type
|
||||||
# TODO: figure out a more sane fetch strategy/package init order
|
# TODO: of fetch to do. figure out a more sane fetch strategy/package
|
||||||
# TODO: (right now it's conflated with stage, package, and the tests make assumptions)
|
# TODO: init order (right now it's conflated with stage, package, and
|
||||||
|
# TODO: the tests make assumptions)
|
||||||
f = fs.for_package_version(self, self.version)
|
f = fs.for_package_version(self, self.version)
|
||||||
if isinstance(f, fs.URLFetchStrategy):
|
if isinstance(f, fs.URLFetchStrategy):
|
||||||
self.url = self.url_for_version(self.spec.version)
|
self.url = self.url_for_version(self.spec.version)
|
||||||
|
@ -852,71 +850,70 @@ def do_clean_dist(self):
|
||||||
self.stage.destroy()
|
self.stage.destroy()
|
||||||
|
|
||||||
|
|
||||||
def fetch_available_versions(self):
|
@property
|
||||||
if not hasattr(self, 'url'):
|
def all_urls(self):
|
||||||
|
urls = []
|
||||||
|
if self.url:
|
||||||
|
urls.append(self.url)
|
||||||
|
|
||||||
|
for args in self.versions.values():
|
||||||
|
if 'url' in args:
|
||||||
|
urls.append(args['url'])
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_remote_versions(self):
|
||||||
|
"""Try to find remote versions of this package using the
|
||||||
|
list_url and any other URLs described in the package file."""
|
||||||
|
if not self.all_urls:
|
||||||
raise VersionFetchError(self.__class__)
|
raise VersionFetchError(self.__class__)
|
||||||
|
|
||||||
# If not, then try to fetch using list_url
|
try:
|
||||||
if not self._available_versions:
|
return find_versions_of_archive(
|
||||||
try:
|
*self.all_urls, list_url=self.list_url, list_depth=self.list_depth)
|
||||||
self._available_versions = find_versions_of_archive(
|
except spack.error.NoNetworkConnectionError, e:
|
||||||
self.url,
|
tty.die("Package.fetch_versions couldn't connect to:",
|
||||||
list_url=self.list_url,
|
e.url, e.message)
|
||||||
list_depth=self.list_depth)
|
|
||||||
|
|
||||||
if not self._available_versions:
|
|
||||||
tty.warn("Found no versions for %s" % self.name,
|
|
||||||
"Check the list_url and list_depth attribute on the "
|
|
||||||
+ self.name + " package.",
|
|
||||||
"Use them to tell Spack where to look for versions.")
|
|
||||||
|
|
||||||
except spack.error.NoNetworkConnectionError, e:
|
|
||||||
tty.die("Package.fetch_available_versions couldn't connect to:",
|
|
||||||
e.url, e.message)
|
|
||||||
|
|
||||||
return self._available_versions
|
|
||||||
|
|
||||||
|
|
||||||
@property
|
def find_versions_of_archive(*archive_urls, **kwargs):
|
||||||
def available_versions(self):
|
|
||||||
# If the package overrode available_versions, then use that.
|
|
||||||
if self.versions is not None:
|
|
||||||
return VersionList(self.versions.keys())
|
|
||||||
else:
|
|
||||||
vlist = self.fetch_available_versions()
|
|
||||||
if not vlist:
|
|
||||||
vlist = ver([self.version])
|
|
||||||
return vlist
|
|
||||||
|
|
||||||
|
|
||||||
def find_versions_of_archive(archive_url, **kwargs):
|
|
||||||
list_url = kwargs.get('list_url', None)
|
list_url = kwargs.get('list_url', None)
|
||||||
list_depth = kwargs.get('list_depth', 1)
|
list_depth = kwargs.get('list_depth', 1)
|
||||||
|
|
||||||
if not list_url:
|
# Generate a list of list_urls based on archive urls and any
|
||||||
list_url = url.find_list_url(archive_url)
|
# explicitly listed list_url in the package
|
||||||
|
list_urls = set()
|
||||||
# This creates a regex from the URL with a capture group for the
|
if list_url:
|
||||||
# version part of the URL. The capture group is converted to a
|
list_urls.add(list_url)
|
||||||
# generic wildcard, so we can use this to extract things on a page
|
for aurl in archive_urls:
|
||||||
# that look like archive URLs.
|
list_urls.add(url.find_list_url(aurl))
|
||||||
url_regex = url.wildcard_version(archive_url)
|
|
||||||
|
|
||||||
# We'll be a bit more liberal and just look for the archive part,
|
|
||||||
# not the full path.
|
|
||||||
archive_regex = os.path.basename(url_regex)
|
|
||||||
|
|
||||||
# Grab some web pages to scrape.
|
# Grab some web pages to scrape.
|
||||||
page_map = get_pages(list_url, depth=list_depth)
|
page_map = {}
|
||||||
|
for lurl in list_urls:
|
||||||
|
page_map.update(get_pages(lurl, depth=list_depth))
|
||||||
|
|
||||||
|
# Scrape them for archive URLs
|
||||||
|
regexes = []
|
||||||
|
for aurl in archive_urls:
|
||||||
|
# This creates a regex from the URL with a capture group for
|
||||||
|
# the version part of the URL. The capture group is converted
|
||||||
|
# to a generic wildcard, so we can use this to extract things
|
||||||
|
# on a page that look like archive URLs.
|
||||||
|
url_regex = url.wildcard_version(aurl)
|
||||||
|
|
||||||
|
# We'll be a bit more liberal and just look for the archive
|
||||||
|
# part, not the full path.
|
||||||
|
regexes.append(os.path.basename(url_regex))
|
||||||
|
|
||||||
# Build a version list from all the matches we find
|
# Build a version list from all the matches we find
|
||||||
versions = VersionList()
|
versions = {}
|
||||||
for site, page in page_map.iteritems():
|
for page_url, content in page_map.iteritems():
|
||||||
# extract versions from matches.
|
# extract versions from matches.
|
||||||
matches = re.finditer(archive_regex, page)
|
for regex in regexes:
|
||||||
version_strings = set(m.group(1) for m in matches)
|
versions.update(
|
||||||
for v in version_strings:
|
(Version(m.group(1)), urljoin(page_url, m.group(0)))
|
||||||
versions.add(Version(v))
|
for m in re.finditer(regex, content))
|
||||||
|
|
||||||
return versions
|
return versions
|
||||||
|
|
||||||
|
@ -979,8 +976,8 @@ class VersionFetchError(PackageError):
|
||||||
"""Raised when a version URL cannot automatically be determined."""
|
"""Raised when a version URL cannot automatically be determined."""
|
||||||
def __init__(self, cls):
|
def __init__(self, cls):
|
||||||
super(VersionFetchError, self).__init__(
|
super(VersionFetchError, self).__init__(
|
||||||
"Cannot fetch version for package %s " % cls.__name__ +
|
"Cannot fetch versions for package %s " % cls.__name__ +
|
||||||
"because it does not define a default url.")
|
"because it does not define any URLs to fetch.")
|
||||||
|
|
||||||
|
|
||||||
class NoURLError(PackageError):
|
class NoURLError(PackageError):
|
||||||
|
|
|
@ -245,6 +245,10 @@ def wildcard_version(path):
|
||||||
# Construct a case-insensitive regular expression for the package name.
|
# Construct a case-insensitive regular expression for the package name.
|
||||||
name_re = '(%s)' % insensitize(name)
|
name_re = '(%s)' % insensitize(name)
|
||||||
|
|
||||||
|
# protect extensions like bz2 from wildcarding.
|
||||||
|
ext = comp.extension(path)
|
||||||
|
path = comp.strip_extension(path)
|
||||||
|
|
||||||
# Split the string apart by things that match the name so that if the
|
# Split the string apart by things that match the name so that if the
|
||||||
# name contains numbers or things that look like versions, we don't
|
# name contains numbers or things that look like versions, we don't
|
||||||
# catch them with the version wildcard.
|
# catch them with the version wildcard.
|
||||||
|
@ -261,4 +265,4 @@ def wildcard_version(path):
|
||||||
name_parts[i] = vgroup.join(re.escape(vp) for vp in vparts)
|
name_parts[i] = vgroup.join(re.escape(vp) for vp in vparts)
|
||||||
|
|
||||||
# Put it all back together with original name matches intact.
|
# Put it all back together with original name matches intact.
|
||||||
return ''.join(name_parts)
|
return ''.join(name_parts) + '.' + ext
|
||||||
|
|
|
@ -25,7 +25,7 @@
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import subprocess
|
import subprocess
|
||||||
import urllib2
|
import urllib2, cookielib
|
||||||
import urlparse
|
import urlparse
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
from HTMLParser import HTMLParser, HTMLParseError
|
from HTMLParser import HTMLParser, HTMLParseError
|
||||||
|
@ -68,7 +68,7 @@ def _spider(args):
|
||||||
pool. Firing off all the child links at once makes the fetch MUCH
|
pool. Firing off all the child links at once makes the fetch MUCH
|
||||||
faster for pages with lots of children.
|
faster for pages with lots of children.
|
||||||
"""
|
"""
|
||||||
url, depth, max_depth, raise_on_error = args
|
url, visited, root, opener, depth, max_depth, raise_on_error = args
|
||||||
|
|
||||||
pages = {}
|
pages = {}
|
||||||
try:
|
try:
|
||||||
|
@ -82,12 +82,12 @@ def _spider(args):
|
||||||
resp = urllib2.urlopen(req, timeout=TIMEOUT)
|
resp = urllib2.urlopen(req, timeout=TIMEOUT)
|
||||||
|
|
||||||
if not "Content-type" in resp.headers:
|
if not "Content-type" in resp.headers:
|
||||||
tty.warn("ignoring page " + url)
|
tty.debug("ignoring page " + url)
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
if not resp.headers["Content-type"].startswith('text/html'):
|
if not resp.headers["Content-type"].startswith('text/html'):
|
||||||
tty.warn("ignoring page " + url + " with content type " +
|
tty.debug("ignoring page " + url + " with content type " +
|
||||||
resp.headers["Content-type"])
|
resp.headers["Content-type"])
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
# Do the real GET request when we know it's just HTML.
|
# Do the real GET request when we know it's just HTML.
|
||||||
|
@ -114,15 +114,30 @@ def _spider(args):
|
||||||
|
|
||||||
# Evaluate the link relative to the page it came from.
|
# Evaluate the link relative to the page it came from.
|
||||||
abs_link = urlparse.urljoin(response_url, raw_link)
|
abs_link = urlparse.urljoin(response_url, raw_link)
|
||||||
subcalls.append((abs_link, depth+1, max_depth, raise_on_error))
|
|
||||||
|
# Skip things outside the root directory
|
||||||
|
if not abs_link.startswith(root):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip already-visited links
|
||||||
|
if abs_link in visited:
|
||||||
|
continue
|
||||||
|
|
||||||
|
subcalls.append((abs_link, visited, root, None, depth+1, max_depth, raise_on_error))
|
||||||
|
visited.add(abs_link)
|
||||||
|
|
||||||
if subcalls:
|
if subcalls:
|
||||||
pool = Pool(processes=len(subcalls))
|
try:
|
||||||
dicts = pool.map(_spider, subcalls)
|
pool = Pool(processes=len(subcalls))
|
||||||
for d in dicts:
|
dicts = pool.map(_spider, subcalls)
|
||||||
pages.update(d)
|
for d in dicts:
|
||||||
|
pages.update(d)
|
||||||
|
finally:
|
||||||
|
pool.terminate()
|
||||||
|
pool.join()
|
||||||
|
|
||||||
except urllib2.URLError, e:
|
except urllib2.URLError, e:
|
||||||
|
tty.debug(e)
|
||||||
if raise_on_error:
|
if raise_on_error:
|
||||||
raise spack.error.NoNetworkConnectionError(str(e), url)
|
raise spack.error.NoNetworkConnectionError(str(e), url)
|
||||||
|
|
||||||
|
@ -137,7 +152,8 @@ def _spider(args):
|
||||||
tty.warn(msg, url, "HTMLParseError: " + str(e))
|
tty.warn(msg, url, "HTMLParseError: " + str(e))
|
||||||
|
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
pass # Other types of errors are completely ignored.
|
# Other types of errors are completely ignored, except in debug mode.
|
||||||
|
tty.debug("Error in _spider: %s" % e)
|
||||||
|
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
|
@ -151,5 +167,5 @@ def get_pages(root_url, **kwargs):
|
||||||
performance over a sequential fetch.
|
performance over a sequential fetch.
|
||||||
"""
|
"""
|
||||||
max_depth = kwargs.setdefault('depth', 1)
|
max_depth = kwargs.setdefault('depth', 1)
|
||||||
pages = _spider((root_url, 1, max_depth, False))
|
pages = _spider((root_url, set(), root_url, None, 1, max_depth, False))
|
||||||
return pages
|
return pages
|
||||||
|
|
Loading…
Reference in a new issue