Add better tests for web.py; fix some bugs found with spidering.

- _spider in web.py was actually failing to spider deeper than a certain
  point.
- Fixed multiprocessing pools to not use daemons and to allow recursive
  spawning.
- Added detailed tests for spidering and for finding archive versions.
- left some xfail URL finding exercises for the reader.
- Fix noqa annotations for some @when decorators
This commit is contained in:
Todd Gamblin 2017-04-01 14:03:54 -07:00
parent 28d6d375b4
commit 221f179716
27 changed files with 306 additions and 67 deletions

View file

@ -570,7 +570,7 @@ def __init__(self, spec):
self.list_url = None
if not hasattr(self, 'list_depth'):
self.list_depth = 1
self.list_depth = 0
# Set default licensing information
if not hasattr(self, 'license_required'):
@ -966,6 +966,10 @@ def do_stage(self, mirror_only=False):
self.stage.expand_archive()
self.stage.chdir_to_source()
def patch(self):
"""Default patch implementation is a no-op."""
pass
def do_patch(self):
"""Calls do_stage(), then applied patches to the expanded tarball if they
haven't been applied already."""
@ -1686,9 +1690,7 @@ def fetch_remote_versions(self):
try:
return spack.util.web.find_versions_of_archive(
*self.all_urls,
list_url=self.list_url,
list_depth=self.list_depth)
self.all_urls, self.list_url, self.list_depth)
except spack.error.NoNetworkConnectionError as e:
tty.die("Package.fetch_versions couldn't connect to:", e.url,
e.message)

View file

@ -0,0 +1,10 @@
<html>
<head>
This is page 1.
</head>
<body>
<a href="2.html">list_depth=2 follows this.</a>
<a href="foo-1.0.0.tar.gz">foo-1.0.0.tar.gz</a>
</body>
</html>

View file

@ -0,0 +1,12 @@
<html>
<head>
This is page 2.
</head>
<body>
<a href="3.html">list_depth=3 follows this.</a>
<a href="4.html">list_depth=3 follows this too.</a>
<a href="foo-2.0.0.tar.gz">foo-2.0.0.tar.gz</a>
<a href="foo-2.0.0b2.tar.gz">foo-2.0.0b2.tar.gz</a>
</body>
</html>

View file

@ -0,0 +1,11 @@
<html>
<head>
This is page 3.
</head>
<body>
<a href="index.html">This link is already visited.</a>
<a href="foo-3.0.tar.gz">foo-3.0.tar.gz</a>
<a href="foo-3.0a1.tar.gz">foo-3.0a1.tar.gz</a>
</body>
</html>

View file

@ -0,0 +1,11 @@
<html>
<head>
This is page 4.
</head>
<body>
This page is terminal and has no links to other pages.
<a href="foo-4.5.tar.gz">foo-4.5.tar.gz.</a>
<a href="foo-4.5-rc5.tar.gz">foo-4.1-rc5.tar.gz.</a>
</body>
</html>

View file

@ -0,0 +1,10 @@
<html>
<head>
This is the root page.
</head>
<body>
<a href="1.html">list_depth=1 follows this.</a>
<a href="foo-0.0.0.tar.gz">foo-0.0.0.tar.gz</a>
</body>
</html>

165
lib/spack/spack/test/web.py Normal file
View file

@ -0,0 +1,165 @@
##############################################################################
# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
# Produced at the Lawrence Livermore National Laboratory.
#
# This file is part of Spack.
# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
# LLNL-CODE-647188
#
# For details, see https://github.com/llnl/spack
# Please also see the LICENSE file for our notice and the LGPL.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License (as
# published by the Free Software Foundation) version 2.1, February 1999.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
# conditions of the GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
##############################################################################
"""Tests for web.py."""
import pytest
import os
import spack
from spack.util.web import spider, find_versions_of_archive
from spack.version import *
web_data_path = os.path.join(spack.test_path, 'data', 'web')
root = 'file://' + web_data_path + '/index.html'
root_tarball = 'file://' + web_data_path + '/foo-0.0.0.tar.gz'
page_1 = 'file://' + os.path.join(web_data_path, '1.html')
page_2 = 'file://' + os.path.join(web_data_path, '2.html')
page_3 = 'file://' + os.path.join(web_data_path, '3.html')
page_4 = 'file://' + os.path.join(web_data_path, '4.html')
def test_spider_0():
pages, links = spider(root, depth=0)
assert root in pages
assert page_1 not in pages
assert page_2 not in pages
assert page_3 not in pages
assert page_4 not in pages
assert "This is the root page." in pages[root]
assert root not in links
assert page_1 in links
assert page_2 not in links
assert page_3 not in links
assert page_4 not in links
def test_spider_1():
pages, links = spider(root, depth=1)
assert root in pages
assert page_1 in pages
assert page_2 not in pages
assert page_3 not in pages
assert page_4 not in pages
assert "This is the root page." in pages[root]
assert "This is page 1." in pages[page_1]
assert root not in links
assert page_1 in links
assert page_2 in links
assert page_3 not in links
assert page_4 not in links
def test_spider_2():
pages, links = spider(root, depth=2)
assert root in pages
assert page_1 in pages
assert page_2 in pages
assert page_3 not in pages
assert page_4 not in pages
assert "This is the root page." in pages[root]
assert "This is page 1." in pages[page_1]
assert "This is page 2." in pages[page_2]
assert root not in links
assert page_1 in links
assert page_1 in links
assert page_2 in links
assert page_3 in links
assert page_4 in links
def test_spider_3():
pages, links = spider(root, depth=3)
assert root in pages
assert page_1 in pages
assert page_2 in pages
assert page_3 in pages
assert page_4 in pages
assert "This is the root page." in pages[root]
assert "This is page 1." in pages[page_1]
assert "This is page 2." in pages[page_2]
assert "This is page 3." in pages[page_3]
assert "This is page 4." in pages[page_4]
assert root in links # circular link on page 3
assert page_1 in links
assert page_1 in links
assert page_2 in links
assert page_3 in links
assert page_4 in links
def test_find_versions_of_archive_0():
versions = find_versions_of_archive(root_tarball, root, list_depth=0)
assert ver('0.0.0') in versions
def test_find_versions_of_archive_1():
versions = find_versions_of_archive(root_tarball, root, list_depth=1)
assert ver('0.0.0') in versions
assert ver('1.0.0') in versions
def test_find_versions_of_archive_2():
versions = find_versions_of_archive(root_tarball, root, list_depth=2)
assert ver('0.0.0') in versions
assert ver('1.0.0') in versions
assert ver('2.0.0') in versions
@pytest.mark.xfail
def test_find_exotic_versions_of_archive_2():
versions = find_versions_of_archive(root_tarball, root, list_depth=2)
# up for grabs to make this better.
assert ver('2.0.0b2') in versions
def test_find_versions_of_archive_3():
versions = find_versions_of_archive(root_tarball, root, list_depth=3)
assert ver('0.0.0') in versions
assert ver('1.0.0') in versions
assert ver('2.0.0') in versions
assert ver('3.0') in versions
assert ver('4.5') in versions
@pytest.mark.xfail
def test_find_exotic_versions_of_archive_3():
versions = find_versions_of_archive(root_tarball, root, list_depth=3)
assert ver('2.0.0b2') in versions
assert ver('3.0a1') in versions
assert ver('4.5-rc5') in versions

View file

@ -25,11 +25,12 @@
import re
import os
import sys
import traceback
from six.moves.urllib.request import urlopen, Request
from six.moves.urllib.error import URLError
from six.moves.urllib.parse import urljoin
from multiprocessing import Pool
import multiprocessing.pool
try:
# Python 2 had these in the HTMLParser package.
@ -67,25 +68,42 @@ def handle_starttag(self, tag, attrs):
self.links.append(val)
def _spider(args):
"""_spider(url, depth, max_depth)
class NonDaemonProcess(multiprocessing.Process):
"""Process tha allows sub-processes, so pools can have sub-pools."""
def _get_daemon(self):
return False
Fetches URL and any pages it links to up to max_depth. depth should
initially be 1, and max_depth includes the root. This function will
print out a warning only if the root can't be fetched; it ignores
def _set_daemon(self, value):
pass
daemon = property(_get_daemon, _set_daemon)
class NonDaemonPool(multiprocessing.pool.Pool):
"""Pool that uses non-daemon processes"""
Process = NonDaemonProcess
def _spider(url, visited, root, depth, max_depth, raise_on_error):
"""Fetches URL and any pages it links to up to max_depth.
depth should initially be zero, and max_depth is the max depth of
links to follow from the root.
Prints out a warning only if the root can't be fetched; it ignores
errors with pages that the root links to.
This will return a list of the pages fetched, in no particular order.
Takes args as a tuple b/c it's intended to be used by a multiprocessing
pool. Firing off all the child links at once makes the fetch MUCH
faster for pages with lots of children.
Returns a tuple of:
- pages: dict of pages visited (URL) mapped to their full text.
- links: set of links encountered while visiting the pages.
"""
url, visited, root, opener, depth, max_depth, raise_on_error = args
pages = {} # dict from page URL -> text content.
links = set() # set of all links seen on visited pages.
# root may end with index.html -- chop that off.
if root.endswith('/index.html'):
root = re.sub('/index.html$', '', root)
try:
# Make a HEAD request first to check the content type. This lets
# us ignore tarballs and gigantic files.
@ -139,17 +157,19 @@ def _spider(args):
# If we're not at max depth, follow links.
if depth < max_depth:
subcalls.append((abs_link, visited, root, None,
subcalls.append((abs_link, visited, root,
depth + 1, max_depth, raise_on_error))
visited.add(abs_link)
if subcalls:
pool = NonDaemonPool(processes=len(subcalls))
try:
pool = Pool(processes=len(subcalls))
results = pool.map(_spider, subcalls)
results = pool.map(_spider_wrapper, subcalls)
for sub_pages, sub_links in results:
pages.update(sub_pages)
links.update(sub_links)
finally:
pool.terminate()
pool.join()
@ -171,46 +191,53 @@ def _spider(args):
except Exception as e:
# Other types of errors are completely ignored, except in debug mode.
tty.debug("Error in _spider: %s" % e)
tty.debug("Error in _spider: %s:%s" % (type(e), e),
traceback.format_exc())
return pages, links
def spider(root_url, **kwargs):
def _spider_wrapper(args):
"""Wrapper for using spider with multiprocessing."""
return _spider(*args)
def spider(root_url, depth=0):
"""Gets web pages from a root URL.
If depth is specified (e.g., depth=2), then this will also fetches pages
linked from the root and its children up to depth.
If depth is specified (e.g., depth=2), then this will also follow
up to <depth> levels of links from the root.
This will spawn processes to fetch the children, for much improved
performance over a sequential fetch.
"""
max_depth = kwargs.setdefault('depth', 1)
pages, links = _spider((root_url, set(), root_url, None,
1, max_depth, False))
pages, links = _spider(root_url, set(), root_url, 0, depth, False)
return pages, links
def find_versions_of_archive(*archive_urls, **kwargs):
def find_versions_of_archive(archive_urls, list_url=None, list_depth=0):
"""Scrape web pages for new versions of a tarball.
Arguments:
archive_urls:
URLs for different versions of a package. Typically these
are just the tarballs from the package file itself. By
default, this searches the parent directories of archives.
URL or sequence of URLs for different versions of a
package. Typically these are just the tarballs from the package
file itself. By default, this searches the parent directories
of archives.
Keyword Arguments:
list_url:
URL for a listing of archives. Spack wills scrape these
pages for download links that look like the archive URL.
list_depth:
Max depth to follow links on list_url pages.
Max depth to follow links on list_url pages. Default 0.
"""
list_url = kwargs.get('list_url', None)
list_depth = kwargs.get('list_depth', 1)
if not isinstance(archive_urls, (list, tuple)):
archive_urls = [archive_urls]
# Generate a list of list_urls based on archive urls and any
# explicitly listed list_url in the package

View file

@ -34,7 +34,7 @@ class Autogen(AutotoolsPackage):
homepage = "https://www.gnu.org/software/autogen/index.html"
url = "https://ftp.gnu.org/gnu/autogen/rel5.18.12/autogen-5.18.12.tar.gz"
list_url = "https://ftp.gnu.org/gnu/autogen"
list_depth = 2
list_depth = 1
version('5.18.12', '551d15ccbf5b5fc5658da375d5003389')

View file

@ -40,7 +40,7 @@ class Boost(Package):
homepage = "http://www.boost.org"
url = "http://downloads.sourceforge.net/project/boost/boost/1.55.0/boost_1_55_0.tar.bz2"
list_url = "http://sourceforge.net/projects/boost/files/boost/"
list_depth = 2
list_depth = 1
version('1.63.0', '1c837ecd990bb022d07e7aab32b09847')
version('1.62.0', '5fb94629535c19e48703bdb2b2e9490f')

View file

@ -31,7 +31,7 @@ class Cmake(Package):
homepage = 'https://www.cmake.org'
url = 'https://cmake.org/files/v3.4/cmake-3.4.3.tar.gz'
list_url = 'https://cmake.org/files/'
list_depth = 2
list_depth = 1
version('3.7.2', '79bd7e65cd81ea3aa2619484ad6ff25a')
version('3.7.1', 'd031d5a06e9f1c5367cdfc56fbd2a1c8')

View file

@ -37,10 +37,10 @@ class Elfutils(AutotoolsPackage):
url = "https://sourceware.org/elfutils/ftp/0.168/elfutils-0.168.tar.bz2"
list_url = "https://sourceware.org/elfutils/ftp"
list_depth = 2
list_depth = 1
version('0.168','52adfa40758d0d39e5d5c57689bf38d6')
version('0.163','77ce87f259987d2e54e4d87b86cbee41', preferred=True)
version('0.168', '52adfa40758d0d39e5d5c57689bf38d6')
version('0.163', '77ce87f259987d2e54e4d87b86cbee41', preferred=True)
provides('elf@1')

View file

@ -37,7 +37,7 @@ class Gcc(AutotoolsPackage):
url = "http://ftp.gnu.org/gnu/gcc/gcc-4.9.2/gcc-4.9.2.tar.bz2"
list_url = 'http://ftp.gnu.org/gnu/gcc/'
list_depth = 2
list_depth = 1
version('6.3.0', '677a7623c7ef6ab99881bc4e048debb6')
version('6.2.0', '9768625159663b300ae4de2f4745fcc4')

View file

@ -38,7 +38,7 @@ class Gdal(Package):
homepage = "http://www.gdal.org/"
url = "http://download.osgeo.org/gdal/2.1.2/gdal-2.1.2.tar.xz"
list_url = "http://download.osgeo.org/gdal/"
list_depth = 2
list_depth = 1
version('2.1.2', 'ae85b78888514c75e813d658cac9478e')
version('2.0.2', '940208e737c87d31a90eaae43d0efd65')

View file

@ -42,7 +42,7 @@ class Hwloc(AutotoolsPackage):
homepage = "http://www.open-mpi.org/projects/hwloc/"
url = "http://www.open-mpi.org/software/hwloc/v1.9/downloads/hwloc-1.9.tar.gz"
list_url = "http://www.open-mpi.org/software/hwloc/"
list_depth = 3
list_depth = 2
version('1.11.6', 'b4e95eadd2fbdb6d40bbd96be6f03c84')
version('1.11.5', '8f5fe6a9be2eb478409ad5e640b2d3ba')

View file

@ -34,6 +34,6 @@ class Hydra(AutotoolsPackage):
homepage = "http://www.mpich.org"
url = "http://www.mpich.org/static/downloads/3.2/hydra-3.2.tar.gz"
list_url = "http://www.mpich.org/static/downloads/"
list_depth = 2
list_depth = 1
version('3.2', '4d670916695bf7e3a869cc336a881b39')

View file

@ -32,7 +32,7 @@ class Mpich(AutotoolsPackage):
homepage = "http://www.mpich.org"
url = "http://www.mpich.org/static/downloads/3.0.4/mpich-3.0.4.tar.gz"
list_url = "http://www.mpich.org/static/downloads/"
list_depth = 2
list_depth = 1
version('3.2', 'f414cfa77099cd1fa1a5ae4e22db508a')
version('3.1.4', '2ab544607986486562e076b83937bba2')

View file

@ -61,7 +61,7 @@ class Openmpi(AutotoolsPackage):
homepage = "http://www.open-mpi.org"
url = "https://www.open-mpi.org/software/ompi/v2.1/downloads/openmpi-2.1.0.tar.bz2"
list_url = "http://www.open-mpi.org/software/ompi/"
list_depth = 3
list_depth = 2
version('2.1.0', '4838a5973115c44e14442c01d3f21d52')
version('2.0.2', 'ecd99aa436a1ca69ce936a96d6a3fa48')

View file

@ -37,7 +37,7 @@ class Openssl(Package):
# URL must remain http:// so Spack can bootstrap curl
url = "http://www.openssl.org/source/openssl-1.0.1h.tar.gz"
list_url = "https://www.openssl.org/source/old/"
list_depth = 2
list_depth = 1
version('1.1.0e', '51c42d152122e474754aea96f66928c6')
version('1.1.0d', '711ce3cd5f53a99c0e12a7d5804f0f63')

View file

@ -33,7 +33,7 @@ class Pango(AutotoolsPackage):
homepage = "http://www.pango.org"
url = "http://ftp.gnome.org/pub/GNOME/sources/pango/1.40/pango-1.40.3.tar.xz"
list_url = "http://ftp.gnome.org/pub/gnome/sources/pango/"
list_depth = 2
list_depth = 1
version('1.40.3', 'abba8b5ce728520c3a0f1535eab19eac3c14aeef7faa5aded90017ceac2711d3')
version('1.40.1', 'e27af54172c72b3ac6be53c9a4c67053e16c905e02addcf3a603ceb2005c1a40')

View file

@ -33,7 +33,7 @@ class Patchelf(AutotoolsPackage):
url = "http://nixos.org/releases/patchelf/patchelf-0.8/patchelf-0.8.tar.gz"
list_url = "http://nixos.org/releases/patchelf/"
list_depth = 2
list_depth = 1
version('0.9', '3c265508526760f233620f35d79c79fc')
version('0.8', '407b229e6a681ffb0e2cdd5915cb2d01')

View file

@ -31,8 +31,6 @@ class PyNose(PythonPackage):
homepage = "https://pypi.python.org/pypi/nose"
url = "https://pypi.io/packages/source/n/nose/nose-1.3.4.tar.gz"
list_url = "https://pypi.python.org/pypi/nose/"
list_depth = 2
import_modules = [
'nose', 'nose.ext', 'nose.plugins', 'nose.sphinx', 'nose.tools'

View file

@ -30,8 +30,6 @@ class PyScikitLearn(PythonPackage):
homepage = "https://pypi.python.org/pypi/scikit-learn"
url = "https://pypi.io/packages/source/s/scikit-learn/scikit-learn-0.18.1.tar.gz"
list_url = "https://pypi.python.org/pypi/scikit-learn"
list_depth = 2
version('0.18.1', '6b0ff1eaa5010043895dd63d1e3c60c9')
version('0.15.2', 'd9822ad0238e17b382a3c756ea94fe0d')

View file

@ -42,7 +42,7 @@ class Python(Package):
homepage = "http://www.python.org"
url = "http://www.python.org/ftp/python/2.7.8/Python-2.7.8.tgz"
list_url = "https://www.python.org/downloads/"
list_depth = 2
list_depth = 1
version('3.6.0', '3f7062ccf8be76491884d0e47ac8b251')
version('3.5.2', '3fe8434643a78630c61c6464fe2e7e72')
@ -99,11 +99,6 @@ def patch(self):
r'\1setup.py\2 --no-user-cfg \3\6'
)
@when('@:2.6,3.0:3.3')
def patch(self):
# See https://github.com/LLNL/spack/issues/1490
pass
def install(self, spec, prefix):
# TODO: The '--no-user-cfg' option for Python installation is only in
# Python v2.7 and v3.4+ (see https://bugs.python.org/issue1180) and

View file

@ -32,7 +32,7 @@ class QtCreator(Package):
url = 'http://download.qt.io/official_releases/qtcreator/4.1/4.1.0/qt-creator-opensource-src-4.1.0.tar.gz'
list_url = 'http://download.qt.io/official_releases/qtcreator/'
list_depth = 3
list_depth = 2
version('4.1.0', '657727e4209befa4bf5889dff62d9e0a')

View file

@ -33,7 +33,7 @@ class Qt(Package):
homepage = 'http://qt.io'
url = 'http://download.qt.io/archive/qt/5.7/5.7.0/single/qt-everywhere-opensource-src-5.7.0.tar.gz'
list_url = 'http://download.qt.io/archive/qt/'
list_depth = 4
list_depth = 3
version('5.7.1', '031fb3fd0c3cc0f1082644492683f18d')
version('5.7.0', '9a46cce61fc64c20c3ac0a0e0fa41b42')
@ -251,7 +251,7 @@ def common_config_args(self):
# Don't disable all the database drivers, but should
# really get them into spack at some point.
@when('@3')
@when('@3') # noqa: F811
def configure(self):
# A user reported that this was necessary to link Qt3 on ubuntu.
# However, if LD_LIBRARY_PATH is not set the qt build fails, check
@ -268,7 +268,7 @@ def configure(self):
'-release',
'-fast')
@when('@4')
@when('@4') # noqa: F811
def configure(self):
configure('-fast',
'-{0}gtkstyle'.format('' if '+gtk' in self.spec else 'no-'),
@ -276,7 +276,7 @@ def configure(self):
'-arch', str(self.spec.architecture.target),
*self.common_config_args)
@when('@5.0:5.6')
@when('@5.0:5.6') # noqa: F811
def configure(self):
webkit_args = [] if '+webkit' in self.spec else ['-skip', 'qtwebkit']
configure('-no-eglfs',
@ -284,7 +284,7 @@ def configure(self):
'-{0}gtkstyle'.format('' if '+gtk' in self.spec else 'no-'),
*(webkit_args + self.common_config_args))
@when('@5.7:')
@when('@5.7:') # noqa: F811
def configure(self):
config_args = self.common_config_args

View file

@ -31,7 +31,7 @@ class UtilLinux(AutotoolsPackage):
homepage = "http://freecode.com/projects/util-linux"
url = "https://www.kernel.org/pub/linux/utils/util-linux/v2.29/util-linux-2.29.1.tar.gz"
list_url = "https://www.kernel.org/pub/linux/utils/util-linux"
list_depth = 2
list_depth = 1
version('2.29.1', 'c7d5c111ef6bc5df65659e0b523ac9d9')
version('2.25', 'f6d7fc6952ec69c4dc62c8d7c59c1d57')