Use JSON for the database instead of YAML. (#2189)

* Use JSON for the database instead of YAML.

- JSON is much faster than YAML *and* can preserve ordered keys.
  - 170x+ faster than Python YAML when using unordered dicts
  - 55x faster than Python YAML (both using OrderedDicts)
  - 8x faster than C YAML (with OrderedDicts)

- JSON is built into Python, unlike C YAML, so doesn't add a dependency.
- Don't need human readability for the package database.
- JSON requires no major changes to the code -- same object model as YAML.
- add to_json, from_json methods to spec.

* Add tests to ensure JSON and YAML don't need to be ordered in DB.

* Write index.json first time it's not found instead of requiring reindex.

* flake8 bug.
This commit is contained in:
Todd Gamblin 2016-12-05 10:03:58 -08:00 committed by becker33
parent 161d6205b2
commit 41b8f31bcd
8 changed files with 282 additions and 90 deletions

View file

@ -41,14 +41,6 @@ SPACK_PREFIX = os.path.dirname(os.path.dirname(SPACK_FILE))
SPACK_LIB_PATH = os.path.join(SPACK_PREFIX, "lib", "spack") SPACK_LIB_PATH = os.path.join(SPACK_PREFIX, "lib", "spack")
sys.path.insert(0, SPACK_LIB_PATH) sys.path.insert(0, SPACK_LIB_PATH)
# Try to use system YAML if it is available, as it might have libyaml
# support (for faster loading via C). Load it before anything in
# lib/spack/external so it will take precedence over Spack's PyYAML.
try:
import yaml
except ImportError:
pass # ignore and use slow yaml
# Add external libs # Add external libs
SPACK_EXTERNAL_LIBS = os.path.join(SPACK_LIB_PATH, "external") SPACK_EXTERNAL_LIBS = os.path.join(SPACK_LIB_PATH, "external")
sys.path.insert(0, SPACK_EXTERNAL_LIBS) sys.path.insert(0, SPACK_EXTERNAL_LIBS)

View file

@ -55,6 +55,7 @@
import spack.spec import spack.spec
from spack.error import SpackError from spack.error import SpackError
import spack.util.spack_yaml as syaml import spack.util.spack_yaml as syaml
import spack.util.spack_json as sjson
# DB goes in this directory underneath the root # DB goes in this directory underneath the root
@ -134,10 +135,12 @@ def __init__(self, root, db_dir=None):
under ``root/.spack-db``, which is created if it does not under ``root/.spack-db``, which is created if it does not
exist. This is the ``db_dir``. exist. This is the ``db_dir``.
The Database will attempt to read an ``index.yaml`` file in The Database will attempt to read an ``index.json`` file in
``db_dir``. If it does not find one, it will be created when ``db_dir``. If it does not find one, it will fall back to read
needed by scanning the entire Database root for ``spec.yaml`` an ``index.yaml`` if one is present. If that does not exist, it
files according to Spack's ``DirectoryLayout``. will create a database when needed by scanning the entire
Database root for ``spec.yaml`` files according to Spack's
``DirectoryLayout``.
Caller may optionally provide a custom ``db_dir`` parameter Caller may optionally provide a custom ``db_dir`` parameter
where data will be stored. This is intended to be used for where data will be stored. This is intended to be used for
@ -154,7 +157,8 @@ def __init__(self, root, db_dir=None):
self._db_dir = db_dir self._db_dir = db_dir
# Set up layout of database files within the db dir # Set up layout of database files within the db dir
self._index_path = join_path(self._db_dir, 'index.yaml') self._old_yaml_index_path = join_path(self._db_dir, 'index.yaml')
self._index_path = join_path(self._db_dir, 'index.json')
self._lock_path = join_path(self._db_dir, 'lock') self._lock_path = join_path(self._db_dir, 'lock')
# This is for other classes to use to lock prefix directories. # This is for other classes to use to lock prefix directories.
@ -179,8 +183,8 @@ def read_transaction(self, timeout=_db_lock_timeout):
"""Get a read lock context manager for use in a `with` block.""" """Get a read lock context manager for use in a `with` block."""
return ReadTransaction(self.lock, self._read, timeout=timeout) return ReadTransaction(self.lock, self._read, timeout=timeout)
def _write_to_yaml(self, stream): def _write_to_file(self, stream):
"""Write out the databsae to a YAML file. """Write out the databsae to a JSON file.
This function does not do any locking or transactions. This function does not do any locking or transactions.
""" """
@ -201,12 +205,12 @@ def _write_to_yaml(self, stream):
} }
try: try:
return syaml.dump( sjson.dump(database, stream)
database, stream=stream, default_flow_style=False)
except YAMLError as e: except YAMLError as e:
raise SpackYAMLError("error writing YAML database:", str(e)) raise syaml.SpackYAMLError(
"error writing YAML database:", str(e))
def _read_spec_from_yaml(self, hash_key, installs): def _read_spec_from_dict(self, hash_key, installs):
"""Recursively construct a spec from a hash in a YAML database. """Recursively construct a spec from a hash in a YAML database.
Does not do any locking. Does not do any locking.
@ -241,24 +245,32 @@ def _assign_dependencies(self, hash_key, installs, data):
child = data[dhash].spec child = data[dhash].spec
spec._add_dependency(child, dtypes) spec._add_dependency(child, dtypes)
def _read_from_yaml(self, stream): def _read_from_file(self, stream, format='json'):
""" """
Fill database from YAML, do not maintain old data Fill database from file, do not maintain old data
Translate the spec portions from node-dict form to spec form Translate the spec portions from node-dict form to spec form
Does not do any locking. Does not do any locking.
""" """
if format.lower() == 'json':
load = sjson.load
elif format.lower() == 'yaml':
load = syaml.load
else:
raise ValueError("Invalid database format: %s" % format)
try: try:
if isinstance(stream, basestring): if isinstance(stream, basestring):
with open(stream, 'r') as f: with open(stream, 'r') as f:
yfile = syaml.load(f) fdata = load(f)
else: else:
yfile = syaml.load(stream) fdata = load(stream)
except MarkedYAMLError as e: except MarkedYAMLError as e:
raise SpackYAMLError("error parsing YAML database:", str(e)) raise syaml.SpackYAMLError("error parsing YAML database:", str(e))
except Exception as e:
raise CorruptDatabaseError("error parsing database:", str(e))
if yfile is None: if fdata is None:
return return
def check(cond, msg): def check(cond, msg):
@ -266,10 +278,10 @@ def check(cond, msg):
raise CorruptDatabaseError( raise CorruptDatabaseError(
"Spack database is corrupt: %s" % msg, self._index_path) "Spack database is corrupt: %s" % msg, self._index_path)
check('database' in yfile, "No 'database' attribute in YAML.") check('database' in fdata, "No 'database' attribute in YAML.")
# High-level file checks # High-level file checks
db = yfile['database'] db = fdata['database']
check('installs' in db, "No 'installs' in YAML DB.") check('installs' in db, "No 'installs' in YAML DB.")
check('version' in db, "No 'version' in YAML DB.") check('version' in db, "No 'version' in YAML DB.")
@ -303,7 +315,7 @@ def invalid_record(hash_key, error):
for hash_key, rec in installs.items(): for hash_key, rec in installs.items():
try: try:
# This constructs a spec DAG from the list of all installs # This constructs a spec DAG from the list of all installs
spec = self._read_spec_from_yaml(hash_key, installs) spec = self._read_spec_from_dict(hash_key, installs)
# Insert the brand new spec in the database. Each # Insert the brand new spec in the database. Each
# spec has its own copies of its dependency specs. # spec has its own copies of its dependency specs.
@ -342,7 +354,7 @@ def reindex(self, directory_layout):
def _read_suppress_error(): def _read_suppress_error():
try: try:
if os.path.isfile(self._index_path): if os.path.isfile(self._index_path):
self._read_from_yaml(self._index_path) self._read_from_file(self._index_path)
except CorruptDatabaseError as e: except CorruptDatabaseError as e:
self._error = e self._error = e
self._data = {} self._data = {}
@ -426,7 +438,7 @@ def _write(self, type, value, traceback):
# Write a temporary database file them move it into place # Write a temporary database file them move it into place
try: try:
with open(temp_file, 'w') as f: with open(temp_file, 'w') as f:
self._write_to_yaml(f) self._write_to_file(f)
os.rename(temp_file, self._index_path) os.rename(temp_file, self._index_path)
except: except:
# Clean up temp file if something goes wrong. # Clean up temp file if something goes wrong.
@ -437,11 +449,24 @@ def _write(self, type, value, traceback):
def _read(self): def _read(self):
"""Re-read Database from the data in the set location. """Re-read Database from the data in the set location.
This does no locking. This does no locking, with one exception: it will automatically
migrate an index.yaml to an index.json if possible. This requires
taking a write lock.
""" """
if os.path.isfile(self._index_path): if os.path.isfile(self._index_path):
# Read from YAML file if a database exists # Read from JSON file if a JSON database exists
self._read_from_yaml(self._index_path) self._read_from_file(self._index_path, format='json')
elif os.path.isfile(self._old_yaml_index_path):
if os.access(self._db_dir, os.R_OK | os.W_OK):
# if we can write, then read AND write a JSON file.
self._read_from_file(self._old_yaml_index_path, format='yaml')
with WriteTransaction(self.lock, timeout=_db_lock_timeout):
self._write(None, None, None)
else:
# Read chck for a YAML file if we can't find JSON.
self._read_from_file(self._old_yaml_index_path, format='yaml')
else: else:
# The file doesn't exist, try to traverse the directory. # The file doesn't exist, try to traverse the directory.

View file

@ -117,6 +117,7 @@
from spack.util.prefix import Prefix from spack.util.prefix import Prefix
from spack.util.string import * from spack.util.string import *
import spack.util.spack_yaml as syaml import spack.util.spack_yaml as syaml
import spack.util.spack_json as sjson
from spack.util.spack_yaml import syaml_dict from spack.util.spack_yaml import syaml_dict
from spack.util.crypto import prefix_bits from spack.util.crypto import prefix_bits
from spack.version import * from spack.version import *
@ -153,7 +154,6 @@
'UnsatisfiableArchitectureSpecError', 'UnsatisfiableArchitectureSpecError',
'UnsatisfiableProviderSpecError', 'UnsatisfiableProviderSpecError',
'UnsatisfiableDependencySpecError', 'UnsatisfiableDependencySpecError',
'SpackYAMLError',
'AmbiguousHashError'] 'AmbiguousHashError']
# Valid pattern for an identifier in Spack # Valid pattern for an identifier in Spack
@ -1174,15 +1174,21 @@ def to_node_dict(self):
return syaml_dict([(self.name, d)]) return syaml_dict([(self.name, d)])
def to_yaml(self, stream=None): def to_dict(self):
node_list = [] node_list = []
for s in self.traverse(order='pre', deptype=('link', 'run')): for s in self.traverse(order='pre', deptype=('link', 'run')):
node = s.to_node_dict() node = s.to_node_dict()
node[s.name]['hash'] = s.dag_hash() node[s.name]['hash'] = s.dag_hash()
node_list.append(node) node_list.append(node)
return syaml_dict([('spec', node_list)])
def to_yaml(self, stream=None):
return syaml.dump( return syaml.dump(
syaml_dict([('spec', node_list)]), self.to_dict(), stream=stream, default_flow_style=False)
stream=stream, default_flow_style=False)
def to_json(self, stream=None):
return sjson.dump(self.to_dict(), stream)
@staticmethod @staticmethod
def from_node_dict(node): def from_node_dict(node):
@ -1245,22 +1251,13 @@ def read_yaml_dep_specs(dependency_dict):
yield dep_name, dag_hash, list(deptypes) yield dep_name, dag_hash, list(deptypes)
@staticmethod @staticmethod
def from_yaml(stream): def from_dict(data):
"""Construct a spec from YAML. """Construct a spec from YAML.
Parameters: Parameters:
stream -- string or file object to read from. data -- a nested dict/list data structure read from YAML or JSON.
TODO: currently discards hashes. Include hashes when they
represent more than the DAG does.
""" """
try: nodes = data['spec']
yfile = syaml.load(stream)
except MarkedYAMLError as e:
raise SpackYAMLError("error parsing YAML spec:", str(e))
nodes = yfile['spec']
# Read nodes out of list. Root spec is the first element; # Read nodes out of list. Root spec is the first element;
# dependencies are the following elements. # dependencies are the following elements.
@ -1285,6 +1282,32 @@ def from_yaml(stream):
return spec return spec
@staticmethod
def from_yaml(stream):
"""Construct a spec from YAML.
Parameters:
stream -- string or file object to read from.
"""
try:
data = syaml.load(stream)
return Spec.from_dict(data)
except MarkedYAMLError as e:
raise syaml.SpackYAMLError("error parsing YAML spec:", str(e))
@staticmethod
def from_json(stream):
"""Construct a spec from JSON.
Parameters:
stream -- string or file object to read from.
"""
try:
data = sjson.load(stream)
return Spec.from_dict(data)
except Exception as e:
raise sjson.SpackJSONError("error parsing JSON spec:", str(e))
def _concretize_helper(self, presets=None, visited=None): def _concretize_helper(self, presets=None, visited=None):
"""Recursive helper function for concretize(). """Recursive helper function for concretize().
This concretizes everything bottom-up. As things are This concretizes everything bottom-up. As things are
@ -3064,11 +3087,6 @@ def __init__(self, provided, required):
provided, required, "dependency") provided, required, "dependency")
class SpackYAMLError(spack.error.SpackError):
def __init__(self, msg, yaml_error):
super(SpackYAMLError, self).__init__(msg, str(yaml_error))
class AmbiguousHashError(SpecError): class AmbiguousHashError(SpecError):
def __init__(self, msg, *specs): def __init__(self, msg, *specs):
super(AmbiguousHashError, self).__init__(msg) super(AmbiguousHashError, self).__init__(msg)

View file

@ -75,7 +75,7 @@ class DatabaseTest(MockDatabase):
def test_005_db_exists(self): def test_005_db_exists(self):
"""Make sure db cache file exists after creating.""" """Make sure db cache file exists after creating."""
index_file = join_path(self.install_path, '.spack-db', 'index.yaml') index_file = join_path(self.install_path, '.spack-db', 'index.json')
lock_file = join_path(self.install_path, '.spack-db', 'lock') lock_file = join_path(self.install_path, '.spack-db', 'lock')
self.assertTrue(os.path.exists(index_file)) self.assertTrue(os.path.exists(index_file))

View file

@ -496,34 +496,6 @@ def test_deptype_traversal_run(self):
traversal = dag.traverse(deptype='run') traversal = dag.traverse(deptype='run')
self.assertEqual([x.name for x in traversal], names) self.assertEqual([x.name for x in traversal], names)
def test_using_ordered_dict(self):
""" Checks that dicts are ordered
Necessary to make sure that dag_hash is stable across python
versions and processes.
"""
def descend_and_check(iterable, level=0):
from spack.util.spack_yaml import syaml_dict
from collections import Iterable, Mapping
if isinstance(iterable, Mapping):
self.assertTrue(isinstance(iterable, syaml_dict))
return descend_and_check(iterable.values(), level=level + 1)
max_level = level
for value in iterable:
if isinstance(value, Iterable) and not isinstance(value, str):
nlevel = descend_and_check(value, level=level + 1)
if nlevel > max_level:
max_level = nlevel
return max_level
specs = ['mpileaks ^zmpi', 'dttop', 'dtuse']
for spec in specs:
dag = Spec(spec)
dag.normalize()
level = descend_and_check(dag.to_node_dict())
# level just makes sure we are doing something here
self.assertTrue(level >= 5)
def test_hash_bits(self): def test_hash_bits(self):
"""Ensure getting first n bits of a base32-encoded DAG hash works.""" """Ensure getting first n bits of a base32-encoded DAG hash works."""

View file

@ -27,6 +27,10 @@
YAML format preserves DAG informatoin in the spec. YAML format preserves DAG informatoin in the spec.
""" """
import spack.util.spack_yaml as syaml
import spack.util.spack_json as sjson
from spack.util.spack_yaml import syaml_dict
from spack.spec import Spec from spack.spec import Spec
from spack.test.mock_packages_test import * from spack.test.mock_packages_test import *
@ -64,3 +68,123 @@ def test_yaml_subdag(self):
for dep in ('callpath', 'mpich', 'dyninst', 'libdwarf', 'libelf'): for dep in ('callpath', 'mpich', 'dyninst', 'libdwarf', 'libelf'):
self.assertTrue(spec[dep].eq_dag(yaml_spec[dep])) self.assertTrue(spec[dep].eq_dag(yaml_spec[dep]))
def test_using_ordered_dict(self):
""" Checks that dicts are ordered
Necessary to make sure that dag_hash is stable across python
versions and processes.
"""
def descend_and_check(iterable, level=0):
from spack.util.spack_yaml import syaml_dict
from collections import Iterable, Mapping
if isinstance(iterable, Mapping):
self.assertTrue(isinstance(iterable, syaml_dict))
return descend_and_check(iterable.values(), level=level + 1)
max_level = level
for value in iterable:
if isinstance(value, Iterable) and not isinstance(value, str):
nlevel = descend_and_check(value, level=level + 1)
if nlevel > max_level:
max_level = nlevel
return max_level
specs = ['mpileaks ^zmpi', 'dttop', 'dtuse']
for spec in specs:
dag = Spec(spec)
dag.normalize()
level = descend_and_check(dag.to_node_dict())
# level just makes sure we are doing something here
self.assertTrue(level >= 5)
def test_ordered_read_not_required_for_consistent_dag_hash(self):
"""Make sure ordered serialization isn't required to preserve hashes.
For consistent hashes, we require that YAML and json documents
have their keys serialized in a deterministic order. However, we
don't want to require them to be serialized in order. This
ensures that is not reauired.
"""
specs = ['mpileaks ^zmpi', 'dttop', 'dtuse']
for spec in specs:
spec = Spec(spec)
spec.concretize()
#
# Dict & corresponding YAML & JSON from the original spec.
#
spec_dict = spec.to_dict()
spec_yaml = spec.to_yaml()
spec_json = spec.to_json()
#
# Make a spec with reversed OrderedDicts for every
# OrderedDict in the original.
#
reversed_spec_dict = reverse_all_dicts(spec.to_dict())
#
# Dump to YAML and JSON
#
yaml_string = syaml.dump(spec_dict, default_flow_style=False)
reversed_yaml_string = syaml.dump(reversed_spec_dict,
default_flow_style=False)
json_string = sjson.dump(spec_dict)
reversed_json_string = sjson.dump(reversed_spec_dict)
#
# Do many consistency checks
#
# spec yaml is ordered like the spec dict
self.assertEqual(yaml_string, spec_yaml)
self.assertEqual(json_string, spec_json)
# reversed string is different from the original, so it
# *would* generate a different hash
self.assertNotEqual(yaml_string, reversed_yaml_string)
self.assertNotEqual(json_string, reversed_json_string)
# build specs from the "wrongly" ordered data
round_trip_yaml_spec = Spec.from_yaml(yaml_string)
round_trip_json_spec = Spec.from_json(json_string)
round_trip_reversed_yaml_spec = Spec.from_yaml(
reversed_yaml_string)
round_trip_reversed_json_spec = Spec.from_yaml(
reversed_json_string)
# TODO: remove this when build deps are in provenance.
spec = spec.copy(deps=('link', 'run'))
# specs are equal to the original
self.assertEqual(spec, round_trip_yaml_spec)
self.assertEqual(spec, round_trip_json_spec)
self.assertEqual(spec, round_trip_reversed_yaml_spec)
self.assertEqual(spec, round_trip_reversed_json_spec)
self.assertEqual(round_trip_yaml_spec,
round_trip_reversed_yaml_spec)
self.assertEqual(round_trip_json_spec,
round_trip_reversed_json_spec)
# dag_hashes are equal
self.assertEqual(
spec.dag_hash(), round_trip_yaml_spec.dag_hash())
self.assertEqual(
spec.dag_hash(), round_trip_json_spec.dag_hash())
self.assertEqual(
spec.dag_hash(), round_trip_reversed_yaml_spec.dag_hash())
self.assertEqual(
spec.dag_hash(), round_trip_reversed_json_spec.dag_hash())
def reverse_all_dicts(data):
"""Descend into data and reverse all the dictionaries"""
if isinstance(data, dict):
return syaml_dict(reversed(
[(reverse_all_dicts(k), reverse_all_dicts(v))
for k, v in data.items()]))
elif isinstance(data, (list, tuple)):
return type(data)(reverse_all_dicts(elt) for elt in data)
else:
return data

View file

@ -0,0 +1,56 @@
##############################################################################
# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
# Produced at the Lawrence Livermore National Laboratory.
#
# This file is part of Spack.
# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
# LLNL-CODE-647188
#
# For details, see https://github.com/llnl/spack
# Please also see the LICENSE file for our notice and the LGPL.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License (as
# published by the Free Software Foundation) version 2.1, February 1999.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
# conditions of the GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
##############################################################################
"""Simple wrapper around JSON to guarantee consistent use of load/dump. """
import json
import spack.error
__all__ = ['load', 'dump', 'SpackJSONError']
_json_dump_args = {
'indent': True,
'separators': (',', ': ')
}
def load(stream):
"""Spack JSON needs to be ordered to support specs."""
if isinstance(stream, basestring):
return json.loads(stream)
else:
return json.load(stream)
def dump(data, stream=None):
"""Dump JSON with a reasonable amount of indentation and separation."""
if stream is None:
return json.dumps(data, **_json_dump_args)
else:
return json.dump(data, stream, **_json_dump_args)
class SpackJSONError(spack.error.SpackError):
"""Raised when there are issues with JSON parsing."""
def __init__(self, msg, yaml_error):
super(SpackJSONError, self).__init__(msg, str(yaml_error))

View file

@ -32,23 +32,21 @@
""" """
import yaml import yaml
try: from yaml import Loader, Dumper
from yaml import CLoader as Loader, CDumper as Dumper
except ImportError as e:
from yaml import Loader, Dumper
from yaml.nodes import * from yaml.nodes import *
from yaml.constructor import ConstructorError from yaml.constructor import ConstructorError
from ordereddict_backport import OrderedDict from ordereddict_backport import OrderedDict
import spack.error
# Only export load and dump # Only export load and dump
__all__ = ['load', 'dump'] __all__ = ['load', 'dump', 'SpackYAMLError']
# Make new classes so we can add custom attributes. # Make new classes so we can add custom attributes.
# Also, use OrderedDict instead of just dict. # Also, use OrderedDict instead of just dict.
class syaml_dict(OrderedDict): class syaml_dict(OrderedDict):
def __repr__(self): def __repr__(self):
mappings = ('%r: %r' % (k, v) for k, v in self.items()) mappings = ('%r: %r' % (k, v) for k, v in self.items())
return '{%s}' % ', '.join(mappings) return '{%s}' % ', '.join(mappings)
@ -153,6 +151,7 @@ def construct_mapping(self, node, deep=False):
mark(mapping, node) mark(mapping, node)
return mapping return mapping
# register above new constructors # register above new constructors
OrderedLineLoader.add_constructor( OrderedLineLoader.add_constructor(
u'tag:yaml.org,2002:map', OrderedLineLoader.construct_yaml_map) u'tag:yaml.org,2002:map', OrderedLineLoader.construct_yaml_map)
@ -223,3 +222,9 @@ def load(*args, **kwargs):
def dump(*args, **kwargs): def dump(*args, **kwargs):
kwargs['Dumper'] = OrderedLineDumper kwargs['Dumper'] = OrderedLineDumper
return yaml.dump(*args, **kwargs) return yaml.dump(*args, **kwargs)
class SpackYAMLError(spack.error.SpackError):
"""Raised when there are issues with YAML parsing."""
def __init__(self, msg, yaml_error):
super(SpackYAMLError, self).__init__(msg, str(yaml_error))