database: optimize query() by skipping unnecessary virtual checks (#40898)

Most queries will end up calling `spec.satisfies(query)` on everything in the DB, which
will cause Spack to ask whether the query spec is virtual if its name doesn't match the
target spec's. This can be expensive, because it can cause Spack to check if any new
virtuals showed up in *all* the packages it knows about. That can currently trigger
thousands of `stat()` calls.

We can avoid the virtual check for most successful queries if we consider that if there
*is* a match by name, the query spec *can't* be virtual. This PR adds an optimization to
the query loop to save any comparisons that would trigger a virtual check for last.

- [x] Add a `deferred` list to the `query()` loop.
- [x] First run through the `query()` loop *only* checks for name matches.
- [x] Query loop now returns early if there's a name match, skipping most `satisfies()` calls.
- [x] Second run through the `deferred()` list only runs if query spec is virtual.
- [x] Fix up handling of concrete specs.
- [x] Add test for querying virtuals in DB.
- [x] Avoid allocating deferred if not necessary.

---------

Co-authored-by: Harmen Stoppels <me@harmenstoppels.nl>
This commit is contained in:
Todd Gamblin 2023-11-06 17:00:37 -08:00 committed by GitHub
parent 4ce80b95f3
commit 910190f55b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 42 additions and 9 deletions

View file

@ -1522,14 +1522,18 @@ def _query(
# TODO: like installed and known that can be queried? Or are
# TODO: these really special cases that only belong here?
# Just look up concrete specs with hashes; no fancy search.
if isinstance(query_spec, spack.spec.Spec) and query_spec.concrete:
# TODO: handling of hashes restriction is not particularly elegant.
hash_key = query_spec.dag_hash()
if hash_key in self._data and (not hashes or hash_key in hashes):
return [self._data[hash_key].spec]
else:
return []
if query_spec is not any:
if not isinstance(query_spec, spack.spec.Spec):
query_spec = spack.spec.Spec(query_spec)
# Just look up concrete specs with hashes; no fancy search.
if query_spec.concrete:
# TODO: handling of hashes restriction is not particularly elegant.
hash_key = query_spec.dag_hash()
if hash_key in self._data and (not hashes or hash_key in hashes):
return [self._data[hash_key].spec]
else:
return []
# Abstract specs require more work -- currently we test
# against everything.
@ -1537,6 +1541,9 @@ def _query(
start_date = start_date or datetime.datetime.min
end_date = end_date or datetime.datetime.max
# save specs whose name doesn't match for last, to avoid a virtual check
deferred = []
for key, rec in self._data.items():
if hashes is not None and rec.spec.dag_hash() not in hashes:
continue
@ -1561,8 +1568,26 @@ def _query(
if not (start_date < inst_date < end_date):
continue
if query_spec is any or rec.spec.satisfies(query_spec):
if query_spec is any:
results.append(rec.spec)
continue
# check anon specs and exact name matches first
if not query_spec.name or rec.spec.name == query_spec.name:
if rec.spec.satisfies(query_spec):
results.append(rec.spec)
# save potential virtual matches for later, but not if we already found a match
elif not results:
deferred.append(rec.spec)
# Checking for virtuals is expensive, so we save it for last and only if needed.
# If we get here, we didn't find anything in the DB that matched by name.
# If we did fine something, the query spec can't be virtual b/c we matched an actual
# package installation, so skip the virtual check entirely. If we *didn't* find anything,
# check all the deferred specs *if* the query is virtual.
if not results and query_spec is not any and deferred and query_spec.virtual:
results = [spec for spec in deferred if spec.satisfies(query_spec)]
return results

View file

@ -803,6 +803,14 @@ def test_query_spec_with_non_conditional_virtual_dependency(database):
assert len(results) == 1
def test_query_virtual_spec(database):
"""Make sure we can query for virtuals in the DB"""
results = spack.store.STORE.db.query_local("mpi")
assert len(results) == 3
names = [s.name for s in results]
assert all(name in names for name in ["mpich", "mpich2", "zmpi"])
def test_failed_spec_path_error(database):
"""Ensure spec not concrete check is covered."""
s = spack.spec.Spec("a")