Pre ci optimization (#16372)
* add initial optimization script * integrate optimization in spack ci * make optimization opt-in * fix import error * flake8 fixes * update command completion * work around vermin errors * fix sphynx errors
This commit is contained in:
parent
42f2c168d2
commit
224dc95159
4 changed files with 393 additions and 3 deletions
|
@ -449,7 +449,8 @@ def format_job_needs(phase_name, strip_compilers, dep_jobs,
|
|||
|
||||
|
||||
def generate_gitlab_ci_yaml(env, print_summary, output_file,
|
||||
custom_spack_repo=None, custom_spack_ref=None):
|
||||
custom_spack_repo=None, custom_spack_ref=None,
|
||||
run_optimizer=False):
|
||||
# FIXME: What's the difference between one that opens with 'spack'
|
||||
# and one that opens with 'env'? This will only handle the former.
|
||||
with spack.concretize.disable_compiler_existence_check():
|
||||
|
@ -788,6 +789,11 @@ def generate_gitlab_ci_yaml(env, print_summary, output_file,
|
|||
for output_key, output_value in sorted(output_object.items()):
|
||||
sorted_output[output_key] = output_value
|
||||
|
||||
# TODO(opadron): remove this or refactor
|
||||
if run_optimizer:
|
||||
import spack.ci_optimization as ci_opt
|
||||
sorted_output = ci_opt.optimizer(sorted_output)
|
||||
|
||||
with open(output_file, 'w') as outf:
|
||||
outf.write(syaml.dump_config(sorted_output, default_flow_style=True))
|
||||
|
||||
|
|
377
lib/spack/spack/ci_optimization.py
Normal file
377
lib/spack/spack/ci_optimization.py
Normal file
|
@ -0,0 +1,377 @@
|
|||
# Copyright 2013-2020 Lawrence Livermore National Security, LLC and other
|
||||
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
||||
#
|
||||
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||
|
||||
import collections
|
||||
|
||||
try:
|
||||
# dynamically import to keep vermin from complaining
|
||||
collections_abc = __import__('collections.abc')
|
||||
except ImportError:
|
||||
collections_abc = collections
|
||||
|
||||
import copy
|
||||
import hashlib
|
||||
|
||||
import spack.util.spack_yaml as syaml
|
||||
|
||||
|
||||
def matches(obj, proto):
|
||||
"""Returns True if the test object "obj" matches the prototype object
|
||||
"proto".
|
||||
|
||||
If obj and proto are mappings, obj matches proto if (key in obj) and
|
||||
(obj[key] matches proto[key]) for every key in proto.
|
||||
|
||||
If obj and proto are sequences, obj matches proto if they are of the same
|
||||
length and (a matches b) for every (a,b) in zip(obj, proto).
|
||||
|
||||
Otherwise, obj matches proto if obj == proto.
|
||||
|
||||
Precondition: proto must not have any reference cycles
|
||||
"""
|
||||
if isinstance(obj, collections_abc.Mapping):
|
||||
if not isinstance(proto, collections_abc.Mapping):
|
||||
return False
|
||||
|
||||
return all(
|
||||
(key in obj and matches(obj[key], val))
|
||||
for key, val in proto.items()
|
||||
)
|
||||
|
||||
if (isinstance(obj, collections_abc.Sequence) and
|
||||
not isinstance(obj, str)):
|
||||
|
||||
if not (isinstance(proto, collections_abc.Sequence) and
|
||||
not isinstance(proto, str)):
|
||||
return False
|
||||
|
||||
if len(obj) != len(proto):
|
||||
return False
|
||||
|
||||
return all(
|
||||
matches(obj[index], val)
|
||||
for index, val in enumerate(proto)
|
||||
)
|
||||
|
||||
return obj == proto
|
||||
|
||||
|
||||
def subkeys(obj, proto):
|
||||
"""Returns the test mapping "obj" after factoring out the items it has in
|
||||
common with the prototype mapping "proto".
|
||||
|
||||
Consider a recursive merge operation, merge(a, b) on mappings a and b, that
|
||||
returns a mapping, m, whose keys are the union of the keys of a and b, and
|
||||
for every such key, "k", its corresponding value is:
|
||||
|
||||
- merge(a[key], b[key]) if a[key] and b[key] are mappings, or
|
||||
- b[key] if (key in b) and not matches(a[key], b[key]),
|
||||
or
|
||||
- a[key] otherwise
|
||||
|
||||
|
||||
If obj and proto are mappings, the returned object is the smallest object,
|
||||
"a", such that merge(a, proto) matches obj.
|
||||
|
||||
Otherwise, obj is returned.
|
||||
"""
|
||||
if not (isinstance(obj, collections_abc.Mapping) and
|
||||
isinstance(proto, collections_abc.Mapping)):
|
||||
return obj
|
||||
|
||||
new_obj = {}
|
||||
for key, value in obj.items():
|
||||
if key not in proto:
|
||||
new_obj[key] = value
|
||||
continue
|
||||
|
||||
if (matches(value, proto[key]) and
|
||||
matches(proto[key], value)):
|
||||
continue
|
||||
|
||||
if isinstance(value, collections_abc.Mapping):
|
||||
new_obj[key] = subkeys(value, proto[key])
|
||||
continue
|
||||
|
||||
new_obj[key] = value
|
||||
|
||||
return new_obj
|
||||
|
||||
|
||||
def add_extends(yaml, key):
|
||||
"""Modifies the given object "yaml" so that it includes an "extends" key
|
||||
whose value features "key".
|
||||
|
||||
If "extends" is not in yaml, then yaml is modified such that
|
||||
yaml["extends"] == key.
|
||||
|
||||
If yaml["extends"] is a str, then yaml is modified such that
|
||||
yaml["extends"] == [yaml["extends"], key]
|
||||
|
||||
If yaml["extends"] is a list that does not include key, then key is
|
||||
appended to the list.
|
||||
|
||||
Otherwise, yaml is left unchanged.
|
||||
"""
|
||||
|
||||
has_key = ('extends' in yaml)
|
||||
extends = yaml.get('extends')
|
||||
|
||||
if has_key and not isinstance(extends, (str, collections_abc.Sequence)):
|
||||
return
|
||||
|
||||
if extends is None:
|
||||
yaml['extends'] = key
|
||||
return
|
||||
|
||||
if isinstance(extends, str):
|
||||
if extends != key:
|
||||
yaml['extends'] = [extends, key]
|
||||
return
|
||||
|
||||
if key not in extends:
|
||||
extends.append(key)
|
||||
|
||||
|
||||
def common_subobject(yaml, sub):
|
||||
"""Factor prototype object "sub" out of the values of mapping "yaml".
|
||||
|
||||
Consider a modified copy of yaml, "new", where for each key, "key" in yaml:
|
||||
|
||||
- If yaml[key] matches sub, then new[key] = subkeys(yaml[key], sub).
|
||||
- Otherwise, new[key] = yaml[key].
|
||||
|
||||
If the above match criteria is not satisfied for any such key, then (yaml,
|
||||
None) is returned. The yaml object is returned unchanged.
|
||||
|
||||
Otherwise, each matching value in new is modified as in
|
||||
add_extends(new[key], common_key), and then new[common_key] is set to sub.
|
||||
The common_key value is chosen such that it does not match any preexisting
|
||||
key in new. In this case, (new, common_key) is returned.
|
||||
"""
|
||||
match_list = set(k for k, v in yaml.items() if matches(v, sub))
|
||||
|
||||
if not match_list:
|
||||
return yaml, None
|
||||
|
||||
common_prefix = '.c'
|
||||
common_index = 0
|
||||
|
||||
while True:
|
||||
common_key = ''.join((common_prefix, str(common_index)))
|
||||
if common_key not in yaml:
|
||||
break
|
||||
common_index += 1
|
||||
|
||||
new_yaml = {}
|
||||
|
||||
for key, val in yaml.items():
|
||||
new_yaml[key] = copy.deepcopy(val)
|
||||
|
||||
if not matches(val, sub):
|
||||
continue
|
||||
|
||||
new_yaml[key] = subkeys(new_yaml[key], sub)
|
||||
add_extends(new_yaml[key], common_key)
|
||||
|
||||
new_yaml[common_key] = sub
|
||||
|
||||
return new_yaml, common_key
|
||||
|
||||
|
||||
def print_delta(name, old, new, applied=None):
|
||||
delta = new - old
|
||||
reldelta = (1000 * delta) // old
|
||||
reldelta = (reldelta // 10, reldelta % 10)
|
||||
|
||||
if applied is None:
|
||||
applied = (new <= old)
|
||||
|
||||
print('\n'.join((
|
||||
'{} {}:',
|
||||
' before: {: 10d}',
|
||||
' after : {: 10d}',
|
||||
' delta : {:+10d} ({:=+3d}.{}%)',
|
||||
)).format(
|
||||
name,
|
||||
('+' if applied else 'x'),
|
||||
old,
|
||||
new,
|
||||
delta,
|
||||
reldelta[0],
|
||||
reldelta[1]
|
||||
))
|
||||
|
||||
|
||||
def try_optimization_pass(name, yaml, optimization_pass, *args, **kwargs):
|
||||
"""Try applying an optimization pass and return information about the
|
||||
result
|
||||
|
||||
"name" is a string describing the nature of the pass. If it is a non-empty
|
||||
string, summary statistics are also printed to stdout.
|
||||
|
||||
"yaml" is the object to apply the pass to.
|
||||
|
||||
"optimization_pass" is the function implementing the pass to be applied.
|
||||
|
||||
"args" and "kwargs" are the additional arguments to pass to optimization
|
||||
pass. The pass is applied as
|
||||
|
||||
>>> (new_yaml, *other_results) = optimization_pass(yaml, *args, **kwargs)
|
||||
|
||||
The pass's results are greedily rejected if it does not modify the original
|
||||
yaml document, or if it produces a yaml document that serializes to a
|
||||
larger string.
|
||||
|
||||
Returns (new_yaml, yaml, applied, other_results) if applied, or
|
||||
(yaml, new_yaml, applied, other_results) otherwise.
|
||||
"""
|
||||
result = optimization_pass(yaml, *args, **kwargs)
|
||||
new_yaml, other_results = result[0], result[1:]
|
||||
|
||||
if new_yaml is yaml:
|
||||
# pass was not applied
|
||||
return (yaml, new_yaml, False, other_results)
|
||||
|
||||
pre_size = len(syaml.dump_config(yaml, default_flow_style=True))
|
||||
post_size = len(syaml.dump_config(new_yaml, default_flow_style=True))
|
||||
|
||||
# pass makes the size worse: not applying
|
||||
applied = (post_size <= pre_size)
|
||||
if applied:
|
||||
yaml, new_yaml = new_yaml, yaml
|
||||
|
||||
if name:
|
||||
print_delta(name, pre_size, post_size, applied)
|
||||
|
||||
return (yaml, new_yaml, applied, other_results)
|
||||
|
||||
|
||||
def build_histogram(iterator, key):
|
||||
"""Builds a histogram of values given an iterable of mappings and a key.
|
||||
|
||||
For each mapping "m" with key "key" in iterator, the value m[key] is
|
||||
considered.
|
||||
|
||||
Returns a list of tuples (hash, count, proportion, value), where
|
||||
|
||||
- "hash" is a sha1sum hash of the value.
|
||||
- "count" is the number of occurences of values that hash to "hash".
|
||||
- "proportion" is the proportion of all values considered above that
|
||||
hash to "hash".
|
||||
- "value" is one of the values considered above that hash to "hash".
|
||||
Which value is chosen when multiple values hash to the same "hash" is
|
||||
undefined.
|
||||
|
||||
The list is sorted in descending order by count, yielding the most
|
||||
frequently occuring hashes first.
|
||||
"""
|
||||
buckets = collections.defaultdict(int)
|
||||
values = {}
|
||||
|
||||
num_objects = 0
|
||||
for obj in iterator:
|
||||
num_objects += 1
|
||||
|
||||
try:
|
||||
val = obj[key]
|
||||
except (KeyError, TypeError):
|
||||
continue
|
||||
|
||||
value_hash = hashlib.sha1()
|
||||
value_hash.update(syaml.dump_config(val).encode())
|
||||
value_hash = value_hash.hexdigest()
|
||||
|
||||
buckets[value_hash] += 1
|
||||
values[value_hash] = val
|
||||
|
||||
return [(h, buckets[h], float(buckets[h]) / num_objects, values[h])
|
||||
for h in sorted(buckets.keys(), key=lambda k: -buckets[k])]
|
||||
|
||||
|
||||
def optimizer(yaml):
|
||||
original_size = len(syaml.dump_config(yaml, default_flow_style=True))
|
||||
|
||||
# try factoring out commonly repeated portions
|
||||
common_job = {
|
||||
'variables': {
|
||||
'SPACK_COMPILER_ACTION': 'NONE',
|
||||
'SPACK_RELATED_BUILDS_CDASH': ''
|
||||
},
|
||||
|
||||
'after_script': ['rm -rf "./spack"'],
|
||||
|
||||
'artifacts': {
|
||||
'paths': ['jobs_scratch_dir', 'cdash_report'],
|
||||
'when': 'always'
|
||||
},
|
||||
}
|
||||
|
||||
# look for a list of tags that appear frequently
|
||||
_, count, proportion, tags = next(iter(
|
||||
build_histogram(yaml.values(), 'tags')),
|
||||
(None,) * 4)
|
||||
|
||||
# If a list of tags is found, and there are more than one job that uses it,
|
||||
# *and* the jobs that do use it represent at least 70% of all jobs, then
|
||||
# add the list to the prototype object.
|
||||
if tags and count > 1 and proportion >= 0.70:
|
||||
common_job['tags'] = tags
|
||||
|
||||
# apply common object factorization
|
||||
yaml, other, applied, rest = try_optimization_pass(
|
||||
'general common object factorization',
|
||||
yaml, common_subobject, common_job)
|
||||
|
||||
# look for a common script, and try factoring that out
|
||||
_, count, proportion, script = next(iter(
|
||||
build_histogram(yaml.values(), 'script')),
|
||||
(None,) * 4)
|
||||
|
||||
if script and count > 1 and proportion >= 0.70:
|
||||
yaml, other, applied, rest = try_optimization_pass(
|
||||
'script factorization',
|
||||
yaml, common_subobject, {'script': script})
|
||||
|
||||
# look for a common before_script, and try factoring that out
|
||||
_, count, proportion, script = next(iter(
|
||||
build_histogram(yaml.values(), 'before_script')),
|
||||
(None,) * 4)
|
||||
|
||||
if script and count > 1 and proportion >= 0.70:
|
||||
yaml, other, applied, rest = try_optimization_pass(
|
||||
'before_script factorization',
|
||||
yaml, common_subobject, {'before_script': script})
|
||||
|
||||
# Look specifically for the SPACK_ROOT_SPEC environment variables.
|
||||
# Try to factor them out.
|
||||
h = build_histogram((
|
||||
getattr(val, 'get', lambda *args: {})('variables')
|
||||
for val in yaml.values()), 'SPACK_ROOT_SPEC')
|
||||
|
||||
# In this case, we try to factor out *all* instances of the SPACK_ROOT_SPEC
|
||||
# environment variable; not just the one that appears with the greatest
|
||||
# frequency. We only require that more than 1 job uses a given instance's
|
||||
# value, because we expect the value to be very large, and so expect even
|
||||
# few-to-one factorizations to yield large space savings.
|
||||
counter = 0
|
||||
for _, count, proportion, spec in h:
|
||||
if count <= 1:
|
||||
continue
|
||||
|
||||
counter += 1
|
||||
|
||||
yaml, other, applied, rest = try_optimization_pass(
|
||||
'SPACK_ROOT_SPEC factorization ({count})'.format(count=counter),
|
||||
yaml,
|
||||
common_subobject,
|
||||
{'variables': {'SPACK_ROOT_SPEC': spec}})
|
||||
|
||||
new_size = len(syaml.dump_config(yaml, default_flow_style=True))
|
||||
|
||||
print('\n')
|
||||
print_delta('overall summary', original_size, new_size)
|
||||
print('\n')
|
||||
return yaml
|
|
@ -54,6 +54,11 @@ def setup_parser(subparser):
|
|||
help="Provide a git branch or tag if a custom spack branch " +
|
||||
"should be checked out as a step in each generated job. " +
|
||||
"This argument is ignored if no --spack-repo is provided.")
|
||||
generate.add_argument(
|
||||
'--optimize', action='store_true',
|
||||
help="(Experimental) run the generated document through a series of "
|
||||
"optimization passes designed to reduce the size of the "
|
||||
"generated file.")
|
||||
generate.set_defaults(func=ci_generate)
|
||||
|
||||
# Check a spec against mirror. Rebuild, create buildcache and push to
|
||||
|
@ -75,6 +80,7 @@ def ci_generate(args):
|
|||
copy_yaml_to = args.copy_to
|
||||
spack_repo = args.spack_repo
|
||||
spack_ref = args.spack_ref
|
||||
run_optimizer = args.optimize
|
||||
|
||||
if not output_file:
|
||||
gen_ci_dir = os.getcwd()
|
||||
|
@ -86,7 +92,8 @@ def ci_generate(args):
|
|||
|
||||
# Generate the jobs
|
||||
spack_ci.generate_gitlab_ci_yaml(
|
||||
env, True, output_file, spack_repo, spack_ref)
|
||||
env, True, output_file, spack_repo, spack_ref,
|
||||
run_optimizer=run_optimizer)
|
||||
|
||||
if copy_yaml_to:
|
||||
copy_to_dir = os.path.dirname(copy_yaml_to)
|
||||
|
|
|
@ -474,7 +474,7 @@ _spack_ci() {
|
|||
}
|
||||
|
||||
_spack_ci_generate() {
|
||||
SPACK_COMPREPLY="-h --help --output-file --copy-to --spack-repo --spack-ref"
|
||||
SPACK_COMPREPLY="-h --help --output-file --copy-to --spack-repo --spack-ref --optimize"
|
||||
}
|
||||
|
||||
_spack_ci_rebuild() {
|
||||
|
|
Loading…
Reference in a new issue