Pre ci optimization (#16372)

* add initial optimization script

* integrate optimization in spack ci

* make optimization opt-in

* fix import error

* flake8 fixes

* update command completion

* work around vermin errors

* fix sphynx errors
This commit is contained in:
Omar Padron 2020-06-22 13:19:47 -04:00 committed by GitHub
parent 42f2c168d2
commit 224dc95159
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 393 additions and 3 deletions

View file

@ -449,7 +449,8 @@ def format_job_needs(phase_name, strip_compilers, dep_jobs,
def generate_gitlab_ci_yaml(env, print_summary, output_file,
custom_spack_repo=None, custom_spack_ref=None):
custom_spack_repo=None, custom_spack_ref=None,
run_optimizer=False):
# FIXME: What's the difference between one that opens with 'spack'
# and one that opens with 'env'? This will only handle the former.
with spack.concretize.disable_compiler_existence_check():
@ -788,6 +789,11 @@ def generate_gitlab_ci_yaml(env, print_summary, output_file,
for output_key, output_value in sorted(output_object.items()):
sorted_output[output_key] = output_value
# TODO(opadron): remove this or refactor
if run_optimizer:
import spack.ci_optimization as ci_opt
sorted_output = ci_opt.optimizer(sorted_output)
with open(output_file, 'w') as outf:
outf.write(syaml.dump_config(sorted_output, default_flow_style=True))

View file

@ -0,0 +1,377 @@
# Copyright 2013-2020 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import collections
try:
# dynamically import to keep vermin from complaining
collections_abc = __import__('collections.abc')
except ImportError:
collections_abc = collections
import copy
import hashlib
import spack.util.spack_yaml as syaml
def matches(obj, proto):
"""Returns True if the test object "obj" matches the prototype object
"proto".
If obj and proto are mappings, obj matches proto if (key in obj) and
(obj[key] matches proto[key]) for every key in proto.
If obj and proto are sequences, obj matches proto if they are of the same
length and (a matches b) for every (a,b) in zip(obj, proto).
Otherwise, obj matches proto if obj == proto.
Precondition: proto must not have any reference cycles
"""
if isinstance(obj, collections_abc.Mapping):
if not isinstance(proto, collections_abc.Mapping):
return False
return all(
(key in obj and matches(obj[key], val))
for key, val in proto.items()
)
if (isinstance(obj, collections_abc.Sequence) and
not isinstance(obj, str)):
if not (isinstance(proto, collections_abc.Sequence) and
not isinstance(proto, str)):
return False
if len(obj) != len(proto):
return False
return all(
matches(obj[index], val)
for index, val in enumerate(proto)
)
return obj == proto
def subkeys(obj, proto):
"""Returns the test mapping "obj" after factoring out the items it has in
common with the prototype mapping "proto".
Consider a recursive merge operation, merge(a, b) on mappings a and b, that
returns a mapping, m, whose keys are the union of the keys of a and b, and
for every such key, "k", its corresponding value is:
- merge(a[key], b[key]) if a[key] and b[key] are mappings, or
- b[key] if (key in b) and not matches(a[key], b[key]),
or
- a[key] otherwise
If obj and proto are mappings, the returned object is the smallest object,
"a", such that merge(a, proto) matches obj.
Otherwise, obj is returned.
"""
if not (isinstance(obj, collections_abc.Mapping) and
isinstance(proto, collections_abc.Mapping)):
return obj
new_obj = {}
for key, value in obj.items():
if key not in proto:
new_obj[key] = value
continue
if (matches(value, proto[key]) and
matches(proto[key], value)):
continue
if isinstance(value, collections_abc.Mapping):
new_obj[key] = subkeys(value, proto[key])
continue
new_obj[key] = value
return new_obj
def add_extends(yaml, key):
"""Modifies the given object "yaml" so that it includes an "extends" key
whose value features "key".
If "extends" is not in yaml, then yaml is modified such that
yaml["extends"] == key.
If yaml["extends"] is a str, then yaml is modified such that
yaml["extends"] == [yaml["extends"], key]
If yaml["extends"] is a list that does not include key, then key is
appended to the list.
Otherwise, yaml is left unchanged.
"""
has_key = ('extends' in yaml)
extends = yaml.get('extends')
if has_key and not isinstance(extends, (str, collections_abc.Sequence)):
return
if extends is None:
yaml['extends'] = key
return
if isinstance(extends, str):
if extends != key:
yaml['extends'] = [extends, key]
return
if key not in extends:
extends.append(key)
def common_subobject(yaml, sub):
"""Factor prototype object "sub" out of the values of mapping "yaml".
Consider a modified copy of yaml, "new", where for each key, "key" in yaml:
- If yaml[key] matches sub, then new[key] = subkeys(yaml[key], sub).
- Otherwise, new[key] = yaml[key].
If the above match criteria is not satisfied for any such key, then (yaml,
None) is returned. The yaml object is returned unchanged.
Otherwise, each matching value in new is modified as in
add_extends(new[key], common_key), and then new[common_key] is set to sub.
The common_key value is chosen such that it does not match any preexisting
key in new. In this case, (new, common_key) is returned.
"""
match_list = set(k for k, v in yaml.items() if matches(v, sub))
if not match_list:
return yaml, None
common_prefix = '.c'
common_index = 0
while True:
common_key = ''.join((common_prefix, str(common_index)))
if common_key not in yaml:
break
common_index += 1
new_yaml = {}
for key, val in yaml.items():
new_yaml[key] = copy.deepcopy(val)
if not matches(val, sub):
continue
new_yaml[key] = subkeys(new_yaml[key], sub)
add_extends(new_yaml[key], common_key)
new_yaml[common_key] = sub
return new_yaml, common_key
def print_delta(name, old, new, applied=None):
delta = new - old
reldelta = (1000 * delta) // old
reldelta = (reldelta // 10, reldelta % 10)
if applied is None:
applied = (new <= old)
print('\n'.join((
'{} {}:',
' before: {: 10d}',
' after : {: 10d}',
' delta : {:+10d} ({:=+3d}.{}%)',
)).format(
name,
('+' if applied else 'x'),
old,
new,
delta,
reldelta[0],
reldelta[1]
))
def try_optimization_pass(name, yaml, optimization_pass, *args, **kwargs):
"""Try applying an optimization pass and return information about the
result
"name" is a string describing the nature of the pass. If it is a non-empty
string, summary statistics are also printed to stdout.
"yaml" is the object to apply the pass to.
"optimization_pass" is the function implementing the pass to be applied.
"args" and "kwargs" are the additional arguments to pass to optimization
pass. The pass is applied as
>>> (new_yaml, *other_results) = optimization_pass(yaml, *args, **kwargs)
The pass's results are greedily rejected if it does not modify the original
yaml document, or if it produces a yaml document that serializes to a
larger string.
Returns (new_yaml, yaml, applied, other_results) if applied, or
(yaml, new_yaml, applied, other_results) otherwise.
"""
result = optimization_pass(yaml, *args, **kwargs)
new_yaml, other_results = result[0], result[1:]
if new_yaml is yaml:
# pass was not applied
return (yaml, new_yaml, False, other_results)
pre_size = len(syaml.dump_config(yaml, default_flow_style=True))
post_size = len(syaml.dump_config(new_yaml, default_flow_style=True))
# pass makes the size worse: not applying
applied = (post_size <= pre_size)
if applied:
yaml, new_yaml = new_yaml, yaml
if name:
print_delta(name, pre_size, post_size, applied)
return (yaml, new_yaml, applied, other_results)
def build_histogram(iterator, key):
"""Builds a histogram of values given an iterable of mappings and a key.
For each mapping "m" with key "key" in iterator, the value m[key] is
considered.
Returns a list of tuples (hash, count, proportion, value), where
- "hash" is a sha1sum hash of the value.
- "count" is the number of occurences of values that hash to "hash".
- "proportion" is the proportion of all values considered above that
hash to "hash".
- "value" is one of the values considered above that hash to "hash".
Which value is chosen when multiple values hash to the same "hash" is
undefined.
The list is sorted in descending order by count, yielding the most
frequently occuring hashes first.
"""
buckets = collections.defaultdict(int)
values = {}
num_objects = 0
for obj in iterator:
num_objects += 1
try:
val = obj[key]
except (KeyError, TypeError):
continue
value_hash = hashlib.sha1()
value_hash.update(syaml.dump_config(val).encode())
value_hash = value_hash.hexdigest()
buckets[value_hash] += 1
values[value_hash] = val
return [(h, buckets[h], float(buckets[h]) / num_objects, values[h])
for h in sorted(buckets.keys(), key=lambda k: -buckets[k])]
def optimizer(yaml):
original_size = len(syaml.dump_config(yaml, default_flow_style=True))
# try factoring out commonly repeated portions
common_job = {
'variables': {
'SPACK_COMPILER_ACTION': 'NONE',
'SPACK_RELATED_BUILDS_CDASH': ''
},
'after_script': ['rm -rf "./spack"'],
'artifacts': {
'paths': ['jobs_scratch_dir', 'cdash_report'],
'when': 'always'
},
}
# look for a list of tags that appear frequently
_, count, proportion, tags = next(iter(
build_histogram(yaml.values(), 'tags')),
(None,) * 4)
# If a list of tags is found, and there are more than one job that uses it,
# *and* the jobs that do use it represent at least 70% of all jobs, then
# add the list to the prototype object.
if tags and count > 1 and proportion >= 0.70:
common_job['tags'] = tags
# apply common object factorization
yaml, other, applied, rest = try_optimization_pass(
'general common object factorization',
yaml, common_subobject, common_job)
# look for a common script, and try factoring that out
_, count, proportion, script = next(iter(
build_histogram(yaml.values(), 'script')),
(None,) * 4)
if script and count > 1 and proportion >= 0.70:
yaml, other, applied, rest = try_optimization_pass(
'script factorization',
yaml, common_subobject, {'script': script})
# look for a common before_script, and try factoring that out
_, count, proportion, script = next(iter(
build_histogram(yaml.values(), 'before_script')),
(None,) * 4)
if script and count > 1 and proportion >= 0.70:
yaml, other, applied, rest = try_optimization_pass(
'before_script factorization',
yaml, common_subobject, {'before_script': script})
# Look specifically for the SPACK_ROOT_SPEC environment variables.
# Try to factor them out.
h = build_histogram((
getattr(val, 'get', lambda *args: {})('variables')
for val in yaml.values()), 'SPACK_ROOT_SPEC')
# In this case, we try to factor out *all* instances of the SPACK_ROOT_SPEC
# environment variable; not just the one that appears with the greatest
# frequency. We only require that more than 1 job uses a given instance's
# value, because we expect the value to be very large, and so expect even
# few-to-one factorizations to yield large space savings.
counter = 0
for _, count, proportion, spec in h:
if count <= 1:
continue
counter += 1
yaml, other, applied, rest = try_optimization_pass(
'SPACK_ROOT_SPEC factorization ({count})'.format(count=counter),
yaml,
common_subobject,
{'variables': {'SPACK_ROOT_SPEC': spec}})
new_size = len(syaml.dump_config(yaml, default_flow_style=True))
print('\n')
print_delta('overall summary', original_size, new_size)
print('\n')
return yaml

View file

@ -54,6 +54,11 @@ def setup_parser(subparser):
help="Provide a git branch or tag if a custom spack branch " +
"should be checked out as a step in each generated job. " +
"This argument is ignored if no --spack-repo is provided.")
generate.add_argument(
'--optimize', action='store_true',
help="(Experimental) run the generated document through a series of "
"optimization passes designed to reduce the size of the "
"generated file.")
generate.set_defaults(func=ci_generate)
# Check a spec against mirror. Rebuild, create buildcache and push to
@ -75,6 +80,7 @@ def ci_generate(args):
copy_yaml_to = args.copy_to
spack_repo = args.spack_repo
spack_ref = args.spack_ref
run_optimizer = args.optimize
if not output_file:
gen_ci_dir = os.getcwd()
@ -86,7 +92,8 @@ def ci_generate(args):
# Generate the jobs
spack_ci.generate_gitlab_ci_yaml(
env, True, output_file, spack_repo, spack_ref)
env, True, output_file, spack_repo, spack_ref,
run_optimizer=run_optimizer)
if copy_yaml_to:
copy_to_dir = os.path.dirname(copy_yaml_to)

View file

@ -474,7 +474,7 @@ _spack_ci() {
}
_spack_ci_generate() {
SPACK_COMPREPLY="-h --help --output-file --copy-to --spack-repo --spack-ref"
SPACK_COMPREPLY="-h --help --output-file --copy-to --spack-repo --spack-ref --optimize"
}
_spack_ci_rebuild() {