YamlFilesystemView: improve file removal performance via batching (#24355)

* Drastically improve YamlFilesystemView file removal via batching

The `remove_file` routine has to check if the file is owned by multiple packages, so it doesn't
remove necessary files. This is done by the `get_all_specs` routine, which walks the entire
package tree. With large numbers of packages on shared file systems, this can take seconds
per file tree traversal, which adds up extremely quickly. For example, a single deactivate
of a largish python package in our software stack on GPFS took approximately 40 minutes.

This patch simply replaces `remove_file` with a batch `remove_files` routine. This routine
removes a list of files rather than a single file, requiring only one traversal per batch. In
practice this means a package can be removed in seconds time, rather than potentially hours,
essentially a ~100x speedup (ignoring initial deactivation logic, which takes about 3 minutes
in our test setup).
This commit is contained in:
Robert Blackwell 2021-10-28 10:39:16 -04:00 committed by GitHub
parent c13f915735
commit 8fd94e3114
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 23 additions and 17 deletions

View file

@ -393,11 +393,15 @@ def remove_files_from_view(self, view, merge_map):
self.spec self.spec
) )
) )
to_remove = []
for src, dst in merge_map.items(): for src, dst in merge_map.items():
if ignore_namespace and namespace_init(dst): if ignore_namespace and namespace_init(dst):
continue continue
if global_view or not path_contains_subdirectory(src, bin_dir): if global_view or not path_contains_subdirectory(src, bin_dir):
view.remove_file(src, dst) to_remove.append(dst)
else: else:
os.remove(dst) os.remove(dst)
view.remove_files(to_remove)

View file

@ -442,11 +442,7 @@ def unmerge(self, spec, ignore=None):
# now unmerge the directory tree # now unmerge the directory tree
tree.unmerge_directories(view_dst, ignore_file) tree.unmerge_directories(view_dst, ignore_file)
def remove_file(self, src, dest): def remove_files(self, files):
if not os.path.lexists(dest):
tty.warn("Tried to remove %s which does not exist" % dest)
return
def needs_file(spec, file): def needs_file(spec, file):
# convert the file we want to remove to a source in this spec # convert the file we want to remove to a source in this spec
projection = self.get_projection_for_spec(spec) projection = self.get_projection_for_spec(spec)
@ -465,16 +461,23 @@ def needs_file(spec, file):
manifest = {} manifest = {}
return test_path in manifest return test_path in manifest
# remove if dest is not owned by any other package in the view specs = self.get_all_specs()
# This will only be false if two packages are merged into a prefix
# and have a conflicting file
# check all specs for whether they own the file. That include the spec for file in files:
# we are currently removing, as we remove files before unlinking the if not os.path.lexists(file):
# metadata directory. tty.warn("Tried to remove %s which does not exist" % file)
if len([s for s in self.get_all_specs() continue
if needs_file(s, dest)]) <= 1:
os.remove(dest) # remove if file is not owned by any other package in the view
# This will only be false if two packages are merged into a prefix
# and have a conflicting file
# check all specs for whether they own the file. That include the spec
# we are currently removing, as we remove files before unlinking the
# metadata directory.
if len([s for s in specs if needs_file(s, file)]) <= 1:
tty.debug("Removing file " + file)
os.remove(file)
def check_added(self, spec): def check_added(self, spec):
assert spec.concrete assert spec.concrete

View file

@ -469,8 +469,7 @@ def remove_files_from_view(self, view, merge_map):
example if two packages include the same file, it should only be example if two packages include the same file, it should only be
removed when both packages are removed. removed when both packages are removed.
""" """
for src, dst in merge_map.items(): view.remove_files(merge_map.values())
view.remove_file(src, dst)
def test_log_pathname(test_stage, spec): def test_log_pathname(test_stage, spec):