From f9112a6244dede4d52700c4e86f335b42ec69a49 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Fri, 21 Oct 2022 18:30:26 +0200 Subject: [PATCH] Relocation should take hardlinks into account (#33460) Currently `relocate_text` and `relocate_text_bin` are unsafe in the sense that they run in parallel, and lead to races when modifying different items pointing to the same inode. This leads to the issue observed in #33453. This PR: 1. Renames those functions to `unsafe_*` so people are aware 2. Adds logic to deal with hardlinks in current binary packages 3. Adds logic to deal with hardlinks when creating new binary tarballs, so the install side doesn't have to de-dupe hardlinks. 4. Adds a test for 3 The assumption is that all our relocation logic preserves inodes. That is, we should never copy a file, modify it, and then move it back. I quickly verified, and its seems like this is true for (binary) text relocation, as well as rpath patching in patchelf (even when the file grows) and mach-o fixes. --- lib/spack/spack/binary_distribution.py | 168 ++++++++++++++++++------- lib/spack/spack/filesystem_view.py | 4 +- lib/spack/spack/relocate.py | 10 +- lib/spack/spack/rewiring.py | 4 +- lib/spack/spack/test/bindist.py | 46 +++++++ lib/spack/spack/test/packaging.py | 6 +- lib/spack/spack/test/relocate.py | 4 +- 7 files changed, 185 insertions(+), 57 deletions(-) diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py index b712e82cc0..7ae91dd6bf 100644 --- a/lib/spack/spack/binary_distribution.py +++ b/lib/spack/spack/binary_distribution.py @@ -23,7 +23,7 @@ import llnl.util.filesystem as fsys import llnl.util.lang import llnl.util.tty as tty -from llnl.util.filesystem import mkdirp +from llnl.util.filesystem import BaseDirectoryVisitor, mkdirp, visit_directory_tree import spack.cmd import spack.config as config @@ -642,6 +642,51 @@ def read_buildinfo_file(prefix): return buildinfo +class BuildManifestVisitor(BaseDirectoryVisitor): + """Visitor that collects a list of files and symlinks + that can be checked for need of relocation. It knows how + to dedupe hardlinks and deal with symlinks to files and + directories.""" + + def __init__(self): + # Save unique identifiers of files to avoid + # relocating hardlink files for each path. + self.visited = set() + + # Lists of files we will check + self.files = [] + self.symlinks = [] + + def seen_before(self, root, rel_path): + stat_result = os.lstat(os.path.join(root, rel_path)) + identifier = (stat_result.st_dev, stat_result.st_ino) + if identifier in self.visited: + return True + else: + self.visited.add(identifier) + return False + + def visit_file(self, root, rel_path, depth): + if self.seen_before(root, rel_path): + return + self.files.append(rel_path) + + def visit_symlinked_file(self, root, rel_path, depth): + # Note: symlinks *can* be hardlinked, but it is unclear if + # symlinks can be relinked in-place (preserving inode). + # Therefore, we do *not* de-dupe hardlinked symlinks. + self.symlinks.append(rel_path) + + def before_visit_dir(self, root, rel_path, depth): + return os.path.basename(rel_path) not in (".spack", "man") + + def before_visit_symlinked_dir(self, root, rel_path, depth): + # Treat symlinked directories simply as symlinks. + self.visit_symlinked_file(root, rel_path, depth) + # Never recurse into symlinked directories. + return False + + def get_buildfile_manifest(spec): """ Return a data structure with information about a build, including @@ -657,57 +702,52 @@ def get_buildfile_manifest(spec): "link_to_relocate": [], "other": [], "binary_to_relocate_fullpath": [], + "hardlinks_deduped": True, } - exclude_list = (".spack", "man") + # Guard against filesystem footguns of hardlinks and symlinks by using + # a visitor to retrieve a list of files and symlinks, so we don't have + # to worry about hardlinks of symlinked dirs and what not. + visitor = BuildManifestVisitor() + root = spec.prefix + visit_directory_tree(root, visitor) - # Do this at during tarball creation to save time when tarball unpacked. - # Used by make_package_relative to determine binaries to change. - for root, dirs, files in os.walk(spec.prefix, topdown=True): - dirs[:] = [d for d in dirs if d not in exclude_list] + # Symlinks. - # Directories may need to be relocated too. - for directory in dirs: - dir_path_name = os.path.join(root, directory) - rel_path_name = os.path.relpath(dir_path_name, spec.prefix) - if os.path.islink(dir_path_name): - link = os.readlink(dir_path_name) - if os.path.isabs(link) and link.startswith(spack.store.layout.root): - data["link_to_relocate"].append(rel_path_name) + # Obvious bugs: + # 1. relative links are not relocated. + # 2. paths are used as strings. + for rel_path in visitor.symlinks: + abs_path = os.path.join(root, rel_path) + link = os.readlink(abs_path) + if os.path.isabs(link) and link.startswith(spack.store.layout.root): + data["link_to_relocate"].append(rel_path) - for filename in files: - path_name = os.path.join(root, filename) - m_type, m_subtype = fsys.mime_type(path_name) - rel_path_name = os.path.relpath(path_name, spec.prefix) - added = False + # Non-symlinks. + for rel_path in visitor.files: + abs_path = os.path.join(root, rel_path) + m_type, m_subtype = fsys.mime_type(abs_path) - if os.path.islink(path_name): - link = os.readlink(path_name) - if os.path.isabs(link): - # Relocate absolute links into the spack tree - if link.startswith(spack.store.layout.root): - data["link_to_relocate"].append(rel_path_name) - added = True + if relocate.needs_binary_relocation(m_type, m_subtype): + # Why is this branch not part of needs_binary_relocation? :( + if ( + ( + m_subtype in ("x-executable", "x-sharedlib", "x-pie-executable") + and sys.platform != "darwin" + ) + or (m_subtype in ("x-mach-binary") and sys.platform == "darwin") + or (not rel_path.endswith(".o")) + ): + data["binary_to_relocate"].append(rel_path) + data["binary_to_relocate_fullpath"].append(abs_path) + continue - if relocate.needs_binary_relocation(m_type, m_subtype): - if ( - ( - m_subtype in ("x-executable", "x-sharedlib", "x-pie-executable") - and sys.platform != "darwin" - ) - or (m_subtype in ("x-mach-binary") and sys.platform == "darwin") - or (not filename.endswith(".o")) - ): - data["binary_to_relocate"].append(rel_path_name) - data["binary_to_relocate_fullpath"].append(path_name) - added = True + elif relocate.needs_text_relocation(m_type, m_subtype): + data["text_to_relocate"].append(rel_path) + continue - if relocate.needs_text_relocation(m_type, m_subtype): - data["text_to_relocate"].append(rel_path_name) - added = True + data["other"].append(abs_path) - if not added: - data["other"].append(path_name) return data @@ -734,6 +774,7 @@ def write_buildinfo_file(spec, workdir, rel=False): buildinfo["relocate_textfiles"] = manifest["text_to_relocate"] buildinfo["relocate_binaries"] = manifest["binary_to_relocate"] buildinfo["relocate_links"] = manifest["link_to_relocate"] + buildinfo["hardlinks_deduped"] = manifest["hardlinks_deduped"] buildinfo["prefix_to_hash"] = prefix_to_hash filename = buildinfo_file_name(workdir) with open(filename, "w") as outfile: @@ -1441,6 +1482,38 @@ def check_package_relocatable(workdir, spec, allow_root): relocate.raise_if_not_relocatable(cur_path_names, allow_root) +def dedupe_hardlinks_if_necessary(root, buildinfo): + """Updates a buildinfo dict for old archives that did + not dedupe hardlinks. De-duping hardlinks is necessary + when relocating files in parallel and in-place. This + means we must preserve inodes when relocating.""" + + # New archives don't need this. + if buildinfo.get("hardlinks_deduped", False): + return + + # Clearly we can assume that an inode is either in the + # textfile or binary group, but let's just stick to + # a single set of visited nodes. + visited = set() + + # Note: we do *not* dedupe hardlinked symlinks, since + # it seems difficult or even impossible to relink + # symlinks while preserving inode. + for key in ("relocate_textfiles", "relocate_binaries"): + if key not in buildinfo: + continue + new_list = [] + for rel_path in buildinfo[key]: + stat_result = os.lstat(os.path.join(root, rel_path)) + identifier = (stat_result.st_dev, stat_result.st_ino) + if identifier in visited: + continue + visited.add(identifier) + new_list.append(rel_path) + buildinfo[key] = new_list + + def relocate_package(spec, allow_root): """ Relocate the given package @@ -1503,6 +1576,9 @@ def relocate_package(spec, allow_root): tty.debug("Relocating package from", "%s to %s." % (old_layout_root, new_layout_root)) + # Old archives maybe have hardlinks repeated. + dedupe_hardlinks_if_necessary(workdir, buildinfo) + def is_backup_file(file): return file.endswith("~") @@ -1548,7 +1624,7 @@ def is_backup_file(file): # For all buildcaches # relocate the install prefixes in text files including dependencies - relocate.relocate_text(text_names, prefix_to_prefix_text) + relocate.unsafe_relocate_text(text_names, prefix_to_prefix_text) paths_to_relocate = [old_prefix, old_layout_root] paths_to_relocate.extend(prefix_to_hash.keys()) @@ -1564,13 +1640,13 @@ def is_backup_file(file): ) ) # relocate the install prefixes in binary files including dependencies - relocate.relocate_text_bin(files_to_relocate, prefix_to_prefix_bin) + relocate.unsafe_relocate_text_bin(files_to_relocate, prefix_to_prefix_bin) # If we are installing back to the same location # relocate the sbang location if the spack directory changed else: if old_spack_prefix != new_spack_prefix: - relocate.relocate_text(text_names, prefix_to_prefix_text) + relocate.unsafe_relocate_text(text_names, prefix_to_prefix_text) def _extract_inner_tarball(spec, filename, extract_to, unsigned, remote_checksum): diff --git a/lib/spack/spack/filesystem_view.py b/lib/spack/spack/filesystem_view.py index 2373ec5e45..db2d6c9477 100644 --- a/lib/spack/spack/filesystem_view.py +++ b/lib/spack/spack/filesystem_view.py @@ -95,11 +95,11 @@ def view_copy(src, dst, view, spec=None): prefix_to_projection[dep.prefix] = view.get_projection_for_spec(dep) if spack.relocate.is_binary(dst): - spack.relocate.relocate_text_bin(binaries=[dst], prefixes=prefix_to_projection) + spack.relocate.unsafe_relocate_text_bin(binaries=[dst], prefixes=prefix_to_projection) else: prefix_to_projection[spack.store.layout.root] = view._root prefix_to_projection[orig_sbang] = new_sbang - spack.relocate.relocate_text(files=[dst], prefixes=prefix_to_projection) + spack.relocate.unsafe_relocate_text(files=[dst], prefixes=prefix_to_projection) try: stat = os.stat(src) os.chown(dst, stat.st_uid, stat.st_gid) diff --git a/lib/spack/spack/relocate.py b/lib/spack/spack/relocate.py index 3ef332c204..3297965b6e 100644 --- a/lib/spack/spack/relocate.py +++ b/lib/spack/spack/relocate.py @@ -744,12 +744,15 @@ def relocate_links(links, orig_layout_root, orig_install_prefix, new_install_pre tty.warn(msg.format(link_target, abs_link, new_install_prefix)) -def relocate_text(files, prefixes, concurrency=32): +def unsafe_relocate_text(files, prefixes, concurrency=32): """Relocate text file from the original installation prefix to the new prefix. Relocation also affects the the path in Spack's sbang script. + Note: unsafe when files contains duplicates, such as repeated paths, + symlinks, hardlinks. + Args: files (list): Text files to be relocated prefixes (OrderedDict): String prefixes which need to be changed @@ -786,11 +789,14 @@ def relocate_text(files, prefixes, concurrency=32): tp.join() -def relocate_text_bin(binaries, prefixes, concurrency=32): +def unsafe_relocate_text_bin(binaries, prefixes, concurrency=32): """Replace null terminated path strings hard coded into binaries. The new install prefix must be shorter than the original one. + Note: unsafe when files contains duplicates, such as repeated paths, + symlinks, hardlinks. + Args: binaries (list): binaries to be relocated prefixes (OrderedDict): String prefixes which need to be changed. diff --git a/lib/spack/spack/rewiring.py b/lib/spack/spack/rewiring.py index 8a2dcad035..6b6a37ddd1 100644 --- a/lib/spack/spack/rewiring.py +++ b/lib/spack/spack/rewiring.py @@ -70,7 +70,7 @@ def rewire_node(spec, explicit): for rel_path in manifest.get("text_to_relocate", []) ] if text_to_relocate: - relocate.relocate_text(files=text_to_relocate, prefixes=prefix_to_prefix) + relocate.unsafe_relocate_text(files=text_to_relocate, prefixes=prefix_to_prefix) bins_to_relocate = [ os.path.join(tempdir, spec.dag_hash(), rel_path) @@ -97,7 +97,7 @@ def rewire_node(spec, explicit): spec.build_spec.prefix, spec.prefix, ) - relocate.relocate_text_bin(binaries=bins_to_relocate, prefixes=prefix_to_prefix) + relocate.unsafe_relocate_text_bin(binaries=bins_to_relocate, prefixes=prefix_to_prefix) # Copy package into place, except for spec.json (because spec.json # describes the old spec and not the new spliced spec). shutil.copytree( diff --git a/lib/spack/spack/test/bindist.py b/lib/spack/spack/test/bindist.py index 7e9066ccbf..d0a8a0a4f0 100644 --- a/lib/spack/spack/test/bindist.py +++ b/lib/spack/spack/test/bindist.py @@ -11,6 +11,8 @@ import py import pytest +from llnl.util.filesystem import visit_directory_tree + import spack.binary_distribution as bindist import spack.config import spack.hooks.sbang as sbang @@ -633,3 +635,47 @@ def test_FetchCacheError_pretty_printing_single(): assert "Multiple errors" not in str_e assert "RuntimeError: Oops!" in str_e assert str_e.rstrip() == str_e + + +def test_build_manifest_visitor(tmpdir): + dir = "directory" + file = os.path.join("directory", "file") + + with tmpdir.as_cwd(): + # Create a file inside a directory + os.mkdir(dir) + with open(file, "wb") as f: + f.write(b"example file") + + # Symlink the dir + os.symlink(dir, "symlink_to_directory") + + # Symlink the file + os.symlink(file, "symlink_to_file") + + # Hardlink the file + os.link(file, "hardlink_of_file") + + # Hardlinked symlinks: seems like this is only a thing on Linux, + # on Darwin the symlink *target* is hardlinked, on Linux the + # symlink *itself* is hardlinked. + if sys.platform.startswith("linux"): + os.link("symlink_to_file", "hardlink_of_symlink_to_file") + os.link("symlink_to_directory", "hardlink_of_symlink_to_directory") + + visitor = bindist.BuildManifestVisitor() + visit_directory_tree(str(tmpdir), visitor) + + # We de-dupe hardlinks of files, so there should really be just one file + assert len(visitor.files) == 1 + + # We do not de-dupe symlinks, cause it's unclear how to update symlinks + # in-place, preserving inodes. + if sys.platform.startswith("linux"): + assert len(visitor.symlinks) == 4 # includes hardlinks of symlinks. + else: + assert len(visitor.symlinks) == 2 + + with tmpdir.as_cwd(): + assert not any(os.path.islink(f) or os.path.isdir(f) for f in visitor.files) + assert all(os.path.islink(f) for f in visitor.symlinks) diff --git a/lib/spack/spack/test/packaging.py b/lib/spack/spack/test/packaging.py index f0d1c240e8..0ed4f943a3 100644 --- a/lib/spack/spack/test/packaging.py +++ b/lib/spack/spack/test/packaging.py @@ -36,7 +36,7 @@ needs_binary_relocation, needs_text_relocation, relocate_links, - relocate_text, + unsafe_relocate_text, ) from spack.spec import Spec @@ -190,7 +190,7 @@ def test_buildcache(mock_archive, tmpdir): @pytest.mark.usefixtures("install_mockery") -def test_relocate_text(tmpdir): +def test_unsafe_relocate_text(tmpdir): spec = Spec("trivial-install-test-package") spec.concretize() with tmpdir.as_cwd(): @@ -203,7 +203,7 @@ def test_relocate_text(tmpdir): filenames = [filename] new_dir = "/opt/rh/devtoolset/" # Singleton dict doesn't matter if Ordered - relocate_text(filenames, {old_dir: new_dir}) + unsafe_relocate_text(filenames, {old_dir: new_dir}) with open(filename, "r") as script: for line in script: assert new_dir in line diff --git a/lib/spack/spack/test/relocate.py b/lib/spack/spack/test/relocate.py index b79f0ba1a4..ab306759cd 100644 --- a/lib/spack/spack/test/relocate.py +++ b/lib/spack/spack/test/relocate.py @@ -406,7 +406,7 @@ def test_relocate_text_bin(hello_world, copy_binary, tmpdir): orig_path_bytes = str(orig_binary.dirpath()).encode("utf-8") new_path_bytes = str(new_binary.dirpath()).encode("utf-8") - spack.relocate.relocate_text_bin([str(new_binary)], {orig_path_bytes: new_path_bytes}) + spack.relocate.unsafe_relocate_text_bin([str(new_binary)], {orig_path_bytes: new_path_bytes}) # Check original directory is not there anymore and it was # substituted with the new one @@ -421,7 +421,7 @@ def test_relocate_text_bin_raise_if_new_prefix_is_longer(tmpdir): with open(fpath, "w") as f: f.write("/short") with pytest.raises(spack.relocate.BinaryTextReplaceError): - spack.relocate.relocate_text_bin([fpath], {short_prefix: long_prefix}) + spack.relocate.unsafe_relocate_text_bin([fpath], {short_prefix: long_prefix}) @pytest.mark.requires_executables("install_name_tool", "file", "cc")