_replace_prefix_bin performance improvements (#33590)

- single pass over the binary data matching all prefixes
- collect offsets and replacement strings
- do in-place updates with `fseek` / `fwrite`, since typically our
  replacement touch O(few bytes) while the file is O(many megabytes)
- be nice: leave the file untouched if some string can't be
  replaced
This commit is contained in:
Harmen Stoppels 2022-10-31 10:08:16 +01:00 committed by GitHub
parent fb4be98f26
commit 616d5a89d4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 51 additions and 26 deletions

View file

@ -467,40 +467,35 @@ def _replace_prefix_bin(filename, byte_prefixes):
"""Replace all the occurrences of the old install prefix with a
new install prefix in binary files.
The new install prefix is prefixed with ``os.sep`` until the
The new install prefix is prefixed with ``b'/'`` until the
lengths of the prefixes are the same.
Args:
filename (str): target binary file
byte_prefixes (OrderedDict): OrderedDictionary where the keys are
precompiled regex of the old prefixes and the values are the new
prefixes (uft-8 encoded)
binary strings of the old prefixes and the values are the new
binary prefixes
"""
all_prefixes = re.compile(b"|".join(re.escape(prefix) for prefix in byte_prefixes.keys()))
def padded_replacement(old):
new = byte_prefixes[old]
pad = len(old) - len(new)
if pad < 0:
raise BinaryTextReplaceError(old, new)
return new + b"/" * pad
with open(filename, "rb+") as f:
data = f.read()
f.seek(0)
for orig_bytes, new_bytes in byte_prefixes.items():
original_data_len = len(data)
# Skip this hassle if not found
if orig_bytes not in data:
continue
# We only care about this problem if we are about to replace
length_compatible = len(new_bytes) <= len(orig_bytes)
if not length_compatible:
tty.debug("Binary failing to relocate is %s" % filename)
raise BinaryTextReplaceError(orig_bytes, new_bytes)
pad_length = len(orig_bytes) - len(new_bytes)
padding = os.sep * pad_length
padding = padding.encode("utf-8")
data = data.replace(orig_bytes, new_bytes + padding)
# Really needs to be the same length
if not len(data) == original_data_len:
print("Length of pad:", pad_length, "should be", len(padding))
print(new_bytes, "was to replace", orig_bytes)
raise BinaryStringReplacementError(filename, original_data_len, len(data))
f.write(data)
f.truncate()
# Register what replacement string to put on what offsets in the file.
replacements_at_offset = [
(padded_replacement(m.group(0)), m.start())
for m in re.finditer(all_prefixes, f.read())
]
# Apply the replacements
for replacement, offset in replacements_at_offset:
f.seek(offset)
f.write(replacement)
def relocate_macho_binaries(

View file

@ -7,6 +7,7 @@
import re
import shutil
import sys
from collections import OrderedDict
import pytest
@ -501,3 +502,32 @@ def test_utf8_paths_to_single_binary_regex():
# Match "unsafe" dir name
string = b"don't match /safe/a/path but do match /safe/[a-z]/file"
assert regex.search(string).group(0) == b"/safe/[a-z]/file"
def test_ordered_replacement(tmpdir):
# This tests whether binary text replacement respects order, so that
# a long package prefix is replaced before a shorter sub-prefix like
# the root of the spack store (as a fallback).
def replace_and_expect(prefix_map, before, after):
file = str(tmpdir.join("file"))
with open(file, "wb") as f:
f.write(before)
spack.relocate._replace_prefix_bin(file, prefix_map)
with open(file, "rb") as f:
assert f.read() == after
replace_and_expect(
OrderedDict(
[(b"/old-spack/opt/specific-package", b"/first"), (b"/old-spack/opt", b"/second")]
),
b"Binary with /old-spack/opt/specific-package and /old-spack/opt",
b"Binary with /first///////////////////////// and /second///////",
)
replace_and_expect(
OrderedDict(
[(b"/old-spack/opt", b"/second"), (b"/old-spack/opt/specific-package", b"/first")]
),
b"Binary with /old-spack/opt/specific-package and /old-spack/opt",
b"Binary with /second////////specific-package and /second///////",
)