_replace_prefix_bin performance improvements (#33590)

- single pass over the binary data matching all prefixes
- collect offsets and replacement strings
- do in-place updates with `fseek` / `fwrite`, since typically our
  replacement touch O(few bytes) while the file is O(many megabytes)
- be nice: leave the file untouched if some string can't be
  replaced
This commit is contained in:
Harmen Stoppels 2022-10-31 10:08:16 +01:00 committed by GitHub
parent fb4be98f26
commit 616d5a89d4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 51 additions and 26 deletions

View file

@ -467,40 +467,35 @@ def _replace_prefix_bin(filename, byte_prefixes):
"""Replace all the occurrences of the old install prefix with a """Replace all the occurrences of the old install prefix with a
new install prefix in binary files. new install prefix in binary files.
The new install prefix is prefixed with ``os.sep`` until the The new install prefix is prefixed with ``b'/'`` until the
lengths of the prefixes are the same. lengths of the prefixes are the same.
Args: Args:
filename (str): target binary file filename (str): target binary file
byte_prefixes (OrderedDict): OrderedDictionary where the keys are byte_prefixes (OrderedDict): OrderedDictionary where the keys are
precompiled regex of the old prefixes and the values are the new binary strings of the old prefixes and the values are the new
prefixes (uft-8 encoded) binary prefixes
""" """
all_prefixes = re.compile(b"|".join(re.escape(prefix) for prefix in byte_prefixes.keys()))
def padded_replacement(old):
new = byte_prefixes[old]
pad = len(old) - len(new)
if pad < 0:
raise BinaryTextReplaceError(old, new)
return new + b"/" * pad
with open(filename, "rb+") as f: with open(filename, "rb+") as f:
data = f.read() # Register what replacement string to put on what offsets in the file.
f.seek(0) replacements_at_offset = [
for orig_bytes, new_bytes in byte_prefixes.items(): (padded_replacement(m.group(0)), m.start())
original_data_len = len(data) for m in re.finditer(all_prefixes, f.read())
# Skip this hassle if not found ]
if orig_bytes not in data:
continue # Apply the replacements
# We only care about this problem if we are about to replace for replacement, offset in replacements_at_offset:
length_compatible = len(new_bytes) <= len(orig_bytes) f.seek(offset)
if not length_compatible: f.write(replacement)
tty.debug("Binary failing to relocate is %s" % filename)
raise BinaryTextReplaceError(orig_bytes, new_bytes)
pad_length = len(orig_bytes) - len(new_bytes)
padding = os.sep * pad_length
padding = padding.encode("utf-8")
data = data.replace(orig_bytes, new_bytes + padding)
# Really needs to be the same length
if not len(data) == original_data_len:
print("Length of pad:", pad_length, "should be", len(padding))
print(new_bytes, "was to replace", orig_bytes)
raise BinaryStringReplacementError(filename, original_data_len, len(data))
f.write(data)
f.truncate()
def relocate_macho_binaries( def relocate_macho_binaries(

View file

@ -7,6 +7,7 @@
import re import re
import shutil import shutil
import sys import sys
from collections import OrderedDict
import pytest import pytest
@ -501,3 +502,32 @@ def test_utf8_paths_to_single_binary_regex():
# Match "unsafe" dir name # Match "unsafe" dir name
string = b"don't match /safe/a/path but do match /safe/[a-z]/file" string = b"don't match /safe/a/path but do match /safe/[a-z]/file"
assert regex.search(string).group(0) == b"/safe/[a-z]/file" assert regex.search(string).group(0) == b"/safe/[a-z]/file"
def test_ordered_replacement(tmpdir):
# This tests whether binary text replacement respects order, so that
# a long package prefix is replaced before a shorter sub-prefix like
# the root of the spack store (as a fallback).
def replace_and_expect(prefix_map, before, after):
file = str(tmpdir.join("file"))
with open(file, "wb") as f:
f.write(before)
spack.relocate._replace_prefix_bin(file, prefix_map)
with open(file, "rb") as f:
assert f.read() == after
replace_and_expect(
OrderedDict(
[(b"/old-spack/opt/specific-package", b"/first"), (b"/old-spack/opt", b"/second")]
),
b"Binary with /old-spack/opt/specific-package and /old-spack/opt",
b"Binary with /first///////////////////////// and /second///////",
)
replace_and_expect(
OrderedDict(
[(b"/old-spack/opt", b"/second"), (b"/old-spack/opt/specific-package", b"/first")]
),
b"Binary with /old-spack/opt/specific-package and /old-spack/opt",
b"Binary with /second////////specific-package and /second///////",
)