Spack install: handle failed restore of backup (#25647)

Spack has logic to preserve an installation prefix when it is being
overwritten: if the new install fails, the old files are restored.
This PR adds error handling for when this backup restoration fails
(i.e. the new install fails, and then some unexpected error prevents
restoration from the backup).
This commit is contained in:
Harmen Stoppels 2021-10-01 20:40:48 +02:00 committed by GitHub
parent df590bb6ee
commit 18760de972
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 200 additions and 35 deletions

View file

@ -656,6 +656,12 @@ def working_dir(dirname, **kwargs):
os.chdir(orig_dir)
class CouldNotRestoreDirectoryBackup(RuntimeError):
def __init__(self, inner_exception, outer_exception):
self.inner_exception = inner_exception
self.outer_exception = outer_exception
@contextmanager
def replace_directory_transaction(directory_name, tmp_root=None):
"""Moves a directory to a temporary space. If the operations executed
@ -683,29 +689,33 @@ def replace_directory_transaction(directory_name, tmp_root=None):
assert os.path.isabs(tmp_root)
tmp_dir = tempfile.mkdtemp(dir=tmp_root)
tty.debug('TEMPORARY DIRECTORY CREATED [{0}]'.format(tmp_dir))
tty.debug('Temporary directory created [{0}]'.format(tmp_dir))
shutil.move(src=directory_name, dst=tmp_dir)
tty.debug('DIRECTORY MOVED [src={0}, dest={1}]'.format(
directory_name, tmp_dir
))
tty.debug('Directory moved [src={0}, dest={1}]'.format(directory_name, tmp_dir))
try:
yield tmp_dir
except (Exception, KeyboardInterrupt, SystemExit):
# Delete what was there, before copying back the original content
if os.path.exists(directory_name):
shutil.rmtree(directory_name)
shutil.move(
src=os.path.join(tmp_dir, directory_basename),
dst=os.path.dirname(directory_name)
)
tty.debug('DIRECTORY RECOVERED [{0}]'.format(directory_name))
except (Exception, KeyboardInterrupt, SystemExit) as inner_exception:
# Try to recover the original directory, if this fails, raise a
# composite exception.
try:
# Delete what was there, before copying back the original content
if os.path.exists(directory_name):
shutil.rmtree(directory_name)
shutil.move(
src=os.path.join(tmp_dir, directory_basename),
dst=os.path.dirname(directory_name)
)
except Exception as outer_exception:
raise CouldNotRestoreDirectoryBackup(inner_exception, outer_exception)
tty.debug('Directory recovered [{0}]'.format(directory_name))
raise
else:
# Otherwise delete the temporary directory
shutil.rmtree(tmp_dir)
tty.debug('TEMPORARY DIRECTORY DELETED [{0}]'.format(tmp_dir))
shutil.rmtree(tmp_dir, ignore_errors=True)
tty.debug('Temporary directory deleted [{0}]'.format(tmp_dir))
def hash_directory(directory, ignore=[]):

View file

@ -85,6 +85,15 @@
STATUS_REMOVED = 'removed'
class InstallAction(object):
#: Don't perform an install
NONE = 0
#: Do a standard install
INSTALL = 1
#: Do an overwrite install
OVERWRITE = 2
def _check_last_phase(pkg):
"""
Ensures the specified package has a valid last phase before proceeding
@ -1418,6 +1427,42 @@ def _init_queue(self):
for dependent_id in dependents.difference(task.dependents):
task.add_dependent(dependent_id)
def _install_action(self, task):
"""
Determine whether the installation should be overwritten (if it already
exists) or skipped (if has been handled by another process).
If the package has not been installed yet, this will indicate that the
installation should proceed as normal (i.e. no need to transactionally
preserve the old prefix).
"""
# If we don't have to overwrite, do a normal install
if task.pkg.spec.dag_hash() not in task.request.overwrite:
return InstallAction.INSTALL
# If it's not installed, do a normal install as well
rec, installed = self._check_db(task.pkg.spec)
if not installed:
return InstallAction.INSTALL
# Ensure install_tree projections have not changed.
assert task.pkg.prefix == rec.path
# If another process has overwritten this, we shouldn't install at all
if rec.installation_time >= task.request.overwrite_time:
return InstallAction.NONE
# If the install prefix is missing, warn about it, and proceed with
# normal install.
if not os.path.exists(task.pkg.prefix):
tty.debug("Missing installation to overwrite")
return InstallAction.INSTALL
# Otherwise, do an actual overwrite install. We backup the original
# install directory, put the old prefix
# back on failure
return InstallAction.OVERWRITE
def install(self):
"""
Install the requested package(s) and or associated dependencies.
@ -1561,26 +1606,12 @@ def install(self):
# Proceed with the installation since we have an exclusive write
# lock on the package.
try:
if pkg.spec.dag_hash() in task.request.overwrite:
rec, _ = self._check_db(pkg.spec)
if rec and rec.installed:
if rec.installation_time < task.request.overwrite_time:
# If it's actually overwriting, do a fs transaction
if os.path.exists(rec.path):
with fs.replace_directory_transaction(
rec.path):
# fs transaction will put the old prefix
# back on failure, so make sure to keep it.
keep_prefix = True
self._install_task(task)
else:
tty.debug("Missing installation to overwrite")
self._install_task(task)
else:
# overwriting nothing
self._install_task(task)
else:
action = self._install_action(task)
if action == InstallAction.INSTALL:
self._install_task(task)
elif action == InstallAction.OVERWRITE:
OverwriteInstall(self, spack.store.db, task).install()
self._update_installed(task)
@ -1631,7 +1662,7 @@ def install(self):
finally:
# Remove the install prefix if anything went wrong during
# install.
if not keep_prefix:
if not keep_prefix and not action == InstallAction.OVERWRITE:
pkg.remove_prefix()
# The subprocess *may* have removed the build stage. Mark it
@ -1883,6 +1914,35 @@ def build_process(pkg, install_args):
return installer.run()
class OverwriteInstall(object):
def __init__(self, installer, database, task, tmp_root=None):
self.installer = installer
self.database = database
self.task = task
self.tmp_root = tmp_root
def install(self):
"""
Try to run the install task overwriting the package prefix.
If this fails, try to recover the original install prefix. If that fails
too, mark the spec as uninstalled. This function always the original
install error if installation fails.
"""
try:
with fs.replace_directory_transaction(self.task.pkg.prefix, self.tmp_root):
self.installer._install_task(self.task)
except fs.CouldNotRestoreDirectoryBackup as e:
self.database.remove(self.task.pkg.spec)
tty.error('Recovery of install dir of {0} failed due to '
'{1}: {2}. The spec is now uninstalled.'.format(
self.task.pkg.name,
e.outer_exception.__class__.__name__,
str(e.outer_exception)))
# Unwrap the actual installation exception.
raise e.inner_exception
class BuildTask(object):
"""Class for representing the build task for a package."""

View file

@ -4,6 +4,7 @@
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import os
import shutil
import py
import pytest
@ -1177,3 +1178,97 @@ def test_install_skip_patch(install_mockery, mock_fetch):
spec, install_args = const_arg[0]
assert inst.package_id(spec.package) in installer.installed
def test_overwrite_install_backup_success(temporary_store, config, mock_packages,
tmpdir):
"""
When doing an overwrite install that fails, Spack should restore the backup
of the original prefix, and leave the original spec marked installed.
"""
# Where to store the backups
backup = str(tmpdir.mkdir("backup"))
# Get a build task. TODO: refactor this to avoid calling internal methods
const_arg = installer_args(["b"])
installer = create_installer(const_arg)
installer._init_queue()
task = installer._pop_task()
# Make sure the install prefix exists with some trivial file
installed_file = os.path.join(task.pkg.prefix, 'some_file')
fs.touchp(installed_file)
class InstallerThatWipesThePrefixDir:
def _install_task(self, task):
shutil.rmtree(task.pkg.prefix, ignore_errors=True)
fs.mkdirp(task.pkg.prefix)
raise Exception("Some fatal install error")
class FakeDatabase:
called = False
def remove(self, spec):
self.called = True
fake_installer = InstallerThatWipesThePrefixDir()
fake_db = FakeDatabase()
overwrite_install = inst.OverwriteInstall(
fake_installer, fake_db, task, tmp_root=backup)
# Installation should throw the installation exception, not the backup
# failure.
with pytest.raises(Exception, match='Some fatal install error'):
overwrite_install.install()
# Make sure the package is not marked uninstalled and the original dir
# is back.
assert not fake_db.called
assert os.path.exists(installed_file)
def test_overwrite_install_backup_failure(temporary_store, config, mock_packages,
tmpdir):
"""
When doing an overwrite install that fails, Spack should try to recover the
original prefix. If that fails, the spec is lost, and it should be removed
from the database.
"""
# Where to store the backups
backup = str(tmpdir.mkdir("backup"))
class InstallerThatAccidentallyDeletesTheBackupDir:
def _install_task(self, task):
# Remove the backup directory so that restoring goes terribly wrong
shutil.rmtree(backup)
raise Exception("Some fatal install error")
class FakeDatabase:
called = False
def remove(self, spec):
self.called = True
# Get a build task. TODO: refactor this to avoid calling internal methods
const_arg = installer_args(["b"])
installer = create_installer(const_arg)
installer._init_queue()
task = installer._pop_task()
# Make sure the install prefix exists
installed_file = os.path.join(task.pkg.prefix, 'some_file')
fs.touchp(installed_file)
fake_installer = InstallerThatAccidentallyDeletesTheBackupDir()
fake_db = FakeDatabase()
overwrite_install = inst.OverwriteInstall(
fake_installer, fake_db, task, tmp_root=backup)
# Installation should throw the installation exception, not the backup
# failure.
with pytest.raises(Exception, match='Some fatal install error'):
overwrite_install.install()
# Make sure that `remove` was called on the database after an unsuccessful
# attempt to restore the backup.
assert fake_db.called