###############################################################################
# Copyright (c), Forschungszentrum Jülich GmbH, IAS-1/PGI-1, Germany. #
# All rights reserved. #
# This file is part of the AiiDA-FLEUR package. #
# #
# The code is hosted on GitHub at https://github.com/JuDFTteam/aiida-fleur #
# For further information on the license, see the LICENSE.txt file #
# For further information please visit http://www.flapw.de or #
# http://aiida-fleur.readthedocs.io/en/develop/ #
###############################################################################
"""
This module contains the FleurBaseWorkChain.
FleurBaseWorkChain is a workchain that wraps the submission of
the FLEUR calculation. Inheritance from the BaseRestartWorkChain
allows to add scenarios to restart a calculation in an
automatic way if an expected failure occurred.
"""
from aiida import orm
from aiida.common import AttributeDict
from aiida.engine import while_
from aiida.engine.processes.workchains import BaseRestartWorkChain
from aiida.engine.processes.workchains.utils import process_handler, ProcessHandlerReport
from aiida_fleur.tools.common_fleur_wf import optimize_calc_options
from aiida_fleur.calculation.fleur import FleurCalculation
from aiida_fleur.data.fleurinp import get_fleurinp_from_remote_data
[docs]class FleurBaseWorkChain(BaseRestartWorkChain):
"""Workchain to run a FLEUR calculation with automated error handling and restarts"""
_workflowversion = '0.2.1'
_process_class = FleurCalculation
[docs] @classmethod
def define(cls, spec):
super().define(spec)
spec.expose_inputs(FleurCalculation, exclude=('metadata.options',))
spec.input('options', valid_type=orm.Dict, help='Optional parameters to set up computational details.')
spec.input('description', valid_type=str, required=False, non_db=True, help='Calculation description.')
spec.input('label', valid_type=str, required=False, non_db=True, help='Calculation label.')
spec.input(
'add_comp_para',
valid_type=orm.Dict,
default=lambda: orm.Dict(dict={
'only_even_MPI': False,
'forbid_single_mpi': False,
'max_queue_nodes': 20,
'max_queue_wallclock_sec': 86400
}),
help='Gives additional control over computational parameters'
'only_even_MPI: set to true if you want to suppress odd number of MPI processes in parallelisation.'
'This might speedup a calculation for machines having even number of sockets per node.'
'max_queue_nodes: maximal number of nodes allowed on the remote machine. Used only to automatically solve some FLEUR failures.'
'max_queue_wallclock_sec: maximal wallclock time allowed on the remote machine. Used only to automatically solve some FLEUR failures.'
)
spec.outline(
cls.setup,
cls.validate_inputs,
while_(cls.should_run_process)(
cls.run_process,
cls.inspect_process,
),
cls.results,
)
spec.expose_outputs(FleurCalculation)
spec.exit_code(311,
'ERROR_VACUUM_SPILL_RELAX',
message='FLEUR calculation failed because an atom spilled to the'
'vacuum during relaxation')
spec.exit_code(313, 'ERROR_MT_RADII_RELAX', message='Overlapping MT-spheres during relaxation.')
spec.exit_code(388, 'ERROR_TIME_LIMIT_NO_SOLUTION', message='Computational resources are not optimal.')
spec.exit_code(389, 'ERROR_MEMORY_ISSUE_NO_SOLUTION', message='Computational resources are not optimal.')
spec.exit_code(390, 'ERROR_NOT_OPTIMAL_RESOURCES', message='Computational resources are not optimal.')
spec.exit_code(399,
'ERROR_SOMETHING_WENT_WRONG',
message='FleurCalculation failed and FleurBaseWorkChain has no strategy '
'to resolve this')
[docs] def check_kpts(self):
"""
This routine checks if the total number of requested cpus
is a factor of kpts and makes an optimisation.
If suggested number of num_mpiprocs_per_machine is 60% smaller than
requested, it throws an exit code and calculation stop withour submission.
"""
if 'fleurinp' in self.ctx.inputs:
fleurinp = self.ctx.inputs.fleurinp
else:
fleurinp = get_fleurinp_from_remote_data(self.ctx.inputs.parent_folder)
only_even_MPI = self.inputs.add_comp_para['only_even_MPI']
forbid_single_mpi = self.inputs.add_comp_para['forbid_single_mpi']
try:
machines, mpi_tasks, omp_threads, message = optimize_calc_options(self.ctx.num_machines,
self.ctx.num_mpiprocs_per_machine,
self.ctx.num_cores_per_mpiproc,
self.ctx.use_omp,
self.ctx.suggest_mpi_omp_ratio,
fleurinp,
only_even_MPI=only_even_MPI,
forbid_single_mpi=forbid_single_mpi)
except ValueError as exc:
self.report(exc)
return self.exit_codes.ERROR_NOT_OPTIMAL_RESOURCES
self.report(message)
self.ctx.inputs.metadata.options['resources']['num_machines'] = machines
self.ctx.inputs.metadata.options['resources']['num_mpiprocs_per_machine'] = mpi_tasks
if self.ctx.use_omp:
self.ctx.inputs.metadata.options['resources']['num_cores_per_mpiproc'] = omp_threads
if 'environment_variables' not in self.ctx.inputs.metadata.options:
self.ctx.inputs.metadata.options['environment_variables'] = {}
self.ctx.inputs.metadata.options['environment_variables']['OMP_NUM_THREADS'] = str(omp_threads)
@process_handler(priority=1,
exit_codes=[
FleurCalculation.exit_codes.ERROR_FLEUR_CALC_FAILED,
FleurCalculation.exit_codes.ERROR_MT_RADII,
FleurCalculation.exit_codes.ERROR_NO_RETRIEVED_FOLDER,
FleurCalculation.exit_codes.ERROR_OPENING_OUTPUTS,
FleurCalculation.exit_codes.ERROR_NO_OUTXML,
FleurCalculation.exit_codes.ERROR_XMLOUT_PARSING_FAILED,
FleurCalculation.exit_codes.ERROR_RELAX_PARSING_FAILED,
FleurCalculation.exit_codes.ERROR_MISSING_DEPENDENCY,
])
def _handle_general_error(self, calculation):
"""
Calculation failed for unknown reason.
"""
self.ctx.restart_calc = calculation
self.ctx.is_finished = True
self.report('Calculation failed for a reason that can not be resolved automatically')
self.results()
return ProcessHandlerReport(True, self.exit_codes.ERROR_SOMETHING_WENT_WRONG)
@process_handler(priority=48, exit_codes=FleurCalculation.exit_codes.ERROR_DROP_CDN)
def _handle_dirac_equation(self, calculation):
"""
Sometimes relaxation calculation fails with Diraq problem which is usually caused by
problems with reusing charge density. In this case we resubmit the calculation, dropping the input cdn.
"""
# try to drop remote folder and see if it helps
is_fleurinp_from_relax = False
if 'fleurinp' in self.ctx.inputs:
if 'relax.xml' in self.ctx.inputs.fleurinp.files:
is_fleurinp_from_relax = True
if 'parent_folder' in self.ctx.inputs and is_fleurinp_from_relax:
del self.ctx.inputs.parent_folder
self.ctx.restart_calc = None
self.ctx.is_finished = False
self.report('Calculation seems to fail due to corrupted charge density (can happen'
'during relaxation). I drop cdn from previous step')
return ProcessHandlerReport(True)
self.ctx.restart_calc = calculation
self.ctx.is_finished = True
self.report('Can not drop charge density. If I drop the remote folder, there will be no inp.xml')
self.results()
return ProcessHandlerReport(True, self.exit_codes.ERROR_SOMETHING_WENT_WRONG)
@process_handler(priority=52, exit_codes=FleurCalculation.exit_codes.ERROR_VACUUM_SPILL_RELAX)
def _handle_vacuum_spill_error(self, calculation):
"""
Calculation failed for unknown reason.
"""
self.ctx.restart_calc = calculation
self.ctx.is_finished = True
self.report('FLEUR calculation failed because an atom spilled to the vacuum during'
'relaxation. Can be fixed via RelaxBaseWorkChain.')
self.results()
return ProcessHandlerReport(True, self.exit_codes.ERROR_VACUUM_SPILL_RELAX)
@process_handler(priority=51, exit_codes=FleurCalculation.exit_codes.ERROR_MT_RADII_RELAX)
def _handle_mt_relax_error(self, calculation):
"""
Calculation failed for unknown reason.
"""
self.ctx.restart_calc = calculation
self.ctx.is_finished = True
self.report('FLEUR calculation failed due to MT overlap. Can be fixed via RelaxBaseWorkChain')
self.results()
return ProcessHandlerReport(True, self.exit_codes.ERROR_MT_RADII_RELAX)
@process_handler(priority=50, exit_codes=FleurCalculation.exit_codes.ERROR_NOT_ENOUGH_MEMORY)
def _handle_not_enough_memory(self, calculation):
"""
Calculation failed due to lack of memory.
Probably works for JURECA only, has to be tested for other systems.
"""
if not self.ctx.can_be_optimised:
self.ctx.restart_calc = calculation
self.ctx.is_finished = True
self.report('I am not allowed to optimize your settings. Consider providing at least'
'num_machines and num_mpiprocs_per_machine')
self.results()
return ProcessHandlerReport(True, self.exit_codes.ERROR_MEMORY_ISSUE_NO_SOLUTION)
self.ctx.restart_calc = None
self.ctx.is_finished = False
self.report('Calculation failed due to lack of memory, I resubmit it with twice larger'
' amount of computational nodes and smaller MPI/OMP ratio')
# increase number of nodes
propose_nodes = self.ctx.num_machines * 2
if propose_nodes > self.ctx.max_queue_nodes:
propose_nodes = self.ctx.max_queue_nodes
self.ctx.num_machines = propose_nodes
self.ctx.suggest_mpi_omp_ratio = self.ctx.suggest_mpi_omp_ratio / 2
status = self.check_kpts()
if status is not None:
self.ctx.is_finished = True
self.results()
return ProcessHandlerReport(True, self.exit_codes.ERROR_NOT_OPTIMAL_RESOURCES)
if 'settings' not in self.ctx.inputs:
settings = {}
else:
settings = self.ctx.inputs.settings.get_dict()
settings.setdefault('remove_from_remotecopy_list', [])
if 'mixing_history*' not in settings['remove_from_remotecopy_list']:
settings['remove_from_remotecopy_list'].append('mixing_history*')
self.ctx.inputs.settings = orm.Dict(dict=settings)
#check if the cdn.hdf can be reused
#Out of memory can also occur after a couple of iterations if the mixing_history gets too large
remote = calculation.base.links.get_outgoing().get_node_by_label('remote_folder')
if _is_remote_reusable(self.ctx.inputs, calculation):
if 'fleurinp' in self.ctx.inputs:
del self.ctx.inputs.fleurinp
self.ctx.inputs.parent_folder = remote
return ProcessHandlerReport(True)
@process_handler(priority=47, exit_codes=FleurCalculation.exit_codes.ERROR_TIME_LIMIT)
def _handle_time_limits(self, calculation):
"""
If calculation fails due to time limits, we simply resubmit it.
"""
from aiida.common.exceptions import NotExistent
# if previous calculation failed for the same reason, do not restart
try:
prev_calculation_remote = calculation.base.links.get_incoming().get_node_by_label('parent_folder')
prev_calculation_status = prev_calculation_remote.creator.exit_status
if prev_calculation_status in FleurCalculation.get_exit_statuses(['ERROR_TIME_LIMIT']):
self.ctx.is_finished = True
self.results()
return ProcessHandlerReport(True)
except NotExistent:
pass
self.report('FleurCalculation failed due to time limits, I restart it from where it ended')
# increase wallclock time
propose_wallclock = self.ctx.inputs.metadata.options['max_wallclock_seconds'] * 2
if propose_wallclock > self.ctx.max_queue_wallclock_sec:
propose_wallclock = self.ctx.max_queue_wallclock_sec
self.ctx.inputs.metadata.options['max_wallclock_seconds'] = propose_wallclock
# increase number of nodes
propose_nodes = self.ctx.num_machines * 2
if propose_nodes > self.ctx.max_queue_nodes:
propose_nodes = self.ctx.max_queue_nodes
self.ctx.num_machines = propose_nodes
remote = calculation.base.links.get_outgoing().get_node_by_label('remote_folder')
# resubmit providing inp.xml and cdn from the remote folder
self.ctx.is_finished = False
if _is_remote_reusable(self.ctx.inputs, calculation):
if 'fleurinp' in self.ctx.inputs:
del self.ctx.inputs.fleurinp
self.ctx.inputs.parent_folder = remote
return ProcessHandlerReport(True)
def _is_remote_reusable(inputs, calculation):
"""
Check whether the remote folder of the given calculation
can be resubmitted
"""
can_use_remote = False
#If no charge density file is available to restart from the calculation will except
#with a not nice error message. So we can only reuse the charge density if these files are available
retrieved_filenames = calculation.base.links.get_outgoing().get_node_by_label('retrieved').list_object_names()
if any(file in retrieved_filenames for file in (
'cdn_last.hdf',
'cdn1',
)):
can_use_remote = True
if 'fleurinp' in inputs:
modes = inputs.fleurinp.get_fleur_modes()
if modes['force_theorem'] or modes['dos'] or modes['band']:
# in modes listed above it makes no sense copying cdn.hdf
can_use_remote = False
# without fleurinp it is harder to extract modes in this case
# - simply try to reuse cdn.hdf and hope it works
return can_use_remote