doc/.venv/Lib/site-packages/cerulean/torque_scheduler.py

211 lines
7.6 KiB
Python
Raw Normal View History

import logging
from typing import Optional
from cerulean.job_description import JobDescription
from cerulean.job_status import JobStatus
from cerulean.scheduler import Scheduler
from cerulean.terminal import Terminal
from defusedxml import ElementTree
logger = logging.getLogger(__name__)
class TorqueScheduler(Scheduler):
"""Represents a Torque scheduler.
This class represents a Torque scheduler, to which it talks through \
a :class:`Terminal`.
On some machines, an additional command is needed to make Torque
available to the user, e.g. 'module load torque'. If you specify a
prefix, it will be prepended to any Torque command run by this
class. Note that this is a plain string concatenation, so you'll
probably need something like 'module load torque;', with a
semicolon to separate the commands.
Arguments:
terminal: The terminal to use to talk to the scheduler.
prefix: A string to prefix the Torque commands with.
"""
def __init__(self, terminal: Terminal, prefix: str = '') -> None:
self.__terminal = terminal
self.__prefix = prefix
logger.debug('Running qsub --version')
command = self.__prefix + ' qsub'
exit_code, output, error = self.__terminal.run(10, 'qsub',
['--version'])
logger.debug('qsub --version exit_code: %s', exit_code)
logger.debug('qsub --version std output: %s', output)
logger.debug('qsub --version std error: %s', error)
def submit(self, job_description: JobDescription) -> str:
if job_description.command is None:
raise ValueError('Job description is missing a command')
job_script = _job_desc_to_job_script(job_description)
logger.debug('Running qsub with job script:\n%s', job_script)
command = self.__prefix + ' qsub'
exit_code, output, error = self.__terminal.run(10, command, ['-'],
job_script)
logger.debug('qsub exit code: %s', exit_code)
logger.debug('qsub std output: %s', output)
logger.debug('qsub std error: %s', error)
if exit_code != 0:
raise RuntimeError('Torque qsub error: {}'.format(error))
job_id = output.strip().split(' ')[-1]
return job_id
def get_status(self, job_id: str) -> JobStatus:
logger.debug('Running qstat with job id %s', job_id)
command = self.__prefix + ' qstat'
exit_code, output, error = self.__terminal.run(10, command,
['-x', job_id])
logger.debug('qstat exit code: %s', exit_code)
logger.debug('qstat output: %s', output)
logger.debug('qstat error: %s', error)
if output == '':
raise RuntimeError(
'No output from qstat, could not determine job status.')
xml_data = ElementTree.fromstring(output)
if len(xml_data) == 0:
return JobStatus.DONE
status = _get_field_from_qstat_xml(xml_data, 'job_state')
logger.debug('qstat status: %s', status)
status_map = {
'W': JobStatus.WAITING, # waiting for start time
'Q': JobStatus.WAITING, # queued
'H': JobStatus.RUNNING, # held
'S': JobStatus.RUNNING, # suspended
'R': JobStatus.RUNNING, # running
'T': JobStatus.RUNNING, # being moved
'E': JobStatus.RUNNING, # exiting
'C': JobStatus.DONE # completed
}
try:
job_status = status_map[status]
except KeyError:
raise RuntimeError(
'Received an unexpected job status {} from qstat'.format(
status))
logger.debug('get_status returning %s', job_status.name)
return job_status
def get_exit_code(self, job_id: str) -> Optional[int]:
if self.get_status(job_id) != JobStatus.DONE:
return None
logger.debug('get_exit_code() running qstat -x %s', job_id)
command = self.__prefix + ' qstat'
exit_code, output, error = self.__terminal.run(10, command,
['-x', job_id])
logger.debug('qstat exit code: %s', exit_code)
logger.debug('qstat output: %s', output)
logger.debug('qstat error: %s', error)
xml_data = ElementTree.fromstring(output)
job_exit_code = int(_get_field_from_qstat_xml(xml_data, 'exit_status'))
return job_exit_code
def cancel(self, job_id: str) -> None:
logger.debug('cancel() running qdel %s', job_id)
command = self.__prefix + ' qdel'
exit_code, output, error = self.__terminal.run(10, command, [job_id])
logger.debug('qdel exit code: %s', exit_code)
logger.debug('qdel output: %s', output)
logger.debug('qdel error: %s', error)
def _job_desc_to_job_script(job_description: JobDescription) -> str:
job_script = '#!/bin/bash\n'
job_script += '#PBS -N {}\n'.format(job_description.name)
resources = []
if job_description.time_reserved is not None:
resources.append('walltime={}'.format(
_seconds_to_time(job_description.time_reserved)))
if job_description.num_nodes is not None:
resources.append('nodes={}'.format(job_description.num_nodes))
if job_description.mpi_processes_per_node is not None:
resources.append('ppn={}'.format(
job_description.mpi_processes_per_node))
if len(resources) > 0:
job_script += '#PBS -l {}\n'.format(':'.join(resources))
if job_description.queue_name is not None:
job_script += '#PBS -q {}\n'.format(job_description.queue_name)
if job_description.system_out_file is not None:
job_script += '#PBS -o {}\n'.format(job_description.system_out_file)
if job_description.system_err_file is not None:
job_script += '#PBS -e {}\n'.format(job_description.system_err_file)
if job_description.extra_scheduler_options is not None:
job_script += '#PBS {}\n'.format(
job_description.extra_scheduler_options)
for name, value in job_description.environment.items():
job_script += "export {}='{}'\n".format(name, value)
if job_description.working_directory is not None:
job_script += 'cd {}\n'.format(job_description.working_directory)
job_script += '{}'.format(job_description.command)
args = map(str, job_description.arguments)
job_script += ' {}'.format(' '.join(args))
if job_description.stdout_file is not None:
job_script += ' >{}'.format(job_description.stdout_file)
if job_description.stderr_file is not None:
job_script += ' 2>{}'.format(job_description.stderr_file)
return job_script
def _get_field_from_qstat_xml(xml_data: ElementTree, field_name: str) -> str:
for job in xml_data:
for field in job:
if field.tag == field_name:
return field.text
raise RuntimeError(
'Expected field {} not returned by qstat'.format(field_name))
def _seconds_to_time(seconds: int) -> str:
"""Converts seconds to a Torque allocation duration.
Args:
seconds: The number of seconds to reserve.
Returns:
A string of the form DD:HH:MM:SS.
"""
seconds_per_day = 60 * 60 * 24
seconds_per_hour = 60 * 60
seconds_per_minute = 60
days = seconds // seconds_per_day
seconds -= days * seconds_per_day
hours = seconds // seconds_per_hour
seconds -= hours * seconds_per_hour
minutes = seconds // seconds_per_minute
seconds -= minutes * seconds_per_minute
return '{:02d}:{:02d}:{:02d}:{:02d}'.format(days, hours, minutes, seconds)