diff --git a/PandaPkgInfo.py b/PandaPkgInfo.py index e7b057782..c93370d47 100644 --- a/PandaPkgInfo.py +++ b/PandaPkgInfo.py @@ -1 +1 @@ -release_version = "0.7.2" +release_version = "0.8.0" diff --git a/pandajedi/jedicore/JediDBSchemaInfo.py b/pandajedi/jedicore/JediDBSchemaInfo.py index ee5961c57..fb25f745a 100644 --- a/pandajedi/jedicore/JediDBSchemaInfo.py +++ b/pandajedi/jedicore/JediDBSchemaInfo.py @@ -12,6 +12,6 @@ class JediDBSchemaInfo: schema_version = None def method(self): - schema_version = "0.0.30" + schema_version = "0.1.1" _logger.debug(f"PanDA schema version required for JEDI is : {schema_version}") return schema_version diff --git a/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py b/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py new file mode 100644 index 000000000..38e96fefb --- /dev/null +++ b/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py @@ -0,0 +1,63 @@ +import sys +import traceback + +# logger +from pandacommon.pandalogger.PandaLogger import PandaLogger + +from pandajedi.jedicore.MsgWrapper import MsgWrapper +from pandaserver.workflow.workflow_core import WorkflowInterface + +from .WatchDogBase import WatchDogBase + +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + + +class AtlasWorkflowManagerWatchDog(WatchDogBase): + """ + Workflow manager watchdog for ATLAS + """ + + # constructor + def __init__(self, taskBufferIF, ddmIF): + WatchDogBase.__init__(self, taskBufferIF, ddmIF) + self.vo = "atlas" + self.workflow_interface = WorkflowInterface(taskBufferIF) + + def doProcessWorkflows(self): + """ + Action to process active workflows + """ + tmpLog = MsgWrapper(logger, " #ATM #KV doProcessWorkflows") + tmpLog.debug("start") + try: + # watchdog lock + got_lock = self.get_process_lock("AtlasWFManagerDog.doProcessWorkflows", timeLimit=5) + if not got_lock: + tmpLog.debug("locked by another watchdog process. Skipped") + return + tmpLog.debug("got watchdog lock") + # process active workflows + stats = self.workflow_interface.process_active_workflows() + tmpLog.info(f"processed workflows: {stats}") + # done + tmpLog.debug("done") + except Exception: + errtype, errvalue = sys.exc_info()[:2] + tmpLog.error(f"failed with {errtype} {errvalue} {traceback.format_exc()}") + + # main + def doAction(self): + try: + # get logger + origTmpLog = MsgWrapper(logger) + origTmpLog.debug("start") + # clean up + # check + # process workflows + self.doProcessWorkflows() + except Exception: + errtype, errvalue = sys.exc_info()[:2] + origTmpLog.error(f"failed with {errtype} {errvalue}") + # return + origTmpLog.debug("done") + return self.SC_SUCCEEDED diff --git a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py new file mode 100644 index 000000000..1129e652e --- /dev/null +++ b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py @@ -0,0 +1,89 @@ +import json + +from pandacommon.pandalogger import logger_utils + +from pandajedi.jedimsgprocessor.base_msg_processor import BaseMsgProcPlugin +from pandaserver.workflow.workflow_core import WorkflowInterface + +base_logger = logger_utils.setup_logger(__name__.split(".")[-1]) + + +# Workflow manager message processor plugin +class WorkflowManagerMsgProcPlugin(BaseMsgProcPlugin): + """ + Message-driven workflow manager + """ + + def initialize(self): + """ + Initialize the plugin + """ + BaseMsgProcPlugin.initialize(self) + self.workflow_interface = WorkflowInterface(self.tbIF) + + def process(self, msg_obj): + """ + Process the message + Typical message data looks like: + {"msg_type":"workflow", "workflow_id": 123, "timestamp": 987654321} + {"msg_type":"wfstep", "step_id": 456, "timestamp": 987654321} + {"msg_type":"wfdata", "data_id": 789, "timestamp": 987654321} + + Args: + msg_obj: message object + """ + tmp_log = logger_utils.make_logger(base_logger, token=self.get_pid(), method_name="process") + # start + tmp_log.info("start") + tmp_log.debug(f"sub_id={msg_obj.sub_id} ; msg_id={msg_obj.msg_id}") + # parse json + try: + msg_dict = json.loads(msg_obj.data) + except Exception as e: + err_str = f"failed to parse message json {msg_obj.data} , skipped. {e.__class__.__name__} : {e}" + tmp_log.error(err_str) + raise + # sanity check + try: + msg_type = msg_dict["msg_type"] + except Exception as e: + err_str = f"failed to parse message object dict {msg_dict} , skipped. {e.__class__.__name__} : {e}" + tmp_log.error(err_str) + raise + if msg_type not in ("workflow", "wfstep", "wfdata"): + err_str = f"got unknown msg_type {msg_type} , skipped " + tmp_log.error(err_str) + raise + # run + try: + tmp_log.info(f"got message {msg_dict}") + if msg_type == "workflow": + workflow_id = msg_dict["workflow_id"] + workflow_spec = self.tbIF.get_workflow(workflow_id) + if workflow_spec is None: + tmp_log.warning(f"workflow_id={workflow_id} not found; skipped") + return + stats, workflow_spec = self.workflow_interface.process_workflow(workflow_spec, by="msg") + tmp_log.info(f"processed workflow_id={workflow_id}") + elif msg_type == "wfstep": + step_id = msg_dict["step_id"] + step_spec = self.tbIF.get_workflow_step(step_id) + if step_spec is None: + tmp_log.warning(f"step_id={step_id} not found; skipped") + return + stats, step_spec = self.workflow_interface.process_step(step_spec, by="msg") + tmp_log.info(f"processed step_id={step_id}") + elif msg_type == "wfdata": + data_id = msg_dict["data_id"] + data_spec = self.tbIF.get_workflow_data(data_id) + if data_spec is None: + tmp_log.warning(f"data_id={data_id} not found; skipped") + return + stats, data_spec = self.workflow_interface.process_data(data_spec, by="msg") + tmp_log.info(f"processed data_id={data_id}") + except Exception as e: + err_str = f"failed to run, skipped. {e.__class__.__name__} : {e}" + tmp_log.error(err_str) + raise + # done + tmp_log.info("done") diff --git a/pandajedi/jediorder/ContentsFeeder.py b/pandajedi/jediorder/ContentsFeeder.py index 3cc08a194..38999df07 100644 --- a/pandajedi/jediorder/ContentsFeeder.py +++ b/pandajedi/jediorder/ContentsFeeder.py @@ -185,6 +185,7 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): parentOutDatasets.add(tmpParentOutDataset.containerName + "/") # loop over all datasets nFilesMaster = 0 + nFilesMasterReady = 0 checkedMaster = False setFrozenTime = True master_offset = None @@ -240,6 +241,9 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): ): # dummy metadata when parent is running tmpMetadata = {"state": "mutable"} + # set mutable when workflow holdup is set + if taskSpec.is_workflow_holdup(): + tmpMetadata = {"state": "mutable"} except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error(f"{self.__class__.__name__} failed to get metadata to {errtype.__name__}:{errvalue}") @@ -529,7 +533,7 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): orderBy = None # feed files to the contents table tmpLog.debug("update contents") - retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI( + res_dict = self.taskBufferIF.insertFilesForDataset_JEDI( datasetSpec, tmpRet, tmpMetadata["state"], @@ -565,6 +569,10 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): maxFileRecords, skip_short_output, ) + retDB = res_dict["ret_val"] + missingFileList = res_dict["missingFileList"] + nFilesUnique = res_dict["numUniqueLfn"] + diagMap = res_dict["diagMap"] if retDB is False: taskSpec.setErrDiag(f"failed to insert files for {datasetSpec.datasetName}. {diagMap['errMsg']}") allUpdated = False @@ -595,6 +603,7 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique + nFilesMasterReady += res_dict.get("nReady", 0) master_offset = datasetSpec.getOffset() # running task if diagMap["isRunningTask"]: @@ -618,6 +627,11 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): setFrozenTime = False skip_secondaries = True tmpLog.debug("end loop") + # task holdup by workflow if no master inputs are ready + if not taskOnHold and not taskBroken and allUpdated and nFilesMasterReady == 0 and checkedMaster and taskSpec.is_workflow_holdup(): + # hold up by the workflow + taskOnHold = True + tmpLog.debug("task to hold up by workflow") # no master input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = "no master input files. input dataset is empty" diff --git a/pandajedi/jedirefine/TaskRefinerBase.py b/pandajedi/jedirefine/TaskRefinerBase.py index 74fdd4fbf..461b33544 100644 --- a/pandajedi/jedirefine/TaskRefinerBase.py +++ b/pandajedi/jedirefine/TaskRefinerBase.py @@ -432,6 +432,8 @@ def extractCommon(self, jediTaskID, taskParamMap, workQueueMapper, splitRule): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken["allowIncompleteInDS"]) if "noAutoPause" in taskParamMap and taskParamMap["noAutoPause"]: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken["noAutoPause"]) + if "workflowHoldup" in taskParamMap and taskParamMap["workflowHoldup"]: + self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken["workflowHoldup"]) # work queue workQueue = None if "workQueueName" in taskParamMap: diff --git a/pandaserver/api/v1/workflow_api.py b/pandaserver/api/v1/workflow_api.py new file mode 100644 index 000000000..065e359f1 --- /dev/null +++ b/pandaserver/api/v1/workflow_api.py @@ -0,0 +1,136 @@ +import datetime +import json +from concurrent.futures import ThreadPoolExecutor +from threading import Lock +from typing import Any, Dict, List + +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger +from pandacommon.pandautils.PandaUtils import naive_utcnow + +from pandaserver.api.v1.common import ( + MESSAGE_DATABASE, + TIME_OUT, + TimedMethod, + generate_response, + get_dn, + has_production_role, + request_validation, +) +from pandaserver.srvcore.panda_request import PandaRequest +from pandaserver.taskbuffer.TaskBuffer import TaskBuffer +from pandaserver.workflow.workflow_core import WorkflowInterface + +_logger = PandaLogger().getLogger("api_workflow") + +# These global variables are initialized in the init_task_buffer method +global_task_buffer = None +global_wfif = None + +# These global variables don't depend on DB access and can be initialized here +# global_proxy_cache = panda_proxy_cache.MyProxyInterface() +# global_token_cache = token_cache.TokenCache() + + +def init_task_buffer(task_buffer: TaskBuffer) -> None: + """ + Initialize the task buffer and other interfaces. This method needs to be called before any other method in this module. + """ + global global_task_buffer + global_task_buffer = task_buffer + + global global_wfif + global_wfif = WorkflowInterface(global_task_buffer) + + +@request_validation(_logger, secure=True, production=False, request_method="POST") +def submit_workflow_raw_request(req: PandaRequest, params: dict | str) -> dict: + """ + Submit raw request of PanDA native workflow. + + API details: + HTTP Method: POST + Path: /v1/workflow/submit_workflow_raw_request + + Args: + req(PandaRequest): internally generated request object containing the env variables + params (dict|str): dictionary or JSON of parameters of the raw request + + Returns: + dict: dictionary `{'success': True/False, 'message': 'Description of error', 'data': }` + """ + + user_dn = get_dn(req) + prodsourcelabel = "user" + + # FIXME: only for analysis temporarily + # if has_production_role(req): + # prodsourcelabel = "managed" + + tmp_logger = LogWrapper(_logger, f'submit_workflow_raw_request prodsourcelabel={prodsourcelabel} user_dn="{user_dn}" ') + tmp_logger.debug("Start") + success, message, data = False, "", None + time_start = naive_utcnow() + + if isinstance(params, str): + try: + params = json.loads(params) + except Exception as exc: + message = f"Failed to parse params: {params} {str(exc)}" + tmp_logger.error(message) + return generate_response(success, message, data) + + workflow_id = global_wfif.register_workflow(prodsourcelabel, user_dn, raw_request_params=params) + + if workflow_id is not None: + success = True + data = {"workflow_id": workflow_id} + else: + message = "Failed to submit raw workflow request" + + time_delta = naive_utcnow() - time_start + tmp_logger.debug(f"Done. Took {time_delta.seconds}.{time_delta.microseconds // 1000:03d} sec") + + return generate_response(success, message, data) + + +@request_validation(_logger, secure=True, production=False, request_method="POST") +def submit_workflow(req: PandaRequest, workflow_definition: dict) -> dict: + """ + Submit a PanDA native workflow. + + API details: + HTTP Method: POST + Path: /v1/workflow/submit_workflow + + Args: + req(PandaRequest): internally generated request object containing the env variables + workflow_definition (dict): dictionary of workflow definition + + Returns: + dict: dictionary `{'success': True/False, 'message': 'Description of error', 'data': }` + """ + + user_dn = get_dn(req) + prodsourcelabel = "user" + if has_production_role(req): + prodsourcelabel = "managed" + workflow_name = workflow_definition.get("workflow_name", None) + + tmp_logger = LogWrapper(_logger, f'submit_workflow prodsourcelabel={prodsourcelabel} user_dn="{user_dn}" workflow_name={workflow_name}') + tmp_logger.debug("Start") + success, message, data = False, "", None + time_start = naive_utcnow() + + workflow_id = global_wfif.register_workflow(prodsourcelabel, user_dn, workflow_name, workflow_definition) + + if workflow_id is not None: + success = True + data = {"workflow_id": workflow_id} + else: + message = "Failed to submit workflow" + + time_delta = naive_utcnow() - time_start + tmp_logger.debug(f"Done. Took {time_delta.seconds}.{time_delta.microseconds // 1000:03d} sec") + + return generate_response(success, message, data) diff --git a/pandaserver/dataservice/ddm.py b/pandaserver/dataservice/ddm.py index 7578bf228..b3c4282b3 100755 --- a/pandaserver/dataservice/ddm.py +++ b/pandaserver/dataservice/ddm.py @@ -791,21 +791,28 @@ def get_number_of_files(self, dataset_name: str, preset_scope: str = None): Tuple[bool, Union[int, str]]: A tuple containing a boolean indicating the success of the operation and the number of files or an error message. If an exception occurs, the boolean is False and the string contains the error message. """ + # make logger + method_name = "get_number_of_files" + method_name = f"{method_name} dataset_name={dataset_name}" + tmp_log = LogWrapper(_logger, method_name) + tmp_log.debug("start") # extract scope from dataset scope, dataset_name = self.extract_scope(dataset_name) if preset_scope is not None: scope = preset_scope - client = RucioClient() - n_files = 0 try: + client = RucioClient() + n_files = 0 for _ in client.list_files(scope, dataset_name): n_files += 1 return True, n_files except DataIdentifierNotFound: + tmp_log.debug("dataset not found") return None, "dataset not found" except Exception: err_type, err_value = sys.exc_info()[:2] err_msg = f"{err_type.__name__} {err_value}" + tmp_log.error(f"got error ; {traceback.format_exc()}") return False, err_msg # list datasets with GUIDs @@ -1093,7 +1100,7 @@ def get_dataset_metadata(self, dataset_name, ignore_missing=False): return None # get files in dataset - def get_files_in_dataset(self, dataset_name, ski_duplicate=True, ignore_unknown=False, long_format=False, lfn_only=False): + def get_files_in_dataset(self, dataset_name, skip_duplicate=True, ignore_unknown=False, long_format=False, lfn_only=False): method_name = "get_files_in_dataset" method_name += f" " tmp_log = LogWrapper(_logger, method_name) @@ -1131,7 +1138,7 @@ def get_files_in_dataset(self, dataset_name, ski_duplicate=True, ignore_unknown= guid = str(f"{x['guid'][0:8]}-{x['guid'][8:12]}-{x['guid'][12:16]}-{x['guid'][16:20]}-{x['guid'][20:32]}") attrs["guid"] = guid # skip duplicated files - if ski_duplicate: + if skip_duplicate: # extract base LFN and attempt number baseLFN = re.sub("(\.(\d+))$", "", lfn) attNr = re.sub(baseLFN + "\.*", "", lfn) diff --git a/pandaserver/server/panda.py b/pandaserver/server/panda.py index 6091929d6..b613e3d2d 100755 --- a/pandaserver/server/panda.py +++ b/pandaserver/server/panda.py @@ -38,6 +38,7 @@ from pandaserver.api.v1 import statistics_api as statistics_api_v1 from pandaserver.api.v1 import system_api as system_api_v1 from pandaserver.api.v1 import task_api as task_api_v1 +from pandaserver.api.v1 import workflow_api as workflow_api_v1 from pandaserver.api.v1.common import extract_allowed_methods from pandaserver.config import panda_config @@ -167,6 +168,7 @@ statistics_api_v1_methods = extract_allowed_methods(statistics_api_v1) system_api_v1_methods = extract_allowed_methods(system_api_v1) task_api_v1_methods = extract_allowed_methods(task_api_v1) +workflow_api_v1_methods = extract_allowed_methods(workflow_api_v1) # initialize oracledb using dummy connection initializer.init() @@ -195,6 +197,7 @@ statistics_api_v1.init_task_buffer(taskBuffer) # System API does not need to be initialized. system_api_v1.init_task_buffer(taskBuffer) task_api_v1.init_task_buffer(taskBuffer) + workflow_api_v1.init_task_buffer(taskBuffer) # initialize JobDispatcher jobDispatcher.init(taskBuffer) @@ -364,6 +367,7 @@ def module_mapping(version, api_module): "statistics": {"module": statistics_api_v1, "allowed_methods": statistics_api_v1_methods}, "system": {"module": system_api_v1, "allowed_methods": system_api_v1_methods}, "task": {"module": task_api_v1, "allowed_methods": task_api_v1_methods}, + "workflow": {"module": workflow_api_v1, "allowed_methods": workflow_api_v1_methods}, }, } try: diff --git a/pandaserver/taskbuffer/JediTaskSpec.py b/pandaserver/taskbuffer/JediTaskSpec.py index 791e16604..2a8f8a6d9 100644 --- a/pandaserver/taskbuffer/JediTaskSpec.py +++ b/pandaserver/taskbuffer/JediTaskSpec.py @@ -840,7 +840,7 @@ def removeSplitRule(self, ruleName): items = self.splitRule.split(",") newItems = [] for item in items: - # remove rile + # remove rule tmpRuleName = item.split("=")[0] if ruleName == tmpRuleName: continue @@ -1626,6 +1626,17 @@ def is_msg_driven(self): def allow_incomplete_input(self): return self.check_split_rule("allowIncompleteInDS") + # check if workflow holdup + def is_workflow_holdup(self): + return self.check_split_rule("workflowHoldup") + + # set workflow holdup + def set_workflow_holdup(self, value: bool): + if value: + self.setSplitRule("workflowHoldup", "1") + else: + self.removeSplitRule(self.splitRuleToken["workflowHoldup"]) + # get queued time def get_queued_time(self): """ diff --git a/pandaserver/taskbuffer/OraDBProxy.py b/pandaserver/taskbuffer/OraDBProxy.py index 8435f923a..30316bc49 100644 --- a/pandaserver/taskbuffer/OraDBProxy.py +++ b/pandaserver/taskbuffer/OraDBProxy.py @@ -22,6 +22,7 @@ task_standalone_module, task_utils_module, worker_module, + workflow_module, ) from pandaserver.taskbuffer.WrappedCursor import WrappedCursor @@ -73,6 +74,7 @@ class DBProxy( task_complex_module.TaskComplexModule, task_standalone_module.TaskStandaloneModule, task_utils_module.TaskUtilsModule, + workflow_module.WorkflowModule, ): # constructor def __init__(self, useOtherError=False): diff --git a/pandaserver/taskbuffer/PandaDBSchemaInfo.py b/pandaserver/taskbuffer/PandaDBSchemaInfo.py index 5a5f077be..4787ac599 100644 --- a/pandaserver/taskbuffer/PandaDBSchemaInfo.py +++ b/pandaserver/taskbuffer/PandaDBSchemaInfo.py @@ -13,6 +13,6 @@ class PandaDBSchemaInfo: schema_version = None def method(self): - schema_version = "0.0.28" + schema_version = "0.1.1" _logger.debug(f"PanDA schema version required for Server is : {schema_version}") return schema_version diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index f8fe5397a..432f593f9 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -75,9 +75,9 @@ def cleanup(self, requester=None): # transaction as a context manager # CANNOT be used with ConBridge or TaskBufferInterface which uses multiprocess.pipe @contextmanager - def transaction(self, name: str): + def transaction(self, name=None, tmp_log=None): with self.proxyPool.get() as proxy: - with proxy.transaction(name) as txn: + with proxy.transaction(name, tmp_log) as txn: if txn is None: raise RuntimeError(f"Failed to start transaction {name}") # yield the transaction @@ -2024,6 +2024,14 @@ def getTaskStatus(self, jediTaskID): res = proxy.getTaskStatus(jediTaskID) return res + # get task status and superstatus + def getTaskStatusSuperstatus(self, jediTaskID): + # get DB proxy + with self.proxyPool.get() as proxy: + # exec + res = proxy.getTaskStatusSuperstatus(jediTaskID) + return res + # reactivate task def reactivateTask(self, jediTaskID, keep_attempt_nr=False, trigger_job_generation=False): # get DB proxy @@ -2685,6 +2693,13 @@ def disable_job_cloning(self, jedi_task_id): ret = proxy.disable_job_cloning(jedi_task_id) return ret + # gets statistics on the number of jobs with a specific status for each nucleus at each site + def get_num_jobs_with_status_by_nucleus(self, vo, job_status): + with self.proxyPool.get() as proxy: + return proxy.get_num_jobs_with_status_by_nucleus(vo, job_status) + + # ==== JEDI taskbuffer functions =========================== + # get JEDI task with jediTaskID def getTaskWithID_JEDI(self, jediTaskID, fullFlag=False, lockTask=False, pid=None, lockInterval=None, clearError=False): with self.proxyPool.get() as proxy: @@ -2695,6 +2710,8 @@ def updateInputFilesStaged_JEDI(self, jeditaskid, scope, filenames_dict, chunk_s with self.proxyPool.get() as proxy: return proxy.updateInputFilesStaged_JEDI(jeditaskid, scope, filenames_dict, chunk_size, by, check_scope) + # ==== Data Carousel functions ============================= + # query data carousel request ID by dataset def get_data_carousel_request_id_by_dataset_JEDI(self, dataset): with self.proxyPool.get() as proxy: @@ -2762,10 +2779,89 @@ def resubmit_data_carousel_request_JEDI(self, request_id, exclude_prev_dst=False with self.proxyPool.get() as proxy: return proxy.resubmit_data_carousel_request_JEDI(request_id, exclude_prev_dst) - # gets statistics on the number of jobs with a specific status for each nucleus at each site - def get_num_jobs_with_status_by_nucleus(self, vo, job_status): + # ==== Workflow fucntions ================================== + + def get_workflow(self, workflow_id): with self.proxyPool.get() as proxy: - return proxy.get_num_jobs_with_status_by_nucleus(vo, job_status) + return proxy.get_workflow(workflow_id) + + def get_workflow_step(self, step_id): + with self.proxyPool.get() as proxy: + return proxy.get_workflow_step(step_id) + + def get_workflow_data(self, data_id): + with self.proxyPool.get() as proxy: + return proxy.get_workflow_data(data_id) + + def get_workflow_data_by_name(self, name, workflow_id=None): + with self.proxyPool.get() as proxy: + return proxy.get_workflow_data_by_name(name, workflow_id) + + def get_steps_of_workflow(self, workflow_id, status_filter_list=None, status_exclusion_list=None): + with self.proxyPool.get() as proxy: + return proxy.get_steps_of_workflow(workflow_id, status_filter_list, status_exclusion_list) + + def get_data_of_workflow(self, workflow_id, status_filter_list=None, status_exclusion_list=None, type_filter_list=None): + with self.proxyPool.get() as proxy: + return proxy.get_data_of_workflow(workflow_id, status_filter_list, status_exclusion_list, type_filter_list) + + def query_workflows(self, status_filter_list=None, status_exclusion_list=None, check_interval_sec=300): + with self.proxyPool.get() as proxy: + return proxy.query_workflows(status_filter_list, status_exclusion_list, check_interval_sec) + + def lock_workflow(self, workflow_id, locked_by, lock_expiration_sec=120): + with self.proxyPool.get() as proxy: + return proxy.lock_workflow(workflow_id, locked_by, lock_expiration_sec) + + def unlock_workflow(self, workflow_id, locked_by): + with self.proxyPool.get() as proxy: + return proxy.unlock_workflow(workflow_id, locked_by) + + def lock_workflow_step(self, step_id, locked_by, lock_expiration_sec=120): + with self.proxyPool.get() as proxy: + return proxy.lock_workflow_step(step_id, locked_by, lock_expiration_sec) + + def unlock_workflow_step(self, step_id, locked_by): + with self.proxyPool.get() as proxy: + return proxy.unlock_workflow_step(step_id, locked_by) + + def lock_workflow_data(self, data_id, locked_by, lock_expiration_sec=120): + with self.proxyPool.get() as proxy: + return proxy.lock_workflow_data(data_id, locked_by, lock_expiration_sec) + + def unlock_workflow_data(self, data_id, locked_by): + with self.proxyPool.get() as proxy: + return proxy.unlock_workflow_data(data_id, locked_by) + + def insert_workflow(self, workflow_spec): + with self.proxyPool.get() as proxy: + return proxy.insert_workflow(workflow_spec) + + def insert_workflow_step(self, wf_step_spec): + with self.proxyPool.get() as proxy: + return proxy.insert_workflow_step(wf_step_spec) + + def insert_workflow_data(self, wf_data_spec): + with self.proxyPool.get() as proxy: + return proxy.insert_workflow_data(wf_data_spec) + + def update_workflow(self, workflow_spec): + with self.proxyPool.get() as proxy: + return proxy.update_workflow(workflow_spec) + + def update_workflow_step(self, wf_step_spec): + with self.proxyPool.get() as proxy: + return proxy.update_workflow_step(wf_step_spec) + + def update_workflow_data(self, wf_data_spec): + with self.proxyPool.get() as proxy: + return proxy.update_workflow_data(wf_data_spec) + + def upsert_workflow_entities(self, workflow_id, actions_dict=None, workflow_spec=None, step_specs=None, data_specs=None): + with self.proxyPool.get() as proxy: + return proxy.upsert_workflow_entities(workflow_id, actions_dict, workflow_spec, step_specs, data_specs) + + # ========================================================== # Singleton diff --git a/pandaserver/taskbuffer/db_proxy_mods/base_module.py b/pandaserver/taskbuffer/db_proxy_mods/base_module.py index 7abd693e9..16af0b928 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/base_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/base_module.py @@ -476,12 +476,13 @@ def wakeUp(self): # transaction as a context manager @contextmanager - def transaction(self, name: str): + def transaction(self, name: str | None = None, tmp_log=None): """ Context manager for transaction Args: - name (str): name of the transaction to be shown in the log + name (str, optional): name of the transaction to be shown in the log + tmp_log (LogWrapper, optional): logger to use. If None, a new logger will be created Yields: Any: the cursor object for executing SQL commands @@ -489,16 +490,17 @@ def transaction(self, name: str): """ comment = " /* DBProxy.transaction */" try: - tmp_log = self.create_tagged_logger(comment, tag=name) - tmp_log.debug("start") + if tmp_log is None: + tmp_log = self.create_tagged_logger(comment, tag=name) + tmp_log.debug("transaction start") # begin transaction self.conn.begin() # cursor and logger for the with block yield (self.cur, tmp_log) # commit transaction if not self._commit(): - raise RuntimeError("Commit error") - tmp_log.debug("done") + raise RuntimeError("commit error") + tmp_log.debug("transaction done") except Exception as e: # roll back self._rollback() diff --git a/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py b/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py index 309ac1123..2ba7093e2 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py @@ -532,6 +532,38 @@ def getTaskStatus(self, jediTaskID): self.dump_error_message(tmp_log) return [] + # get task status and superstatus + def getTaskStatusSuperstatus(self, jediTaskID): + comment = " /* DBProxy.getTaskStatusSuperstatus */" + tmp_log = self.create_tagged_logger(comment, f"jediTaskID={jediTaskID}") + tmp_log.debug("start") + try: + # sql to update input file status + varMap = {} + varMap[":jediTaskID"] = jediTaskID + sql = f"SELECT status,superStatus FROM {panda_config.schemaJEDI}.JEDI_Tasks " + sql += "WHERE jediTaskID=:jediTaskID " + # start transaction + self.conn.begin() + self.cur.arraysize = 1000 + self.cur.execute(sql + comment, varMap) + res = self.cur.fetchone() + # commit + if not self._commit(): + raise RuntimeError("Commit error") + if res: + tmp_log.debug(f"task {jediTaskID} has status={res[0]} superstatus={res[1]}") + else: + res = [] + tmp_log.debug(f"task {jediTaskID} not found") + return res + except Exception: + # roll back + self._rollback() + # error + self.dump_error_message(tmp_log) + return [] + # reactivate task def reactivateTask(self, jediTaskID, keep_attempt_nr=False, trigger_job_generation=False): comment = " /* DBProxy.reactivateTask */" diff --git a/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py b/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py index 622a1e692..b80c4f818 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py @@ -222,8 +222,10 @@ def insertFilesForDataset_JEDI( tmpLog.debug(f"skipShortInput={skipShortInput} skipShortOutput={skip_short_output} inputPreStaging={inputPreStaging} order_by={order_by}") # return value for failure diagMap = {"errMsg": "", "nChunksForScout": nChunksForScout, "nActivatedPending": 0, "isRunningTask": False} - failedRet = False, 0, None, diagMap - harmlessRet = None, 0, None, diagMap + # failedRet = False, 0, None, diagMap + # harmlessRet = None, 0, None, diagMap + failedRet = {"ret_val": False, "missingFileList": 0, "numUniqueLfn": None, "diagMap": diagMap} + harmlessRet = {"ret_val": None, "missingFileList": 0, "numUniqueLfn": None, "diagMap": diagMap} regStart = naive_utcnow() # mutable fake_mutable_for_skip_short_output = False @@ -632,7 +634,8 @@ def insertFilesForDataset_JEDI( nEventsLost = 0 nEventsExist = 0 stagingLB = set() - retVal = None, missingFileList, None, diagMap + # retVal = None, missingFileList, None, diagMap + retVal = {"ret_val": None, "missingFileList": missingFileList, "numUniqueLfn": None, "diagMap": diagMap, "nReady": nReady} # begin transaction self.conn.begin() # check task @@ -741,7 +744,8 @@ def insertFilesForDataset_JEDI( self.cur.execute(sqlCo + comment, varMap) resCo = self.cur.fetchone() numUniqueLfn = resCo[0] - retVal = True, missingFileList, numUniqueLfn, diagMap + # retVal = True, missingFileList, numUniqueLfn, diagMap + retVal = {"ret_val": True, "missingFileList": missingFileList, "numUniqueLfn": numUniqueLfn, "diagMap": diagMap, "nReady": nReady} else: oldDsStatus, nFilesUnprocessed, dsStateInDB, nFilesToUseDS, nFilesUsedInDS = resDs tmpLog.debug(f"ds.state={dsStateInDB} in DB") @@ -1258,7 +1262,8 @@ def insertFilesForDataset_JEDI( if nFilesUnprocessed not in [0, None]: diagMap["nActivatedPending"] += nFilesUnprocessed # set return value - retVal = True, missingFileList, numUniqueLfn, diagMap + # retVal = True, missingFileList, numUniqueLfn, diagMap + retVal = {"ret_val": True, "missingFileList": missingFileList, "numUniqueLfn": numUniqueLfn, "diagMap": diagMap, "nReady": nReady} # fix secondary files in staging if inputPreStaging and datasetSpec.isSeqNumber(): get_task_utils_module(self).fix_associated_files_in_staging(datasetSpec.jediTaskID, secondary_id=datasetSpec.datasetID) diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py new file mode 100644 index 000000000..31f06238a --- /dev/null +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -0,0 +1,832 @@ +import json +import os +import re +import sys +from datetime import datetime, timedelta + +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandautils.PandaUtils import get_sql_IN_bind_variables, naive_utcnow + +from pandaserver.config import panda_config +from pandaserver.srvcore import CoreUtils +from pandaserver.taskbuffer import ErrorCode, JobUtils +from pandaserver.taskbuffer.db_proxy_mods.base_module import BaseModule, varNUMBER +from pandaserver.taskbuffer.db_proxy_mods.entity_module import get_entity_module +from pandaserver.taskbuffer.JobSpec import JobSpec +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFStepSpec, + WFStepStatus, + WorkflowSpec, + WorkflowStatus, +) + + +# Module class to define methods related to workflow +class WorkflowModule(BaseModule): + # constructor + def __init__(self, log_stream: LogWrapper): + super().__init__(log_stream) + + def get_workflow(self, workflow_id: int) -> WorkflowSpec | None: + """ + Retrieve a workflow specification by its ID + + Args: + workflow_id (int): ID of the workflow to retrieve + + Returns: + WorkflowSpec | None: The workflow specification if found, otherwise None + """ + comment = " /* DBProxy.get_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}") + sql = f"SELECT {WorkflowSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflows " f"WHERE workflow_id=:workflow_id " + var_map = {":workflow_id": workflow_id} + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + if len(res_list) > 1: + tmp_log.error("more than one workflows; unexpected") + else: + for res in res_list: + workflow_spec = WorkflowSpec() + workflow_spec.pack(res) + return workflow_spec + else: + tmp_log.warning("no workflow found; skipped") + return None + + def get_workflow_step(self, step_id: int) -> WFStepSpec | None: + """ + Retrieve a workflow step specification by its ID + + Args: + step_id (int): ID of the workflow step to retrieve + + Returns: + WFStepSpec | None: The workflow step specification if found, otherwise None + """ + comment = " /* DBProxy.get_workflow_step */" + tmp_log = self.create_tagged_logger(comment, f"step_id={step_id}") + sql = f"SELECT {WFStepSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_steps " f"WHERE step_id=:step_id " + var_map = {":step_id": step_id} + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + if len(res_list) > 1: + tmp_log.error("more than one steps; unexpected") + else: + for res in res_list: + step_spec = WFStepSpec() + step_spec.pack(res) + return step_spec + else: + tmp_log.warning("no step found; skipped") + return None + + def get_workflow_data(self, data_id: int) -> WFDataSpec | None: + """ + Retrieve a workflow data specification by its ID + + Args: + data_id (int): ID of the workflow data to retrieve + + Returns: + WFDataSpec | None: The workflow data specification if found, otherwise None + """ + comment = " /* DBProxy.get_workflow_data */" + tmp_log = self.create_tagged_logger(comment, f"data_id={data_id}") + sql = f"SELECT {WFDataSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_data " f"WHERE data_id=:data_id " + var_map = {":data_id": data_id} + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + if len(res_list) > 1: + tmp_log.error("more than one data; unexpected") + else: + for res in res_list: + data_spec = WFDataSpec() + data_spec.pack(res) + return data_spec + else: + tmp_log.warning("no data found; skipped") + return None + + def get_workflow_data_by_name(self, name: str, workflow_id: int | None) -> WFDataSpec | None: + """ + Retrieve a workflow data specification by its name and workflow ID + + Args: + name (str): Name of the workflow data to retrieve + workflow_id (int | None): ID of the workflow to which the data belongs (optional) + + Returns: + WFDataSpec | None: The workflow data specification if found, otherwise None + """ + comment = " /* DBProxy.get_workflow_data_by_name */" + tmp_log = self.create_tagged_logger(comment, f"name={name}, workflow_id={workflow_id}") + sql = f"SELECT {WFDataSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_data " f"WHERE name=:name " + var_map = {":name": name} + if workflow_id is not None: + sql += "AND workflow_id=:workflow_id " + var_map[":workflow_id"] = workflow_id + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + if len(res_list) > 1: + tmp_log.error("more than one data; unexpected") + return None + else: + for res in res_list: + data_spec = WFDataSpec() + data_spec.pack(res) + return data_spec + else: + tmp_log.warning("no data found; skipped") + return None + + def get_steps_of_workflow(self, workflow_id: int, status_filter_list: list | None = None, status_exclusion_list: list | None = None) -> list[WFStepSpec]: + """ + Retrieve all workflow steps for a given workflow ID + + Args: + workflow_id (int): ID of the workflow to retrieve steps for + status_filter_list (list | None): List of statuses to filter the steps by (optional) + status_exclusion_list (list | None): List of statuses to exclude the steps by (optional) + + Returns: + list[WFStepSpec]: List of workflow step specifications + """ + comment = " /* DBProxy.get_steps_of_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}") + sql = f"SELECT {WFStepSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_steps " f"WHERE workflow_id=:workflow_id " + var_map = {":workflow_id": workflow_id} + if status_filter_list: + status_var_names_str, status_var_map = get_sql_IN_bind_variables(status_filter_list, prefix=":status") + sql += f"AND status IN ({status_var_names_str}) " + var_map.update(status_var_map) + if status_exclusion_list: + antistatus_var_names_str, antistatus_var_map = get_sql_IN_bind_variables(status_exclusion_list, prefix=":antistatus") + sql += f"AND status NOT IN ({antistatus_var_names_str}) " + var_map.update(antistatus_var_map) + sql += "ORDER BY step_id " + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + step_specs = [] + for res in res_list: + step_spec = WFStepSpec() + step_spec.pack(res) + step_specs.append(step_spec) + return step_specs + else: + tmp_log.warning("no steps found; skipped") + return [] + + def get_data_of_workflow( + self, workflow_id: int, status_filter_list: list | None = None, status_exclusion_list: list | None = None, type_filter_list: list | None = None + ) -> list[WFDataSpec]: + """ + Retrieve all workflow data for a given workflow ID + + Args: + workflow_id (int): ID of the workflow to retrieve data for + status_filter_list (list | None): List of statuses to filter the data by (optional) + status_exclusion_list (list | None): List of statuses to exclude the data by (optional) + type_filter_list (list | None): List of types to filter the data by (optional) + + Returns: + list[WFDataSpec]: List of workflow data specifications + """ + comment = " /* DBProxy.get_data_of_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}") + sql = f"SELECT {WFDataSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_data " f"WHERE workflow_id=:workflow_id " + var_map = {":workflow_id": workflow_id} + if status_filter_list: + status_var_names_str, status_var_map = get_sql_IN_bind_variables(status_filter_list, prefix=":status") + sql += f"AND status IN ({status_var_names_str}) " + var_map.update(status_var_map) + if status_exclusion_list: + antistatus_var_names_str, antistatus_var_map = get_sql_IN_bind_variables(status_exclusion_list, prefix=":antistatus") + sql += f"AND status NOT IN ({antistatus_var_names_str}) " + var_map.update(antistatus_var_map) + if type_filter_list: + type_var_names_str, type_var_map = get_sql_IN_bind_variables(type_filter_list, prefix=":type") + sql += f"AND type IN ({type_var_names_str}) " + var_map.update(type_var_map) + sql += "ORDER BY data_id " + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + data_specs = [] + for res in res_list: + data_spec = WFDataSpec() + data_spec.pack(res) + data_specs.append(data_spec) + return data_specs + else: + tmp_log.warning("no data found; skipped") + return [] + + def query_workflows( + self, status_filter_list: list | None = None, status_exclusion_list: list | None = None, check_interval_sec: int = 300 + ) -> list[WorkflowSpec]: + """ + Retrieve list of workflows with optional status filtering + + Args: + status_filter_list (list | None): List of statuses to filter the workflows by (optional) + status_exclusion_list (list | None): List of statuses to exclude the workflows by (optional) + check_interval_sec (int): Time in seconds to wait between checks (default: 300) + + Returns: + list[WorkflowSpec]: List of workflow specifications + """ + comment = " /* DBProxy.query_workflows */" + tmp_log = self.create_tagged_logger(comment, "query_workflows") + tmp_log.debug(f"start, status_filter_list={status_filter_list} status_exclusion_list={status_exclusion_list} check_interval_sec={check_interval_sec}") + sql = f"SELECT {WorkflowSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflows " f"WHERE (check_time IS NULL OR check_time<:check_time) " + now_time = naive_utcnow() + var_map = {":check_time": now_time - timedelta(seconds=check_interval_sec)} + if status_filter_list: + status_var_names_str, status_var_map = get_sql_IN_bind_variables(status_filter_list, prefix=":status") + sql += f"AND status IN ({status_var_names_str}) " + var_map.update(status_var_map) + if status_exclusion_list: + antistatus_var_names_str, antistatus_var_map = get_sql_IN_bind_variables(status_exclusion_list, prefix=":antistatus") + sql += f"AND status NOT IN ({antistatus_var_names_str}) " + var_map.update(antistatus_var_map) + sql += "ORDER BY check_time, creation_time " + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + workflow_specs = [] + for res in res_list: + workflow_spec = WorkflowSpec() + workflow_spec.pack(res) + workflow_specs.append(workflow_spec) + tmp_log.debug(f"got {len(workflow_specs)} workflows") + return workflow_specs + else: + tmp_log.warning("no workflows found; skipped") + return [] + + def lock_workflow(self, workflow_id: int, locked_by: str, lock_expiration_sec: int = 120) -> bool | None: + """ + Lock a workflow to prevent concurrent modifications + + Args: + workflow_id (int): ID of the workflow to lock + locked_by (str): Identifier of the entity locking the workflow + lock_expiration_sec (int): Time in seconds after which the lock expires + + Returns: + bool | None: True if the lock was acquired, False if not, None if an error occurred + """ + comment = " /* DBProxy.lock_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + now_time = naive_utcnow() + sql_lock = ( + f"UPDATE {panda_config.schemaJEDI}.workflows " + "SET locked_by=:locked_by, lock_time=:lock_time " + "WHERE workflow_id=:workflow_id " + "AND (locked_by IS NULL OR locked_by=:locked_by OR lock_time<:min_lock_time)" + ) + var_map = { + ":locked_by": locked_by, + ":lock_time": now_time, + ":workflow_id": workflow_id, + ":min_lock_time": now_time - timedelta(seconds=lock_expiration_sec), + } + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_lock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to lock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one workflow updated to lock; unexpected") + elif row_count == 0: + # no row updated; did not get the lock + tmp_log.debug(f"did not get lock; skipped") + return False + elif row_count == 1: + # successfully locked the workflow + tmp_log.debug(f"got lock") + return True + except Exception as e: + tmp_log.error(f"failed to lock workflow: {e}") + + def unlock_workflow(self, workflow_id: int, locked_by: str) -> bool | None: + """ + Unlock a workflow to allow modifications + + Args: + workflow_id (int): ID of the workflow to unlock + locked_by (str): Identifier of the entity unlocking the workflow + + Returns: + bool | None: True if the unlock was successful, False if not, None if an error occurred + """ + comment = " /* DBProxy.unlock_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + sql_unlock = ( + f"UPDATE {panda_config.schemaJEDI}.workflows " "SET locked_by=NULL, lock_time=NULL " "WHERE workflow_id=:workflow_id AND locked_by=:locked_by" + ) + var_map = {":workflow_id": workflow_id, ":locked_by": locked_by} + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_unlock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to unlock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one workflow updated to unlock; unexpected") + elif row_count == 0: + # no row updated; did not get the unlock + tmp_log.debug(f"no workflow updated to unlock; skipped") + return False + elif row_count == 1: + # successfully unlocked the workflow + tmp_log.debug(f"released lock") + return True + except Exception as e: + tmp_log.error(f"failed to unlock workflow: {e}") + + def lock_workflow_step(self, step_id: int, locked_by: str, lock_expiration_sec: int = 120) -> bool | None: + """ + Lock a workflow step to prevent concurrent modifications + + Args: + step_id (int): ID of the workflow step to lock + locked_by (str): Identifier of the entity locking the workflow step + lock_expiration_sec (int): Time in seconds after which the lock expires + + Returns: + bool | None: True if the lock was acquired, False if not, None if an error occurred + """ + comment = " /* DBProxy.lock_workflow_step */" + tmp_log = self.create_tagged_logger(comment, f"step_id={step_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + now_time = naive_utcnow() + sql_lock = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_steps " + "SET locked_by=:locked_by, lock_time=:lock_time " + "WHERE step_id=:step_id " + "AND (locked_by IS NULL OR locked_by=:locked_by OR lock_time<:min_lock_time)" + ) + var_map = { + ":locked_by": locked_by, + ":lock_time": now_time, + ":step_id": step_id, + ":min_lock_time": now_time - timedelta(seconds=lock_expiration_sec), + } + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_lock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to lock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one step updated to lock; unexpected") + elif row_count == 0: + # no row updated; did not get the lock + tmp_log.debug(f"did not get lock; skipped") + return False + elif row_count == 1: + # successfully locked the workflow step + tmp_log.debug(f"got lock") + return True + except Exception as e: + tmp_log.error(f"failed to lock workflow step: {e}") + + def unlock_workflow_step(self, step_id: int, locked_by: str) -> bool | None: + """ + Unlock a workflow step to allow modifications + + Args: + step_id (int): ID of the workflow step to unlock + locked_by (str): Identifier of the entity unlocking the workflow step + + Returns: + bool | None: True if the unlock was successful, False if not, None if an error occurred + """ + comment = " /* DBProxy.unlock_workflow_step */" + tmp_log = self.create_tagged_logger(comment, f"step_id={step_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + sql_unlock = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_steps " "SET locked_by=NULL, lock_time=NULL " "WHERE step_id=:step_id AND locked_by=:locked_by" + ) + var_map = {":step_id": step_id, ":locked_by": locked_by} + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_unlock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to unlock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one step updated to unlock; unexpected") + elif row_count == 0: + # no row updated; did not get the unlock + tmp_log.debug(f"no step updated to unlock; skipped") + return False + elif row_count == 1: + # successfully unlocked the workflow step + tmp_log.debug(f"released lock") + return True + except Exception as e: + tmp_log.error(f"failed to unlock workflow step: {e}") + + def lock_workflow_data(self, data_id: int, locked_by: str, lock_expiration_sec: int = 120) -> bool | None: + """ + Lock a workflow data to prevent concurrent modifications + + Args: + data_id (int): ID of the workflow data to lock + locked_by (str): Identifier of the entity locking the workflow data + lock_expiration_sec (int): Time in seconds after which the lock expires + + Returns: + bool | None: True if the lock was acquired, False if not, None if an error occurred + """ + comment = " /* DBProxy.lock_workflow_data */" + tmp_log = self.create_tagged_logger(comment, f"data_id={data_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + now_time = naive_utcnow() + sql_lock = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_data " + "SET locked_by=:locked_by, lock_time=:lock_time " + "WHERE data_id=:data_id " + "AND (locked_by IS NULL OR locked_by=:locked_by OR lock_time<:min_lock_time)" + ) + var_map = { + ":locked_by": locked_by, + ":lock_time": now_time, + ":data_id": data_id, + ":min_lock_time": now_time - timedelta(seconds=lock_expiration_sec), + } + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_lock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to lock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one data updated to lock; unexpected") + elif row_count == 0: + # no row updated; did not get the lock + tmp_log.debug(f"did not get lock; skipped") + return False + elif row_count == 1: + # successfully locked the workflow data + tmp_log.debug(f"got lock") + return True + except Exception as e: + tmp_log.error(f"failed to lock workflow data: {e}") + + def unlock_workflow_data(self, data_id: int, locked_by: str) -> bool | None: + """ + Unlock a workflow data to allow modifications + + Args: + data_id (int): ID of the workflow data to unlock + locked_by (str): Identifier of the entity unlocking the workflow data + + Returns: + bool | None: True if the unlock was successful, False if not, None if an error occurred + """ + comment = " /* DBProxy.unlock_workflow_data */" + tmp_log = self.create_tagged_logger(comment, f"data_id={data_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + sql_unlock = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_data " "SET locked_by=NULL, lock_time=NULL " "WHERE data_id=:data_id AND locked_by=:locked_by" + ) + var_map = {":data_id": data_id, ":locked_by": locked_by} + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_unlock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to unlock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one data updated to unlock; unexpected") + elif row_count == 0: + # no row updated; did not get the unlock + tmp_log.debug(f"no data updated to unlock; skipped") + return False + elif row_count == 1: + # successfully unlocked the workflow data + tmp_log.debug(f"released lock") + return True + except Exception as e: + tmp_log.error(f"failed to unlock workflow data: {e}") + + def insert_workflow(self, workflow_spec: WorkflowSpec) -> int | None: + """ + Insert a new workflow specification into the database + + Args: + workflow_spec (WorkflowSpec): The workflow specification to insert + + Returns: + int | None: The ID of the inserted workflow if successful, otherwise None + """ + comment = " /* DBProxy.insert_workflow */" + tmp_log = self.create_tagged_logger(comment, "") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to insert workflow + workflow_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflows ({workflow_spec.columnNames()}) " + f"{workflow_spec.bindValuesExpression()} " + f"RETURNING workflow_id INTO :new_workflow_id " + ) + var_map = workflow_spec.valuesMap(useSeq=True) + var_map[":new_workflow_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + workflow_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_workflow_id"]))) + tmp_log.debug(f"inserted workflow_id={workflow_id}") + return workflow_id + except Exception: + return None + + def insert_workflow_step(self, step_spec: WFStepSpec) -> int | None: + """ + Insert a new workflow step specification into the database + + Args: + step_spec (WFStepSpec): The workflow step specification to insert + + Returns: + int | None: The ID of the inserted workflow step if successful, otherwise None + """ + comment = " /* DBProxy.insert_workflow_step */" + tmp_log = self.create_tagged_logger(comment, "") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to insert workflow step + step_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflow_steps ({step_spec.columnNames()}) " + f"{step_spec.bindValuesExpression()} " + f"RETURNING step_id INTO :new_step_id " + ) + var_map = step_spec.valuesMap(useSeq=True) + var_map[":new_step_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + step_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_step_id"]))) + tmp_log.debug(f"inserted step_id={step_id}") + return step_id + except Exception: + return None + + def insert_workflow_data(self, data_spec: WFDataSpec) -> int | None: + """ + Insert a new workflow data specification into the database + + Args: + data_spec (WFDataSpec): The workflow data specification to insert + + Returns: + int | None: The ID of the inserted workflow data if successful, otherwise None + """ + comment = " /* DBProxy.insert_workflow_data */" + tmp_log = self.create_tagged_logger(comment, "") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to insert workflow data + data_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflow_data ({data_spec.columnNames()}) " + f"{data_spec.bindValuesExpression()} " + f"RETURNING data_id INTO :new_data_id " + ) + var_map = data_spec.valuesMap(useSeq=True) + var_map[":new_data_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + data_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_data_id"]))) + tmp_log.debug(f"inserted data_id={data_id}") + return data_id + except Exception: + return None + + def update_workflow(self, workflow_spec: WorkflowSpec) -> WorkflowSpec | None: + """ + Update a workflow specification in the database + + Args: + workflow_spec (WorkflowSpec): The workflow specification to update + + Returns: + WorkflowSpec | None: The updated workflow specification if successful, otherwise None + """ + comment = " /* DBProxy.update_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_spec.workflow_id}") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to update workflow + workflow_spec.modification_time = naive_utcnow() + sql_update = ( + f"UPDATE {panda_config.schemaJEDI}.workflows " f"SET {workflow_spec.bindUpdateChangesExpression()} " "WHERE workflow_id=:workflow_id " + ) + var_map = workflow_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":workflow_id"] = workflow_spec.workflow_id + cur.execute(sql_update + comment, var_map) + tmp_log.debug(f"updated {workflow_spec.bindUpdateChangesExpression()}") + return workflow_spec + except Exception: + return None + + def update_workflow_step(self, step_spec: WFStepSpec) -> WFStepSpec | None: + """ + Update a workflow step specification in the database + + Args: + step_spec (WFStepSpec): The workflow step specification to update + + Returns: + WFStepSpec | None: The updated workflow step specification if successful, otherwise None + """ + comment = " /* DBProxy.update_workflow_step */" + tmp_log = self.create_tagged_logger(comment, f"step_id={step_spec.step_id}") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to update workflow step + step_spec.modification_time = naive_utcnow() + sql_update = f"UPDATE {panda_config.schemaJEDI}.workflow_steps " f"SET {step_spec.bindUpdateChangesExpression()} " "WHERE step_id=:step_id " + var_map = step_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":step_id"] = step_spec.step_id + cur.execute(sql_update + comment, var_map) + tmp_log.debug(f"updated {step_spec.bindUpdateChangesExpression()}") + return step_spec + except Exception: + return None + + def update_workflow_data(self, data_spec: WFDataSpec) -> WFDataSpec | None: + """ + Update a workflow data specification in the database + + Args: + data_spec (WFDataSpec): The workflow data specification to update + + Returns: + WFDataSpec | None: The updated workflow data specification if successful, otherwise None + """ + comment = " /* DBProxy.update_workflow_data */" + tmp_log = self.create_tagged_logger(comment, f"data_id={data_spec.data_id}") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to update workflow data + data_spec.modification_time = naive_utcnow() + sql_update = f"UPDATE {panda_config.schemaJEDI}.workflow_data " f"SET {data_spec.bindUpdateChangesExpression()} " "WHERE data_id=:data_id " + var_map = data_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":data_id"] = data_spec.data_id + cur.execute(sql_update + comment, var_map) + tmp_log.debug(f"updated {data_spec.bindUpdateChangesExpression()}") + return data_spec + except Exception: + return None + + def upsert_workflow_entities( + self, + workflow_id: int | None, + actions_dict: dict | None = None, + workflow_spec: WorkflowSpec | None = None, + step_specs: list[WFStepSpec] | None = None, + data_specs: list[WFDataSpec] | None = None, + ) -> dict | None: + """ + Update or insert (if not existing) steps and data associated with a workflow within a transaction + + Args: + workflow_id (int | None): ID of the workflow to update, or None if to insert + actions_dict (dict | None): Dictionary of actions (insert, update, or None) to perform on the entities (workflow, steps, data), e.g. {"workflow": None, "steps": "insert", "data": "update"} + workflow_spec (WorkflowSpec|None): The workflow specification to update or insert + step_specs (list[WFStepSpec]|None): List of workflow step specifications to update or insert + data_specs (list[WFDataSpec]|None): List of workflow data specifications to update or insert + + Returns: + dict | None: Dictionary containing the number of steps and data upserted, or None if an error occurred + """ + comment = " /* DBProxy.upsert_workflow_entities */" + # Determine actions of each entity + action_of_workflow = None + action_of_steps = None + action_of_data = None + if actions_dict: + if (tmp_action_of_workflow := actions_dict.get("workflow")) and workflow_spec: + if tmp_action_of_workflow == "insert" and workflow_id is None: + action_of_workflow = "insert" + elif tmp_action_of_workflow == "update" and workflow_id is not None and workflow_spec.workflow_id == workflow_id: + action_of_workflow = "update" + action_of_steps = actions_dict.get("steps") if (workflow_id and step_specs) else None + action_of_data = actions_dict.get("data") if (workflow_id and data_specs) else None + actions_dict = { + "workflow": action_of_workflow, + "steps": action_of_steps, + "data": action_of_data, + } + # log + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}") + tmp_log.debug(f"start, actions={actions_dict}") + # skip if no action specified + if not any(actions_dict.values()): + tmp_log.warning("no action specified; skipped") + return None + try: + n_steps_upserted = 0 + n_data_upserted = 0 + with self.transaction(tmp_log=tmp_log) as (cur, _): + # action for data + if action_of_data == "insert": + for data_spec in data_specs: + data_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflow_data ({data_spec.columnNames()}) " + f"{data_spec.bindValuesExpression()} " + f"RETURNING data_id INTO :new_data_id " + ) + var_map = data_spec.valuesMap(useSeq=True) + var_map[":new_data_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + data_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_data_id"]))) + data_spec.data_id = data_id + n_data_upserted += 1 + tmp_log.debug(f"inserted a data workflow_id={workflow_id} data_id={data_id}") + elif action_of_data == "update": + for data_spec in data_specs: + data_spec.modification_time = naive_utcnow() + sql_update = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_data " f"SET {data_spec.bindUpdateChangesExpression()} " "WHERE data_id=:data_id " + ) + var_map = data_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":data_id"] = data_spec.data_id + self.cur.execute(sql_update + comment, var_map) + n_data_upserted += 1 + tmp_log.debug(f"updated a data workflow_id={workflow_id} data_id={data_spec.data_id}") + # action for steps + if action_of_steps == "insert": + for step_spec in step_specs: + step_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflow_steps ({step_spec.columnNames()}) " + f"{step_spec.bindValuesExpression()} " + f"RETURNING step_id INTO :new_step_id " + ) + var_map = step_spec.valuesMap(useSeq=True) + var_map[":new_step_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + step_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_step_id"]))) + step_spec.step_id = step_id + n_steps_upserted += 1 + tmp_log.debug(f"inserted a step workflow_id={workflow_id} step_id={step_id}") + elif action_of_steps == "update": + for step_spec in step_specs: + step_spec.modification_time = naive_utcnow() + sql_update = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_steps " f"SET {step_spec.bindUpdateChangesExpression()} " "WHERE step_id=:step_id " + ) + var_map = step_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":step_id"] = step_spec.step_id + self.cur.execute(sql_update + comment, var_map) + n_steps_upserted += 1 + tmp_log.debug(f"updated a step workflow_id={workflow_id} step_id={step_spec.step_id}") + # action for workflow + if action_of_workflow == "insert": + workflow_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflows ({workflow_spec.columnNames()}) " + f"{workflow_spec.bindValuesExpression()} " + f"RETURNING workflow_id INTO :new_workflow_id " + ) + var_map = workflow_spec.valuesMap(useSeq=True) + var_map[":new_workflow_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + workflow_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_workflow_id"]))) + workflow_spec.workflow_id = workflow_id + tmp_log.debug(f"inserted a workflow workflow_id={workflow_id}") + elif action_of_workflow == "update": + workflow_spec.modification_time = naive_utcnow() + sql_update = ( + f"UPDATE {panda_config.schemaJEDI}.workflows " f"SET {workflow_spec.bindUpdateChangesExpression()} " "WHERE workflow_id=:workflow_id " + ) + var_map = workflow_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":workflow_id"] = workflow_spec.workflow_id + self.cur.execute(sql_update + comment, var_map) + tmp_log.debug(f"updated a workflow workflow_id={workflow_spec.workflow_id}") + tmp_log.debug("actions completed") + # Summary + tmp_log.debug(f"done, actions={actions_dict}, upserted workflow_id={workflow_id} with {n_steps_upserted} steps and {n_data_upserted} data") + return {"workflow_id": workflow_id, "steps": n_steps_upserted, "data": n_data_upserted} + except Exception: + return None diff --git a/pandaserver/taskbuffer/task_split_rules.py b/pandaserver/taskbuffer/task_split_rules.py index c1243b4ae..2bc715407 100644 --- a/pandaserver/taskbuffer/task_split_rules.py +++ b/pandaserver/taskbuffer/task_split_rules.py @@ -99,6 +99,7 @@ "useExhausted": "UX", "useZipToPin": "UZ", "writeInputToFile": "WF", + "workflowHoldup": "WH", "waitInput": "WI", "maxAttemptES": "XA", "decAttOnFailedES": "XF", diff --git a/pandaserver/taskbuffer/workflow_processor.py b/pandaserver/taskbuffer/workflow_processor.py index c3861d277..00a0768fd 100644 --- a/pandaserver/taskbuffer/workflow_processor.py +++ b/pandaserver/taskbuffer/workflow_processor.py @@ -21,6 +21,9 @@ _logger = PandaLogger().getLogger("workflow_processor") +SUPPORTED_WORKFLOW_LANGUAGES = ["cwl", "snakemake"] + + # process workflow class WorkflowProcessor(object): # constructor @@ -170,34 +173,19 @@ def core_exec(sandbox_url, log_token, dump_workflow, ops_file, user_name, test_m if is_OK: tmpLog.info("parse workflow") workflow_name = None - if ops["data"]["language"] == "cwl": - workflow_name = ops["data"].get("workflow_name") - nodes, root_in = pcwl_utils.parse_workflow_file(ops["data"]["workflowSpecFile"], tmpLog) - with open(ops["data"]["workflowInputFile"]) as workflow_input: - yaml = YAML(typ="safe", pure=True) - data = yaml.load(workflow_input) - # noinspection DuplicatedCode - s_id, t_nodes, nodes = pcwl_utils.resolve_nodes(nodes, root_in, data, 0, set(), ops["data"]["outDS"], tmpLog) - workflow_utils.set_workflow_outputs(nodes) - id_node_map = workflow_utils.get_node_id_map(nodes) - [node.resolve_params(ops["data"]["taskParams"], id_node_map) for node in nodes] - dump_str = "the description was internally converted as follows\n" + workflow_utils.dump_nodes(nodes) - tmpLog.info(dump_str) - for node in nodes: - s_check, o_check = node.verify() - tmp_str = f"Verification failure in ID:{node.id} {o_check}" - if not s_check: - tmpLog.error(tmp_str) - dump_str += tmp_str - dump_str += "\n" - is_fatal = True - is_OK = False - elif ops["data"]["language"] == "snakemake": - parser = Parser(ops["data"]["workflowSpecFile"], logger=tmpLog) - nodes, root_in = parser.parse_nodes() - data = dict() - # noinspection DuplicatedCode - s_id, t_nodes, nodes = pcwl_utils.resolve_nodes(nodes, root_in, data, 0, set(), ops["data"]["outDS"], tmpLog) + if (wf_lang := ops["data"]["language"]) in SUPPORTED_WORKFLOW_LANGUAGES: + if wf_lang == "cwl": + workflow_name = ops["data"].get("workflow_name") + nodes, root_in = pcwl_utils.parse_workflow_file(ops["data"]["workflowSpecFile"], tmpLog) + with open(ops["data"]["workflowInputFile"]) as workflow_input: + yaml = YAML(typ="safe", pure=True) + data = yaml.load(workflow_input) + elif wf_lang == "snakemake": + parser = Parser(ops["data"]["workflowSpecFile"], logger=tmpLog) + nodes, root_in = parser.parse_nodes() + data = dict() + # resolve nodes + s_id, t_nodes, nodes = workflow_utils.resolve_nodes(nodes, root_in, data, 0, set(), ops["data"]["outDS"], tmpLog) workflow_utils.set_workflow_outputs(nodes) id_node_map = workflow_utils.get_node_id_map(nodes) [node.resolve_params(ops["data"]["taskParams"], id_node_map) for node in nodes] diff --git a/pandaserver/workflow/data_handler_plugins/base_data_handler.py b/pandaserver/workflow/data_handler_plugins/base_data_handler.py new file mode 100644 index 000000000..bd2f3a19e --- /dev/null +++ b/pandaserver/workflow/data_handler_plugins/base_data_handler.py @@ -0,0 +1,48 @@ +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFDataTargetCheckResult, + WFDataType, + WFStepSpec, + WFStepStatus, + WFStepTargetCheckResult, + WFStepTargetSubmitResult, + WFStepType, + WorkflowSpec, + WorkflowStatus, +) + + +class BaseDataHandler: + """ + Base class for data handlers in the workflow system. + This class provides a common interface and some utility methods for data handlers. + """ + + def __init__(self, task_buffer, ddm_if, *args, **kwargs): + """ + Initialize the step handler with necessary parameters. + + Args: + task_buffer: The task buffer interface to interact with the task database. + ddm_if: The DDM interface to interact with the DDM system. + *args: Additional positional arguments. + **kwargs: Additional keyword arguments. + """ + self.tbif = task_buffer + self.ddm_if = ddm_if + + def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResult: + """ + Check the status of the data target. + This method should be implemented by subclasses to handle the specifics of data target status checking. + This method should NOT modify data_spec. Any update information should be stored in the WFStepTargetCheckResult returned instead. + + Args: + data_spec (WFDataSpec): The data specification to check. + **kwargs: Additional keyword arguments. + + Returns: + WFDataTargetCheckResult: The result of the target check. + """ + raise NotImplementedError("Subclasses must implement this method.") diff --git a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py new file mode 100644 index 000000000..70500cb46 --- /dev/null +++ b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py @@ -0,0 +1,100 @@ +import json +import traceback +import uuid + +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger + +from pandaserver.workflow.data_handler_plugins.base_data_handler import BaseDataHandler +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFDataTargetCheckResult, + WFDataTargetCheckStatus, + WFDataType, + WFStepSpec, + WFStepStatus, + WFStepType, + WorkflowSpec, + WorkflowStatus, +) + +# main logger +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + + +class DDMCollectionDIDType: + """ + Data Identifier Types for DDM Collections + """ + + DATASET = "DATASET" + CONTAINER = "CONTAINER" + + +class DDMCollectionState: + """ + States for DDM Collections + """ + + open = "open" + closed = "closed" + missing = "missing" + + +class DDMCollectionDataHandler(BaseDataHandler): + """ + Handler for DDM collection data in the workflow. + This class is responsible for managing the DDM collection data within a workflow. + """ + + def __init__(self, *args, **kwargs): + """ + Initialize the data handler with necessary parameters. + """ + # Initialize base class or any required modules here + super().__init__(*args, **kwargs) + self.plugin_flavor = "ddm_collection" + + def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResult: + """ + Check the status of the DDM collection data target. + This method should be implemented to handle the specifics of DDM collection data status checking. + + Args: + data_spec (WFDataSpec): The data specification containing details about the data to be checked. + **kwargs: Additional keyword arguments that may be required for checking. + + Returns: + WFDataTargetCheckResult: An object containing the result of the check, including success status, current data status, and message. + """ + tmp_log = LogWrapper(logger, f"check_target workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") + # Initialize + check_result = WFDataTargetCheckResult() + # Check data flavor + if data_spec.flavor != self.plugin_flavor: + tmp_log.warning(f"flavor={data_spec.flavor} not {self.plugin_flavor}; skipped") + check_result.message = f"flavor not {self.plugin_flavor}; skipped" + return check_result + # Check DDM collection status + collection = data_spec.target_id + collection_meta = self.ddm_if.get_dataset_metadata(collection, ignore_missing=True) + if collection_meta is None: + check_result.success = False + check_result.message = f"Failed to get metadata for collection {collection}" + tmp_log.error(f"{check_result.message}") + return check_result + match collection_meta.get("state"): + case DDMCollectionState.missing: + check_result.check_status = WFDataTargetCheckStatus.nonexist + case DDMCollectionState.open: + if collection_meta.get("length", 0) == 0: + check_result.check_status = WFDataTargetCheckStatus.insuffi + else: + check_result.check_status = WFDataTargetCheckStatus.suffice + case DDMCollectionState.closed: + check_result.check_status = WFDataTargetCheckStatus.complete + check_result.metadata = collection_meta + check_result.success = True + tmp_log.info(f"Got collection {collection} check_status={check_result.check_status}") + return check_result diff --git a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py new file mode 100644 index 000000000..a80c96857 --- /dev/null +++ b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py @@ -0,0 +1,137 @@ +import json +import traceback +import uuid + +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger + +from pandaserver.workflow.data_handler_plugins.base_data_handler import BaseDataHandler +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFDataTargetCheckResult, + WFDataTargetCheckStatus, + WFDataType, + WFStepSpec, + WFStepStatus, + WFStepType, + WorkflowSpec, + WorkflowStatus, +) + +# main logger +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + + +class DDMCollectionDIDType: + """ + Data Identifier Types for DDM Collections + """ + + DATASET = "DATASET" + CONTAINER = "CONTAINER" + + +class DDMCollectionState: + """ + States for DDM Collections + """ + + open = "open" + closed = "closed" + missing = "missing" + + +class PandaTaskDataHandler(BaseDataHandler): + """ + Handler for PanDA task intermediate/output data in the workflow. + This class is responsible for managing the data generated by PanDA task within a workflow. + The output data from a PanDA task is usually a DDM container, which remains open even after the task completion. + Thus, the handler not only checks the status of the DDM collection to determine if there are files available, but also verifies the step status of source workflow steps to ensure that the data generation process has been completed successfully. + """ + + def __init__(self, *args, **kwargs): + """ + Initialize the data handler with necessary parameters. + """ + # Initialize base class or any required modules here + super().__init__(*args, **kwargs) + self.plugin_flavor = "panda_task" + + def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResult: + """ + Check the status of the PanDA task data target. + This method should be implemented to handle the specifics of PanDA task data status checking. + + Args: + data_spec (WFDataSpec): The data specification containing details about the data to be checked. + **kwargs: Additional keyword arguments that may be required for checking. + + Returns: + WFDataTargetCheckResult: An object containing the result of the check, including success status, current data status, and message. + """ + tmp_log = LogWrapper(logger, f"check_target workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") + # Initialize + check_result = WFDataTargetCheckResult() + # Check data flavor + if data_spec.flavor != self.plugin_flavor: + tmp_log.warning(f"flavor={data_spec.flavor} not {self.plugin_flavor}; skipped") + check_result.message = f"flavor not {self.plugin_flavor}; skipped" + return check_result + # Check source step status + if data_spec.source_step_id is not None: + source_step_spec = self.tbif.get_workflow_step(data_spec.source_step_id) + if source_step_spec is None: + check_result.success = False + check_result.message = f"Failed to get source step step_id={data_spec.source_step_id}; skipped" + tmp_log.error(f"{check_result.message}") + return check_result + if source_step_spec.status == WFStepStatus.done: + # Source step done; consider data fully available + check_result.success = True + check_result.check_status = WFDataTargetCheckStatus.complete + tmp_log.info(f"Source step step_id={source_step_spec.step_id} done, data considered fully available; check_status={check_result.check_status}") + return check_result + elif source_step_spec.status in WFStepStatus.final_statuses: + # Source step in final status but not done; skip data availability + check_result.success = True + check_result.message = f"Source step step_id={source_step_spec.step_id} {source_step_spec.status}; skip data availability check" + tmp_log.warning(f"{check_result.message}") + return check_result + else: + tmp_log.info(f"Source step step_id={source_step_spec.step_id} status={source_step_spec.status}; checking data availability") + else: + tmp_log.info("No source step yet; checking data availability") + # Without source step or source step not terminated; check number of files in DDM collections + total_n_files = 0 + none_exist = True + output_types = data_spec.get_parameter("output_types") + if output_types is None: + output_types = [] + for output_type in output_types: + collection = f"{data_spec.target_id}_{output_type}" + tmp_stat, tmp_res = self.ddm_if.get_number_of_files(collection) + if tmp_stat is None: + tmp_log.debug(f"Collection {collection} does not exist") + elif not tmp_stat: + # Error in getting number of files + check_result.success = False + check_result.message = f"Failed to get number of files for collection {collection}: {tmp_res}" + tmp_log.error(f"{check_result.message}") + return check_result + else: + none_exist = False + n_files = tmp_res + total_n_files += n_files + tmp_log.debug(f"Got collection {collection} n_files={n_files}") + # Check number of files + if none_exist: + check_result.check_status = WFDataTargetCheckStatus.nonexist + elif total_n_files == 0: + check_result.check_status = WFDataTargetCheckStatus.insuffi + else: + # At least 1 file is sufficient for step input + check_result.check_status = WFDataTargetCheckStatus.suffice + check_result.success = True + tmp_log.info(f"Got total_n_files={total_n_files}; check_status={check_result.check_status}") + return check_result diff --git a/pandaserver/workflow/psnakemake_test.py b/pandaserver/workflow/psnakemake_test.py index 1427a2dc0..2e4cc8cf5 100644 --- a/pandaserver/workflow/psnakemake_test.py +++ b/pandaserver/workflow/psnakemake_test.py @@ -7,11 +7,11 @@ from snakeparser import Parser -from pandaserver.workflow.pcwl_utils import resolve_nodes from pandaserver.workflow.workflow_utils import ( convert_nodes_to_workflow, dump_nodes, get_node_id_map, + resolve_nodes, set_workflow_outputs, ) diff --git a/pandaserver/workflow/step_handler_plugins/base_step_handler.py b/pandaserver/workflow/step_handler_plugins/base_step_handler.py new file mode 100644 index 000000000..c352fd538 --- /dev/null +++ b/pandaserver/workflow/step_handler_plugins/base_step_handler.py @@ -0,0 +1,85 @@ +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFDataType, + WFStepSpec, + WFStepStatus, + WFStepTargetCancelResult, + WFStepTargetCheckResult, + WFStepTargetSubmitResult, + WFStepType, + WorkflowSpec, + WorkflowStatus, +) + + +class BaseStepHandler: + """ + Base class for step handlers in the workflow. + This class provides a common interface and some utility methods for step handlers. + """ + + def __init__(self, task_buffer, *args, **kwargs): + """ + Initialize the step handler with necessary parameters. + + Args: + task_buffer: The task buffer interface to interact with the task database. + *args: Additional positional arguments. + **kwargs: Additional keyword arguments. + """ + self.tbif = task_buffer + + def submit_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetSubmitResult: + """ + Submit a target for processing the step. + This method should be implemented by subclasses to handle the specifics of target submission. + This method should NOT modify step_spec. Any update information should be stored in the WFStepTargetSubmitResult returned instead. + + Args: + step_spec (WFStepSpec): Specifications of the workflow step whose target is to be submitted. + + Returns: + WFStepTargetSubmitResult: An object containing the result of the submission, including success status, target ID, and message. + + """ + raise NotImplementedError("Subclasses must implement this method.") + + def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResult: + """ + Check the status of the submitted target. + This method should be implemented by subclasses to handle the specifics of target status checking. + This method should NOT modify step_spec. Any update information should be stored in the WFStepTargetCheckResult returned instead. + + Args: + step_spec (WFStepSpec): Specifications of the workflow step to be checked. + + Returns: + WFStepTargetCheckResult: An object containing the result of the check, including success status, current step status, and message. + """ + raise NotImplementedError("Subclasses must implement this method.") + + def on_all_inputs_done(self, step_spec: WFStepSpec, **kwargs) -> None: + """ + Hook method called when all inputs for the step are done. + This method can be overridden by subclasses to perform actions when all inputs are ready. + + Args: + step_spec (WFStepSpec): Specifications of the workflow step whose inputs are done. + **kwargs: Additional keyword arguments. + """ + raise NotImplementedError("Subclasses must implement this method.") + + def cancel_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCancelResult: + """ + Cancel the submitted target. + This method can be overridden by subclasses to handle target cancellation. + + Args: + step_spec (WFStepSpec): Specifications of the workflow step whose target is to be cancelled. + **kwargs: Additional keyword arguments. + + Returns: + WFStepTargetCancelResult: An object containing the result of the cancellation, including success status and message. + """ + raise NotImplementedError("Subclasses must implement this method.") diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py new file mode 100644 index 000000000..022fd08fb --- /dev/null +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -0,0 +1,280 @@ +import traceback + +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger + +from pandaserver.workflow.step_handler_plugins.base_step_handler import BaseStepHandler +from pandaserver.workflow.workflow_base import ( + WFStepSpec, + WFStepStatus, + WFStepTargetCancelResult, + WFStepTargetCheckResult, + WFStepTargetSubmitResult, + WFStepType, +) + +# main logger +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + + +class PandaTaskStepHandler(BaseStepHandler): + """ + Handler for PanDA task steps in the workflow. + This class is responsible for managing the execution of PanDA tasks within a workflow. + """ + + def __init__(self, *args, **kwargs): + """ + Initialize the step handler with necessary parameters. + """ + # Initialize base class or any required modules here + super().__init__(*args, **kwargs) + # plugin flavor + self.plugin_flavor = "panda_task" + + def submit_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetSubmitResult: + """ + Submit a target for processing the PanDA task step. + This method should be implemented to handle the specifics of PanDA task submission. + + Args: + step_spec (WFStepSpec): The workflow step specification containing details about the step to be processed. + **kwargs: Additional keyword arguments that may be required for submission. + + Returns: + WFStepTargetSubmitResult: An object containing the result of the submission, including success status, target ID (task ID), and message. + """ + tmp_log = LogWrapper(logger, f"submit_target workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + # Initialize + submit_result = WFStepTargetSubmitResult() + # Check step flavor + if step_spec.flavor != self.plugin_flavor: + tmp_log.warning(f"flavor={step_spec.flavor} not {self.plugin_flavor}; skipped") + submit_result.message = f"flavor not {self.plugin_flavor}; skipped" + return submit_result + ... + # task_param_map = {} + # task_param_map["taskName"] = step_spec.name + # task_param_map["userName"] = workflow_spec.username + # task_param_map["vo"] = "atlas" + # task_param_map["taskPriority"] = 1000 + # # task_param_map["architecture"] = "i686-slc5-gcc43-opt" + # # task_param_map["transUses"] = "Atlas-17.2.7" + # task_param_map["transUses"] = None + # # task_param_map["transHome"] = "AtlasProduction-17.2.8.10" + # task_param_map["transHome"] = None + # task_param_map["transPath"] = "runGen-00-00-02" + # task_param_map["processingType"] = "reco" + # task_param_map["prodSourceLabel"] = "user" + # # task_param_map["prodSourceLabel"] = "managed" + # task_param_map["taskType"] = "anal" + # # task_param_map["taskType"] = "prod" + # task_param_map["inputPreStaging"] = True + # # task_param_map["panda_data_carousel"] = True + # task_param_map["remove_rule_when_done"] = True + # # task_param_map["workingGroup"] = "AP_Higgs" + # task_param_map["coreCount"] = 1 + # task_param_map["nFiles"] = 1 + # # task_param_map["cloud"] = "US" + # logDatasetName = f"panda.jeditest.log.{uuid.uuid4()}" + # task_param_map["log"] = { + # "dataset": logDatasetName, + # "type": "template", + # "param_type": "log", + # "token": "ATLASDATADISK", + # "value": f"{logDatasetName}.${{SN}}.log.tgz", + # } + # outDatasetName = f"panda.jeditest.NTUP_EMBLLDN.{uuid.uuid4()}" + # task_param_map["jobParameters"] = [ + # { + # "type": "template", + # "param_type": "input", + # "value": "inputAODFile=${IN}", + # "dataset": "mc23_13p6TeV:mc23_13p6TeV.602027.PhH7EG_NLO_LQ_S43_ResProd_lam22_5000_3p5.merge.AOD.e8531_e8528_s4162_s4114_r14622_r14663_tid34033945_00", + # "expand": True, + # }, + # {"type": "template", "param_type": "pseudo_input", "value": "dummy_value", "dataset": "pseudo_dataset"}, + # {"type": "constant", "value": "AMITag=p1462"}, + # { + # "type": "template", + # "param_type": "output", + # "token": "ATLASDATADISK", + # "value": f"outputNTUP_EMBLLDNFile={outDatasetName}.${{SN}}.pool.root", + # "dataset": outDatasetName, + # }, + # ] + try: + # Get step definition + step_definition = step_spec.definition_json_map + user_name = step_definition.get("user_name") + user_dn = step_definition.get("user_dn") + task_param_map = step_definition.get("task_params", {}) + # task_param_map["userName"] = user_name + if not step_spec.get_parameter("all_inputs_complete"): + # Some inputs are not complete, set workflowHoldup to True to hold up the workflow until released by workflow processor + task_param_map["workflowHoldup"] = True + # Submit task + tmp_ret_flag, temp_ret_val = self.tbif.insertTaskParamsPanda(task_param_map, user_dn, False, decode=False) + if tmp_ret_flag: + submit_result.success = True + submit_result.target_id = str(temp_ret_val) + tmp_log.info(f"Submitted task target_id={submit_result.target_id}") + else: + submit_result.message = temp_ret_val + tmp_log.error(f"Failed to submit task: {submit_result.message}") + except Exception as e: + submit_result.message = f"exception {str(e)}" + tmp_log.error(f"Failed to submit task: {traceback.format_exc()}") + return submit_result + + def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResult: + """ + Check the status of a submitted target for the given step. + This method should be implemented to handle the specifics of status checking. + + Args: + step_spec (WFStepSpec): The workflow step specification containing details about the step to be processed. + **kwargs: Additional keyword arguments that may be required for status checking. + + Returns: + WFStepTargetCheckResult: An object containing the result of the status check, including success status, step status, native status, and message. + """ + tmp_log = LogWrapper(logger, f"check_target workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + allowed_step_statuses = [WFStepStatus.starting, WFStepStatus.running] + try: + # Initialize + check_result = WFStepTargetCheckResult() + # Check preconditions + if step_spec.status not in allowed_step_statuses: + check_result.message = f"not in status to check; skipped" + tmp_log.warning(f"status={step_spec.status} not in status to check; skipped") + return check_result + if step_spec.flavor != self.plugin_flavor: + check_result.message = f"flavor not {self.plugin_flavor}; skipped" + tmp_log.warning(f"flavor={step_spec.flavor} not {self.plugin_flavor}; skipped") + return check_result + if step_spec.target_id is None: + check_result.message = f"target_id is None; skipped" + tmp_log.warning(f"target_id is None; skipped") + return check_result + # Get task ID and status + task_id = int(step_spec.target_id) + res = self.tbif.getTaskStatusSuperstatus(task_id) + if not res: + check_result.message = f"task_id={task_id} not found" + tmp_log.error(f"{check_result.message}") + return check_result + # Interpret status + task_status = res[0] + task_superstatus = res[1] + check_result.success = True + check_result.native_status = task_status + if task_status in ["running", "scouting", "scouted", "throttled", "prepared", "finishing", "passed"]: + check_result.step_status = WFStepStatus.running + elif task_status in ["defined", "assigned", "activated", "starting", "ready"]: + check_result.step_status = WFStepStatus.starting + elif task_status in ["pending"]: + # Check superstatus for repetitive status (e.g. pending) to distinguish between starting and running + if task_superstatus in ["running"]: + check_result.step_status = WFStepStatus.running + else: + check_result.step_status = WFStepStatus.starting + elif task_status in ["done", "finished"]: + check_result.step_status = WFStepStatus.done + elif task_status in ["failed", "exhausted", "aborted", "toabort", "aborting", "broken", "tobroken"]: + check_result.step_status = WFStepStatus.failed + else: + check_result.success = False + check_result.message = f"unknown task_status {task_status}" + tmp_log.error(f"{check_result.message}") + return check_result + tmp_log.info(f"Got task_id={task_id} task_status={task_status}") + except Exception as e: + check_result.success = False + check_result.message = f"exception {str(e)}" + tmp_log.error(f"Failed to check status: {traceback.format_exc()}") + return check_result + + def on_all_inputs_done(self, step_spec: WFStepSpec, **kwargs) -> None: + """ + Hook method called when all inputs for the step are done. + For PanDA task steps, unset workflowHoldup of the target task to allow it to proceed. + + Args: + step_spec (WFStepSpec): The workflow step specification containing details about the step. + **kwargs: Additional keyword arguments. + """ + tmp_log = LogWrapper(logger, f"on_all_inputs_done workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + try: + # Check step flavor + if step_spec.flavor != self.plugin_flavor: + tmp_log.warning(f"flavor={step_spec.flavor} not {self.plugin_flavor}; skipped") + return + if step_spec.target_id is None: + tmp_log.warning(f"target_id is None; skipped") + return + # Get task ID + task_id = int(step_spec.target_id) + # Get task spec + _, task_spec = self.tbif.getTaskWithID_JEDI(task_id) + if task_spec is None: + tmp_log.error(f"task_id={task_id} not found; skipped") + return + # Unset workflowHoldup and release the task + if task_spec.is_workflow_holdup(): + task_spec.set_workflow_holdup(False) + self.tbif.updateTask_JEDI(task_spec, {"jediTaskID": task_spec.jediTaskID}) + tmp_log.info(f"task_id={task_id} unset workflowHoldup") + if task_spec.status == "pending": + tmp_ret = self.tbif.release_task_on_hold(task_id) + if not tmp_ret: + tmp_log.error(f"task_id={task_id} failed to release from pending") + else: + tmp_log.info(f"task_id={task_id} released from pending") + # Done + tmp_log.debug(f"Done") + except Exception as e: + tmp_log.error(f"Failed with: {traceback.format_exc()}") + + def cancel_target(self, step_spec, **kwargs) -> WFStepTargetCancelResult: + """ + Cancel the target task for the given step. + This method should be implemented to handle the specifics of task cancellation. + + Args: + step_spec (WFStepSpec): The workflow step specification containing details about the step to be processed. + **kwargs: Additional keyword arguments that may be required for cancellation. + + Returns: + WFStepTargetCancelResult: An object containing the result of the cancellation, including success status and message. + """ + tmp_log = LogWrapper(logger, f"cancel_target workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + cancel_result = WFStepTargetCancelResult() + try: + # Check step flavor + if step_spec.flavor != self.plugin_flavor: + cancel_result.message = f"flavor not {self.plugin_flavor}; skipped" + tmp_log.warning(f"flavor={step_spec.flavor} not {self.plugin_flavor}; skipped") + return cancel_result + if step_spec.target_id is None: + # If target_id is None, consider it as already cancelled since there is no task to cancel + cancel_result.success = True + cancel_result.message = f"target_id is None so considered already cancelled; skipped" + tmp_log.debug(f"{cancel_result.message}") + return cancel_result + # Get task ID + task_id = int(step_spec.target_id) + # Cancel task + ret_val, ret_str = self.tbif.sendCommandTaskPanda(task_id, "PanDA Task Step Handler cancel_target", True, "kill", properErrorCode=True) + # check if ok + if ret_val == 0: + cancel_result.success = True + tmp_log.info(f"target_id={step_spec.target_id} cancelled") + else: + cancel_result.success = False + cancel_result.message = f"failed to cancel the task: error_code={ret_val} {ret_str}" + tmp_log.warning(f"{cancel_result.message}") + except Exception as e: + cancel_result.message = f"exception {str(e)}" + tmp_log.error(f"Failed to cancel task: {traceback.format_exc()}") + return cancel_result diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py new file mode 100644 index 000000000..ad04911fa --- /dev/null +++ b/pandaserver/workflow/workflow_base.py @@ -0,0 +1,531 @@ +import json +from collections import namedtuple +from dataclasses import MISSING, InitVar, asdict, dataclass, field +from datetime import datetime, timedelta +from typing import Any, Dict, List + +# from pandacommon.pandalogger.PandaLogger import PandaLogger +from pandacommon.pandautils.base import SpecBase + +from pandaserver.config import panda_config + +# main logger +# logger = PandaLogger().getLogger(__name__.split(".")[-1]) + +# named tuple for attribute with type +AttributeWithType = namedtuple("AttributeWithType", ["attribute", "type"]) + + +# ==== Status of Entities ====================================== + + +class WorkflowStatus(object): + """ + Class to define the status of workflows + """ + + registered = "registered" + parsed = "parsed" + checking = "checking" + checked = "checked" + starting = "starting" + running = "running" + done = "done" + failed = "failed" + cancelled = "cancelled" + + active_statuses = (registered, parsed, checking, checked, starting, running) + final_statuses = (done, failed, cancelled) + transient_statuses = (parsed, checking, checked, starting) + + +class WFStepStatus(object): + """ + Class to define the status of workflow steps + """ + + registered = "registered" + checking = "checking" + checked_true = "checked_true" + checked_false = "checked_false" + pending = "pending" + ready = "ready" + starting = "starting" + running = "running" + done = "done" + failed = "failed" + closed = "closed" + cancelled = "cancelled" + + checked_statuses = (checked_true, checked_false) + to_advance_step_statuses = (registered, checking, checked_true, checked_false, pending, ready, starting) + after_starting_statuses = (running, done, failed, cancelled) + after_starting_uninterrupted_statuses = (running, done, failed) + after_running_statuses = (done, failed, cancelled) + final_statuses = (done, failed, closed, cancelled) + transient_statuses = (checked_true, checked_false, ready) + + +class WFDataStatus(object): + """ + Class to define the status of workflow data + """ + + registered = "registered" + checking = "checking" + checked_nonexist = "checked_nonexist" # data does not exist + checked_insuffi = "checked_insuffi" # data available but insufficient to be step input + checked_suffice = "checked_suffice" # data partially available and sufficient to be step input + checked_complete = "checked_complete" # data completely available + binding = "binding" # data being bound to a step to generate + generating_bound = "generating_bound" + generating_insuffi = "generating_insuffi" + generating_suffice = "generating_suffice" + waiting_insuffi = "waiting_insuffi" + waiting_suffice = "waiting_suffice" + done_generated = "done_generated" + done_waited = "done_waited" + done_skipped = "done_skipped" + cancelled = "cancelled" + retired = "retired" + + checked_statuses = (checked_nonexist, checked_insuffi, checked_suffice, checked_complete) + generating_statuses = (generating_bound, generating_insuffi, generating_suffice) + waiting_statuses = (waiting_insuffi, waiting_suffice) + done_statuses = (done_generated, done_waited, done_skipped) + good_input_statuses = (generating_suffice, waiting_suffice, done_generated, done_waited, done_skipped) + good_output_statuses = (done_generated, done_waited, done_skipped) + after_generating_bound_statuses = (generating_suffice, done_generated, cancelled) + after_generating_suffice_statuses = (done_generated, cancelled) + after_waiting_suffice_statuses = (done_waited, cancelled) + terminated_statuses = (done_generated, done_waited, done_skipped, cancelled, retired) + nonreusable_statuses = (cancelled, retired) + transient_statuses = (checked_nonexist, checked_insuffi, checked_suffice, checked_complete) + + +# ==== Types =================================================== + + +class WFStepType(object): + """ + Class to define the types of workflow steps + """ + + ... + ordinary = "ordinary" + + +class WFDataType(object): + """ + Class to define the types of workflow data + """ + + input = "input" + output = "output" + mid = "mid" + + +# ==== Specifications ========================================== + + +class WorkflowBaseSpec(SpecBase): + """ + Base class for workflow related specifications + """ + + @property + def parameter_map(self) -> dict: + """ + Get the dictionary parsed by the parameters attribute in JSON + Possible parameters: + ... + + Returns: + dict : dict of parameters if it is JSON or empty dict if null + """ + if self.parameters is None: + return {} + else: + return json.loads(self.parameters) + + @parameter_map.setter + def parameter_map(self, value_map: dict): + """ + Set the dictionary and store in parameters attribute in JSON + + Args: + value_map (dict): dict to set the parameter map + """ + self.parameters = json.dumps(value_map) + + def get_parameter(self, param: str) -> Any: + """ + Get the value of one parameter. None as default + + Args: + param (str): parameter name + + Returns: + Any : value of the parameter; None if parameter not set + """ + tmp_dict = self.parameter_map + return tmp_dict.get(param) + + def set_parameter(self, param: str, value): + """ + Set the value of one parameter and store in parameters attribute in JSON + + Args: + param (str): parameter name + value (Any): value of the parameter to set; must be JSON-serializable + """ + tmp_dict = self.parameter_map + tmp_dict[param] = value + self.parameter_map = tmp_dict + + def update_parameters(self, params: dict): + """ + Update values of parameters with a dict and store in parameters attribute in JSON + + Args: + params (dict): dict of parameter names and values to set + """ + tmp_dict = self.parameter_map + tmp_dict.update(params) + self.parameter_map = tmp_dict + + +class WorkflowSpec(WorkflowBaseSpec): + """ + Workflow specification + """ + + # attributes with types + attributes_with_types = ( + AttributeWithType("workflow_id", int), + AttributeWithType("name", str), + AttributeWithType("parent_id", int), + AttributeWithType("loop_count", int), + AttributeWithType("status", str), + AttributeWithType("prodsourcelabel", str), + AttributeWithType("username", str), + AttributeWithType("creation_time", datetime), + AttributeWithType("start_time", datetime), + AttributeWithType("end_time", datetime), + AttributeWithType("modification_time", datetime), + AttributeWithType("check_time", datetime), + AttributeWithType("locked_by", str), + AttributeWithType("lock_time", datetime), + AttributeWithType("raw_request_json", str), + AttributeWithType("definition_json", str), + AttributeWithType("parameters", str), + ) + # attributes + attributes = tuple([attr.attribute for attr in attributes_with_types]) + # attributes which have 0 by default + _zeroAttrs = () + # attributes to force update + _forceUpdateAttrs = () + # mapping between sequence and attr + _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_ID_SEQ.nextval"} + + @property + def raw_request_json_map(self) -> dict: + """ + Get the dictionary parsed by raw_request_json attribute in JSON + + Returns: + dict : dict of raw_request_json if it is JSON or empty dict if null + """ + if self.raw_request_json is None: + return {} + else: + return json.loads(self.raw_request_json) + + @raw_request_json_map.setter + def raw_request_json_map(self, value_map: dict): + """ + Set the dictionary and store in raw_request_json attribute in JSON + + Args: + value_map (dict): dict to set the raw_request_json map + """ + self.raw_request_json = json.dumps(value_map) + + @property + def definition_json_map(self) -> dict: + """ + Get the dictionary parsed by definition_json attribute in JSON + + Returns: + dict : dict of definition_json if it is JSON or empty dict if null + """ + if self.definition_json is None: + return {} + else: + return json.loads(self.definition_json) + + @definition_json_map.setter + def definition_json_map(self, value_map: dict): + """ + Set the dictionary and store in definition_json attribute in JSON + + Args: + value_map (dict): dict to set the definition_json map + """ + self.definition_json = json.dumps(value_map) + + +class WFStepSpec(WorkflowBaseSpec): + """ + Workflow Step specification + """ + + # attributes with types + attributes_with_types = ( + AttributeWithType("step_id", int), + AttributeWithType("name", str), + AttributeWithType("workflow_id", int), + AttributeWithType("member_id", int), + AttributeWithType("type", str), + AttributeWithType("status", str), + AttributeWithType("flavor", str), + AttributeWithType("target_id", str), + AttributeWithType("creation_time", datetime), + AttributeWithType("start_time", datetime), + AttributeWithType("end_time", datetime), + AttributeWithType("modification_time", datetime), + AttributeWithType("check_time", datetime), + AttributeWithType("locked_by", str), + AttributeWithType("lock_time", datetime), + AttributeWithType("definition_json", str), + AttributeWithType("parameters", str), + ) + # attributes + attributes = tuple([attr.attribute for attr in attributes_with_types]) + # attributes which have 0 by default + _zeroAttrs = () + # attributes to force update + _forceUpdateAttrs = () + # mapping between sequence and attr + _seqAttrMap = {"step_id": f"{panda_config.schemaJEDI}.WORKFLOW_STEP_ID_SEQ.nextval"} + + @property + def definition_json_map(self) -> dict: + """ + Get the dictionary parsed by definition_json attribute in JSON + + Returns: + dict : dict of definition_json if it is JSON or empty dict if null + """ + if self.definition_json is None: + return {} + else: + return json.loads(self.definition_json) + + @definition_json_map.setter + def definition_json_map(self, value_map: dict): + """ + Set the dictionary and store in definition_json attribute in JSON + + Args: + value_map (dict): dict to set the definition_json map + """ + self.definition_json = json.dumps(value_map) + + +class WFDataSpec(WorkflowBaseSpec): + """ + Workflow Data specification + """ + + # attributes with types + attributes_with_types = ( + AttributeWithType("data_id", int), + AttributeWithType("name", str), + AttributeWithType("workflow_id", int), + AttributeWithType("source_step_id", int), + AttributeWithType("type", str), + AttributeWithType("status", str), + AttributeWithType("flavor", str), + AttributeWithType("target_id", str), + AttributeWithType("creation_time", datetime), + AttributeWithType("start_time", datetime), + AttributeWithType("end_time", datetime), + AttributeWithType("modification_time", datetime), + AttributeWithType("check_time", datetime), + AttributeWithType("locked_by", str), + AttributeWithType("lock_time", datetime), + AttributeWithType("metadata", str), + AttributeWithType("parameters", str), + ) + # attributes + attributes = tuple([attr.attribute for attr in attributes_with_types]) + # attributes which have 0 by default + _zeroAttrs = () + # attributes to force update + _forceUpdateAttrs = () + # mapping between sequence and attr + _seqAttrMap = {"data_id": f"{panda_config.schemaJEDI}.WORKFLOW_DATA_ID_SEQ.nextval"} + + @property + def metadata_map(self) -> dict: + """ + Get the dictionary parsed by metadata attribute in JSON + + Returns: + dict : dict of metadata if it is JSON or empty dict if null + """ + if self.metadata is None: + return {} + else: + return json.loads(self.metadata) + + @metadata_map.setter + def metadata_map(self, value_map: dict): + """ + Set the dictionary and store in metadata attribute in JSON + + Args: + value_map (dict): dict to set the metadata map + """ + self.metadata = json.dumps(value_map) + + +# === Return objects of core methods which process status ====== + + +@dataclass(slots=True) +class WFDataProcessResult: + """ + Result of processing data. + + Fields: + success (bool | None): Indicates if the processing was successful. + new_status (WFDataStatus | None): The new status of the data after processing, None if no change. + message (str): A message providing additional information about the processing result. + """ + + success: bool | None = None + new_status: WFDataStatus | None = None + message: str = "" + + +@dataclass(slots=True) +class WFStepProcessResult: + """ + Result of processing a step. + + Fields: + success (bool | None): Indicates if the processing was successful. + new_status (WFStepStatus | None): The new status of the step after processing, None if no change. + message (str): A message providing additional information about the processing result. + """ + + success: bool | None = None + new_status: WFStepStatus | None = None + message: str = "" + + +@dataclass(slots=True) +class WorkflowProcessResult: + """ + Result of processing a workflow. + + Fields: + success (bool | None): Indicates if the processing was successful. + new_status (WorkflowStatus | None): The new status of the workflow after processing, None if no change. + message (str): A message providing additional information about the processing result. + """ + + success: bool | None = None + new_status: WorkflowStatus | None = None + message: str = "" + + +# === Return objects of step handler methods =================== + + +@dataclass(slots=True) +class WFStepTargetSubmitResult: + """ + Result of submitting a target of a step. + + Fields: + success (bool | None): Indicates if the submission was successful. + target_id (str | None): The ID of the submitted target (e.g., task ID). + message (str): A message providing additional information about the submission result. + """ + + success: bool | None = None + target_id: str | None = None + message: str = "" + + +@dataclass(slots=True) +class WFStepTargetCheckResult: + """ + Result of checking the status of a submitted target. + + Fields: + success (bool | None): Indicates if the status check was successful. + status (WFStepStatus | None): The status of the step to move to. + native_status (str | None): The native status string from the target system. + message (str): A message providing additional information about the status check result. + """ + + success: bool | None = None + step_status: WFStepStatus | None = None + native_status: str | None = None + message: str = "" + + +@dataclass(slots=True) +class WFStepTargetCancelResult: + """ + Result of cancelling a target of a step. + + Fields: + success (bool | None): Indicates if the cancellation was successful. + target_id (str | None): The ID of the cancelled target (e.g., task ID). + message (str): A message providing additional information about the cancellation result. + """ + + success: bool | None = None + target_id: str | None = None + message: str = "" + + +# ==== Return objects of data handler methods ================== + + +class WFDataTargetCheckStatus: + """ + Possible statuses returned by data target check + """ + + complete = "complete" # data completely exists + suffice = "suffice" # data partially exists and suffices to be step input + insuffi = "insuffi" # data partially exists but is insufficient to be step input + nonexist = "nonexist" # data does not exist + + +@dataclass(slots=True) +class WFDataTargetCheckResult: + """ + Result of checking the status of a data target. + + Fields: + success (bool | None): Indicates if the status check was successful. + check_status (WFDataTargetCheckStatus | None): The status of the data target. + metadata (dict | None): The native metadata from the target system. + message (str): A message providing additional information about the status check result. + """ + + success: bool | None = None + check_status: WFDataTargetCheckStatus | None = None + metadata: dict | None = None + message: str = "" + + +# ============================================================== diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py new file mode 100644 index 000000000..21e3c52df --- /dev/null +++ b/pandaserver/workflow/workflow_core.py @@ -0,0 +1,1967 @@ +import atexit +import copy +import functools +import importlib +import json +import os +import random +import re +import socket +import time +import traceback +from collections import namedtuple +from contextlib import contextmanager +from datetime import datetime, timedelta +from typing import Any, Dict, List + +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger +from pandacommon.pandautils.PandaUtils import get_sql_IN_bind_variables, naive_utcnow + +from pandaserver.config import panda_config +from pandaserver.dataservice.ddm import rucioAPI +from pandaserver.srvcore.CoreUtils import clean_user_id +from pandaserver.workflow.workflow_base import ( + WFDataProcessResult, + WFDataSpec, + WFDataStatus, + WFDataTargetCheckStatus, + WFDataType, + WFStepProcessResult, + WFStepSpec, + WFStepStatus, + WFStepTargetCheckResult, + WFStepTargetSubmitResult, + WFStepType, + WorkflowProcessResult, + WorkflowSpec, + WorkflowStatus, +) +from pandaserver.workflow.workflow_parser import ( + json_serialize_default, + parse_raw_request, +) + +# import polars as pl # isort:skip + + +# main logger +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + +# named tuple for attribute with type +AttributeWithType = namedtuple("AttributeWithType", ["attribute", "type"]) + +# ==== Global Parameters ======================================= + +WORKFLOW_CHECK_INTERVAL_SEC = 60 +MESSAGE_QUEUE_NAME = "jedi_workflow_manager" + +# ==== Plugin Map ============================================== + +PLUGIN_RAW_MAP = { + "step_handler": { + "panda_task": ("panda_task_step_handler", "PandaTaskStepHandler"), + # Add more step handler plugins here + }, + "data_handler": { + "ddm_collection": ("ddm_collection_data_handler", "DDMCollectionDataHandler"), + "panda_task": ("panda_task_data_handler", "PandaTaskDataHandler"), + # Add more data handler plugins here + }, + # Add more plugin types here +} + + +@functools.lru_cache(maxsize=1) +def _get_flavor_plugin_class_map() -> Dict[str, Dict[str, Any]]: + """Lazily import plugin classes once per process and cache the map.""" + logger.debug("Initializing workflow plugin class map (lazy, one-time per process)") + flavor_plugin_class_map = {} + for plugin_type, plugins in PLUGIN_RAW_MAP.items(): + flavor_plugin_class_map[plugin_type] = {} + for flavor, (module_name, class_name) in plugins.items(): + try: + full_module_name = f"pandaserver.workflow.{plugin_type}_plugins.{module_name}" + module = importlib.import_module(full_module_name) + cls = getattr(module, class_name) + flavor_plugin_class_map[plugin_type][flavor] = cls + logger.debug(f"Imported {plugin_type} plugin {flavor} from {module_name}.{class_name}") + except Exception as e: + logger.error(f"Failed to import {plugin_type} plugin {flavor} from {module_name}.{class_name}: {e}") + return flavor_plugin_class_map + + +# ==== Functions =============================================== + + +def get_plugin_class(plugin_type: str, flavor: str): + """ + Get the plugin class for the given type and flavor + + Args: + plugin_type (str): Type of the plugin (e.g., "step_handler", "data_handler") + flavor (str): Flavor of the plugin (e.g., "panda_task") + + Returns: + class: The plugin class if found, otherwise None + """ + flavor_plugin_class_map = _get_flavor_plugin_class_map() + return flavor_plugin_class_map.get(plugin_type, {}).get(flavor) + + +# ==== Workflow Interface ====================================== + + +class WorkflowInterface(object): + """ + Interface for workflow management methods + """ + + def __init__(self, task_buffer, *args, **kwargs): + """ + Constructor + + Args: + task_buffer (TaskBufferInterface): Interface to the task buffer + *args: Additional arguments + **kwargs: Additional keyword arguments + """ + self.tbif = task_buffer + self.ddm_if = rucioAPI + self.full_pid = f"{socket.getfqdn().split('.')[0]}-{os.getpgrp()}-{os.getpid()}" + self.plugin_map = {} + self.mb_proxy = None + self.set_mb_proxy() + + def get_plugin(self, plugin_type: str, flavor: str): + """ + Get the plugin instance for the given type and flavor + + Args: + plugin_type (str): Type of the plugin (e.g., "step_handler", "data_handler") + flavor (str): Flavor of the plugin (e.g., "panda_task") + + Returns: + Any: The plugin instance if found, otherwise None + """ + plugin = self.plugin_map.get(plugin_type, {}).get(flavor) + if plugin is not None: + return plugin + else: + # not yet loaded, try to load + cls = get_plugin_class(plugin_type, flavor) + if cls is not None: + self.plugin_map.setdefault(plugin_type, {})[flavor] = cls(task_buffer=self.tbif, ddm_if=self.ddm_if) + plugin = self.plugin_map[plugin_type][flavor] + return plugin + + def set_mb_proxy(self): + """ + Set the message broker proxy for workflow manager messaging + """ + try: + jedi_config = None + try: + jedi_config = importlib.import_module("pandajedi.jediconfig.jedi_config") + except Exception: + jedi_config = importlib.import_module("pandajedi.jediconfig").jedi_config + if hasattr(jedi_config, "mq") and hasattr(jedi_config.mq, "configFile") and jedi_config.mq.configFile: + MsgProcAgent = importlib.import_module(f"pandajedi.jediorder.JediMsgProcessor").MsgProcAgent + else: + logger.warning("Message queue config not found in jedi_config; skipped workflow manager messaging") + return None + out_q_list = [MESSAGE_QUEUE_NAME] + mq_agent = MsgProcAgent(config_file=jedi_config.mq.configFile) + mb_proxy_dict = mq_agent.start_passive_mode(in_q_list=[], out_q_list=out_q_list) + # stop with atexit + atexit.register(mq_agent.stop_passive_mode) + # set mb_proxy + self.mb_proxy = mb_proxy_dict["out"].get(MESSAGE_QUEUE_NAME) + if self.mb_proxy is None: + logger.warning(f"Message queue {MESSAGE_QUEUE_NAME} not found in mb_proxy_dict; skipped workflow manager messaging") + return None + # logger.debug(f"Set mb_proxy about queue {MESSAGE_QUEUE_NAME} for workflow manager messaging") + except Exception: + logger.warning(f"Failed to set mb_proxy about queue {MESSAGE_QUEUE_NAME}; skipped workflow manager messaging: {traceback.format_exc()}") + return None + + def _send_message(self, tmp_log, msg_type: str, data_dict: Dict[str, Any] = None): + """ + Send a message to the workflow manager message queue + + Args: + tmp_log (logging.Logger): Logger for logging messages + msg_type (str): Type of the message (e.g., "workflow", "wfstep", "wfdata") + data_dict (Dict[str, Any], optional): Additional data to include in the message + """ + if self.mb_proxy is None: + return None + try: + now_time = naive_utcnow() + now_ts = int(now_time.timestamp()) + # get mbproxy + msg_dict = {} + if data_dict: + msg_dict.update(data_dict) + msg_dict.update( + { + "msg_type": msg_type, + "timestamp": now_ts, + } + ) + msg = json.dumps(msg_dict) + self.mb_proxy.send(msg) + tmp_log.debug(f"Sent message") + except Exception: + tmp_log.error(f"Failed to send message to workflow manager queue {MESSAGE_QUEUE_NAME}: {traceback.format_exc()}") + + def send_workflow_message(self, workflow_id: int): + """ + Send a message about the workflow to the workflow manager message queue + + Args: + workflow_id (int): ID of the workflow + """ + tmp_log = LogWrapper(logger, f"send_workflow_message ") + self._send_message(tmp_log, "workflow", {"workflow_id": workflow_id}) + + def send_step_message(self, step_id: int): + """ + Send a message about the workflow step to the workflow manager message queue + + Args: + step_id (int): ID of the workflow step + """ + tmp_log = LogWrapper(logger, f"send_step_message ") + self._send_message(tmp_log, "wfstep", {"step_id": step_id}) + + def send_data_message(self, data_id: int): + """ + Send a message about the workflow data to the workflow manager message queue + + Args: + data_id (int): ID of the workflow data + """ + tmp_log = LogWrapper(logger, f"send_data_message ") + self._send_message(tmp_log, "wfdata", {"data_id": data_id}) + + # --- Context managers for locking ------------------------- + + @contextmanager + def workflow_lock(self, workflow_id: int, lock_expiration_sec: int = 120): + """ + Context manager to lock a workflow + + Args: + workflow_id (int): ID of the workflow to lock + lock_expiration_sec (int): Time in seconds after which the lock expires + + Yields: + WorkflowSpec | None: The locked workflow specification if the lock was acquired, otherwise None + """ + if self.tbif.lock_workflow(workflow_id, self.full_pid, lock_expiration_sec): + try: + # get the workflow spec locked + locked_spec = self.tbif.get_workflow(workflow_id) + # yield and run wrapped function + yield locked_spec + finally: + self.tbif.unlock_workflow(workflow_id, self.full_pid) + else: + # lock not acquired + yield None + + @contextmanager + def workflow_step_lock(self, step_id: int, lock_expiration_sec: int = 120): + """ + Context manager to lock a workflow step + + Args: + step_id (int): ID of the workflow step to lock + lock_expiration_sec (int): Time in seconds after which the lock expires + + Yields: + WFStepSpec | None: The locked workflow step specification if the lock was acquired, otherwise None + """ + if self.tbif.lock_workflow_step(step_id, self.full_pid, lock_expiration_sec): + try: + # get the workflow step spec locked + locked_spec = self.tbif.get_workflow_step(step_id) + # yield and run wrapped function + yield locked_spec + finally: + self.tbif.unlock_workflow_step(step_id, self.full_pid) + else: + # lock not acquired + yield None + + @contextmanager + def workflow_data_lock(self, data_id: int, lock_expiration_sec: int = 120): + """ + Context manager to lock workflow data + + Args: + data_id (int): ID of the workflow data to lock + lock_expiration_sec (int): Time in seconds after which the lock expires + + Yields: + WFDataSpec | None: The locked workflow data specification if the lock was acquired, otherwise None + """ + if self.tbif.lock_workflow_data(data_id, self.full_pid, lock_expiration_sec): + try: + # get the workflow data spec locked + locked_spec = self.tbif.get_workflow_data(data_id) + # yield and run wrapped function + yield locked_spec + finally: + self.tbif.unlock_workflow_data(data_id, self.full_pid) + else: + # lock not acquired + yield None + + # --- Workflow operation ----------------------------------- + + def register_workflow( + self, + prodsourcelabel: str, + user_dn: str, + workflow_name: str | None = None, + workflow_definition: dict | None = None, + raw_request_params: dict | None = None, + *args, + **kwargs, + ) -> int | None: + """ + Register a new workflow + + Args: + prodsourcelabel (str): Production source label for the workflow + user_dn (str): Distinguished name of the user submitting the workflow + workflow_name (str | None): Name of the workflow + workflow_definition (dict | None): Dictionary of workflow definition + raw_request_params (dict | None): Dictionary of parameters of the raw request + *args: Additional arguments + **kwargs: Additional keyword arguments + + Returns: + int | None: The ID of the registered workflow if successful, otherwise None + """ + username = clean_user_id(user_dn) + tmp_log = LogWrapper(logger, f"register_workflow prodsourcelabel={prodsourcelabel} username={username} name={workflow_name}") + tmp_log.debug(f'Start, user_dn is "{user_dn}"') + # Implementation of workflow registration logic + ... + workflow_spec = WorkflowSpec() + workflow_spec.prodsourcelabel = prodsourcelabel + workflow_spec.username = username + if workflow_name is not None: + workflow_spec.name = workflow_name + if workflow_definition is not None: + workflow_definition["user_dn"] = user_dn + workflow_spec.definition_json = json.dumps(workflow_definition, default=json_serialize_default) + elif raw_request_params is not None: + raw_request_params["user_dn"] = user_dn + workflow_spec.raw_request_json = json.dumps(raw_request_params, default=json_serialize_default) + else: + tmp_log.error(f"Either workflow_definition or raw_request_params must be provided") + return None + workflow_spec.creation_time = naive_utcnow() + workflow_spec.status = WorkflowStatus.registered + # Insert to DB + ret_workflow_id = self.tbif.insert_workflow(workflow_spec) + if ret_workflow_id is None: + tmp_log.error(f"Failed to register workflow") + return None + tmp_log.info(f"Registered workflow ") + return ret_workflow_id + + def cancel_workflow(self, workflow_id: int, force: bool = False) -> bool: + """ + Cancel the workflow + + Args: + workflow_id (int): ID of the workflow to cancel + force (bool): Whether to force into cancelled status + + Returns: + bool: True if the workflow was successfully cancelled, otherwise False + """ + tmp_log = LogWrapper(logger, f"cancel_workflow ") + # tmp_log.debug("Start") + try: + with self.workflow_lock(workflow_id) as workflow_spec: + if workflow_spec is None: + tmp_log.warning(f"Failed to acquire lock; skipped") + return False + if workflow_spec.status in WorkflowStatus.final_statuses: + tmp_log.debug(f"Workflow already in final status {workflow_spec.status}; skipped") + return True + # Cancel all steps and data of the workflow + all_cancelled = True + step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id) + if step_specs is None: + tmp_log.warning(f"Failed to get steps of the workflow; skipped cancelling steps") + all_cancelled = False + else: + for step_spec in step_specs: + if not self.cancel_step(step_spec.step_id, force): + all_cancelled = False + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) + if data_specs is None: + tmp_log.warning(f"Failed to get data of the workflow; skipped cancelling data") + all_cancelled = False + else: + for data_spec in data_specs: + if not self.cancel_data(data_spec.data_id, force): + all_cancelled = False + # Update workflow status to cancelled if all steps and data are cancelled + if not all_cancelled and not force: + tmp_log.warning(f"Not all steps and data could be cancelled; skipped updating workflow status") + return False + else: + workflow_spec.status = WorkflowStatus.cancelled + workflow_spec.end_time = naive_utcnow() + self.tbif.update_workflow(workflow_spec) + if force and not all_cancelled: + tmp_log.warning(f"Force cancelled workflow without cancelling all steps and data") + else: + tmp_log.info(f"Cancelled workflow, updated status to {workflow_spec.status}") + return True + except Exception as e: + tmp_log.error(f"Got error {str(e)}") + return False + + # --- Step operation --------------------------------------- + + def cancel_step(self, step_id: int, force: bool = False) -> bool: + """ + Cancel the workflow step + + Args: + step_id (int): ID of the workflow step to cancel + force (bool): Whether to force into cancelled status; if False, the step will only be cancelled if the target cancellation is successful, while if True, the step will be marked as cancelled regardless of the target cancellation result + + Returns: + bool: True if the step was successfully cancelled, otherwise False + """ + log_prefix = f"cancel_step " + tmp_log = LogWrapper(logger, log_prefix) + # tmp_log.debug("Start") + try: + with self.workflow_step_lock(step_id) as step_spec: + if step_spec is None: + tmp_log.warning(f"Failed to acquire lock; skipped") + return False + log_prefix += f" workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}" + tmp_log = LogWrapper(logger, log_prefix) + if step_spec.status in WFStepStatus.final_statuses: + tmp_log.debug(f"Step already in final status {step_spec.status}; skipped") + return True + # Call plugin to cancel the target of the step + target_is_cancelled = False + step_handler = self.get_plugin("step_handler", step_spec.flavor) + if step_handler is None: + tmp_log.warning(f"Step handler plugin not found for flavor {step_spec.flavor}; skipped target cancellation") + else: + cancel_result = step_handler.cancel_target(step_spec) + if not cancel_result.success: + tmp_log.warning(f"Failed to cancel target with plugin {step_spec.flavor}; got message: {cancel_result.message}") + else: + tmp_log.debug(f"Cancelled target with flavor {step_spec.flavor}") + target_is_cancelled = True + # Update step status to cancelled + if not target_is_cancelled and not force: + tmp_log.warning(f"Target not cancelled; skipped updating step status") + return False + else: + step_spec.status = WFStepStatus.cancelled + step_spec.end_time = naive_utcnow() + self.tbif.update_workflow_step(step_spec) + if force and not target_is_cancelled: + tmp_log.warning(f"Force cancelled step without cancelling target") + else: + tmp_log.info(f"Cancelled step, updated status to {step_spec.status}") + return True + except Exception as e: + tmp_log.error(f"Got error {str(e)}") + return False + + # --- Data operation --------------------------------------- + + def cancel_data(self, data_id: int, force: bool = False) -> bool: + """ + Cancel the workflow data + + Args: + data_id (int): ID of the workflow data to cancel + force (bool): Whether to force into cancelled status; currently has no effect since data cancellation is not implemented in plugins, but reserved for future use + + Returns: + bool: True if the data was successfully cancelled, otherwise False + """ + log_prefix = f"cancel_data " + tmp_log = LogWrapper(logger, log_prefix) + # tmp_log.debug("Start") + try: + with self.workflow_data_lock(data_id) as data_spec: + if data_spec is None: + tmp_log.warning(f"Failed to acquire lock; skipped") + return False + log_prefix += f" workflow_id={data_spec.workflow_id}" + tmp_log = LogWrapper(logger, log_prefix) + if data_spec.status in WFDataStatus.terminated_statuses: + tmp_log.debug(f"Data already terminated with status {data_spec.status}; skipped") + return True + data_spec.status = WFDataStatus.cancelled + data_spec.end_time = naive_utcnow() + self.tbif.update_workflow_data(data_spec) + tmp_log.info(f"Cancelled data, updated status to {data_spec.status}") + return True + except Exception as e: + tmp_log.error(f"Got error {str(e)}") + return False + + # ---- Data status transitions ----------------------------- + + def process_data_registered(self, data_spec: WFDataSpec) -> WFDataProcessResult: + """ + Process data in registered status + To prepare for checking the data + + Args: + data_spec (WFDataSpec): The workflow data specification to process + + Returns: + WFDataProcessResult: The result of processing the data + """ + tmp_log = LogWrapper(logger, f"process_data_registered workflow_id={data_spec.workflow_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() + # Check status + if data_spec.status != WFDataStatus.registered: + process_result.message = f"Data status changed unexpectedly from {WFDataStatus.registered} to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # For now, just update status to checking + data_spec.status = WFDataStatus.checking + self.tbif.update_workflow_data(data_spec) + process_result.success = True + process_result.new_status = data_spec.status + tmp_log.info(f"Done, status={data_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{traceback.format_exc()}") + return process_result + + def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: + """ + Process data in checking status + To check the conditions about whether the data is available + + Args: + data_spec (WFDataSpec): The workflow data specification to process + + Returns: + WFDataProcessResult: The result of processing the data + """ + tmp_log = LogWrapper(logger, f"process_data_checking workflow_id={data_spec.workflow_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() + # Check status + if data_spec.status != WFDataStatus.checking: + process_result.message = f"Data status changed unexpectedly from {WFDataStatus.checking} to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Check data availability + original_status = data_spec.status + # Get the data handler plugin + data_handler = self.get_plugin("data_handler", data_spec.flavor) + # Check the data status + check_result = data_handler.check_target(data_spec) + if check_result.success and check_result.check_status is None: + # No status change + process_result.message = f"Skipped; {check_result.message}" + tmp_log.debug(f"{process_result.message}") + process_result.success = True + return process_result + elif not check_result.success or check_result.check_status is None: + process_result.message = f"Failed to check data; {check_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result + # Update data status + now_time = naive_utcnow() + match check_result.check_status: + case WFDataTargetCheckStatus.nonexist: + data_spec.status = WFDataStatus.checked_nonexist + case WFDataTargetCheckStatus.insuffi: + data_spec.status = WFDataStatus.checked_insuffi + case WFDataTargetCheckStatus.suffice: + data_spec.status = WFDataStatus.checked_suffice + case WFDataTargetCheckStatus.complete: + data_spec.status = WFDataStatus.checked_complete + data_spec.check_time = now_time + self.tbif.update_workflow_data(data_spec) + process_result.new_status = data_spec.status + process_result.success = True + tmp_log.info(f"Done, status={data_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{traceback.format_exc()}") + return process_result + + def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: + """ + Process data in checked status + To advance to next status based on check result + + Args: + data_spec (WFDataSpec): The workflow data specification to process + + Returns: + WFDataProcessResult: The result of processing the data + """ + tmp_log = LogWrapper(logger, f"process_data_checked workflow_id={data_spec.workflow_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() + # Check status + if data_spec.status not in WFDataStatus.checked_statuses: + process_result.message = f"Data status changed unexpectedly from checked_* to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + original_status = data_spec.status + # Update data status based on check result + now_time = naive_utcnow() + match data_spec.status: + case WFDataStatus.checked_nonexist: + # Data does not exist, advance to binding + data_spec.status = WFDataStatus.binding + data_spec.start_time = now_time + self.tbif.update_workflow_data(data_spec) + case WFDataStatus.checked_insuffi: + # Data insufficient, advance to waiting_insuffi + data_spec.status = WFDataStatus.waiting_insuffi + self.tbif.update_workflow_data(data_spec) + case WFDataStatus.checked_suffice: + # Data partially exist, advance to waiting_suffice + data_spec.status = WFDataStatus.waiting_suffice + self.tbif.update_workflow_data(data_spec) + case WFDataStatus.checked_complete: + # Data already fully exist, advance to done_skipped + data_spec.status = WFDataStatus.done_skipped + data_spec.end_time = now_time + self.tbif.update_workflow_data(data_spec) + process_result.success = True + process_result.new_status = data_spec.status + tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{traceback.format_exc()}") + return process_result + + def process_data_binding(self, data_spec: WFDataSpec, step_spec: WFStepSpec) -> WFDataProcessResult: + """ + Process data in binding status + To bind the data to the step that will generate it + + Args: + data_spec (WFDataSpec): The workflow data specification to process + step_spec (WFStepSpec): The workflow step specification to bind the data to + + Returns: + WFDataProcessResult: The result of processing the data + """ + tmp_log = LogWrapper(logger, f"process_data_binding workflow_id={data_spec.workflow_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() + # Check status + if data_spec.status != WFDataStatus.binding: + process_result.message = f"Data status changed unexpectedly from {WFDataStatus.binding} to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + original_status = data_spec.status + data_spec.source_step_id = step_spec.step_id + data_spec.status = WFDataStatus.generating_bound + self.tbif.update_workflow_data(data_spec) + process_result.success = True + process_result.new_status = data_spec.status + tmp_log.info(f"Done, bound to step_id={step_spec.step_id}, from {original_status} to status={data_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{traceback.format_exc()}") + return process_result + + def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: + """ + Process data in generating status + To check the status of the data being generated + + Args: + data_spec (WFDataSpec): The workflow data specification to process + + Returns: + WFDataProcessResult: The result of processing the data + """ + tmp_log = LogWrapper(logger, f"process_data_generating workflow_id={data_spec.workflow_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() + # Check status + if data_spec.status not in WFDataStatus.generating_statuses: + process_result.message = f"Data status changed unexpectedly from generating_* to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + original_status = data_spec.status + # Get the data handler plugin + data_handler = self.get_plugin("data_handler", data_spec.flavor) + # Check the data status + check_result = data_handler.check_target(data_spec) + if check_result.success and check_result.check_status is None: + # No status change + process_result.message = f"Skipped; {check_result.message}" + tmp_log.debug(f"{process_result.message}") + process_result.success = True + return process_result + elif not check_result.success or check_result.check_status is None: + process_result.message = f"Failed to check data; {check_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result + # Update data status + now_time = naive_utcnow() + if original_status == WFDataStatus.generating_bound: + match check_result.check_status: + case WFDataTargetCheckStatus.suffice | WFDataTargetCheckStatus.complete: + # Data exist, advance to generating_suffice + data_spec.status = WFDataStatus.generating_suffice + process_result.new_status = data_spec.status + case WFDataTargetCheckStatus.insuffi: + # Data insufficient, move to generating_insuffi + data_spec.status = WFDataStatus.generating_insuffi + process_result.new_status = data_spec.status + case WFDataTargetCheckStatus.nonexist: + # Data not yet exist, stay in generating_bound + pass + case _: + # Unexpected status, log and skip + tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") + elif original_status == WFDataStatus.generating_insuffi: + match check_result.check_status: + case WFDataTargetCheckStatus.suffice | WFDataTargetCheckStatus.complete: + # Data now exist, advance to generating_suffice + data_spec.status = WFDataStatus.generating_suffice + process_result.new_status = data_spec.status + case WFDataTargetCheckStatus.insuffi: + # Data still insufficient, stay in generating_insuffi + pass + case WFDataTargetCheckStatus.nonexist: + # Data not exist anymore, unexpected, log and skip + tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") + case _: + # Unexpected status, log and skip + tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") + elif original_status == WFDataStatus.generating_suffice: + match check_result.check_status: + case WFDataTargetCheckStatus.complete: + # Data fully exist, advance to final status done_generated + data_spec.status = WFDataStatus.done_generated + process_result.new_status = data_spec.status + data_spec.end_time = now_time + case WFDataTargetCheckStatus.suffice: + # Data still partially exist, stay in generating_suffice + pass + case WFDataTargetCheckStatus.insuffi: + # Data not sufficient anymore, unexpected, log and skip + tmp_log.warning(f"Data are not sufficient anymore, unexpected; skipped") + case WFDataTargetCheckStatus.nonexist: + # Data not exist anymore, unexpected, log and skip + tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") + case _: + # Unexpected status, log and skip + tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") + data_spec.check_time = now_time + self.tbif.update_workflow_data(data_spec) + process_result.success = True + if data_spec.status == original_status: + tmp_log.info(f"Done, status stays in {data_spec.status}") + else: + tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{traceback.format_exc()}") + return process_result + + def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: + """ + Process data in waiting status + To check the status of the data being waited for, probably generating by other workflow steps or external sources + + Args: + data_spec (WFDataSpec): The workflow data specification to process + + Returns: + WFDataProcessResult: The result of processing the data + """ + tmp_log = LogWrapper(logger, f"process_data_waiting workflow_id={data_spec.workflow_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() + # Check status + if data_spec.status not in WFDataStatus.waiting_statuses: + process_result.message = f"Data status changed unexpectedly from waiting_* to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + original_status = data_spec.status + # Get the data handler plugin + data_handler = self.get_plugin("data_handler", data_spec.flavor) + # Check the data status + check_result = data_handler.check_target(data_spec) + if check_result.success and check_result.check_status is None: + # No status change + process_result.message = f"Skipped; {check_result.message}" + tmp_log.debug(f"{process_result.message}") + process_result.success = True + return process_result + elif not check_result.success or check_result.check_status is None: + process_result.message = f"Failed to check data; {check_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result + # Update data status + now_time = naive_utcnow() + if original_status == WFDataStatus.waiting_suffice: + match check_result.check_status: + case WFDataTargetCheckStatus.complete: + # Data fully exist, advance to final status done_waited + data_spec.status = WFDataStatus.done_waited + process_result.new_status = data_spec.status + data_spec.end_time = now_time + case WFDataTargetCheckStatus.suffice: + # Data still partially exist, stay in waiting_suffice + pass + case WFDataTargetCheckStatus.insuffi: + # Data not sufficient anymore, unexpected, log and skip + tmp_log.warning(f"Data are not sufficient anymore, unexpected; skipped") + case WFDataTargetCheckStatus.nonexist: + # Data not exist anymore, unexpected, log and skip + tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") + case _: + tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") + elif original_status == WFDataStatus.waiting_insuffi: + match check_result.check_status: + case WFDataTargetCheckStatus.suffice: + # Data partially exist, advance to waiting_suffice + data_spec.status = WFDataStatus.waiting_suffice + process_result.new_status = data_spec.status + case WFDataTargetCheckStatus.complete: + # Data fully exist, advance to final status done_waited + data_spec.status = WFDataStatus.done_waited + process_result.new_status = data_spec.status + data_spec.end_time = now_time + case WFDataTargetCheckStatus.insuffi: + # Data still insufficient, stay in waiting_insuffi + pass + case WFDataTargetCheckStatus.nonexist: + # Data not exist anymore, unexpected, log and skip + tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") + case _: + tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") + data_spec.check_time = now_time + self.tbif.update_workflow_data(data_spec) + process_result.success = True + if data_spec.status == original_status: + tmp_log.info(f"Done, status stays in {data_spec.status}") + else: + tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{traceback.format_exc()}") + return process_result + + def process_data(self, data_spec: WFDataSpec, by: str = "dog") -> tuple[WFDataProcessResult | None, WFDataSpec]: + """ + Process a single workflow data specification + + Args: + data_spec (WFDataSpec): The workflow data specification to process + by (str): Identifier of the entity processing the data specification + + Returns: + WFDataProcessResult | None: The result of processing the data specification, or None if skipped + WFDataSpec: The updated workflow data specification + """ + tmp_log = LogWrapper(logger, f"process_data workflow_id={data_spec.workflow_id} by={by}") + # tmp_log.debug("Start") + tmp_res = None + with self.workflow_data_lock(data_spec.data_id) as locked_data_spec: + if locked_data_spec is None: + tmp_log.warning(f"Failed to acquire lock for data_id={data_spec.data_id}; skipped") + return None, data_spec + data_spec = locked_data_spec + orig_status = data_spec.status + # Process the data + if data_spec.status == WFDataStatus.registered: + tmp_res = self.process_data_registered(data_spec) + elif data_spec.status == WFDataStatus.checking: + tmp_res = self.process_data_checking(data_spec) + elif data_spec.status in WFDataStatus.checked_statuses: + tmp_res = self.process_data_checked(data_spec) + elif data_spec.status == WFDataStatus.binding: + # dummy result since binding data are handled in step processing + dummy_process_result = WFDataProcessResult() + dummy_process_result.success = True + tmp_res = dummy_process_result + tmp_log.debug(f"Data status {data_spec.status} ; wait for step processing") + elif data_spec.status in WFDataStatus.generating_statuses: + tmp_res = self.process_data_generating(data_spec) + elif data_spec.status in WFDataStatus.waiting_statuses: + tmp_res = self.process_data_waiting(data_spec) + else: + tmp_log.debug(f"Data status {data_spec.status} is not handled in this context; skipped") + # For changes into transient status, send message to trigger processing immediately + if data_spec.status != orig_status and data_spec.status in WFDataStatus.transient_statuses: + self.send_data_message(data_spec.data_id) + return tmp_res, data_spec + + def process_datas(self, data_specs: List[WFDataSpec], by: str = "dog") -> Dict: + """ + Process a list of workflow data specifications + + Args: + data_specs (List[WFDataSpec]): List of workflow data specifications to process + by (str): Identifier of the entity processing the data specifications + + Returns: + Dict: Statistics of the processing results + """ + tmp_log = LogWrapper(logger, f"process_datas by={by}") + n_data = len(data_specs) + tmp_log.debug(f"Start, processing {n_data} data specs") + data_status_stats = {"n_data": n_data, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} + for data_spec in data_specs: + orig_status = data_spec.status + tmp_res, data_spec = self.process_data(data_spec, by=by) + if tmp_res and tmp_res.success: + # update stats + if tmp_res.new_status and data_spec.status != orig_status: + data_status_stats["changed"].setdefault(data_spec.status, 0) + data_status_stats["changed"][data_spec.status] += 1 + else: + data_status_stats["unchanged"].setdefault(data_spec.status, 0) + data_status_stats["unchanged"][data_spec.status] += 1 + data_status_stats["processed"].setdefault(data_spec.status, 0) + data_status_stats["processed"][data_spec.status] += 1 + data_status_stats["n_processed"] += 1 + tmp_log.info( + f"Done, processed {data_status_stats['n_processed']}/{n_data} data specs, unchanged: {data_status_stats['unchanged']}, changed: {data_status_stats['changed']}" + ) + return data_status_stats + + # ---- Step status transitions ----------------------------- + + def _check_all_inputs_of_step(self, tmp_log: LogWrapper, input_data_list: List[str], data_spec_map: Dict[str, WFDataSpec]) -> Dict[str, bool]: + """ + Check whether all input data of a step are sufficient or complete + + Args: + tmp_log (LogWrapper): Logger for logging messages + input_data_list (List[str]): List of input data names for the step + data_spec_map (Dict[str, WFDataSpec]): Mapping of data names to their specifications + + Returns: + Dict[str, bool]: Dictionary indicating whether all inputs sufficient and complete + """ + # Check if all input data sufficient or complete + ret_dict = {"all_inputs_sufficient": True, "all_inputs_complete": True} + for input_data_name in input_data_list: + data_spec = data_spec_map.get(input_data_name) + if data_spec is None: + tmp_log.warning(f"Input data {input_data_name} not found in workflow data") + ret_dict["all_inputs_sufficient"] = False + ret_dict["all_inputs_complete"] = False + break + elif data_spec.status not in WFDataStatus.good_input_statuses: + tmp_log.debug(f"Input data {input_data_name} status {data_spec.status} is not sufficient as input") + ret_dict["all_inputs_sufficient"] = False + ret_dict["all_inputs_complete"] = False + break + elif data_spec.status not in WFDataStatus.done_statuses: + ret_dict["all_inputs_complete"] = False + if ret_dict["all_inputs_complete"]: + tmp_log.debug("All input data are complete") + elif ret_dict["all_inputs_sufficient"]: + tmp_log.debug("All input data are sufficient as input") + return ret_dict + + def process_step_registered(self, step_spec: WFStepSpec) -> WFStepProcessResult: + """ + Process a step in registered status + To prepare for checking the step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step + """ + tmp_log = LogWrapper( + logger, f"process_step_registered workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}" + ) + # tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() + # Check status + if step_spec.status != WFStepStatus.registered: + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.registered} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + step_spec.status = WFStepStatus.checking + self.tbif.update_workflow_step(step_spec) + process_result.success = True + process_result.new_status = step_spec.status + tmp_log.info(f"Done, status={step_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{process_result.message}") + return process_result + + def process_step_checking(self, step_spec: WFStepSpec) -> WFStepProcessResult: + """ + Process a step in checking status + To check the conditions about whether to process the step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step + """ + tmp_log = LogWrapper(logger, f"process_step_checking workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() + # Check status + if step_spec.status != WFStepStatus.checking: + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.checking} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Decide whether to run the step: True = must run, False = can skip, None = undecided yet and must check later + to_run_step = False + # FIXME: For now, always check outputs, not customizable + check_outputs = True + if check_outputs and to_run_step is False: + to_generate_output = False + output_data_names = step_spec.definition_json_map.get("output_data_list", []) + for output_data_name in output_data_names: + data_spec = self.tbif.get_workflow_data_by_name(output_data_name, step_spec.workflow_id) + if data_spec is None: + tmp_log.warning(f"Output {output_data_name} not found in workflow data; skipped") + to_run_step = None + break + if data_spec.status == WFDataStatus.binding: + tmp_log.debug(f"Output data {output_data_name} status {data_spec.status} requires step to generate") + to_generate_output = True + break + elif data_spec.status in (WFDataStatus.registered, WFDataStatus.checking) or data_spec.status in WFDataStatus.checked_statuses: + tmp_log.debug(f"Output data {output_data_name} status {data_spec.status} is not after checked; skipped") + to_run_step = None + break + else: + tmp_log.debug(f"Output data {output_data_name} status {data_spec.status} does not require step to generate") + continue + if to_run_step is not None and to_generate_output: + # Outputs are not all good; need to run the step + to_run_step = True + # Update step status + now_time = naive_utcnow() + if to_run_step is None: + step_spec.check_time = now_time + self.tbif.update_workflow_step(step_spec) + process_result.success = True + tmp_log.info(f"Done, status stays in {step_spec.status}") + else: + if to_run_step is True: + step_spec.status = WFStepStatus.checked_true + elif to_run_step is False: + step_spec.status = WFStepStatus.checked_false + step_spec.check_time = now_time + self.tbif.update_workflow_step(step_spec) + process_result.success = True + process_result.new_status = step_spec.status + tmp_log.info(f"Done, status={step_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{process_result.message}") + return process_result + + def process_step_checked(self, step_spec: WFStepSpec) -> WFStepProcessResult: + """ + Process a step in checked status + To advance to pending or closed based on check result + + Args: + step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step + """ + tmp_log = LogWrapper(logger, f"process_step_checked workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() + # Check status + if step_spec.status not in WFStepStatus.checked_statuses: + process_result.message = f"Step status changed unexpectedly from checked_* to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + original_status = step_spec.status + try: + now_time = naive_utcnow() + match step_spec.status: + case WFStepStatus.checked_true: + # Conditions met, advance to pending + step_spec.status = WFStepStatus.pending + step_spec.check_time = now_time + self.tbif.update_workflow_step(step_spec) + case WFStepStatus.checked_false: + # Conditions not met, advanced to closed + step_spec.status = WFStepStatus.closed + step_spec.check_time = now_time + self.tbif.update_workflow_step(step_spec) + process_result.success = True + process_result.new_status = step_spec.status + tmp_log.info(f"Done, from {original_status} to status={step_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{process_result.message}") + return process_result + + def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpec] | None = None) -> WFStepProcessResult: + """ + Process a step in pending status + To check the inputs of the step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + data_spec_map (Dict[str, WFDataSpec] | None): Optional map of data name to WFDataSpec for the workflow + + Returns: + WFStepProcessResult: The result of processing the step + """ + tmp_log = LogWrapper(logger, f"process_step_pending workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() + # Check status + if step_spec.status != WFStepStatus.pending: + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.pending} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Input data list of the step + step_spec_definition = step_spec.definition_json_map + input_data_list = step_spec_definition.get("input_data_list") + if input_data_list is None: + process_result.message = f"Step definition does not have input_data_list; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Get data spec map of the workflow + if data_spec_map is None: + data_specs = self.tbif.get_data_of_workflow(workflow_id=step_spec.workflow_id) + data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} + # Check if all input data are good + all_inputs_stats = self._check_all_inputs_of_step(tmp_log, input_data_list, data_spec_map) + # If not all inputs are sufficient as input, just return and wait for next round + if not all_inputs_stats["all_inputs_sufficient"]: + tmp_log.debug(f"Some input data are not sufficient as input; skipped") + process_result.success = True + return process_result + # All inputs are good, register outputs of the step and update step status to ready + output_data_list = step_spec_definition.get("output_data_list", []) + # outputs_raw_dict = step_spec_definition.get("outputs", {}) + # output_types = step_spec_definition.get("output_types", []) + # now_time = naive_utcnow() + # New code: for all output data, set source_step_id to this step + for output_data_name in output_data_list: + data_spec = self.tbif.get_workflow_data_by_name(output_data_name, step_spec.workflow_id) + if data_spec is not None: + if data_spec.status == WFDataStatus.binding: + self.process_data_binding(data_spec, step_spec) + tmp_log.debug(f"Bound output data_id={data_spec.data_id} name={output_data_name} to the step") + else: + tmp_log.debug(f"Output data_id={data_spec.data_id} name={output_data_name} status={data_spec.status} not in binding; skipped") + else: + tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") + if all_inputs_stats["all_inputs_complete"]: + # All inputs are complete, mark in step_spec + step_spec.set_parameter("all_inputs_complete", True) + # Old code for reference + # if step_spec_definition.get("is_tail"): + # # Tail step, set root output source_step_id + # for output_data_name in output_data_list: + # data_spec = self.tbif.get_workflow_data_by_name(output_data_name, step_spec.workflow_id) + # if data_spec is not None: + # data_spec.source_step_id = step_spec.step_id + # self.tbif.update_workflow_data(data_spec) + # tmp_log.debug(f"Updated output data_id={data_spec.data_id} name={output_data_name} about source_step_id") + # else: + # tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") + # else: + # # Intermediate step, update mid output data specs source_step_id + # for output_data_name in output_data_list: + # data_spec = self.tbif.get_workflow_data_by_name(output_data_name, step_spec.workflow_id) + # if data_spec is None: + # tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") + # continue + # elif data_spec.status == WFDataStatus.binding: + # # mid data in binding, bind it to the step + # data_spec.source_step_id = step_spec.step_id + # data_spec.name = output_data_name + # data_spec.target_id = outputs_raw_dict.get(output_data_name, {}).get("value") # caution: may be None + # data_spec.set_parameter("output_types", output_types) + # data_spec.status = WFDataStatus.registered + # data_spec.type = WFDataType.mid + # data_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable + # data_spec.creation_time = now_time + # self.tbif.update_workflow_data(data_spec) + # tmp_log.debug(f"Updated mid data {output_data_name} about source step") + # # update data_spec_map + # data_spec_map[output_data_name] = data_spec + step_spec.status = WFStepStatus.ready + self.tbif.update_workflow_step(step_spec) + process_result.success = True + process_result.new_status = step_spec.status + tmp_log.info(f"Done, status={step_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result + + def process_step_ready(self, step_spec: WFStepSpec) -> WFStepProcessResult: + """ + Process a step in ready status + To start the step by submitting its target + + Args: + step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step + """ + tmp_log = LogWrapper(logger, f"process_step_ready workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() + # Check status + if step_spec.status != WFStepStatus.ready: + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.ready} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Get the step handler plugin + step_handler = self.get_plugin("step_handler", step_spec.flavor) + # Submit the step target + submit_result = step_handler.submit_target(step_spec) + if not submit_result.success or submit_result.target_id is None: + process_result.message = f"Failed to submit step target; {submit_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result + # Update step status to starting + step_spec.target_id = submit_result.target_id + step_spec.status = WFStepStatus.starting + self.tbif.update_workflow_step(step_spec) + process_result.success = True + process_result.new_status = step_spec.status + tmp_log.info(f"Done, submitted target flavor={step_spec.flavor} target_id={step_spec.target_id}, status={step_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result + + def process_step_starting(self, step_spec: WFStepSpec) -> WFStepProcessResult: + """ + Process a step in starting status + To check the status of the starting step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step + """ + tmp_log = LogWrapper(logger, f"process_step_starting workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() + # Check status + if step_spec.status != WFStepStatus.starting: + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.starting} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Input data list of the step + step_spec_definition = step_spec.definition_json_map + input_data_list = step_spec_definition.get("input_data_list") + if input_data_list is None: + process_result.message = f"Step definition does not have input_data_list; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Get data spec map of the workflow + data_specs = self.tbif.get_data_of_workflow(workflow_id=step_spec.workflow_id) + data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} + # Check if all input data are good + all_inputs_stats = self._check_all_inputs_of_step(tmp_log, input_data_list, data_spec_map) + # Get the step handler plugin + step_handler = self.get_plugin("step_handler", step_spec.flavor) + # Check the step status + check_result = step_handler.check_target(step_spec) + if not check_result.success or check_result.step_status is None: + process_result.message = f"Failed to check step; {check_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result + # If all inputs are complete, mark in step_spec and call the hook of step_handler + if all_inputs_stats["all_inputs_complete"]: + step_spec.set_parameter("all_inputs_complete", True) + step_handler.on_all_inputs_done(step_spec) + # Update step status + if check_result.step_status in WFStepStatus.after_starting_statuses: + # Step status advanced + step_spec.status = check_result.step_status + process_result.new_status = step_spec.status + elif check_result.step_status == WFStepStatus.starting: + # Still in starting, do nothing + pass + else: + tmp_log.warning(f"Invalid step_status {check_result.step_status} from target check result; skipped") + now_time = naive_utcnow() + step_spec.check_time = now_time + if step_spec.status in WFStepStatus.after_starting_uninterrupted_statuses and step_spec.start_time is None: + # step has run, set start_time if not yet set + step_spec.start_time = now_time + if step_spec.status in WFStepStatus.final_statuses and step_spec.start_time is not None and step_spec.end_time is None: + # step has ended, set end_time if not yet set + step_spec.end_time = now_time + self.tbif.update_workflow_step(step_spec) + process_result.success = True + tmp_log.info(f"Checked step, flavor={step_spec.flavor}, target_id={step_spec.target_id}, status={step_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result + + def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: + """ + Process a step in running status + To check the status of the running step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step + """ + tmp_log = LogWrapper(logger, f"process_step_running workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") + # tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() + # Check status + if step_spec.status != WFStepStatus.running: + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.running} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Input data list of the step + step_spec_definition = step_spec.definition_json_map + input_data_list = step_spec_definition.get("input_data_list") + if input_data_list is None: + process_result.message = f"Step definition does not have input_data_list; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Get data spec map of the workflow + data_specs = self.tbif.get_data_of_workflow(workflow_id=step_spec.workflow_id) + data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} + # Check if all input data are good + all_inputs_stats = self._check_all_inputs_of_step(tmp_log, input_data_list, data_spec_map) + # Get the step handler plugin + step_handler = self.get_plugin("step_handler", step_spec.flavor) + # If all inputs are complete, mark in step_spec and call the hook of step_handler + if all_inputs_stats["all_inputs_complete"]: + step_spec.set_parameter("all_inputs_complete", True) + step_handler.on_all_inputs_done(step_spec) + # Check the step status + check_result = step_handler.check_target(step_spec) + if not check_result.success or check_result.step_status is None: + process_result.message = f"Failed to check step; {check_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result + # Update step status + if check_result.step_status in WFStepStatus.after_running_statuses: + # Step status advanced + step_spec.status = check_result.step_status + process_result.new_status = step_spec.status + elif check_result.step_status == WFStepStatus.running: + # Still in running, do nothing + pass + else: + tmp_log.warning(f"Invalid step_status {check_result.step_status} from target check result; skipped") + now_time = naive_utcnow() + step_spec.check_time = now_time + if step_spec.status in WFStepStatus.after_starting_uninterrupted_statuses and step_spec.start_time is None: + # step has run, set start_time if not yet set + step_spec.start_time = now_time + if step_spec.status in WFStepStatus.final_statuses and step_spec.start_time is not None and step_spec.end_time is None: + # step has ended, set end_time if not yet set + step_spec.end_time = now_time + self.tbif.update_workflow_step(step_spec) + process_result.success = True + tmp_log.info(f"Checked step, flavor={step_spec.flavor}, target_id={step_spec.target_id}, status={step_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result + + def process_step( + self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "dog" + ) -> tuple[WFStepProcessResult | None, WFStepSpec]: + """ + Process a single workflow step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + data_spec_map (Dict[str, WFDataSpec] | None): Optional map of data name to WFDataSpec for the workflow + by (str): The entity processing the step, e.g., "dog" or "user" + + Returns: + WFStepProcessResult | None: The result of processing the step, or None if the step was skipped + WFStepSpec: The updated workflow step specification + """ + tmp_log = LogWrapper(logger, f"process_step workflow_id={step_spec.workflow_id} member_id={step_spec.member_id} by={by}") + # tmp_log.debug("Start") + tmp_res = None + with self.workflow_step_lock(step_spec.step_id) as locked_step_spec: + if locked_step_spec is None: + tmp_log.warning(f"Failed to acquire lock for step_id={step_spec.step_id}; skipped") + return None, step_spec + step_spec = locked_step_spec + orig_status = step_spec.status + # Process the step + if step_spec.status == WFStepStatus.registered: + tmp_res = self.process_step_registered(step_spec) + elif step_spec.status == WFStepStatus.checking: + tmp_res = self.process_step_checking(step_spec) + elif step_spec.status in WFStepStatus.checked_statuses: + tmp_res = self.process_step_checked(step_spec) + elif step_spec.status == WFStepStatus.pending: + tmp_res = self.process_step_pending(step_spec, data_spec_map=data_spec_map) + elif step_spec.status == WFStepStatus.ready: + tmp_res = self.process_step_ready(step_spec) + elif step_spec.status == WFStepStatus.starting: + tmp_res = self.process_step_starting(step_spec) + elif step_spec.status == WFStepStatus.running: + tmp_res = self.process_step_running(step_spec) + elif step_spec.status in WFStepStatus.final_statuses: + # dummy result since final steps need no processing + dummy_process_result = WFStepProcessResult() + dummy_process_result.success = True + tmp_res = dummy_process_result + tmp_log.debug(f"Step in final status {step_spec.status} ; skipped") + else: + tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") + # For changes into transient status, send message to trigger processing immediately + if step_spec.status != orig_status and step_spec.status in WFStepStatus.transient_statuses: + self.send_step_message(step_spec.step_id) + return tmp_res, step_spec + + def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "dog") -> Dict: + """ + Process a list of workflow steps + + Args: + step_specs (List[WFStepSpec]): List of workflow step specifications to process + data_spec_map (Dict[str, WFDataSpec] | None): Optional map of data name to WFDataSpec for the workflow + by (str): The entity processing the steps, e.g., "dog" or "user" + + Returns: + Dict: Statistics of the processing results + """ + tmp_log = LogWrapper(logger, f"process_steps by={by}") + n_steps = len(step_specs) + tmp_log.debug(f"Start, processing {n_steps} steps") + steps_status_stats = {"n_steps": n_steps, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} + for step_spec in step_specs: + orig_status = step_spec.status + tmp_res, step_spec = self.process_step(step_spec, data_spec_map=data_spec_map, by=by) + if tmp_res and tmp_res.success: + # update stats + if tmp_res.new_status and step_spec.status != orig_status: + steps_status_stats["changed"].setdefault(step_spec.status, 0) + steps_status_stats["changed"][step_spec.status] += 1 + else: + steps_status_stats["unchanged"].setdefault(step_spec.status, 0) + steps_status_stats["unchanged"][step_spec.status] += 1 + steps_status_stats["processed"].setdefault(step_spec.status, 0) + steps_status_stats["processed"][step_spec.status] += 1 + steps_status_stats["n_processed"] += 1 + tmp_log.info( + f"Done, processed {steps_status_stats['n_processed']}/{n_steps} steps, unchanged: {steps_status_stats['unchanged']}, changed: {steps_status_stats['changed']}" + ) + return steps_status_stats + + # ---- Workflow status transitions ------------------------- + + def process_workflow_registered(self, workflow_spec: WorkflowSpec) -> WorkflowProcessResult: + """ + Process a workflow in registered status + To parse to get workflow definition from raw request + + Args: + workflow_spec (WorkflowSpec): The workflow specification to process + + Returns: + WorkflowProcessResult: The result of processing the workflow + """ + tmp_log = LogWrapper(logger, f"process_workflow_registered ") + # tmp_log.debug("Start") + # Initialize + process_result = WorkflowProcessResult() + # Check status + if workflow_spec.status != WorkflowStatus.registered: + process_result.message = f"Workflow status changed unexpectedly from {WorkflowStatus.registered} to {workflow_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + if workflow_spec.definition_json is not None: + # Already has definition, skip parsing + tmp_log.debug(f"Workflow already has definition; skipped parsing") + else: + # Parse the workflow definition from raw request + raw_request_dict = workflow_spec.raw_request_json_map + sandbox_url = os.path.join(raw_request_dict["sourceURL"], "cache", raw_request_dict["sandbox"]) + log_token = f'< user="{workflow_spec.username}" outDS={raw_request_dict["outDS"]}>' + is_ok, is_fatal, workflow_definition = parse_raw_request( + sandbox_url=sandbox_url, + log_token=log_token, + user_name=workflow_spec.username, + raw_request_dict=raw_request_dict, + ) + # Failure handling + # if is_fatal: + if False: # disable fatal for now + process_result.message = f"Fatal error in parsing raw request; cancelled the workflow" + tmp_log.error(f"{process_result.message}") + workflow_spec.status = WorkflowStatus.cancelled + workflow_spec.set_parameter("cancel_reason", "Fatal error in parsing raw request") + self.tbif.update_workflow(workflow_spec) + return process_result + if not is_ok: + process_result.message = f"Failed to parse raw request; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # extra info from raw request + workflow_definition["user_dn"] = raw_request_dict.get("user_dn") + # Parsed successfully, update definition + workflow_spec.definition_json = json.dumps(workflow_definition, default=json_serialize_default) + tmp_log.debug(f"Parsed raw request into definition") + # Update status to parsed + # workflow_spec.status = WorkflowStatus.parsed + workflow_spec.status = WorkflowStatus.checked # skip parsed for now + # Update DB + self.tbif.update_workflow(workflow_spec) + process_result.success = True + process_result.new_status = workflow_spec.status + tmp_log.info(f"Done, status={workflow_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result + + def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProcessResult: + """ + Process a workflow in checked status + Register steps, and update its status + Parse raw request into workflow definition, register steps, and update its status + + Args: + workflow_spec (WorkflowSpec): The workflow specification to process + + Returns: + WorkflowProcessResult: The result of processing the workflow + """ + tmp_log = LogWrapper(logger, f"process_workflow_checked ") + # tmp_log.debug("Start") + # Initialize + process_result = WorkflowProcessResult() + # Check status + if workflow_spec.status != WorkflowStatus.checked: + process_result.message = f"Workflow status changed unexpectedly from {WorkflowStatus.checked} to {workflow_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Parse the workflow definition + workflow_definition = workflow_spec.definition_json_map + if workflow_definition is None: + process_result.message = f"Workflow definition is None; cancelled the workflow" + tmp_log.error(f"{process_result.message}") + workflow_spec.status = WorkflowStatus.cancelled + workflow_spec.set_parameter("cancel_reason", "Workflow definition is None") + self.tbif.update_workflow(workflow_spec) + return process_result + # initialize + data_specs = [] + step_specs = [] + now_time = naive_utcnow() + # Register root outputs + for output_name, output_dict in workflow_definition["root_outputs"].items(): + data_spec = WFDataSpec() + data_spec.workflow_id = workflow_spec.workflow_id + data_spec.source_step_id = None # to be set when the step producing it starts + data_spec.name = output_name + data_spec.target_id = output_dict.get("value") + data_spec.set_parameter("output_types", output_dict.get("output_types")) + data_spec.status = WFDataStatus.registered + data_spec.type = WFDataType.output + data_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable + data_spec.creation_time = now_time + data_specs.append(data_spec) + # Register root inputs + for input_name, input_target in workflow_definition["root_inputs"].items(): + data_spec = WFDataSpec() + data_spec.workflow_id = workflow_spec.workflow_id + data_spec.name = input_name + data_spec.target_id = input_target + data_spec.status = WFDataStatus.registered + data_spec.type = WFDataType.input + data_spec.flavor = "ddm_collection" # FIXME: hardcoded flavor, should be configurable + data_spec.creation_time = now_time + data_specs.append(data_spec) + # Register steps and their intermediate outputs based on nodes in the definition + for node in workflow_definition["nodes"]: + # FIXME: not yet consider scatter, condition, loop, etc. + if not (node.get("condition") or node.get("scatter") or node.get("loop")): + step_spec = WFStepSpec() + step_spec.workflow_id = workflow_spec.workflow_id + step_spec.member_id = node["id"] + step_spec.name = node["name"] + step_spec.status = WFStepStatus.registered + step_spec.type = WFStepType.ordinary + step_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable + # step definition + step_definition = copy.deepcopy(node) + # propagate user name and DN from workflow to step + step_definition["user_name"] = workflow_spec.username + step_definition["user_dn"] = workflow_definition.get("user_dn") + # resolve inputs and outputs + input_data_set = set() + output_data_dict = dict() + for input_target in step_definition.get("inputs", {}).values(): + if not input_target.get("source"): + continue + sources = [] + if isinstance(input_target["source"], list): + sources = copy.deepcopy(input_target["source"]) + else: + sources = [input_target["source"]] + input_data_set.update(sources) + for output_name, output_value in step_definition.get("outputs", {}).items(): + output_data_dict[output_name] = output_value.get("value") + step_definition["input_data_list"] = list(input_data_set) + step_definition["output_data_list"] = list(output_data_dict.keys()) + step_spec.definition_json_map = step_definition + step_spec.creation_time = now_time + step_specs.append(step_spec) + # intermediate outputs of the step + for output_data_name in output_data_dict.keys(): + if output_data_name not in workflow_definition["root_outputs"]: + data_spec = WFDataSpec() + data_spec.workflow_id = workflow_spec.workflow_id + data_spec.source_step_id = None # to be set when step starts + data_spec.name = output_data_name + data_spec.target_id = output_data_dict[output_data_name] + data_spec.set_parameter("output_types", step_definition.get("output_types", [])) + data_spec.status = WFDataStatus.registered + data_spec.type = WFDataType.mid + data_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable + data_spec.creation_time = now_time + data_specs.append(data_spec) + # Update status to starting + workflow_spec.status = WorkflowStatus.starting + # Upsert DB + self.tbif.upsert_workflow_entities( + workflow_spec.workflow_id, + actions_dict={"workflow": "update", "steps": "insert", "data": "insert"}, + workflow_spec=workflow_spec, + step_specs=step_specs, + data_specs=data_specs, + ) + process_result.success = True + process_result.new_status = workflow_spec.status + tmp_log.info(f"Done, inserted {len(step_specs)} steps and {len(data_specs)} data, status={workflow_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result + + def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProcessResult: + """ + Process a workflow in starting status + To start the steps in the workflow + + Args: + workflow_spec (WorkflowSpec): The workflow specification to process + + Returns: + WorkflowProcessResult: The result of processing the workflow + """ + tmp_log = LogWrapper(logger, f"process_workflow_starting ") + # tmp_log.debug("Start") + # Initialize + process_result = WorkflowProcessResult() + # Check status + if workflow_spec.status != WorkflowStatus.starting: + process_result.message = f"Workflow status changed unexpectedly from {WorkflowStatus.starting} to {workflow_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Process data specs first + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id, status_exclusion_list=list(WFDataStatus.terminated_statuses)) + if data_specs: + data_status_stats = self.process_datas(data_specs) + # Get steps in registered status + required_step_statuses = list(WFStepStatus.to_advance_step_statuses) + over_advanced_step_statuses = list(WFStepStatus.after_starting_uninterrupted_statuses) + step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=required_step_statuses) + over_advanced_step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=over_advanced_step_statuses) + if not step_specs: + process_result.message = f"No step in required status; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + if over_advanced_step_specs: + process_result.message = f"Some steps are not in required status; force to advance the workflow" + tmp_log.warning(f"{process_result.message}") + # Advance the workflow to running directly + workflow_spec.status = WorkflowStatus.running + workflow_spec.start_time = naive_utcnow() + self.tbif.update_workflow(workflow_spec) + process_result.success = True + process_result.new_status = workflow_spec.status + tmp_log.info(f"Done, forced advanced to status={workflow_spec.status}") + return process_result + # Get data spec map of the workflow + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) + data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} + # Process steps + steps_status_stats = self.process_steps(step_specs, data_spec_map=data_spec_map) + # Update workflow status to running if any of step is starting + now_time = naive_utcnow() + if steps_status_stats["processed"].get(WFStepStatus.starting): + workflow_spec.status = WorkflowStatus.running + workflow_spec.start_time = now_time + workflow_spec.check_time = now_time + self.tbif.update_workflow(workflow_spec) + process_result.success = True + process_result.new_status = workflow_spec.status + tmp_log.info(f"Done, advanced to status={workflow_spec.status}") + else: + workflow_spec.check_time = now_time + self.tbif.update_workflow(workflow_spec) + process_result.success = True + tmp_log.info(f"Done, status remains {workflow_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result + + def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProcessResult: + """ + Process a workflow in running status + To monitor the steps in the workflow + + Args: + workflow_spec (WorkflowSpec): The workflow specification to process + + Returns: + WorkflowProcessResult: The result of processing the workflow + """ + tmp_log = LogWrapper(logger, f"process_workflow_running ") + # tmp_log.debug("Start") + # Initialize + process_result = WorkflowProcessResult() + # Check status + if workflow_spec.status != WorkflowStatus.running: + process_result.message = f"Workflow status changed unexpectedly from {WorkflowStatus.running} to {workflow_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Process data specs first + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id, status_exclusion_list=list(WFDataStatus.terminated_statuses)) + if data_specs: + data_status_stats = self.process_datas(data_specs) + # Get steps + step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id) + if not step_specs: + process_result.message = f"No step in required status; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Get data spec map of the workflow + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) + data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} + output_data_spec_map = {data_spec.name: data_spec for data_spec in data_specs if data_spec.type == WFDataType.output} + # Check if all output data are good + all_outputs_good = None + for output_data_name, output_data_spec in output_data_spec_map.items(): + if output_data_spec.status in WFDataStatus.good_output_statuses: + if all_outputs_good is None: + all_outputs_good = True + else: + all_outputs_good = False + break + if all_outputs_good is True: + # All outputs are good, mark the workflow as done + workflow_spec.status = WorkflowStatus.done + workflow_spec.end_time = naive_utcnow() + self.tbif.update_workflow(workflow_spec) + process_result.success = True + process_result.new_status = workflow_spec.status + tmp_log.info(f"Done, all output data are good; advanced to status={workflow_spec.status}") + return process_result + # Process each step + steps_status_stats = self.process_steps(step_specs, data_spec_map=data_spec_map) + # Update workflow status by steps + now_time = naive_utcnow() + if (processed_steps_stats := steps_status_stats["processed"]) and ( + processed_steps_stats.get(WFStepStatus.failed) or processed_steps_stats.get(WFStepStatus.cancelled) + ): + # TODO: cancel all unfinished steps + # self.cancel_step(...) + # mark workflow as failed + tmp_log.warning(f"workflow failed due to some steps failed or cancelled") + workflow_spec.status = WorkflowStatus.failed + workflow_spec.end_time = now_time + workflow_spec.check_time = now_time + self.tbif.update_workflow(workflow_spec) + process_result.success = True + process_result.new_status = workflow_spec.status + tmp_log.info(f"Done, advanced to status={workflow_spec.status}") + else: + workflow_spec.check_time = now_time + self.tbif.update_workflow(workflow_spec) + process_result.success = True + tmp_log.info(f"Done, status remains {workflow_spec.status}") + if processed_steps_stats.get(WFStepStatus.done) == len(step_specs): + # all steps are done, trigger re-check to update workflow status + self.send_workflow_message(workflow_spec.workflow_id) + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result + + def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "dog") -> tuple[WorkflowProcessResult, WorkflowSpec]: + """ + Process a workflow based on its current status + + Args: + workflow_spec (WorkflowSpec): The workflow specification to process + by (str): The entity processing the workflow + + Returns: + WorkflowProcessResult: The result of processing the workflow + WorkflowSpec: The updated workflow specification + """ + tmp_log = LogWrapper(logger, f"process_workflow by={by}") + tmp_log.debug(f"Start, current status={workflow_spec.status}") + # Initialize + process_result = WorkflowProcessResult() + orig_status = workflow_spec.status + # Process based on status + match workflow_spec.status: + case WorkflowStatus.registered: + process_result = self.process_workflow_registered(workflow_spec) + case WorkflowStatus.checked: + process_result = self.process_workflow_checked(workflow_spec) + case WorkflowStatus.starting: + process_result = self.process_workflow_starting(workflow_spec) + case WorkflowStatus.running: + process_result = self.process_workflow_running(workflow_spec) + case _: + process_result.message = f"Workflow status {workflow_spec.status} is not handled in this context; skipped" + tmp_log.warning(f"{process_result.message}") + # For changes into transient status, send message to trigger processing immediately + if workflow_spec.status != orig_status and workflow_spec.status in WorkflowStatus.transient_statuses: + self.send_workflow_message(workflow_spec.workflow_id) + return process_result, workflow_spec + + # ---- Process all workflows ------------------------------------- + + def process_active_workflows(self) -> Dict: + """ + Process all active workflows in the system + + Returns: + Dict: Statistics of the processing results + """ + tmp_log = LogWrapper(logger, "process_active_workflows") + # tmp_log.debug("Start") + # Initialize + workflows_status_stats = {"n_workflows": 0, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} + try: + # Query active workflows to process + workflow_specs = self.tbif.query_workflows(status_filter_list=WorkflowStatus.active_statuses, check_interval_sec=WORKFLOW_CHECK_INTERVAL_SEC) + n_workflows = len(workflow_specs) + tmp_log.debug(f"Got {n_workflows} workflows to process") + if n_workflows == 0: + tmp_log.info("Done, no workflow to process") + return workflows_status_stats + # Process each workflow + for workflow_spec in workflow_specs: + with self.workflow_lock(workflow_spec.workflow_id) as locked_workflow_spec: + if locked_workflow_spec is None: + tmp_log.warning(f"Failed to acquire lock for workflow_id={workflow_spec.workflow_id}; skipped") + continue + workflow_spec = locked_workflow_spec + orig_status = workflow_spec.status + # Process the workflow + tmp_res, workflow_spec = self.process_workflow(workflow_spec) + if tmp_res and tmp_res.success: + # update stats + if tmp_res.new_status and workflow_spec.status != orig_status: + workflows_status_stats["changed"].setdefault(workflow_spec.status, 0) + workflows_status_stats["changed"][workflow_spec.status] += 1 + else: + workflows_status_stats["unchanged"].setdefault(workflow_spec.status, 0) + workflows_status_stats["unchanged"][workflow_spec.status] += 1 + workflows_status_stats["processed"].setdefault(workflow_spec.status, 0) + workflows_status_stats["processed"][workflow_spec.status] += 1 + workflows_status_stats["n_processed"] += 1 + workflows_status_stats["n_workflows"] = n_workflows + tmp_log.info( + f"Done, processed {workflows_status_stats['n_processed']}/{n_workflows} workflows, unchanged: {workflows_status_stats['unchanged']}, changed: {workflows_status_stats['changed']}" + ) + except Exception as e: + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return workflows_status_stats diff --git a/pandaserver/workflow/workflow_core_smoketest.py b/pandaserver/workflow/workflow_core_smoketest.py new file mode 100644 index 000000000..34ead7f8c --- /dev/null +++ b/pandaserver/workflow/workflow_core_smoketest.py @@ -0,0 +1,388 @@ +import argparse +import sys + +from pandacommon.pandautils.thread_utils import GenericThread + +from pandaserver.config import panda_config +from pandaserver.taskbuffer.TaskBuffer import taskBuffer + + +def parse_args(): + parser = argparse.ArgumentParser(description="Workflow core smoke test helper") + parser.add_argument("action", choices=["cancel_workflow"], help="Action to perform in the smoke test") + parser.add_argument("--force", action="store_true", help="Force into cancelled status") + parser.add_argument("workflow_id", help="Workflow ID to use in commented smoke test calls") + return parser.parse_args() + + +# parameters for the workflow +# prodsourcelabel = "user" +# username = "testuser" +# workflow_name = "test_workflow_bg_comb_00" + +# workflow definition json +# wfd_json = json.dumps( +# json.loads( +# """ +# { +# "root_inputs": { +# "sig_bg_comb.cwl#background": "mc16_5TeV.361238.Pythia8EvtGen_A3NNPDF23LO_minbias_inelastic_low.merge.HITS.e6446_s3238_s3250/", +# "sig_bg_comb.cwl#signal": "mc16_valid:mc16_valid.900248.PG_singlepion_flatPt2to50.simul.HITS.e8312_s3238_tid26378578_00" +# }, +# "root_outputs": {"sig_bg_comb.cwl#combine/outDS": {"value": "user.me.my_outDS_005_combine"}}, +# "nodes": [ +# { +# "condition": null, +# "data": null, +# "id": 1, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#make_signal/opt_args": { +# "default": "--outputs abc.dat,def.zip --nFilesPerJob 5", +# "source": null +# }, +# "sig_bg_comb.cwl#make_signal/opt_containerImage": { +# "default": "docker://busybox", +# "source": null +# }, +# "sig_bg_comb.cwl#make_signal/opt_exec": { +# "default": "echo %IN > abc.dat; echo 123 > def.zip", +# "source": null +# }, +# "sig_bg_comb.cwl#make_signal/opt_inDS": { +# "default": null, +# "source": "sig_bg_comb.cwl#signal" +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": false, +# "is_workflow_output": false, +# "loop": false, +# "name": "make_signal", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#make_signal/outDS": {} +# }, +# "parents": [], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# }, +# { +# "condition": null, +# "data": null, +# "id": 2, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#make_background_1/opt_args": { +# "default": "--outputs opq.root,xyz.pool --nGBPerJob 10", +# "source": null +# }, +# "sig_bg_comb.cwl#make_background_1/opt_exec": { +# "default": "echo %IN > opq.root; echo %IN > xyz.pool", +# "source": null +# }, +# "sig_bg_comb.cwl#make_background_1/opt_inDS": { +# "default": null, +# "source": "sig_bg_comb.cwl#background" +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": false, +# "is_workflow_output": false, +# "loop": false, +# "name": "make_background_1", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#make_background_1/outDS": {} +# }, +# "parents": [], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# }, +# { +# "condition": null, +# "data": null, +# "id": 3, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#premix/opt_args": { +# "default": "--outputs klm.root --secondaryDSs IN2:2:%{SECDS1}", +# "source": null +# }, +# "sig_bg_comb.cwl#premix/opt_exec": { +# "default": "echo %IN %IN2 > klm.root", +# "source": null +# }, +# "sig_bg_comb.cwl#premix/opt_inDS": { +# "default": null, +# "parent_id": 1, +# "source": "sig_bg_comb.cwl#make_signal/outDS" +# }, +# "sig_bg_comb.cwl#premix/opt_inDsType": { +# "default": "def.zip", +# "source": null +# }, +# "sig_bg_comb.cwl#premix/opt_secondaryDSs": { +# "default": null, +# "parent_id": [ +# 2 +# ], +# "source": [ +# "sig_bg_comb.cwl#make_background_1/outDS" +# ] +# }, +# "sig_bg_comb.cwl#premix/opt_secondaryDsTypes": { +# "default": [ +# "xyz.pool" +# ], +# "source": null +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": false, +# "is_workflow_output": false, +# "loop": false, +# "name": "premix", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#premix/outDS": {} +# }, +# "parents": [ +# 1, +# 2 +# ], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# }, +# { +# "condition": null, +# "data": null, +# "id": 4, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#generate_some/opt_args": { +# "default": "--outputs gen.root --nJobs 10", +# "source": null +# }, +# "sig_bg_comb.cwl#generate_some/opt_exec": { +# "default": "echo %RNDM:10 > gen.root", +# "source": null +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": false, +# "is_workflow_output": false, +# "loop": false, +# "name": "generate_some", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#generate_some/outDS": {} +# }, +# "parents": [], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# }, +# { +# "condition": null, +# "data": null, +# "id": 5, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#make_background_2/opt_args": { +# "default": "--outputs ooo.root,jjj.txt --secondaryDSs IN2:2:%{SECDS1}", +# "source": null +# }, +# "sig_bg_comb.cwl#make_background_2/opt_containerImage": { +# "default": "docker://alpine", +# "source": null +# }, +# "sig_bg_comb.cwl#make_background_2/opt_exec": { +# "default": "echo %IN > ooo.root; echo %IN2 > jjj.txt", +# "source": null +# }, +# "sig_bg_comb.cwl#make_background_2/opt_inDS": { +# "default": null, +# "source": "sig_bg_comb.cwl#background" +# }, +# "sig_bg_comb.cwl#make_background_2/opt_secondaryDSs": { +# "default": null, +# "parent_id": [ +# 4 +# ], +# "source": [ +# "sig_bg_comb.cwl#generate_some/outDS" +# ] +# }, +# "sig_bg_comb.cwl#make_background_2/opt_secondaryDsTypes": { +# "default": [ +# "gen.root" +# ], +# "source": null +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": false, +# "is_workflow_output": false, +# "loop": false, +# "name": "make_background_2", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#make_background_2/outDS": {} +# }, +# "parents": [ +# 4 +# ], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# }, +# { +# "condition": null, +# "data": null, +# "id": 6, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#combine/opt_args": { +# "default": "--outputs aaa.root --secondaryDSs IN2:2:%{SECDS1},IN3:5:%{SECDS2}", +# "source": null +# }, +# "sig_bg_comb.cwl#combine/opt_exec": { +# "default": "echo %IN %IN2 %IN3 > aaa.root", +# "source": null +# }, +# "sig_bg_comb.cwl#combine/opt_inDS": { +# "default": null, +# "parent_id": 1, +# "source": "sig_bg_comb.cwl#make_signal/outDS" +# }, +# "sig_bg_comb.cwl#combine/opt_inDsType": { +# "default": "abc.dat", +# "source": null +# }, +# "sig_bg_comb.cwl#combine/opt_secondaryDSs": { +# "default": null, +# "parent_id": [ +# 3, +# 5 +# ], +# "source": [ +# "sig_bg_comb.cwl#premix/outDS", +# "sig_bg_comb.cwl#make_background_2/outDS" +# ] +# }, +# "sig_bg_comb.cwl#combine/opt_secondaryDsTypes": { +# "default": [ +# "klm.root", +# "ooo.root" +# ], +# "source": null +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": true, +# "is_workflow_output": false, +# "loop": false, +# "name": "combine", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#combine/outDS": {} +# }, +# "parents": [ +# 1, +# 3, +# 5 +# ], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# } +# ] +# } +# """ +# ) +# ) + + +def main(): + args = parse_args() + WFID = args.workflow_id + action = args.action + force = args.force + + from pandaserver.workflow.workflow_core import WorkflowInterface + + # interface for workflow operations + requester_id = GenericThread().get_full_id(__name__, sys.modules[__name__].__file__) + taskBuffer.init( + panda_config.dbhost, + panda_config.dbpasswd, + nDBConnection=panda_config.nDBConnection, + useTimeout=True, + requester=requester_id, + ) + + wfif = WorkflowInterface(taskBuffer) + + # Test cases for workflow core + + # Register the workflow + # print("Registering workflow...") + # wf_spec = wfif.register_workflow( + # prodsourcelabel=prodsourcelabel, + # username=username, + # workflow_name=workflow_name, + # workflow_definition_json=wfd_json, + # ) + + # Process the registered workflow + # wf_spec = taskBuffer.get_workflow(workflow_id=WFID) + # print("Processing registered workflow...") + # wfif.process_workflow_registered(wf_spec) + + # wf_spec = taskBuffer.get_workflow(workflow_id=WFID) + # print("Processing checked workflow...") + # wfif.process_workflow_checked(wf_spec) + + # wf_spec = taskBuffer.get_workflow(workflow_id=WFID) + # print("Processing starting workflow...") + # wfif.process_workflow_starting(wf_spec) + + if args.action == "cancel_workflow": + print(f"Cancelling workflow_id={WFID} ...") + res = wfif.cancel_workflow(workflow_id=WFID, force=args.force) + if res: + print(f"Cancelled workflow_id={WFID} successfully.") + else: + print(f"Failed to cancel workflow_id={WFID}.") + + +if __name__ == "__main__": + main() diff --git a/pandaserver/workflow/workflow_parser.py b/pandaserver/workflow/workflow_parser.py new file mode 100644 index 000000000..c828d0136 --- /dev/null +++ b/pandaserver/workflow/workflow_parser.py @@ -0,0 +1,225 @@ +import copy +import json +import os +import re +import shlex +import sys +import tarfile +import tempfile +import traceback + +import requests +from idds.atlas.workflowv2.atlaslocalpandawork import ATLASLocalPandaWork +from idds.atlas.workflowv2.atlaspandawork import ATLASPandaWork +from idds.workflowv2.workflow import AndCondition, Condition, OrCondition, Workflow +from pandaclient import PhpoScript, PrunScript +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger +from ruamel.yaml import YAML + +# from pandaserver.srvcore.CoreUtils import clean_user_id +from pandaserver.workflow import pcwl_utils, workflow_utils +from pandaserver.workflow.snakeparser import Parser + +# supported workflow description languages +SUPPORTED_WORKFLOW_LANGUAGES = ["cwl", "snakemake"] + +# main logger +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + + +# ============================================================================== +# Native PanDA workflow functions +# ============================================================================== + + +def json_serialize_default(obj): + """ + Default JSON serializer for non-serializable objects of Node object + + Args: + obj (Any): Object to serialize + + Returns: + Any: JSON serializable object + """ + # convert set to list + if isinstance(obj, set): + return list(obj) + elif isinstance(obj, workflow_utils.Node): + return obj.id + return obj + + +def parse_raw_request(sandbox_url, log_token, user_name, raw_request_dict) -> tuple[bool, bool, dict]: + """ + Parse raw request with files in sandbox into workflow definition + + Args: + sandbox_url (str): URL to download sandbox + log_token (str): Log token + user_name (str): User name + raw_request_dict (dict): Raw request dictionary + + Returns: + bool: Whether the parsing is successful + bool: Whether the failure is fatal + dict: Workflow definition dictionary + """ + tmp_log = LogWrapper(logger, log_token) + is_ok = True + is_fatal = False + # request_id = None + workflow_definition_dict = dict() + cur_dir = os.getcwd() + + def _is_within_directory(base_dir: str, target_path: str) -> bool: + abs_base_dir = os.path.abspath(base_dir) + abs_target_path = os.path.abspath(target_path) + return os.path.commonpath([abs_base_dir, abs_target_path]) == abs_base_dir + + def _safe_extract_tar_gz(tar_path: str, extract_dir: str): + with tarfile.open(tar_path, mode="r:gz") as tar: + members = tar.getmembers() + for member in members: + member_name = member.name + normalized_name = os.path.normpath(member_name) + # security checks for tar member name + if os.path.isabs(member_name): + raise ValueError(f"absolute path in tar member is not allowed: {member_name}") + if normalized_name in ("", ".", "..") or normalized_name.startswith(".." + os.path.sep): + raise ValueError(f"path traversal in tar member is not allowed: {member_name}") + if member.issym() or member.islnk(): + raise ValueError(f"links in tar archive are not allowed: {member_name}") + if member.ischr() or member.isblk() or member.isfifo(): + raise ValueError(f"special file in tar archive is not allowed: {member_name}") + # check that the extraction target is within the extract_dir + extraction_target = os.path.join(extract_dir, normalized_name) + if not _is_within_directory(extract_dir, extraction_target): + raise ValueError(f"tar member extracts outside target directory: {member_name}") + # all checks passed, safe to extract + tar.extractall(path=extract_dir, members=members) + + try: + # go to temp dir + with tempfile.TemporaryDirectory() as tmp_dirname: + os.chdir(tmp_dirname) + # download sandbox + tmp_log.info(f"downloading sandbox from {sandbox_url}") + with requests.get(sandbox_url, allow_redirects=True, stream=True) as r: + if r.status_code == 400: + tmp_log.error("not found") + is_fatal = True + is_ok = False + elif r.status_code != 200: + tmp_log.error(f"bad HTTP response {r.status_code}") + is_ok = False + # validate sandbox filename + sandbox_name = raw_request_dict.get("sandbox") + if is_ok: + if not isinstance(sandbox_name, str): + tmp_log.error("sandbox filename is missing or not a string") + is_fatal = True + is_ok = False + else: + # sandbox filename must not contain any path separators + seps = [os.path.sep] + if os.path.altsep: + seps.append(os.path.altsep) + if any(sep in sandbox_name for sep in seps): + tmp_log.error("sandbox filename must not contain path separators") + is_fatal = True + is_ok = False + else: + sandbox_name = os.path.basename(sandbox_name) + # extract sandbox + if is_ok: + with open(sandbox_name, "wb") as fs: + for chunk in r.raw.stream(1024, decode_content=False): + if chunk: + fs.write(chunk) + fs.close() + try: + _safe_extract_tar_gz(sandbox_name, tmp_dirname) + except Exception as e: + dump_str = f"failed to extract {sandbox_name}: {traceback.format_exc()}" + tmp_log.error(dump_str) + is_fatal = True + is_ok = False + # parse workflow files + if is_ok: + tmp_log.info("parse workflow") + workflow_name = None + if (wf_lang := raw_request_dict["language"]) in SUPPORTED_WORKFLOW_LANGUAGES: + if wf_lang == "cwl": + workflow_name = raw_request_dict.get("workflow_name") + nodes, root_in = pcwl_utils.parse_workflow_file(raw_request_dict["workflowSpecFile"], tmp_log) + with open(raw_request_dict["workflowInputFile"]) as workflow_input: + yaml = YAML(typ="safe", pure=True) + data = yaml.load(workflow_input) + elif wf_lang == "snakemake": + parser = Parser(raw_request_dict["workflowSpecFile"], logger=tmp_log) + nodes, root_in = parser.parse_nodes() + data = dict() + # resolve nodes + s_id, t_nodes, nodes = workflow_utils.resolve_nodes(nodes, root_in, data, 0, set(), raw_request_dict["outDS"], tmp_log) + workflow_utils.set_workflow_outputs(nodes) + id_node_map = workflow_utils.get_node_id_map(nodes) + [node.resolve_params(raw_request_dict["taskParams"], id_node_map) for node in nodes] + dump_str = "the description was internally converted as follows\n" + workflow_utils.dump_nodes(nodes) + tmp_log.info(dump_str) + for node in nodes: + s_check, o_check = node.verify() + tmp_str = f"Verification failure in ID:{node.id} {o_check}" + if not s_check: + tmp_log.error(tmp_str) + dump_str += tmp_str + dump_str += "\n" + is_fatal = True + is_ok = False + else: + dump_str = f"{wf_lang} is not supported to describe the workflow" + tmp_log.error(dump_str) + is_fatal = True + is_ok = False + # genertate workflow definition + if is_ok: + # root inputs + root_inputs_dict = dict() + for k in root_in: + kk = k.split("#")[-1] + if kk in data: + root_inputs_dict[k] = data[kk] + # root outputs + root_outputs_dict = dict() + nodes_list = [] + # nodes + for node in nodes: + nodes_list.append(vars(node)) + if node.is_tail: + root_outputs_dict.update(node.outputs) + for out_val in root_outputs_dict.values(): + out_val["output_types"] = node.output_types + # workflow definition + workflow_definition_dict = { + "workflow_name": workflow_name, + "user_name": user_name, + "root_inputs": root_inputs_dict, + "root_outputs": root_outputs_dict, + "nodes": nodes_list, + } + except Exception as e: + is_ok = False + is_fatal = True + tmp_log.error(f"failed to run with {str(e)} {traceback.format_exc()}") + finally: + try: + os.chdir(cur_dir) + except Exception as e: + tmp_log.error(f"failed to restore working directory to {cur_dir}: {traceback.format_exc()}") + + # with tempfile.NamedTemporaryFile(delete=False, mode="w") as tmp_json: + # json.dump([is_ok, is_fatal, request_id, tmp_log.dumpToString()], tmp_json) + # print(tmp_json.name) + + return is_ok, is_fatal, workflow_definition_dict diff --git a/pandaserver/workflow/workflow_utils.py b/pandaserver/workflow/workflow_utils.py index d2b561bca..5ed1e60ab 100644 --- a/pandaserver/workflow/workflow_utils.py +++ b/pandaserver/workflow/workflow_utils.py @@ -423,9 +423,9 @@ def make_task_params(self, task_template, id_map, workflow_node): tmp_item["value"] = f"-a {task_params['buildSpec']['archiveName']}" del task_params["buildSpec"] # parent - if self.parents and len(self.parents) == 1: - task_params["noWaitParent"] = True - task_params["parentTaskName"] = id_map[list(self.parents)[0]].task_params["taskName"] + # if self.parents and len(self.parents) == 1: + # task_params["noWaitParent"] = True + # task_params["parentTaskName"] = id_map[list(self.parents)[0]].task_params["taskName"] # notification if not self.is_workflow_output: task_params["noEmail"] = True @@ -626,6 +626,181 @@ def set_workflow_outputs(node_list, all_parents=None): set_workflow_outputs(node.sub_nodes, all_parents) +# convert parameter names to parent IDs +def convert_params_in_condition_to_parent_ids(condition_item, input_data, id_map): + for item in ["left", "right"]: + param = getattr(condition_item, item) + if isinstance(param, str): + m = re.search(r"^[^\[]+\[(\d+)\]", param) + if m: + param = param.split("[")[0] + idx = int(m.group(1)) + else: + idx = None + isOK = False + for tmp_name, tmp_data in input_data.items(): + if param == tmp_name.split("/")[-1]: + isOK = True + if isinstance(tmp_data["parent_id"], list): + if idx is not None: + if idx < 0 or idx >= len(tmp_data["parent_id"]): + raise IndexError(f"index {idx} is out of bounds for parameter {param} with {len(tmp_data['parent_id'])} parents") + parent_id = tmp_data["parent_id"][idx] + if parent_id not in id_map: + raise ReferenceError(f"unresolved parent_id {parent_id} for parameter {param}[{idx}]") + setattr(condition_item, item, id_map[parent_id]) + else: + resolved_parent_ids = set() + for parent_id in tmp_data["parent_id"]: + if parent_id not in id_map: + raise ReferenceError(f"unresolved parent_id {parent_id} for parameter {param}") + resolved_parent_ids |= id_map[parent_id] + setattr(condition_item, item, list(resolved_parent_ids)) + else: + if tmp_data["parent_id"] not in id_map: + raise ReferenceError(f"unresolved parent_id {tmp_data['parent_id']} for parameter {param}") + setattr(condition_item, item, id_map[tmp_data["parent_id"]]) + break + if not isOK: + raise ReferenceError(f"unresolved parameter {param} in the condition string") + elif isinstance(param, ConditionItem): + convert_params_in_condition_to_parent_ids(param, input_data, id_map) + + +# resolve nodes +def resolve_nodes(node_list, root_inputs, data, serial_id, parent_ids, out_ds_name, log_stream): + for k in root_inputs: + kk = k.split("#")[-1] + if kk in data: + root_inputs[k] = data[kk] + tmp_to_real_id_map = {} + resolved_map = {} + # map of object identity to original temporary node ID used in resolved_map keys + node_key_map = {} + all_nodes = [] + for node in node_list: + # resolve input + for tmp_name, tmp_data in node.inputs.items(): + if not tmp_data["source"]: + continue + if isinstance(tmp_data["source"], list): + tmp_sources = tmp_data["source"] + if "parent_id" in tmp_data: + # Make a copy to avoid mutating the original list stored in node.inputs + tmp_parent_ids = list(tmp_data["parent_id"]) + tmp_parent_ids += [None] * (len(tmp_sources) - len(tmp_parent_ids)) + else: + tmp_parent_ids = [None] * len(tmp_sources) + else: + tmp_sources = [tmp_data["source"]] + if "parent_id" in tmp_data: + tmp_parent_ids = [tmp_data["parent_id"]] + else: + tmp_parent_ids = [None] * len(tmp_sources) + for tmp_source, tmp_parent_id in zip(tmp_sources, tmp_parent_ids): + isOK = False + # check root input + if tmp_source in root_inputs: + node.is_head = True + node.set_input_value(tmp_name, tmp_source, root_inputs[tmp_source]) + continue + # check parent output + for i in node.parents: + for r_node in resolved_map[i]: + if tmp_source in r_node.outputs: + node.set_input_value( + tmp_name, + tmp_source, + r_node.outputs[tmp_source]["value"], + ) + isOK = True + break + if isOK: + break + if isOK: + continue + # check resolved parent outputs + if tmp_parent_id is not None: + values = [list(r_node.outputs.values())[0]["value"] for r_node in resolved_map[tmp_parent_id]] + if len(values) == 1: + values = values[0] + node.set_input_value(tmp_name, tmp_source, values) + continue + # scatter + if node.scatter: + # resolve scattered parameters + scatters = None + sc_nodes = [] + for item in node.scatter: + if scatters is None: + scatters = [{item: v} for v in node.inputs[item]["value"]] + else: + [i.update({item: v}) for i, v in zip(scatters, node.inputs[item]["value"])] + for idx, item in enumerate(scatters): + sc_node = copy.deepcopy(node) + for k, v in item.items(): + sc_node.inputs[k]["value"] = v + for tmp_node in sc_node.sub_nodes: + tmp_node.scatter_index = idx + tmp_node.upper_root_inputs = sc_node.root_inputs + sc_nodes.append(sc_node) + else: + sc_nodes = [node] + # loop over scattered nodes + for sc_node in sc_nodes: + original_node_id = sc_node.id + all_nodes.append(sc_node) + node_key_map[id(sc_node)] = original_node_id + # set real node ID + resolved_map.setdefault(original_node_id, []) + tmp_to_real_id_map.setdefault(original_node_id, set()) + # resolve parents + real_parens = set() + for i in sc_node.parents: + real_parens |= tmp_to_real_id_map[i] + sc_node.parents = real_parens + if sc_node.is_head: + sc_node.parents |= parent_ids + if sc_node.is_leaf: + resolved_map[original_node_id].append(sc_node) + tmp_to_real_id_map[original_node_id].add(serial_id) + sc_node.id = serial_id + serial_id += 1 + else: + serial_id, sub_tail_nodes, sc_node.sub_nodes = resolve_nodes( + sc_node.sub_nodes, + sc_node.root_inputs, + sc_node.convert_dict_inputs(), + serial_id, + sc_node.parents, + out_ds_name, + log_stream, + ) + resolved_map[original_node_id] += sub_tail_nodes + tmp_to_real_id_map[original_node_id] |= set([n.id for n in sub_tail_nodes]) + sc_node.id = serial_id + serial_id += 1 + # convert parameters to parent IDs in conditions + if sc_node.condition: + convert_params_in_condition_to_parent_ids(sc_node.condition, sc_node.inputs, tmp_to_real_id_map) + # resolve outputs + if sc_node.is_leaf: + for tmp_name, tmp_data in sc_node.outputs.items(): + tmp_data["value"] = f"{out_ds_name}_{sc_node.id:03d}_{sc_node.name}" + # add loop count for nodes in a loop + if sc_node.in_loop: + tmp_data["value"] += ".___idds___num_run___" + # return tails + tail_nodes = [] + for node in all_nodes: + original_node_id = node_key_map.get(id(node), node.id) + if node.is_tail: + tail_nodes.append(node) + else: + tail_nodes += resolved_map[original_node_id] + return serial_id, tail_nodes, all_nodes + + # condition item class ConditionItem(object): def __init__(self, left, right=None, operator=None): diff --git a/pyproject.toml b/pyproject.toml index d7a72e7b0..27ea4702a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ authors = [ { name = "PanDA Team", email = "panda-support@cern.ch" }, ] dependencies = [ - 'panda-common>=0.1.6', + 'panda-common>=0.1.8', 'panda-client-light>=1.5.55', 'pyOpenSSL', 'python-daemon', diff --git a/templates/sysconfig/panda_jedi.template b/templates/sysconfig/panda_jedi.template index 61f830267..2eb87f227 100644 --- a/templates/sysconfig/panda_jedi.template +++ b/templates/sysconfig/panda_jedi.template @@ -10,9 +10,11 @@ if [[ -n "${VIRTUAL_ENV}" ]]; then PATH=${VIRTUAL_ENV}/bin:${VIRTUAL_ENV}/usr/local/bin:${VIRTUAL_ENV}/usr/bin:${VIRTUAL_ENV}:${PATH} fi -# for DQ2 +# for Rucio export X509_CERT_DIR=/etc/grid-security/certificates export X509_USER_PROXY=/data/atlpan/x509up_u25606 +export RUCIO_ACCOUNT=panda +export RUCIO_APPID=pandasrv # panda home if [[ -n "${VIRTUAL_ENV}" ]]; then diff --git a/templates/sysconfig/panda_server.sysconfig.rpmnew.template b/templates/sysconfig/panda_server.sysconfig.rpmnew.template index 5d0aa25c7..e1e589fa4 100644 --- a/templates/sysconfig/panda_server.sysconfig.rpmnew.template +++ b/templates/sysconfig/panda_server.sysconfig.rpmnew.template @@ -18,7 +18,7 @@ if [[ -n "${VIRTUAL_ENV}" ]]; then fi unset LD_LIBRARY_PATH -# for DQ2 +# for Rucio export X509_CERT_DIR=/etc/grid-security/certificates if [[ -z "${PANDA_RUCIO_ACCOUNT}" ]]; then export RUCIO_ACCOUNT=panda diff --git a/templates/sysconfig/panda_server_env.systemd.rpmnew.template b/templates/sysconfig/panda_server_env.systemd.rpmnew.template index af31a47c2..61dbe6a87 100644 --- a/templates/sysconfig/panda_server_env.systemd.rpmnew.template +++ b/templates/sysconfig/panda_server_env.systemd.rpmnew.template @@ -7,7 +7,7 @@ OPTIONS="-f @@virtual_env@@/etc/panda/panda_server-httpd.conf" PATH=@@virtual_env@@/bin:/bin:/usr/local/bin:/usr/bin LD_LIBRARY_PATH= -# for DQ2 +# for Rucio X509_CERT_DIR=/etc/grid-security/certificates RUCIO_ACCOUNT=panda RUCIO_APPID=pandasrv