From a0d72acca093ad7f2fced5e733e3c91609e3c526 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 20 May 2025 13:25:27 +0200 Subject: [PATCH 001/101] workflow: regain changes --- pandaserver/taskbuffer/workflow_processor.py | 44 ++---- pandaserver/workflow/psnakemake_test.py | 2 +- pandaserver/workflow/workflow_utils.py | 158 +++++++++++++++++++ 3 files changed, 175 insertions(+), 29 deletions(-) diff --git a/pandaserver/taskbuffer/workflow_processor.py b/pandaserver/taskbuffer/workflow_processor.py index c3861d277..00a0768fd 100644 --- a/pandaserver/taskbuffer/workflow_processor.py +++ b/pandaserver/taskbuffer/workflow_processor.py @@ -21,6 +21,9 @@ _logger = PandaLogger().getLogger("workflow_processor") +SUPPORTED_WORKFLOW_LANGUAGES = ["cwl", "snakemake"] + + # process workflow class WorkflowProcessor(object): # constructor @@ -170,34 +173,19 @@ def core_exec(sandbox_url, log_token, dump_workflow, ops_file, user_name, test_m if is_OK: tmpLog.info("parse workflow") workflow_name = None - if ops["data"]["language"] == "cwl": - workflow_name = ops["data"].get("workflow_name") - nodes, root_in = pcwl_utils.parse_workflow_file(ops["data"]["workflowSpecFile"], tmpLog) - with open(ops["data"]["workflowInputFile"]) as workflow_input: - yaml = YAML(typ="safe", pure=True) - data = yaml.load(workflow_input) - # noinspection DuplicatedCode - s_id, t_nodes, nodes = pcwl_utils.resolve_nodes(nodes, root_in, data, 0, set(), ops["data"]["outDS"], tmpLog) - workflow_utils.set_workflow_outputs(nodes) - id_node_map = workflow_utils.get_node_id_map(nodes) - [node.resolve_params(ops["data"]["taskParams"], id_node_map) for node in nodes] - dump_str = "the description was internally converted as follows\n" + workflow_utils.dump_nodes(nodes) - tmpLog.info(dump_str) - for node in nodes: - s_check, o_check = node.verify() - tmp_str = f"Verification failure in ID:{node.id} {o_check}" - if not s_check: - tmpLog.error(tmp_str) - dump_str += tmp_str - dump_str += "\n" - is_fatal = True - is_OK = False - elif ops["data"]["language"] == "snakemake": - parser = Parser(ops["data"]["workflowSpecFile"], logger=tmpLog) - nodes, root_in = parser.parse_nodes() - data = dict() - # noinspection DuplicatedCode - s_id, t_nodes, nodes = pcwl_utils.resolve_nodes(nodes, root_in, data, 0, set(), ops["data"]["outDS"], tmpLog) + if (wf_lang := ops["data"]["language"]) in SUPPORTED_WORKFLOW_LANGUAGES: + if wf_lang == "cwl": + workflow_name = ops["data"].get("workflow_name") + nodes, root_in = pcwl_utils.parse_workflow_file(ops["data"]["workflowSpecFile"], tmpLog) + with open(ops["data"]["workflowInputFile"]) as workflow_input: + yaml = YAML(typ="safe", pure=True) + data = yaml.load(workflow_input) + elif wf_lang == "snakemake": + parser = Parser(ops["data"]["workflowSpecFile"], logger=tmpLog) + nodes, root_in = parser.parse_nodes() + data = dict() + # resolve nodes + s_id, t_nodes, nodes = workflow_utils.resolve_nodes(nodes, root_in, data, 0, set(), ops["data"]["outDS"], tmpLog) workflow_utils.set_workflow_outputs(nodes) id_node_map = workflow_utils.get_node_id_map(nodes) [node.resolve_params(ops["data"]["taskParams"], id_node_map) for node in nodes] diff --git a/pandaserver/workflow/psnakemake_test.py b/pandaserver/workflow/psnakemake_test.py index ae03c0950..e8cf5aaa3 100644 --- a/pandaserver/workflow/psnakemake_test.py +++ b/pandaserver/workflow/psnakemake_test.py @@ -7,11 +7,11 @@ from snakeparser import Parser -from pandaserver.workflow.pcwl_utils import resolve_nodes from pandaserver.workflow.workflow_utils import ( convert_nodes_to_workflow, dump_nodes, get_node_id_map, + resolve_nodes, set_workflow_outputs, ) diff --git a/pandaserver/workflow/workflow_utils.py b/pandaserver/workflow/workflow_utils.py index d2b561bca..dce483be5 100644 --- a/pandaserver/workflow/workflow_utils.py +++ b/pandaserver/workflow/workflow_utils.py @@ -626,6 +626,164 @@ def set_workflow_outputs(node_list, all_parents=None): set_workflow_outputs(node.sub_nodes, all_parents) +# convert parameter names to parent IDs +def convert_params_in_condition_to_parent_ids(condition_item, input_data, id_map): + for item in ["left", "right"]: + param = getattr(condition_item, item) + if isinstance(param, str): + m = re.search(r"^[^\[]+\[(\d+)\]", param) + if m: + param = param.split("[")[0] + idx = int(m.group(1)) + else: + idx = None + isOK = False + for tmp_name, tmp_data in input_data.items(): + if param == tmp_name.split("/")[-1]: + isOK = True + if isinstance(tmp_data["parent_id"], list): + if idx is not None: + setattr(condition_item, item, id_map[tmp_data["parent_id"][idx]]) + else: + setattr(condition_item, item, id_map[tmp_data["parent_id"]]) + else: + setattr(condition_item, item, id_map[tmp_data["parent_id"]]) + break + if not isOK: + raise ReferenceError(f"unresolved paramter {param} in the condition string") + elif isinstance(param, ConditionItem): + convert_params_in_condition_to_parent_ids(param, input_data, id_map) + + +# resolve nodes +def resolve_nodes(node_list, root_inputs, data, serial_id, parent_ids, out_ds_name, log_stream): + for k in root_inputs: + kk = k.split("#")[-1] + if kk in data: + root_inputs[k] = data[kk] + tmp_to_real_id_map = {} + resolved_map = {} + all_nodes = [] + for node in node_list: + # resolve input + for tmp_name, tmp_data in node.inputs.items(): + if not tmp_data["source"]: + continue + if isinstance(tmp_data["source"], list): + tmp_sources = tmp_data["source"] + if "parent_id" in tmp_data: + tmp_parent_ids = tmp_data["parent_id"] + tmp_parent_ids += [None] * (len(tmp_sources) - len(tmp_parent_ids)) + else: + tmp_parent_ids = [None] * len(tmp_sources) + else: + tmp_sources = [tmp_data["source"]] + if "parent_id" in tmp_data: + tmp_parent_ids = [tmp_data["parent_id"]] + else: + tmp_parent_ids = [None] * len(tmp_sources) + for tmp_source, tmp_parent_id in zip(tmp_sources, tmp_parent_ids): + isOK = False + # check root input + if tmp_source in root_inputs: + node.is_head = True + node.set_input_value(tmp_name, tmp_source, root_inputs[tmp_source]) + continue + # check parent output + for i in node.parents: + for r_node in resolved_map[i]: + if tmp_source in r_node.outputs: + node.set_input_value( + tmp_name, + tmp_source, + r_node.outputs[tmp_source]["value"], + ) + isOK = True + break + if isOK: + break + if isOK: + continue + # check resolved parent outputs + if tmp_parent_id is not None: + values = [list(r_node.outputs.values())[0]["value"] for r_node in resolved_map[tmp_parent_id]] + if len(values) == 1: + values = values[0] + node.set_input_value(tmp_name, tmp_source, values) + continue + # scatter + if node.scatter: + # resolve scattered parameters + scatters = None + sc_nodes = [] + for item in node.scatter: + if scatters is None: + scatters = [{item: v} for v in node.inputs[item]["value"]] + else: + [i.update({item: v}) for i, v in zip(scatters, node.inputs[item]["value"])] + for idx, item in enumerate(scatters): + sc_node = copy.deepcopy(node) + for k, v in item.items(): + sc_node.inputs[k]["value"] = v + for tmp_node in sc_node.sub_nodes: + tmp_node.scatter_index = idx + tmp_node.upper_root_inputs = sc_node.root_inputs + sc_nodes.append(sc_node) + else: + sc_nodes = [node] + # loop over scattered nodes + for sc_node in sc_nodes: + all_nodes.append(sc_node) + # set real node ID + resolved_map.setdefault(sc_node.id, []) + tmp_to_real_id_map.setdefault(sc_node.id, set()) + # resolve parents + real_parens = set() + for i in sc_node.parents: + real_parens |= tmp_to_real_id_map[i] + sc_node.parents = real_parens + if sc_node.is_head: + sc_node.parents |= parent_ids + if sc_node.is_leaf: + resolved_map[sc_node.id].append(sc_node) + tmp_to_real_id_map[sc_node.id].add(serial_id) + sc_node.id = serial_id + serial_id += 1 + else: + serial_id, sub_tail_nodes, sc_node.sub_nodes = resolve_nodes( + sc_node.sub_nodes, + sc_node.root_inputs, + sc_node.convert_dict_inputs(), + serial_id, + sc_node.parents, + out_ds_name, + log_stream, + ) + resolved_map[sc_node.id] += sub_tail_nodes + tmp_to_real_id_map[sc_node.id] |= set([n.id for n in sub_tail_nodes]) + sc_node.id = serial_id + serial_id += 1 + # convert parameters to parent IDs in conditions + if sc_node.condition: + convert_params_in_condition_to_parent_ids(sc_node.condition, sc_node.inputs, tmp_to_real_id_map) + # resolve outputs + if sc_node.is_leaf: + for tmp_name, tmp_data in sc_node.outputs.items(): + tmp_data["value"] = f"{out_ds_name}_{sc_node.id:03d}_{sc_node.name}" + # add loop count for nodes in a loop + if sc_node.in_loop: + tmp_data["value"] += ".___idds___num_run___" + # return tails + tail_nodes = [] + for node in all_nodes: + if node.is_tail: + if node.is_tail: + tail_nodes.append(node) + else: + tail_nodes += resolved_map[node.id] + return serial_id, tail_nodes, all_nodes + + # condition item class ConditionItem(object): def __init__(self, left, right=None, operator=None): From abadf0e9d8fa36094ccd19f0a74e62cf4aebf5a5 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 21 May 2025 15:48:52 +0200 Subject: [PATCH 002/101] workflow: temp func --- pandaserver/workflow/workflow_utils.py | 202 +++++++++++++++++++++++++ 1 file changed, 202 insertions(+) diff --git a/pandaserver/workflow/workflow_utils.py b/pandaserver/workflow/workflow_utils.py index dce483be5..7a1600a63 100644 --- a/pandaserver/workflow/workflow_utils.py +++ b/pandaserver/workflow/workflow_utils.py @@ -1025,3 +1025,205 @@ def convert_nodes_to_workflow(nodes, workflow_node=None, workflow=None, workflow if not is_top: return id_work_map, dump_str_list return workflow, dump_str_list + + +# register panda workflow +def register_panda_workflow(nodes, workflow_node=None, workflow=None, workflow_name=None): + """ + Register nodes as PanDA workflow + """ + # if workflow is None: + # is_top = True + # workflow = Workflow() + # workflow.name = workflow_name + # else: + # is_top = False + # id_work_map = {} + # all_sub_id_work_map = {} + # sub_to_id_map = {} + # cond_dump_str = " Conditions\n" + # class_dump_str = f"===== Workflow ID:{workflow_node.id if workflow_node else workflow_name} ====\n" + # class_dump_str += " Works\n" + # dump_str_list = [] + # # create works or workflows + # for node in nodes: + # if node.is_leaf: + # # work + # if node.type == "junction": + # work = ATLASLocalPandaWork(task_parameters=node.task_params) + # work.add_custom_condition("to_exit", True) + # else: + # work = ATLASPandaWork(task_parameters=node.task_params) + # workflow.add_work(work) + # id_work_map[node.id] = work + # class_dump_str += f" {node.short_desc()} Class:{work.__class__.__name__}\n" + # else: + # # sub workflow + # sub_workflow = Workflow() + # id_work_map[node.id] = sub_workflow + # class_dump_str += f" {node.short_desc()} Class:{sub_workflow.__class__.__name__}\n" + # sub_id_work_map, tmp_dump_str_list = convert_nodes_to_workflow(node.sub_nodes, node, sub_workflow) + # dump_str_list += tmp_dump_str_list + # for sub_id in node.get_all_sub_node_ids(): + # all_sub_id_work_map[sub_id] = sub_workflow + # sub_to_id_map[sub_id] = node.id + # # add loop condition + # if node.loop: + # for sub_node in node.sub_nodes: + # if sub_node.type == "junction": + # # use to_continue for loop termination + # j_work = sub_id_work_map[sub_node.id] + # j_work.add_custom_condition(key="to_continue", value=True) + # cond = Condition(cond=j_work.get_custom_condition_status) + # sub_workflow.add_loop_condition(cond) + # cond_dump_str += f" Loop in ID:{node.id} with terminator ID:{sub_node.id}\n" + # break + # workflow.add_work(sub_workflow) + # # add conditions + # for node in nodes: + # if node.parents: + # c_work = id_work_map[node.id] + # if not node.condition: + # # default conditions if unspecified + # cond_func_list = [] + # for p_id in node.parents: + # if p_id in id_work_map: + # p_work = id_work_map[p_id] + # str_p_id = p_id + # elif p_id in all_sub_id_work_map: + # p_work = all_sub_id_work_map[p_id] + # str_p_id = sub_to_id_map[p_id] + # else: + # # head node + # continue + # if len(node.parents) > 1 or isinstance(p_work, Workflow) or node.type in ["junction", "reana", "gitlab"]: + # cond_function = p_work.is_processed + # else: + # cond_function = p_work.is_started + # if cond_function not in cond_func_list: + # cond_func_list.append(cond_function) + # cond_dump_str += f" Default Link ID:{str_p_id} {cond_function.__name__} -> ID:{node.id}\n" + # cond = AndCondition(true_works=[c_work], conditions=cond_func_list) + # workflow.add_condition(cond) + # else: + # # convert conditions + # cond_list = node.condition.get_dict_form() + # base_cond_map = {} + # str_cond_map = {} + # root_condition = None + # for tmp_idx, base_cond in cond_list: + # # leaf condition + # if base_cond["right"] is None: + # # condition based on works + # cond_func_list = [] + # str_func_list = [] + # for p_id in base_cond["left"]: + # if p_id in id_work_map: + # p_work = id_work_map[p_id] + # str_p_id = p_id + # else: + # p_work = all_sub_id_work_map[p_id] + # str_p_id = sub_to_id_map[p_id] + # # finished or failed + # if base_cond["operator"] is None: + # cond_function = p_work.is_processed + # else: + # cond_function = p_work.is_failed + # cond_func_list.append(cond_function) + # str_func_list.append(f"ID:{str_p_id} {cond_function.__name__}") + # cond = AndCondition(conditions=cond_func_list) + # base_cond_map[tmp_idx] = cond + # str_func = "AND ".join(str_func_list) + # str_cond_map[tmp_idx] = str_func + # cond_dump_str += f" Unary Ops {cond.__class__.__name__}({str_func}) -> ID:{node.id}\n" + # root_condition = cond + # else: + # # composite condition + # l_str_func_list = [] + # r_str_func_list = [] + # if isinstance(base_cond["left"], set): + # cond_func_list = [] + # for p_id in base_cond["left"]: + # if p_id in id_work_map: + # p_work = id_work_map[p_id] + # str_p_id = p_id + # else: + # p_work = all_sub_id_work_map[p_id] + # str_p_id = sub_to_id_map[p_id] + # cond_function = p_work.is_processed + # cond_func_list.append(cond_function) + # l_str_func_list.append(f"ID:{str_p_id} {cond_function.__name__}") + # l_cond = AndCondition(conditions=cond_func_list) + # l_str_func = "AND ".join(l_str_func_list) + # str_cond_map[base_cond["left"]] = l_str_func + # else: + # l_cond = base_cond_map[base_cond["left"]] + # l_str_func = str_cond_map[base_cond["left"]] + # if isinstance(base_cond["right"], set): + # cond_func_list = [] + # for p_id in base_cond["right"]: + # if p_id in id_work_map: + # p_work = id_work_map[p_id] + # str_p_id = p_id + # else: + # p_work = all_sub_id_work_map[p_id] + # str_p_id = sub_to_id_map[p_id] + # cond_function = p_work.is_processed + # cond_func_list.append(cond_function) + # r_str_func_list.append(f"ID:{str_p_id} {cond_function.__name__}") + # r_cond = AndCondition(conditions=cond_func_list) + # r_str_func = "AND ".join(r_str_func_list) + # str_cond_map[base_cond["right"]] = r_str_func + # else: + # r_cond = base_cond_map[base_cond["right"]] + # r_str_func = str_cond_map[base_cond["right"]] + # if base_cond["operator"] == "and": + # cond = AndCondition( + # conditions=[ + # l_cond.is_condition_true, + # r_cond.is_condition_true, + # ] + # ) + # else: + # cond = OrCondition( + # conditions=[ + # l_cond.is_condition_true, + # r_cond.is_condition_true, + # ] + # ) + # base_cond_map[tmp_idx] = cond + # cond_dump_str += f" Binary Ops {cond.__class__.__name__}({l_str_func}, {r_str_func}) for ID:{node.id}\n" + # root_condition = cond + # # set root condition + # if root_condition: + # root_condition.true_works = [c_work] + # workflow.add_condition(root_condition) + # # global parameters + # if workflow_node: + # tmp_global, tmp_workflow_global = workflow_node.get_global_parameters() + # if tmp_global: + # loop_locals = {} + # loop_slices = [] + # for k, v in tmp_global.items(): + # if not isinstance(v, dict): + # # normal looping locals + # loop_locals["user_" + k] = tmp_global[k] + # else: + # # sliced locals + # v["src"] = "user_" + v["src"] + # loop_slices.append([k, v]) + # if loop_locals: + # workflow.set_global_parameters(loop_locals) + # for k, v in loop_slices: + # workflow.set_sliced_global_parameters(source=v["src"], index=v["idx"], name="user_" + k) + # cond_dump_str += "\n Looping local variables\n" + # cond_dump_str += f" {tmp_global}\n" + # if tmp_workflow_global: + # cond_dump_str += "\n Workflow local variable\n" + # cond_dump_str += f" {tmp_workflow_global}\n" + # # dump strings + # dump_str_list.insert(0, class_dump_str + "\n" + cond_dump_str + "\n\n") + # # return + # if not is_top: + # return id_work_map, dump_str_list + # return workflow, dump_str_list From 120633449f05f586138bacedfad9b3d56221da87 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 29 Jul 2025 14:23:05 +0200 Subject: [PATCH 003/101] preliminary workflow core --- pandaserver/taskbuffer/TaskBuffer.py | 4 +- .../taskbuffer/db_proxy_mods/base_module.py | 14 +- .../db_proxy_mods/workflow_module.py | 241 ++++++++++++++++++ pandaserver/taskbuffer/workflow_core.py | 220 ++++++++++++++++ 4 files changed, 471 insertions(+), 8 deletions(-) create mode 100644 pandaserver/taskbuffer/db_proxy_mods/workflow_module.py create mode 100644 pandaserver/taskbuffer/workflow_core.py diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index dba12ca7b..9d4523fad 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -75,9 +75,9 @@ def cleanup(self, requester=None): # transaction as a context manager # CANNOT be used with ConBridge or TaskBufferInterface which uses multiprocess.pipe @contextmanager - def transaction(self, name: str): + def transaction(self, name=None, tmp_log=None): with self.proxyPool.get() as proxy: - with proxy.transaction(name) as txn: + with proxy.transaction(name, tmp_log) as txn: if txn is None: raise RuntimeError(f"Failed to start transaction {name}") # yield the transaction diff --git a/pandaserver/taskbuffer/db_proxy_mods/base_module.py b/pandaserver/taskbuffer/db_proxy_mods/base_module.py index 7abd693e9..16af0b928 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/base_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/base_module.py @@ -476,12 +476,13 @@ def wakeUp(self): # transaction as a context manager @contextmanager - def transaction(self, name: str): + def transaction(self, name: str | None = None, tmp_log=None): """ Context manager for transaction Args: - name (str): name of the transaction to be shown in the log + name (str, optional): name of the transaction to be shown in the log + tmp_log (LogWrapper, optional): logger to use. If None, a new logger will be created Yields: Any: the cursor object for executing SQL commands @@ -489,16 +490,17 @@ def transaction(self, name: str): """ comment = " /* DBProxy.transaction */" try: - tmp_log = self.create_tagged_logger(comment, tag=name) - tmp_log.debug("start") + if tmp_log is None: + tmp_log = self.create_tagged_logger(comment, tag=name) + tmp_log.debug("transaction start") # begin transaction self.conn.begin() # cursor and logger for the with block yield (self.cur, tmp_log) # commit transaction if not self._commit(): - raise RuntimeError("Commit error") - tmp_log.debug("done") + raise RuntimeError("commit error") + tmp_log.debug("transaction done") except Exception as e: # roll back self._rollback() diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py new file mode 100644 index 000000000..09b22aa56 --- /dev/null +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -0,0 +1,241 @@ +import datetime +import json +import os +import re +import sys + +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandautils.PandaUtils import get_sql_IN_bind_variables, naive_utcnow + +from pandaserver.config import panda_config +from pandaserver.srvcore import CoreUtils +from pandaserver.taskbuffer import ErrorCode, JobUtils +from pandaserver.taskbuffer.db_proxy_mods.base_module import BaseModule +from pandaserver.taskbuffer.db_proxy_mods.entity_module import get_entity_module +from pandaserver.taskbuffer.JobSpec import JobSpec +from pandaserver.taskbuffer.workflow_core import WFDataSpec, WFStepSpec, WorkflowSpec + + +# Module class to define methods related to workflow +class WorkflowModule(BaseModule): + # constructor + def __init__(self, log_stream: LogWrapper): + super().__init__(log_stream) + + def get_workflow(self, workflow_id: int) -> WorkflowSpec | None: + """ + Retrieve a workflow specification by its ID + + Args: + workflow_id (int): ID of the workflow to retrieve + + Returns: + WorkflowSpec | None: The workflow specification if found, otherwise None + """ + comment = " /* DBProxy.get_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}") + sql = f"SELECT {WorkflowSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflows " f"WHERE workflow_id=:workflow_id " + var_map = {":workflow_id": workflow_id} + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + if len(res_list) > 1: + tmp_log.error("more than one workflows; unexpected") + else: + for res in res_list: + workflow_spec = WorkflowSpec() + workflow_spec.pack(res) + return workflow_spec + else: + tmp_log.warning("no request found; skipped") + return None + + def get_workflow_step(self, step_id: int) -> WFStepSpec | None: + """ + Retrieve a workflow step specification by its ID + + Args: + step_id (int): ID of the workflow step to retrieve + + Returns: + WFStepSpec | None: The workflow step specification if found, otherwise None + """ + comment = " /* DBProxy.get_workflow_step */" + tmp_log = self.create_tagged_logger(comment, f"step_id={step_id}") + sql = f"SELECT {WFStepSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_steps " f"WHERE step_id=:step_id " + var_map = {":step_id": step_id} + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + if len(res_list) > 1: + tmp_log.error("more than one steps; unexpected") + else: + for res in res_list: + wf_step_spec = WFStepSpec() + wf_step_spec.pack(res) + return wf_step_spec + else: + tmp_log.warning("no request found; skipped") + return None + + def get_workflow_data(self, data_id: int) -> WFDataSpec | None: + """ + Retrieve a workflow data specification by its ID + + Args: + data_id (int): ID of the workflow data to retrieve + + Returns: + WFDataSpec | None: The workflow data specification if found, otherwise None + """ + comment = " /* DBProxy.get_workflow_data */" + tmp_log = self.create_tagged_logger(comment, f"data_id={data_id}") + sql = f"SELECT {WFDataSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_data " f"WHERE data_id=:data_id " + var_map = {":data_id": data_id} + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + if len(res_list) > 1: + tmp_log.error("more than one data; unexpected") + else: + for res in res_list: + wf_data_spec = WFDataSpec() + wf_data_spec.pack(res) + return wf_data_spec + else: + tmp_log.warning("no request found; skipped") + return None + + def get_steps_of_workflow(self, workflow_id: int) -> list[WFStepSpec]: + """ + Retrieve all workflow steps for a given workflow ID + + Args: + workflow_id (int): ID of the workflow to retrieve steps for + + Returns: + list[WFStepSpec]: List of workflow step specifications + """ + comment = " /* DBProxy.get_steps_of_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}") + sql = f"SELECT {WFStepSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_steps " f"WHERE workflow_id=:workflow_id " + var_map = {":workflow_id": workflow_id} + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + wf_step_specs = [] + for res in res_list: + wf_step_spec = WFStepSpec() + wf_step_spec.pack(res) + wf_step_specs.append(wf_step_spec) + return wf_step_specs + else: + tmp_log.warning("no steps found; skipped") + return [] + + def get_data_of_workflow(self, workflow_id: int) -> list[WFDataSpec]: + """ + Retrieve all workflow data for a given workflow ID + + Args: + workflow_id (int): ID of the workflow to retrieve data for + + Returns: + list[WFDataSpec]: List of workflow data specifications + """ + comment = " /* DBProxy.get_data_of_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}") + sql = f"SELECT {WFDataSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_data " f"WHERE workflow_id=:workflow_id " + var_map = {":workflow_id": workflow_id} + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + wf_data_specs = [] + for res in res_list: + wf_data_spec = WFDataSpec() + wf_data_spec.pack(res) + wf_data_specs.append(wf_data_spec) + return wf_data_specs + else: + tmp_log.warning("no data found; skipped") + return [] + + def update_workflow(self, workflow_spec: WorkflowSpec) -> WorkflowSpec | None: + """ + Update a workflow specification in the database + + Args: + workflow_spec (WorkflowSpec): The workflow specification to update + + Returns: + WorkflowSpec | None: The updated workflow specification if successful, otherwise None + """ + comment = " /* DBProxy.update_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_spec.workflow_id}") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to update workflow + workflow_spec.modification_time = naive_utcnow() + sql_update = ( + f"UPDATE {panda_config.schemaJEDI}.workflows " f"SET {workflow_spec.bindUpdateChangesExpression()} " "WHERE workflow_id=:workflow_id " + ) + var_map = workflow_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":workflow_id"] = workflow_spec.workflow_id + cur.execute(sql_update + comment, var_map) + tmp_log.debug(f"updated {workflow_spec.bindUpdateChangesExpression()}") + return workflow_spec + except Exception: + return None + + def update_workflow_step(self, wf_step_spec: WFStepSpec) -> WFStepSpec | None: + """ + Update a workflow step specification in the database + + Args: + wf_step_spec (WFStepSpec): The workflow step specification to update + + Returns: + WFStepSpec | None: The updated workflow step specification if successful, otherwise None + """ + comment = " /* DBProxy.update_workflow_step */" + tmp_log = self.create_tagged_logger(comment, f"step_id={wf_step_spec.step_id}") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to update workflow step + wf_step_spec.modification_time = naive_utcnow() + sql_update = f"UPDATE {panda_config.schemaJEDI}.workflow_steps " f"SET {wf_step_spec.bindUpdateChangesExpression()} " "WHERE step_id=:step_id " + var_map = wf_step_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":step_id"] = wf_step_spec.step_id + cur.execute(sql_update + comment, var_map) + tmp_log.debug(f"updated {wf_step_spec.bindUpdateChangesExpression()}") + return wf_step_spec + except Exception: + return None + + def update_workflow_data(self, wf_data_spec: WFDataSpec) -> WFDataSpec | None: + """ + Update a workflow data specification in the database + + Args: + wf_data_spec (WFDataSpec): The workflow data specification to update + + Returns: + WFDataSpec | None: The updated workflow data specification if successful, otherwise None + """ + comment = " /* DBProxy.update_workflow_data */" + tmp_log = self.create_tagged_logger(comment, f"data_id={wf_data_spec.data_id}") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to update workflow data + wf_data_spec.modification_time = naive_utcnow() + sql_update = f"UPDATE {panda_config.schemaJEDI}.workflow_data " f"SET {wf_data_spec.bindUpdateChangesExpression()} " "WHERE data_id=:data_id " + var_map = wf_data_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":data_id"] = wf_data_spec.data_id + cur.execute(sql_update + comment, var_map) + tmp_log.debug(f"updated {wf_data_spec.bindUpdateChangesExpression()}") + return wf_data_spec + except Exception: + return None diff --git a/pandaserver/taskbuffer/workflow_core.py b/pandaserver/taskbuffer/workflow_core.py new file mode 100644 index 000000000..d57de49f1 --- /dev/null +++ b/pandaserver/taskbuffer/workflow_core.py @@ -0,0 +1,220 @@ +import copy +import functools +import json +import os +import random +import re +import socket +import time +import traceback +from collections import namedtuple +from contextlib import contextmanager +from dataclasses import MISSING, InitVar, asdict, dataclass, field +from datetime import datetime, timedelta +from typing import Any, Dict, List + +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger +from pandacommon.pandautils.base import SpecBase +from pandacommon.pandautils.PandaUtils import get_sql_IN_bind_variables, naive_utcnow + +from pandaserver.config import panda_config +from pandaserver.dataservice.ddm import rucioAPI + +import polars as pl # isort:skip + + +# main logger +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + +# named tuple for attribute with type +AttributeWithType = namedtuple("AttributeWithType", ["attribute", "type"]) + + +# ============================================================== + + +class WorkflowBaseSpec(SpecBase): + """ + Base class for workflow related specifications + """ + + @property + def parameter_map(self) -> dict: + """ + Get the dictionary parsed by the parameters attribute in JSON + Possible parameters: + ... + + Returns: + dict : dict of parameters if it is JSON or empty dict if null + """ + if self.parameters is None: + return {} + else: + return json.loads(self.parameters) + + @parameter_map.setter + def parameter_map(self, value_map: dict): + """ + Set the dictionary and store in parameters attribute in JSON + + Args: + value_map (dict): dict to set the parameter map + """ + self.parameters = json.dumps(value_map) + + def get_parameter(self, param: str) -> Any: + """ + Get the value of one parameter. None as default + + Args: + param (str): parameter name + + Returns: + Any : value of the parameter; None if parameter not set + """ + tmp_dict = self.parameter_map + return tmp_dict.get(param) + + def set_parameter(self, param: str, value): + """ + Set the value of one parameter and store in parameters attribute in JSON + + Args: + param (str): parameter name + value (Any): value of the parameter to set; must be JSON-serializable + """ + tmp_dict = self.parameter_map + tmp_dict[param] = value + self.parameter_map = tmp_dict + + def update_parameters(self, params: dict): + """ + Update values of parameters with a dict and store in parameters attribute in JSON + + Args: + params (dict): dict of parameter names and values to set + """ + tmp_dict = self.parameter_map + tmp_dict.update(params) + self.parameter_map = tmp_dict + + +class WorkflowSpec(WorkflowBaseSpec): + """ + Workflow specification + """ + + # attributes with types + attributes_with_types = ( + AttributeWithType("workflow_id", int), + AttributeWithType("name", str), + AttributeWithType("status", str), + AttributeWithType("prodsourcelabel", str), + AttributeWithType("gshare", str), + AttributeWithType("creation_time", datetime), + AttributeWithType("start_time", datetime), + AttributeWithType("end_time", datetime), + AttributeWithType("modification_time", datetime), + AttributeWithType("check_time", datetime), + AttributeWithType("locked_by", str), + AttributeWithType("lock_time", datetime), + AttributeWithType("definition_json", str), + AttributeWithType("parameters", str), + ) + # attributes + attributes = tuple([attr.attribute for attr in attributes_with_types]) + # attributes which have 0 by default + _zeroAttrs = () + # attributes to force update + _forceUpdateAttrs = () + # mapping between sequence and attr + _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_ID_SEQ.nextval"} + + +class WFStepSpec(WorkflowBaseSpec): + """ + Workflow Step specification + """ + + # attributes with types + attributes_with_types = ( + AttributeWithType("step_id", int), + AttributeWithType("workflow_id", int), + AttributeWithType("member_id", int), + AttributeWithType("status", str), + AttributeWithType("type", str), + AttributeWithType("task_id", int), + AttributeWithType("creation_time", datetime), + AttributeWithType("start_time", datetime), + AttributeWithType("end_time", datetime), + AttributeWithType("modification_time", datetime), + AttributeWithType("check_time", datetime), + AttributeWithType("locked_by", str), + AttributeWithType("lock_time", datetime), + AttributeWithType("parameters", str), + ) + # attributes + attributes = tuple([attr.attribute for attr in attributes_with_types]) + # attributes which have 0 by default + _zeroAttrs = () + # attributes to force update + _forceUpdateAttrs = () + # mapping between sequence and attr + _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_STEP_ID_SEQ.nextval"} + + +class WFDataSpec(WorkflowBaseSpec): + """ + Workflow Data specification + """ + + # attributes with types + attributes_with_types = ( + AttributeWithType("data_id", int), + AttributeWithType("workflow_id", int), + AttributeWithType("status", str), + AttributeWithType("type", str), + AttributeWithType("lfn", str), + AttributeWithType("creation_time", datetime), + AttributeWithType("start_time", datetime), + AttributeWithType("end_time", datetime), + AttributeWithType("modification_time", datetime), + AttributeWithType("check_time", datetime), + AttributeWithType("locked_by", str), + AttributeWithType("lock_time", datetime), + AttributeWithType("parameters", str), + ) + # attributes + attributes = tuple([attr.attribute for attr in attributes_with_types]) + # attributes which have 0 by default + _zeroAttrs = () + # attributes to force update + _forceUpdateAttrs = () + # mapping between sequence and attr + _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_DATA_ID_SEQ.nextval"} + + +# ============================================================== + + +class WorkflowInterface(object): + """ + Interface for workflow management methods + """ + + def __init__(self, taskbuffer_if, *args, **kwargs): + """ + Constructor + + Args: + taskbuffer_if (TaskBufferInterface): Interface to the task buffer + *args: Additional arguments + **kwargs: Additional keyword arguments + """ + self.tbif = taskbuffer_if + self.ddm_if = rucioAPI + + # Add methods for workflow management here + # e.g., create_workflow, update_workflow, delete_workflow, etc. From b0dd610d2c6987c2ddac8ebc0dea04a1eb9481ac Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 12 Aug 2025 11:44:06 +0200 Subject: [PATCH 004/101] workflow: preliminary dbproxy methods --- .../db_proxy_mods/workflow_module.py | 260 +++++++++++++++++- pandaserver/taskbuffer/workflow_core.py | 147 +++++++++- 2 files changed, 397 insertions(+), 10 deletions(-) diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index 09b22aa56..8cc468658 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -1,8 +1,8 @@ -import datetime import json import os import re import sys +from datetime import datetime, timedelta from pandacommon.pandalogger.LogWrapper import LogWrapper from pandacommon.pandautils.PandaUtils import get_sql_IN_bind_variables, naive_utcnow @@ -47,7 +47,7 @@ def get_workflow(self, workflow_id: int) -> WorkflowSpec | None: workflow_spec.pack(res) return workflow_spec else: - tmp_log.warning("no request found; skipped") + tmp_log.warning("no workflow found; skipped") return None def get_workflow_step(self, step_id: int) -> WFStepSpec | None: @@ -75,7 +75,7 @@ def get_workflow_step(self, step_id: int) -> WFStepSpec | None: wf_step_spec.pack(res) return wf_step_spec else: - tmp_log.warning("no request found; skipped") + tmp_log.warning("no step found; skipped") return None def get_workflow_data(self, data_id: int) -> WFDataSpec | None: @@ -103,7 +103,7 @@ def get_workflow_data(self, data_id: int) -> WFDataSpec | None: wf_data_spec.pack(res) return wf_data_spec else: - tmp_log.warning("no request found; skipped") + tmp_log.warning("no data found; skipped") return None def get_steps_of_workflow(self, workflow_id: int) -> list[WFStepSpec]: @@ -160,6 +160,258 @@ def get_data_of_workflow(self, workflow_id: int) -> list[WFDataSpec]: tmp_log.warning("no data found; skipped") return [] + def lock_workflow(self, workflow_id: int, locked_by: str, lock_expiration_sec: int = 120) -> bool | None: + """ + Lock a workflow to prevent concurrent modifications + + Args: + workflow_id (int): ID of the workflow to lock + locked_by (str): Identifier of the entity locking the workflow + lock_expiration_sec (int): Time in seconds after which the lock expires + + Returns: + bool | None: True if the lock was acquired, False if not, None if an error occurred + """ + comment = " /* DBProxy.lock_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + now_time = naive_utcnow() + sql_lock = ( + f"UPDATE {panda_config.schemaJEDI}.workflows " + "SET locked_by=:locked_by, lock_time=:lock_time " + "WHERE workflow_id=:workflow_id " + "AND (locked_by IS NULL OR locked_by=:locked_by OR lock_time<:min_lock_time)" + ) + var_map = { + ":locked_by": self.full_pid, + ":lock_time": now_time, + ":workflow_id": workflow_id, + ":min_lock_time": now_time - timedelta(seconds=lock_expiration_sec), + } + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_lock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to lock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one workflow updated to lock; unexpected") + elif row_count == 0: + # no row updated; did not get the lock + tmp_log.debug(f"did not get lock; skipped") + return False + elif row_count == 1: + # successfully locked the workflow + tmp_log.debug(f"got lock") + return True + except Exception as e: + tmp_log.error(f"failed to lock workflow: {e}") + + def unlock_workflow(self, workflow_id: int, locked_by: str) -> bool | None: + """ + Unlock a workflow to allow modifications + + Args: + workflow_id (int): ID of the workflow to unlock + locked_by (str): Identifier of the entity unlocking the workflow + + Returns: + bool | None: True if the unlock was successful, False if not, None if an error occurred + """ + comment = " /* DBProxy.unlock_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + sql_unlock = ( + f"UPDATE {panda_config.schemaJEDI}.workflows " "SET locked_by=NULL, lock_time=NULL " "WHERE workflow_id=:workflow_id AND locked_by=:locked_by" + ) + var_map = {":workflow_id": workflow_id, ":locked_by": locked_by} + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_unlock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to unlock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one workflow updated to unlock; unexpected") + elif row_count == 0: + # no row updated; did not get the unlock + tmp_log.debug(f"no workflow updated to unlock; skipped") + return False + elif row_count == 1: + # successfully unlocked the workflow + tmp_log.debug(f"released lock") + return True + except Exception as e: + tmp_log.error(f"failed to unlock workflow: {e}") + + def lock_workflow_step(self, step_id: int, locked_by: str, lock_expiration_sec: int = 120) -> bool | None: + """ + Lock a workflow step to prevent concurrent modifications + + Args: + step_id (int): ID of the workflow step to lock + locked_by (str): Identifier of the entity locking the workflow step + lock_expiration_sec (int): Time in seconds after which the lock expires + + Returns: + bool | None: True if the lock was acquired, False if not, None if an error occurred + """ + comment = " /* DBProxy.lock_workflow_step */" + tmp_log = self.create_tagged_logger(comment, f"step_id={step_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + now_time = naive_utcnow() + sql_lock = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_steps " + "SET locked_by=:locked_by, lock_time=:lock_time " + "WHERE step_id=:step_id " + "AND (locked_by IS NULL OR locked_by=:locked_by OR lock_time<:min_lock_time)" + ) + var_map = { + ":locked_by": self.full_pid, + ":lock_time": now_time, + ":step_id": step_id, + ":min_lock_time": now_time - timedelta(seconds=lock_expiration_sec), + } + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_lock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to lock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one step updated to lock; unexpected") + elif row_count == 0: + # no row updated; did not get the lock + tmp_log.debug(f"did not get lock; skipped") + return False + elif row_count == 1: + # successfully locked the workflow step + tmp_log.debug(f"got lock") + return True + except Exception as e: + tmp_log.error(f"failed to lock workflow step: {e}") + + def unlock_workflow_step(self, step_id: int, locked_by: str) -> bool | None: + """ + Unlock a workflow step to allow modifications + + Args: + step_id (int): ID of the workflow step to unlock + locked_by (str): Identifier of the entity unlocking the workflow step + + Returns: + bool | None: True if the unlock was successful, False if not, None if an error occurred + """ + comment = " /* DBProxy.unlock_workflow_step */" + tmp_log = self.create_tagged_logger(comment, f"step_id={step_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + sql_unlock = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_steps " "SET locked_by=NULL, lock_time=NULL " "WHERE step_id=:step_id AND locked_by=:locked_by" + ) + var_map = {":step_id": step_id, ":locked_by": locked_by} + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_unlock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to unlock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one step updated to unlock; unexpected") + elif row_count == 0: + # no row updated; did not get the unlock + tmp_log.debug(f"no step updated to unlock; skipped") + return False + elif row_count == 1: + # successfully unlocked the workflow step + tmp_log.debug(f"released lock") + return True + except Exception as e: + tmp_log.error(f"failed to unlock workflow step: {e}") + + def lock_workflow_data(self, data_id: int, locked_by: str, lock_expiration_sec: int = 120) -> bool | None: + """ + Lock a workflow data to prevent concurrent modifications + + Args: + data_id (int): ID of the workflow data to lock + locked_by (str): Identifier of the entity locking the workflow data + lock_expiration_sec (int): Time in seconds after which the lock expires + + Returns: + bool | None: True if the lock was acquired, False if not, None if an error occurred + """ + comment = " /* DBProxy.lock_workflow_data */" + tmp_log = self.create_tagged_logger(comment, f"data_id={data_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + now_time = naive_utcnow() + sql_lock = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_data " + "SET locked_by=:locked_by, lock_time=:lock_time " + "WHERE data_id=:data_id " + "AND (locked_by IS NULL OR locked_by=:locked_by OR lock_time<:min_lock_time)" + ) + var_map = { + ":locked_by": self.full_pid, + ":lock_time": now_time, + ":data_id": data_id, + ":min_lock_time": now_time - timedelta(seconds=lock_expiration_sec), + } + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_lock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to lock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one data updated to lock; unexpected") + elif row_count == 0: + # no row updated; did not get the lock + tmp_log.debug(f"did not get lock; skipped") + return False + elif row_count == 1: + # successfully locked the workflow data + tmp_log.debug(f"got lock") + return True + except Exception as e: + tmp_log.error(f"failed to lock workflow data: {e}") + + def unlock_workflow_data(self, data_id: int, locked_by: str) -> bool | None: + """ + Unlock a workflow data to allow modifications + + Args: + data_id (int): ID of the workflow data to unlock + locked_by (str): Identifier of the entity unlocking the workflow data + + Returns: + bool | None: True if the unlock was successful, False if not, None if an error occurred + """ + comment = " /* DBProxy.unlock_workflow_data */" + tmp_log = self.create_tagged_logger(comment, f"data_id={data_id}, locked_by={locked_by}") + tmp_log.debug("start") + try: + sql_unlock = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_data " "SET locked_by=NULL, lock_time=NULL " "WHERE data_id=:data_id AND locked_by=:locked_by" + ) + var_map = {":data_id": data_id, ":locked_by": locked_by} + with self.transaction(tmp_log=tmp_log) as (cur, _): + cur.execute(sql_unlock + comment, var_map) + row_count = cur.rowcount + if row_count is None: + tmp_log.error(f"failed to update DB to unlock; skipped") + elif row_count > 1: + tmp_log.error(f"more than one data updated to unlock; unexpected") + elif row_count == 0: + # no row updated; did not get the unlock + tmp_log.debug(f"no data updated to unlock; skipped") + return False + elif row_count == 1: + # successfully unlocked the workflow data + tmp_log.debug(f"released lock") + return True + except Exception as e: + tmp_log.error(f"failed to unlock workflow data: {e}") + def update_workflow(self, workflow_spec: WorkflowSpec) -> WorkflowSpec | None: """ Update a workflow specification in the database diff --git a/pandaserver/taskbuffer/workflow_core.py b/pandaserver/taskbuffer/workflow_core.py index d57de49f1..16977bf4a 100644 --- a/pandaserver/taskbuffer/workflow_core.py +++ b/pandaserver/taskbuffer/workflow_core.py @@ -112,7 +112,7 @@ class WorkflowSpec(WorkflowBaseSpec): AttributeWithType("name", str), AttributeWithType("status", str), AttributeWithType("prodsourcelabel", str), - AttributeWithType("gshare", str), + AttributeWithType("username", str), AttributeWithType("creation_time", datetime), AttributeWithType("start_time", datetime), AttributeWithType("end_time", datetime), @@ -141,11 +141,13 @@ class WFStepSpec(WorkflowBaseSpec): # attributes with types attributes_with_types = ( AttributeWithType("step_id", int), + AttributeWithType("name", str), AttributeWithType("workflow_id", int), AttributeWithType("member_id", int), - AttributeWithType("status", str), AttributeWithType("type", str), - AttributeWithType("task_id", int), + AttributeWithType("status", str), + AttributeWithType("flavor", str), + AttributeWithType("target_id", str), AttributeWithType("creation_time", datetime), AttributeWithType("start_time", datetime), AttributeWithType("end_time", datetime), @@ -173,10 +175,12 @@ class WFDataSpec(WorkflowBaseSpec): # attributes with types attributes_with_types = ( AttributeWithType("data_id", int), + AttributeWithType("name", str), AttributeWithType("workflow_id", int), - AttributeWithType("status", str), AttributeWithType("type", str), - AttributeWithType("lfn", str), + AttributeWithType("status", str), + AttributeWithType("flavor", str), + AttributeWithType("target_id", str), AttributeWithType("creation_time", datetime), AttributeWithType("start_time", datetime), AttributeWithType("end_time", datetime), @@ -215,6 +219,137 @@ def __init__(self, taskbuffer_if, *args, **kwargs): """ self.tbif = taskbuffer_if self.ddm_if = rucioAPI + self.full_pid = f"{socket.getfqdn().split('.')[0]}-{os.getpgrp()}-{os.getpid()}" + + #### Context managers for locking + + @contextmanager + def workflow_lock(self, workflow_id: int, lock_expiration_sec: int = 120): + """ + Context manager to lock a workflow + + Args: + workflow_id (int): ID of the workflow to lock + lock_expiration_sec (int): Time in seconds after which the lock expires + + Yields: + WorkflowSpec | None: The locked workflow specification if the lock was acquired, otherwise None + """ + if self.tbif.lock_workflow(workflow_id, self.full_pid, lock_expiration_sec): + try: + # get the workflow spec locked + locked_spec = self.tbif.get_workflow(workflow_id) + # yield and run wrapped function + yield locked_spec + finally: + self.tbif.unlock_workflow(workflow_id, self.full_pid) + else: + # lock not acquired + yield None + + @contextmanager + def workflow_step_lock(self, step_id: int, lock_expiration_sec: int = 120): + """ + Context manager to lock a workflow step + + Args: + step_id (int): ID of the workflow step to lock + lock_expiration_sec (int): Time in seconds after which the lock expires + + Yields: + WFStepSpec | None: The locked workflow step specification if the lock was acquired, otherwise None + """ + if self.tbif.lock_workflow_step(step_id, self.full_pid, lock_expiration_sec): + try: + # get the workflow step spec locked + locked_spec = self.tbif.get_workflow_step(step_id) + # yield and run wrapped function + yield locked_spec + finally: + self.tbif.unlock_workflow_step(step_id, self.full_pid) + else: + # lock not acquired + yield None + + @contextmanager + def workflow_data_lock(self, data_id: int, lock_expiration_sec: int = 120): + """ + Context manager to lock workflow data + + Args: + data_id (int): ID of the workflow data to lock + lock_expiration_sec (int): Time in seconds after which the lock expires + + Yields: + WFDataSpec | None: The locked workflow data specification if the lock was acquired, otherwise None + """ + if self.tbif.lock_workflow_data(data_id, self.full_pid, lock_expiration_sec): + try: + # get the workflow data spec locked + locked_spec = self.tbif.get_workflow_data(data_id) + # yield and run wrapped function + yield locked_spec + finally: + self.tbif.unlock_workflow_data(data_id, self.full_pid) + else: + # lock not acquired + yield None # Add methods for workflow management here - # e.g., create_workflow, update_workflow, delete_workflow, etc. + + def register_workflow(self, prodsourcelabel: str, name: str, workflow_definition_json: str, *args, **kwargs): + """ + Register a new workflow + + Args: + prodsourcelabel (str): Production source label for the workflow + name (str): Name of the workflow + workflow_definition_json (str): JSON string defining the workflow + *args: Additional arguments + **kwargs: Additional keyword arguments + """ + # Implementation of workflow registration logic + ... + workflow_spec = WorkflowSpec() + workflow_spec.prodsourcelabel = prodsourcelabel + workflow_spec.name = name + workflow_spec.definition_json = workflow_definition_json + workflow_spec.creation_time = naive_utcnow() + workflow_spec.status = "registered" + # Update DB + ret_workflow_spec = self.tbif.update_workflow(workflow_spec, *args, **kwargs) + if ret_workflow_spec is None: + logger.error(f"Failed to register workflow prodsourcelabel={prodsourcelabel} name={name}") + return None + logger.info(f"Registered workflow prodsourcelabel={prodsourcelabel} name={name} workflow_id={ret_workflow_spec.workflow_id}") + return ret_workflow_spec + + #### Workflow status transitions + + def process_workflow_registered(self, workflow_spec: WorkflowSpec): + """ + Process a workflow in registered status + Parse the workflow definition, register steps, and update its status + + Args: + workflow_spec (WorkflowSpec): The workflow specification to process + """ + # Parse the workflow definition + try: + workflow_definition_dict = json.loads(workflow_spec.definition_json) + # Register steps based on nodes in the definition + for node in workflow_definition_dict["nodes"]: + step_spec = WFStepSpec() + step_spec.workflow_id = workflow_spec.workflow_id + step_spec.member_id = node["id"] + step_spec.status = "registered" + step_spec.type = node.get("type", "default") + # step_spec.parameters = json.dumps(node.get("parameters", {})) + step_spec.creation_time = naive_utcnow() + except Exception as e: + logger.error(f"Failed to parse workflow definition for workflow_id={workflow_spec.workflow_id}: {e}") + + # FIXME: temporary, skip data checking and go to starting directly + workflow_spec.status = "starting" + # Update DB + self.tbif.update_workflow(workflow_spec) From 53d6126e85f515d3e54a5cc9cedbb49bc78b54fb Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 12 Aug 2025 17:35:29 +0200 Subject: [PATCH 005/101] fix --- pandaserver/taskbuffer/OraDBProxy.py | 2 ++ pandaserver/taskbuffer/db_proxy_mods/workflow_module.py | 2 +- pandaserver/{taskbuffer => workflow}/workflow_core.py | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) rename pandaserver/{taskbuffer => workflow}/workflow_core.py (98%) diff --git a/pandaserver/taskbuffer/OraDBProxy.py b/pandaserver/taskbuffer/OraDBProxy.py index 44c4730e5..299f2231d 100644 --- a/pandaserver/taskbuffer/OraDBProxy.py +++ b/pandaserver/taskbuffer/OraDBProxy.py @@ -21,6 +21,7 @@ task_standalone_module, task_utils_module, worker_module, + workflow_module, ) from pandaserver.taskbuffer.WrappedCursor import WrappedCursor @@ -71,6 +72,7 @@ class DBProxy( task_complex_module.TaskComplexModule, task_standalone_module.TaskStandaloneModule, task_utils_module.TaskUtilsModule, + workflow_module.WorkflowModule, ): # constructor def __init__(self, useOtherError=False): diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index 8cc468658..855e09637 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -13,7 +13,7 @@ from pandaserver.taskbuffer.db_proxy_mods.base_module import BaseModule from pandaserver.taskbuffer.db_proxy_mods.entity_module import get_entity_module from pandaserver.taskbuffer.JobSpec import JobSpec -from pandaserver.taskbuffer.workflow_core import WFDataSpec, WFStepSpec, WorkflowSpec +from pandaserver.workflow.workflow_core import WFDataSpec, WFStepSpec, WorkflowSpec # Module class to define methods related to workflow diff --git a/pandaserver/taskbuffer/workflow_core.py b/pandaserver/workflow/workflow_core.py similarity index 98% rename from pandaserver/taskbuffer/workflow_core.py rename to pandaserver/workflow/workflow_core.py index 16977bf4a..deb279ab8 100644 --- a/pandaserver/taskbuffer/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -297,12 +297,13 @@ def workflow_data_lock(self, data_id: int, lock_expiration_sec: int = 120): # Add methods for workflow management here - def register_workflow(self, prodsourcelabel: str, name: str, workflow_definition_json: str, *args, **kwargs): + def register_workflow(self, prodsourcelabel: str, username: str, name: str, workflow_definition_json: str, *args, **kwargs): """ Register a new workflow Args: prodsourcelabel (str): Production source label for the workflow + username (str): Username of the person registering the workflow name (str): Name of the workflow workflow_definition_json (str): JSON string defining the workflow *args: Additional arguments @@ -313,6 +314,7 @@ def register_workflow(self, prodsourcelabel: str, name: str, workflow_definition workflow_spec = WorkflowSpec() workflow_spec.prodsourcelabel = prodsourcelabel workflow_spec.name = name + workflow_spec.username = username workflow_spec.definition_json = workflow_definition_json workflow_spec.creation_time = naive_utcnow() workflow_spec.status = "registered" From 25a59dbef47eef49feed3f9fc5512bdb02539d1c Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 13 Aug 2025 13:26:53 +0200 Subject: [PATCH 006/101] workflow: add test register --- .../workflow/test_workflow_core_functions.py | 346 ++++++++++++++++++ pandaserver/workflow/workflow_core.py | 41 ++- 2 files changed, 376 insertions(+), 11 deletions(-) create mode 100644 pandaserver/workflow/test_workflow_core_functions.py diff --git a/pandaserver/workflow/test_workflow_core_functions.py b/pandaserver/workflow/test_workflow_core_functions.py new file mode 100644 index 000000000..d073dd0ad --- /dev/null +++ b/pandaserver/workflow/test_workflow_core_functions.py @@ -0,0 +1,346 @@ +import sys + +from pandacommon.pandautils.thread_utils import GenericThread + +from pandaserver.config import panda_config +from pandaserver.taskbuffer.TaskBuffer import taskBuffer +from pandaserver.workflow.workflow_core import ( + WFDataSpec, + WFStepSpec, + WorkflowInterface, + WorkflowSpec, +) + +# parameters for the workflow +prodsourcelabel = "user" +username = "testuser" +workflow_name = "test_workflow_bg_comb_00" + + +# workflow definition json +wfd_json = """ +{ + "root_inputs": { + "sig_bg_comb.cwl#background': 'mc16_5TeV.361238.Pythia8EvtGen_A3NNPDF23LO_minbias_inelastic_low.merge.HITS.e6446_s3238_s3250/", + "sig_bg_comb.cwl#signal': 'mc16_valid:mc16_valid.900248.PG_singlepion_flatPt2to50.simul.HITS.e8312_s3238_tid26378578_00" + }, + "root_outputs": {"sig_bg_comb.cwl#combine/outDS": {"value": "user.me.my_outDS_005_combine"}}, + "nodes": [ + { + "condition": null, + "data": null, + "id": 1, + "in_loop": false, + "inputs": { + "sig_bg_comb.cwl#make_signal/opt_args": { + "default": "--outputs abc.dat,def.zip --nFilesPerJob 5", + "source": null + }, + "sig_bg_comb.cwl#make_signal/opt_containerImage": { + "default": "docker://busybox", + "source": null + }, + "sig_bg_comb.cwl#make_signal/opt_exec": { + "default": "echo %IN > abc.dat; echo 123 > def.zip", + "source": null + }, + "sig_bg_comb.cwl#make_signal/opt_inDS": { + "default": null, + "source": "sig_bg_comb.cwl#signal" + } + }, + "is_head": false, + "is_leaf": true, + "is_tail": false, + "is_workflow_output": false, + "loop": false, + "name": "make_signal", + "output_types": [], + "outputs": { + "sig_bg_comb.cwl#make_signal/outDS": {} + }, + "parents": [], + "root_inputs": null, + "scatter": null, + "sub_nodes": [], + "task_params": null, + "type": "prun", + "upper_root_inputs": null + }, + { + "condition": null, + "data": null, + "id": 2, + "in_loop": false, + "inputs": { + "sig_bg_comb.cwl#make_background_1/opt_args": { + "default": "--outputs opq.root,xyz.pool --nGBPerJob 10", + "source": null + }, + "sig_bg_comb.cwl#make_background_1/opt_exec": { + "default": "echo %IN > opq.root; echo %IN > xyz.pool", + "source": null + }, + "sig_bg_comb.cwl#make_background_1/opt_inDS": { + "default": null, + "source": "sig_bg_comb.cwl#background" + } + }, + "is_head": false, + "is_leaf": true, + "is_tail": false, + "is_workflow_output": false, + "loop": false, + "name": "make_background_1", + "output_types": [], + "outputs": { + "sig_bg_comb.cwl#make_background_1/outDS": {} + }, + "parents": [], + "root_inputs": null, + "scatter": null, + "sub_nodes": [], + "task_params": null, + "type": "prun", + "upper_root_inputs": null + }, + { + "condition": null, + "data": null, + "id": 3, + "in_loop": false, + "inputs": { + "sig_bg_comb.cwl#premix/opt_args": { + "default": "--outputs klm.root --secondaryDSs IN2:2:%{SECDS1}", + "source": null + }, + "sig_bg_comb.cwl#premix/opt_exec": { + "default": "echo %IN %IN2 > klm.root", + "source": null + }, + "sig_bg_comb.cwl#premix/opt_inDS": { + "default": null, + "parent_id": 1, + "source": "sig_bg_comb.cwl#make_signal/outDS" + }, + "sig_bg_comb.cwl#premix/opt_inDsType": { + "default": "def.zip", + "source": null + }, + "sig_bg_comb.cwl#premix/opt_secondaryDSs": { + "default": null, + "parent_id": [ + 2 + ], + "source": [ + "sig_bg_comb.cwl#make_background_1/outDS" + ] + }, + "sig_bg_comb.cwl#premix/opt_secondaryDsTypes": { + "default": [ + "xyz.pool" + ], + "source": null + } + }, + "is_head": false, + "is_leaf": true, + "is_tail": false, + "is_workflow_output": false, + "loop": false, + "name": "premix", + "output_types": [], + "outputs": { + "sig_bg_comb.cwl#premix/outDS": {} + }, + "parents": [ + 1, + 2 + ], + "root_inputs": null, + "scatter": null, + "sub_nodes": [], + "task_params": null, + "type": "prun", + "upper_root_inputs": null + }, + { + "condition": null, + "data": null, + "id": 4, + "in_loop": false, + "inputs": { + "sig_bg_comb.cwl#generate_some/opt_args": { + "default": "--outputs gen.root --nJobs 10", + "source": null + }, + "sig_bg_comb.cwl#generate_some/opt_exec": { + "default": "echo %RNDM:10 > gen.root", + "source": null + } + }, + "is_head": false, + "is_leaf": true, + "is_tail": false, + "is_workflow_output": false, + "loop": false, + "name": "generate_some", + "output_types": [], + "outputs": { + "sig_bg_comb.cwl#generate_some/outDS": {} + }, + "parents": [], + "root_inputs": null, + "scatter": null, + "sub_nodes": [], + "task_params": null, + "type": "prun", + "upper_root_inputs": null + }, + { + "condition": null, + "data": null, + "id": 5, + "in_loop": false, + "inputs": { + "sig_bg_comb.cwl#make_background_2/opt_args": { + "default": "--outputs ooo.root,jjj.txt --secondaryDSs IN2:2:%{SECDS1}", + "source": null + }, + "sig_bg_comb.cwl#make_background_2/opt_containerImage": { + "default": "docker://alpine", + "source": null + }, + "sig_bg_comb.cwl#make_background_2/opt_exec": { + "default": "echo %IN > ooo.root; echo %IN2 > jjj.txt", + "source": null + }, + "sig_bg_comb.cwl#make_background_2/opt_inDS": { + "default": null, + "source": "sig_bg_comb.cwl#background" + }, + "sig_bg_comb.cwl#make_background_2/opt_secondaryDSs": { + "default": null, + "parent_id": [ + 4 + ], + "source": [ + "sig_bg_comb.cwl#generate_some/outDS" + ] + }, + "sig_bg_comb.cwl#make_background_2/opt_secondaryDsTypes": { + "default": [ + "gen.root" + ], + "source": null + } + }, + "is_head": false, + "is_leaf": true, + "is_tail": false, + "is_workflow_output": false, + "loop": false, + "name": "make_background_2", + "output_types": [], + "outputs": { + "sig_bg_comb.cwl#make_background_2/outDS": {} + }, + "parents": [ + 4 + ], + "root_inputs": null, + "scatter": null, + "sub_nodes": [], + "task_params": null, + "type": "prun", + "upper_root_inputs": null + }, + { + "condition": null, + "data": null, + "id": 6, + "in_loop": false, + "inputs": { + "sig_bg_comb.cwl#combine/opt_args": { + "default": "--outputs aaa.root --secondaryDSs IN2:2:%{SECDS1},IN3:5:%{SECDS2}", + "source": null + }, + "sig_bg_comb.cwl#combine/opt_exec": { + "default": "echo %IN %IN2 %IN3 > aaa.root", + "source": null + }, + "sig_bg_comb.cwl#combine/opt_inDS": { + "default": null, + "parent_id": 1, + "source": "sig_bg_comb.cwl#make_signal/outDS" + }, + "sig_bg_comb.cwl#combine/opt_inDsType": { + "default": "abc.dat", + "source": null + }, + "sig_bg_comb.cwl#combine/opt_secondaryDSs": { + "default": null, + "parent_id": [ + 3, + 5 + ], + "source": [ + "sig_bg_comb.cwl#premix/outDS", + "sig_bg_comb.cwl#make_background_2/outDS" + ] + }, + "sig_bg_comb.cwl#combine/opt_secondaryDsTypes": { + "default": [ + "klm.root", + "ooo.root" + ], + "source": null + } + }, + "is_head": false, + "is_leaf": true, + "is_tail": true, + "is_workflow_output": false, + "loop": false, + "name": "combine", + "output_types": [], + "outputs": { + "sig_bg_comb.cwl#combine/outDS": {} + }, + "parents": [ + 1, + 3, + 5 + ], + "root_inputs": null, + "scatter": null, + "sub_nodes": [], + "task_params": null, + "type": "prun", + "upper_root_inputs": null + } + ] +} +""" + + +# interface for workflow operations +requester_id = GenericThread().get_full_id(__name__, sys.modules[__name__].__file__) +taskBuffer.init( + panda_config.dbhost, + panda_config.dbpasswd, + nDBConnection=panda_config.nDBConnection, + useTimeout=True, + requester=requester_id, +) + +wfif = WorkflowInterface(taskBuffer) + + +# Test cases for workflow core +wf_spec = wfif.register_workflow( + prodsourcelabel=prodsourcelabel, + username=username, + workflow_name=workflow_name, + workflow_definition_json=wfd_json, +) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index deb279ab8..00bd7c651 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -31,7 +31,26 @@ AttributeWithType = namedtuple("AttributeWithType", ["attribute", "type"]) -# ============================================================== +# =============================================================== + + +def json_serialize_default(obj): + """ + Default JSON serializer for non-serializable objects + + Args: + obj (Any): Object to serialize + + Returns: + Any: JSON serializable object + """ + # convert set to list + if isinstance(obj, set): + return list(obj) + return obj + + +# =============================================================== class WorkflowBaseSpec(SpecBase): @@ -208,16 +227,16 @@ class WorkflowInterface(object): Interface for workflow management methods """ - def __init__(self, taskbuffer_if, *args, **kwargs): + def __init__(self, task_buffer, *args, **kwargs): """ Constructor Args: - taskbuffer_if (TaskBufferInterface): Interface to the task buffer + task_buffer (TaskBufferInterface): Interface to the task buffer *args: Additional arguments **kwargs: Additional keyword arguments """ - self.tbif = taskbuffer_if + self.tbif = task_buffer self.ddm_if = rucioAPI self.full_pid = f"{socket.getfqdn().split('.')[0]}-{os.getpgrp()}-{os.getpid()}" @@ -297,7 +316,7 @@ def workflow_data_lock(self, data_id: int, lock_expiration_sec: int = 120): # Add methods for workflow management here - def register_workflow(self, prodsourcelabel: str, username: str, name: str, workflow_definition_json: str, *args, **kwargs): + def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: str, workflow_definition_json: str, *args, **kwargs): """ Register a new workflow @@ -313,18 +332,18 @@ def register_workflow(self, prodsourcelabel: str, username: str, name: str, work ... workflow_spec = WorkflowSpec() workflow_spec.prodsourcelabel = prodsourcelabel - workflow_spec.name = name + workflow_spec.name = workflow_name workflow_spec.username = username workflow_spec.definition_json = workflow_definition_json workflow_spec.creation_time = naive_utcnow() workflow_spec.status = "registered" # Update DB - ret_workflow_spec = self.tbif.update_workflow(workflow_spec, *args, **kwargs) - if ret_workflow_spec is None: - logger.error(f"Failed to register workflow prodsourcelabel={prodsourcelabel} name={name}") + ret_wf_spec = self.tbif.update_workflow(workflow_spec, *args, **kwargs) + if ret_wf_spec is None: + logger.error(f"Failed to register workflow prodsourcelabel={ret_wf_spec.prodsourcelabel} name={ret_wf_spec.name}") return None - logger.info(f"Registered workflow prodsourcelabel={prodsourcelabel} name={name} workflow_id={ret_workflow_spec.workflow_id}") - return ret_workflow_spec + logger.info(f"Registered workflow prodsourcelabel={ret_wf_spec.prodsourcelabel} name={ret_wf_spec.name} workflow_id={ret_wf_spec.workflow_id}") + return ret_wf_spec #### Workflow status transitions From 730f36c993124fb7783aaa898b8e041a617a08f8 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 13 Aug 2025 13:38:54 +0200 Subject: [PATCH 007/101] workflow: update taskbuffer --- pandaserver/taskbuffer/TaskBuffer.py | 60 ++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index 9d4523fad..6ce898cc0 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -2708,6 +2708,66 @@ def get_num_jobs_with_status_by_nucleus(self, vo, job_status): with self.proxyPool.get() as proxy: return proxy.get_num_jobs_with_status_by_nucleus(vo, job_status) + # ==== workflow fucntions ========================================== + + def get_workflow(self, workflow_id): + with self.proxyPool.get() as proxy: + return proxy.get_workflow(workflow_id) + + def get_workflow_step(self, step_id): + with self.proxyPool.get() as proxy: + return proxy.get_workflow_step(step_id) + + def get_workflow_data(self, data_id): + with self.proxyPool.get() as proxy: + return proxy.get_workflow_data(data_id) + + def get_steps_of_workflow(self, workflow_id): + with self.proxyPool.get() as proxy: + return proxy.get_steps_of_workflow(workflow_id) + + def get_data_of_workflow(self, workflow_id): + with self.proxyPool.get() as proxy: + return proxy.get_data_of_workflow(workflow_id) + + def lock_workflow(self, workflow_id, locked_by, lock_expiration_sec=120): + with self.proxyPool.get() as proxy: + return proxy.lock_workflow(workflow_id, locked_by, lock_expiration_sec) + + def unlock_workflow(self, workflow_id, locked_by): + with self.proxyPool.get() as proxy: + return proxy.unlock_workflow(workflow_id, locked_by) + + def lock_workflow_step(self, step_id, locked_by, lock_expiration_sec=120): + with self.proxyPool.get() as proxy: + return proxy.lock_workflow_step(step_id, locked_by, lock_expiration_sec) + + def unlock_workflow_step(self, step_id, locked_by): + with self.proxyPool.get() as proxy: + return proxy.unlock_workflow_step(step_id, locked_by) + + def lock_workflow_data(self, data_id, locked_by, lock_expiration_sec=120): + with self.proxyPool.get() as proxy: + return proxy.lock_workflow_data(data_id, locked_by, lock_expiration_sec) + + def unlock_workflow_data(self, data_id, locked_by): + with self.proxyPool.get() as proxy: + return proxy.unlock_workflow_data(data_id, locked_by) + + def update_workflow(self, workflow_spec): + with self.proxyPool.get() as proxy: + return proxy.update_workflow(workflow_spec) + + def update_workflow_step(self, wf_step_spec): + with self.proxyPool.get() as proxy: + return proxy.update_workflow_step(wf_step_spec) + + def update_workflow_data(self, wf_data_spec): + with self.proxyPool.get() as proxy: + return proxy.update_workflow_data(wf_data_spec) + + # ================================================================== + # Singleton taskBuffer = TaskBuffer() From 7bc10bd70e858bef03b4cb6d63d3e47e8b782328 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 13 Aug 2025 14:25:37 +0200 Subject: [PATCH 008/101] workflow: add insert functions --- pandaserver/taskbuffer/TaskBuffer.py | 12 ++ .../db_proxy_mods/workflow_module.py | 94 ++++++++- pandaserver/workflow/workflow_core.py | 190 +----------------- pandaserver/workflow/workflow_specs.py | 185 +++++++++++++++++ 4 files changed, 300 insertions(+), 181 deletions(-) create mode 100644 pandaserver/workflow/workflow_specs.py diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index 6ce898cc0..1e6409aa7 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -2754,6 +2754,18 @@ def unlock_workflow_data(self, data_id, locked_by): with self.proxyPool.get() as proxy: return proxy.unlock_workflow_data(data_id, locked_by) + def insert_workflow(self, workflow_spec): + with self.proxyPool.get() as proxy: + return proxy.insert_workflow(workflow_spec) + + def insert_workflow_step(self, wf_step_spec): + with self.proxyPool.get() as proxy: + return proxy.insert_workflow_step(wf_step_spec) + + def insert_workflow_data(self, wf_data_spec): + with self.proxyPool.get() as proxy: + return proxy.insert_workflow_data(wf_data_spec) + def update_workflow(self, workflow_spec): with self.proxyPool.get() as proxy: return proxy.update_workflow(workflow_spec) diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index 855e09637..9f41f9b0c 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -10,10 +10,10 @@ from pandaserver.config import panda_config from pandaserver.srvcore import CoreUtils from pandaserver.taskbuffer import ErrorCode, JobUtils -from pandaserver.taskbuffer.db_proxy_mods.base_module import BaseModule +from pandaserver.taskbuffer.db_proxy_mods.base_module import BaseModule, varNUMBER from pandaserver.taskbuffer.db_proxy_mods.entity_module import get_entity_module from pandaserver.taskbuffer.JobSpec import JobSpec -from pandaserver.workflow.workflow_core import WFDataSpec, WFStepSpec, WorkflowSpec +from pandaserver.workflow.workflow_specs import WFDataSpec, WFStepSpec, WorkflowSpec # Module class to define methods related to workflow @@ -412,6 +412,96 @@ def unlock_workflow_data(self, data_id: int, locked_by: str) -> bool | None: except Exception as e: tmp_log.error(f"failed to unlock workflow data: {e}") + def insert_workflow(self, workflow_spec: WorkflowSpec) -> int | None: + """ + Insert a new workflow specification into the database + + Args: + workflow_spec (WorkflowSpec): The workflow specification to insert + + Returns: + int | None: The ID of the inserted workflow if successful, otherwise None + """ + comment = " /* DBProxy.insert_workflow */" + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_spec.workflow_id}") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to insert workflow + workflow_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflows ({workflow_spec.columnNames()}) " + f"VALUES ({workflow_spec.bindInsertValuesExpression()}) " + f"RETURNING workflow_id INTO :new_workflow_id " + ) + var_map = workflow_spec.valuesMap(useSeq=True) + var_map[":new_workflow_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + workflow_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_workflow_id"]))) + return workflow_id + except Exception: + return None + + def insert_workflow_step(self, wf_step_spec: WFStepSpec) -> int | None: + """ + Insert a new workflow step specification into the database + + Args: + wf_step_spec (WFStepSpec): The workflow step specification to insert + + Returns: + int | None: The ID of the inserted workflow step if successful, otherwise None + """ + comment = " /* DBProxy.insert_workflow_step */" + tmp_log = self.create_tagged_logger(comment, f"step_id={wf_step_spec.step_id}") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to insert workflow step + wf_step_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflow_steps ({wf_step_spec.columnNames()}) " + f"VALUES ({wf_step_spec.bindInsertValuesExpression()}) " + f"RETURNING step_id INTO :new_step_id " + ) + var_map = wf_step_spec.valuesMap(useSeq=True) + var_map[":new_step_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + step_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_step_id"]))) + return step_id + except Exception: + return None + + def insert_workflow_data(self, wf_data_spec: WFDataSpec) -> int | None: + """ + Insert a new workflow data specification into the database + + Args: + wf_data_spec (WFDataSpec): The workflow data specification to insert + + Returns: + int | None: The ID of the inserted workflow data if successful, otherwise None + """ + comment = " /* DBProxy.insert_workflow_data */" + tmp_log = self.create_tagged_logger(comment, f"data_id={wf_data_spec.data_id}") + tmp_log.debug("start") + try: + with self.transaction(tmp_log=tmp_log) as (cur, _): + # sql to insert workflow data + wf_data_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflow_data ({wf_data_spec.columnNames()}) " + f"VALUES ({wf_data_spec.bindInsertValuesExpression()}) " + f"RETURNING data_id INTO :new_data_id " + ) + var_map = wf_data_spec.valuesMap(useSeq=True) + var_map[":new_data_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + data_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_data_id"]))) + return data_id + except Exception: + return None + def update_workflow(self, workflow_spec: WorkflowSpec) -> WorkflowSpec | None: """ Update a workflow specification in the database diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 00bd7c651..c23ce3aee 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -9,17 +9,15 @@ import traceback from collections import namedtuple from contextlib import contextmanager -from dataclasses import MISSING, InitVar, asdict, dataclass, field from datetime import datetime, timedelta from typing import Any, Dict, List -from pandacommon.pandalogger.LogWrapper import LogWrapper from pandacommon.pandalogger.PandaLogger import PandaLogger -from pandacommon.pandautils.base import SpecBase from pandacommon.pandautils.PandaUtils import get_sql_IN_bind_variables, naive_utcnow from pandaserver.config import panda_config from pandaserver.dataservice.ddm import rucioAPI +from pandaserver.workflow.workflow_specs import WFDataSpec, WFStepSpec, WorkflowSpec import polars as pl # isort:skip @@ -53,175 +51,6 @@ def json_serialize_default(obj): # =============================================================== -class WorkflowBaseSpec(SpecBase): - """ - Base class for workflow related specifications - """ - - @property - def parameter_map(self) -> dict: - """ - Get the dictionary parsed by the parameters attribute in JSON - Possible parameters: - ... - - Returns: - dict : dict of parameters if it is JSON or empty dict if null - """ - if self.parameters is None: - return {} - else: - return json.loads(self.parameters) - - @parameter_map.setter - def parameter_map(self, value_map: dict): - """ - Set the dictionary and store in parameters attribute in JSON - - Args: - value_map (dict): dict to set the parameter map - """ - self.parameters = json.dumps(value_map) - - def get_parameter(self, param: str) -> Any: - """ - Get the value of one parameter. None as default - - Args: - param (str): parameter name - - Returns: - Any : value of the parameter; None if parameter not set - """ - tmp_dict = self.parameter_map - return tmp_dict.get(param) - - def set_parameter(self, param: str, value): - """ - Set the value of one parameter and store in parameters attribute in JSON - - Args: - param (str): parameter name - value (Any): value of the parameter to set; must be JSON-serializable - """ - tmp_dict = self.parameter_map - tmp_dict[param] = value - self.parameter_map = tmp_dict - - def update_parameters(self, params: dict): - """ - Update values of parameters with a dict and store in parameters attribute in JSON - - Args: - params (dict): dict of parameter names and values to set - """ - tmp_dict = self.parameter_map - tmp_dict.update(params) - self.parameter_map = tmp_dict - - -class WorkflowSpec(WorkflowBaseSpec): - """ - Workflow specification - """ - - # attributes with types - attributes_with_types = ( - AttributeWithType("workflow_id", int), - AttributeWithType("name", str), - AttributeWithType("status", str), - AttributeWithType("prodsourcelabel", str), - AttributeWithType("username", str), - AttributeWithType("creation_time", datetime), - AttributeWithType("start_time", datetime), - AttributeWithType("end_time", datetime), - AttributeWithType("modification_time", datetime), - AttributeWithType("check_time", datetime), - AttributeWithType("locked_by", str), - AttributeWithType("lock_time", datetime), - AttributeWithType("definition_json", str), - AttributeWithType("parameters", str), - ) - # attributes - attributes = tuple([attr.attribute for attr in attributes_with_types]) - # attributes which have 0 by default - _zeroAttrs = () - # attributes to force update - _forceUpdateAttrs = () - # mapping between sequence and attr - _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_ID_SEQ.nextval"} - - -class WFStepSpec(WorkflowBaseSpec): - """ - Workflow Step specification - """ - - # attributes with types - attributes_with_types = ( - AttributeWithType("step_id", int), - AttributeWithType("name", str), - AttributeWithType("workflow_id", int), - AttributeWithType("member_id", int), - AttributeWithType("type", str), - AttributeWithType("status", str), - AttributeWithType("flavor", str), - AttributeWithType("target_id", str), - AttributeWithType("creation_time", datetime), - AttributeWithType("start_time", datetime), - AttributeWithType("end_time", datetime), - AttributeWithType("modification_time", datetime), - AttributeWithType("check_time", datetime), - AttributeWithType("locked_by", str), - AttributeWithType("lock_time", datetime), - AttributeWithType("parameters", str), - ) - # attributes - attributes = tuple([attr.attribute for attr in attributes_with_types]) - # attributes which have 0 by default - _zeroAttrs = () - # attributes to force update - _forceUpdateAttrs = () - # mapping between sequence and attr - _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_STEP_ID_SEQ.nextval"} - - -class WFDataSpec(WorkflowBaseSpec): - """ - Workflow Data specification - """ - - # attributes with types - attributes_with_types = ( - AttributeWithType("data_id", int), - AttributeWithType("name", str), - AttributeWithType("workflow_id", int), - AttributeWithType("type", str), - AttributeWithType("status", str), - AttributeWithType("flavor", str), - AttributeWithType("target_id", str), - AttributeWithType("creation_time", datetime), - AttributeWithType("start_time", datetime), - AttributeWithType("end_time", datetime), - AttributeWithType("modification_time", datetime), - AttributeWithType("check_time", datetime), - AttributeWithType("locked_by", str), - AttributeWithType("lock_time", datetime), - AttributeWithType("parameters", str), - ) - # attributes - attributes = tuple([attr.attribute for attr in attributes_with_types]) - # attributes which have 0 by default - _zeroAttrs = () - # attributes to force update - _forceUpdateAttrs = () - # mapping between sequence and attr - _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_DATA_ID_SEQ.nextval"} - - -# ============================================================== - - class WorkflowInterface(object): """ Interface for workflow management methods @@ -316,7 +145,7 @@ def workflow_data_lock(self, data_id: int, lock_expiration_sec: int = 120): # Add methods for workflow management here - def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: str, workflow_definition_json: str, *args, **kwargs): + def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: str, workflow_definition_json: str, *args, **kwargs) -> int | None: """ Register a new workflow @@ -327,6 +156,9 @@ def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: workflow_definition_json (str): JSON string defining the workflow *args: Additional arguments **kwargs: Additional keyword arguments + + Returns: + int | None: The ID of the registered workflow if successful, otherwise None """ # Implementation of workflow registration logic ... @@ -337,13 +169,13 @@ def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: workflow_spec.definition_json = workflow_definition_json workflow_spec.creation_time = naive_utcnow() workflow_spec.status = "registered" - # Update DB - ret_wf_spec = self.tbif.update_workflow(workflow_spec, *args, **kwargs) - if ret_wf_spec is None: - logger.error(f"Failed to register workflow prodsourcelabel={ret_wf_spec.prodsourcelabel} name={ret_wf_spec.name}") + # Insert to DB + ret_workflow_id = self.tbif.insert_workflow(workflow_spec) + if ret_workflow_id is None: + logger.error(f"Failed to register workflow prodsourcelabel={prodsourcelabel} name={workflow_name}") return None - logger.info(f"Registered workflow prodsourcelabel={ret_wf_spec.prodsourcelabel} name={ret_wf_spec.name} workflow_id={ret_wf_spec.workflow_id}") - return ret_wf_spec + logger.info(f"Registered workflow prodsourcelabel={prodsourcelabel} name={workflow_name} workflow_id={ret_workflow_id}") + return ret_workflow_id #### Workflow status transitions diff --git a/pandaserver/workflow/workflow_specs.py b/pandaserver/workflow/workflow_specs.py new file mode 100644 index 000000000..e868bf91e --- /dev/null +++ b/pandaserver/workflow/workflow_specs.py @@ -0,0 +1,185 @@ +import json +from collections import namedtuple +from dataclasses import MISSING, InitVar, asdict, dataclass, field +from datetime import datetime, timedelta +from typing import Any, Dict, List + +from pandacommon.pandalogger.PandaLogger import PandaLogger +from pandacommon.pandautils.base import SpecBase + +from pandaserver.config import panda_config + +# main logger +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + +# named tuple for attribute with type +AttributeWithType = namedtuple("AttributeWithType", ["attribute", "type"]) + + +# =============================================================== + + +class WorkflowBaseSpec(SpecBase): + """ + Base class for workflow related specifications + """ + + @property + def parameter_map(self) -> dict: + """ + Get the dictionary parsed by the parameters attribute in JSON + Possible parameters: + ... + + Returns: + dict : dict of parameters if it is JSON or empty dict if null + """ + if self.parameters is None: + return {} + else: + return json.loads(self.parameters) + + @parameter_map.setter + def parameter_map(self, value_map: dict): + """ + Set the dictionary and store in parameters attribute in JSON + + Args: + value_map (dict): dict to set the parameter map + """ + self.parameters = json.dumps(value_map) + + def get_parameter(self, param: str) -> Any: + """ + Get the value of one parameter. None as default + + Args: + param (str): parameter name + + Returns: + Any : value of the parameter; None if parameter not set + """ + tmp_dict = self.parameter_map + return tmp_dict.get(param) + + def set_parameter(self, param: str, value): + """ + Set the value of one parameter and store in parameters attribute in JSON + + Args: + param (str): parameter name + value (Any): value of the parameter to set; must be JSON-serializable + """ + tmp_dict = self.parameter_map + tmp_dict[param] = value + self.parameter_map = tmp_dict + + def update_parameters(self, params: dict): + """ + Update values of parameters with a dict and store in parameters attribute in JSON + + Args: + params (dict): dict of parameter names and values to set + """ + tmp_dict = self.parameter_map + tmp_dict.update(params) + self.parameter_map = tmp_dict + + +class WorkflowSpec(WorkflowBaseSpec): + """ + Workflow specification + """ + + # attributes with types + attributes_with_types = ( + AttributeWithType("workflow_id", int), + AttributeWithType("name", str), + AttributeWithType("status", str), + AttributeWithType("prodsourcelabel", str), + AttributeWithType("username", str), + AttributeWithType("creation_time", datetime), + AttributeWithType("start_time", datetime), + AttributeWithType("end_time", datetime), + AttributeWithType("modification_time", datetime), + AttributeWithType("check_time", datetime), + AttributeWithType("locked_by", str), + AttributeWithType("lock_time", datetime), + AttributeWithType("definition_json", str), + AttributeWithType("parameters", str), + ) + # attributes + attributes = tuple([attr.attribute for attr in attributes_with_types]) + # attributes which have 0 by default + _zeroAttrs = () + # attributes to force update + _forceUpdateAttrs = () + # mapping between sequence and attr + _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_ID_SEQ.nextval"} + + +class WFStepSpec(WorkflowBaseSpec): + """ + Workflow Step specification + """ + + # attributes with types + attributes_with_types = ( + AttributeWithType("step_id", int), + AttributeWithType("name", str), + AttributeWithType("workflow_id", int), + AttributeWithType("member_id", int), + AttributeWithType("type", str), + AttributeWithType("status", str), + AttributeWithType("flavor", str), + AttributeWithType("target_id", str), + AttributeWithType("creation_time", datetime), + AttributeWithType("start_time", datetime), + AttributeWithType("end_time", datetime), + AttributeWithType("modification_time", datetime), + AttributeWithType("check_time", datetime), + AttributeWithType("locked_by", str), + AttributeWithType("lock_time", datetime), + AttributeWithType("parameters", str), + ) + # attributes + attributes = tuple([attr.attribute for attr in attributes_with_types]) + # attributes which have 0 by default + _zeroAttrs = () + # attributes to force update + _forceUpdateAttrs = () + # mapping between sequence and attr + _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_STEP_ID_SEQ.nextval"} + + +class WFDataSpec(WorkflowBaseSpec): + """ + Workflow Data specification + """ + + # attributes with types + attributes_with_types = ( + AttributeWithType("data_id", int), + AttributeWithType("name", str), + AttributeWithType("workflow_id", int), + AttributeWithType("type", str), + AttributeWithType("status", str), + AttributeWithType("flavor", str), + AttributeWithType("target_id", str), + AttributeWithType("creation_time", datetime), + AttributeWithType("start_time", datetime), + AttributeWithType("end_time", datetime), + AttributeWithType("modification_time", datetime), + AttributeWithType("check_time", datetime), + AttributeWithType("locked_by", str), + AttributeWithType("lock_time", datetime), + AttributeWithType("parameters", str), + ) + # attributes + attributes = tuple([attr.attribute for attr in attributes_with_types]) + # attributes which have 0 by default + _zeroAttrs = () + # attributes to force update + _forceUpdateAttrs = () + # mapping between sequence and attr + _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_DATA_ID_SEQ.nextval"} From 2976f4ae5bbe731ca2d1c63fd836251f8f3d34b3 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 13 Aug 2025 15:24:01 +0200 Subject: [PATCH 009/101] fixes --- .../taskbuffer/db_proxy_mods/workflow_module.py | 15 +++++++++------ .../workflow/test_workflow_core_functions.py | 11 ++++++++--- pandaserver/workflow/workflow_core.py | 2 +- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index 9f41f9b0c..24af73dfd 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -423,7 +423,7 @@ def insert_workflow(self, workflow_spec: WorkflowSpec) -> int | None: int | None: The ID of the inserted workflow if successful, otherwise None """ comment = " /* DBProxy.insert_workflow */" - tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_spec.workflow_id}") + tmp_log = self.create_tagged_logger(comment, "") tmp_log.debug("start") try: with self.transaction(tmp_log=tmp_log) as (cur, _): @@ -431,13 +431,14 @@ def insert_workflow(self, workflow_spec: WorkflowSpec) -> int | None: workflow_spec.creation_time = naive_utcnow() sql_insert = ( f"INSERT INTO {panda_config.schemaJEDI}.workflows ({workflow_spec.columnNames()}) " - f"VALUES ({workflow_spec.bindInsertValuesExpression()}) " + f"{workflow_spec.bindValuesExpression()} " f"RETURNING workflow_id INTO :new_workflow_id " ) var_map = workflow_spec.valuesMap(useSeq=True) var_map[":new_workflow_id"] = self.cur.var(varNUMBER) self.cur.execute(sql_insert + comment, var_map) workflow_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_workflow_id"]))) + tmp_log.debug(f"inserted workflow_id={workflow_id}") return workflow_id except Exception: return None @@ -453,7 +454,7 @@ def insert_workflow_step(self, wf_step_spec: WFStepSpec) -> int | None: int | None: The ID of the inserted workflow step if successful, otherwise None """ comment = " /* DBProxy.insert_workflow_step */" - tmp_log = self.create_tagged_logger(comment, f"step_id={wf_step_spec.step_id}") + tmp_log = self.create_tagged_logger(comment, "") tmp_log.debug("start") try: with self.transaction(tmp_log=tmp_log) as (cur, _): @@ -461,13 +462,14 @@ def insert_workflow_step(self, wf_step_spec: WFStepSpec) -> int | None: wf_step_spec.creation_time = naive_utcnow() sql_insert = ( f"INSERT INTO {panda_config.schemaJEDI}.workflow_steps ({wf_step_spec.columnNames()}) " - f"VALUES ({wf_step_spec.bindInsertValuesExpression()}) " + f"{wf_step_spec.bindValuesExpression()} " f"RETURNING step_id INTO :new_step_id " ) var_map = wf_step_spec.valuesMap(useSeq=True) var_map[":new_step_id"] = self.cur.var(varNUMBER) self.cur.execute(sql_insert + comment, var_map) step_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_step_id"]))) + tmp_log.debug(f"inserted step_id={step_id}") return step_id except Exception: return None @@ -483,7 +485,7 @@ def insert_workflow_data(self, wf_data_spec: WFDataSpec) -> int | None: int | None: The ID of the inserted workflow data if successful, otherwise None """ comment = " /* DBProxy.insert_workflow_data */" - tmp_log = self.create_tagged_logger(comment, f"data_id={wf_data_spec.data_id}") + tmp_log = self.create_tagged_logger(comment, "") tmp_log.debug("start") try: with self.transaction(tmp_log=tmp_log) as (cur, _): @@ -491,13 +493,14 @@ def insert_workflow_data(self, wf_data_spec: WFDataSpec) -> int | None: wf_data_spec.creation_time = naive_utcnow() sql_insert = ( f"INSERT INTO {panda_config.schemaJEDI}.workflow_data ({wf_data_spec.columnNames()}) " - f"VALUES ({wf_data_spec.bindInsertValuesExpression()}) " + f"{wf_data_spec.bindValuesExpression()} " f"RETURNING data_id INTO :new_data_id " ) var_map = wf_data_spec.valuesMap(useSeq=True) var_map[":new_data_id"] = self.cur.var(varNUMBER) self.cur.execute(sql_insert + comment, var_map) data_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_data_id"]))) + tmp_log.debug(f"inserted data_id={data_id}") return data_id except Exception: return None diff --git a/pandaserver/workflow/test_workflow_core_functions.py b/pandaserver/workflow/test_workflow_core_functions.py index d073dd0ad..58a7088fa 100644 --- a/pandaserver/workflow/test_workflow_core_functions.py +++ b/pandaserver/workflow/test_workflow_core_functions.py @@ -1,3 +1,4 @@ +import json import sys from pandacommon.pandautils.thread_utils import GenericThread @@ -18,11 +19,13 @@ # workflow definition json -wfd_json = """ +wfd_json = json.dumps( + json.loads( + """ { "root_inputs": { - "sig_bg_comb.cwl#background': 'mc16_5TeV.361238.Pythia8EvtGen_A3NNPDF23LO_minbias_inelastic_low.merge.HITS.e6446_s3238_s3250/", - "sig_bg_comb.cwl#signal': 'mc16_valid:mc16_valid.900248.PG_singlepion_flatPt2to50.simul.HITS.e8312_s3238_tid26378578_00" + "sig_bg_comb.cwl#background": "mc16_5TeV.361238.Pythia8EvtGen_A3NNPDF23LO_minbias_inelastic_low.merge.HITS.e6446_s3238_s3250/", + "sig_bg_comb.cwl#signal": "mc16_valid:mc16_valid.900248.PG_singlepion_flatPt2to50.simul.HITS.e8312_s3238_tid26378578_00" }, "root_outputs": {"sig_bg_comb.cwl#combine/outDS": {"value": "user.me.my_outDS_005_combine"}}, "nodes": [ @@ -322,6 +325,8 @@ ] } """ + ) +) # interface for workflow operations diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index c23ce3aee..3d77d95b9 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -174,7 +174,7 @@ def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: if ret_workflow_id is None: logger.error(f"Failed to register workflow prodsourcelabel={prodsourcelabel} name={workflow_name}") return None - logger.info(f"Registered workflow prodsourcelabel={prodsourcelabel} name={workflow_name} workflow_id={ret_workflow_id}") + logger.info(f"Registered workflow prodsourcelabel={prodsourcelabel} username={username} name={workflow_name} workflow_id={ret_workflow_id}") return ret_workflow_id #### Workflow status transitions From 204a61a461372c37932955f43ce67c907e7be791 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 15 Aug 2025 12:28:07 +0200 Subject: [PATCH 010/101] workflow: add process registered workflow --- pandaserver/taskbuffer/TaskBuffer.py | 4 + .../db_proxy_mods/workflow_module.py | 235 ++++++++++++++---- .../workflow/test_workflow_core_functions.py | 21 +- .../{workflow_specs.py => workflow_base.py} | 74 +++++- pandaserver/workflow/workflow_core.py | 86 +++++-- 5 files changed, 348 insertions(+), 72 deletions(-) rename pandaserver/workflow/{workflow_specs.py => workflow_base.py} (80%) diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index 1e6409aa7..6fb7da60b 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -2778,6 +2778,10 @@ def update_workflow_data(self, wf_data_spec): with self.proxyPool.get() as proxy: return proxy.update_workflow_data(wf_data_spec) + def upsert_workflow_entities(self, workflow_id, actions_dict=None, workflow_spec=None, step_specs=None, data_specs=None): + with self.proxyPool.get() as proxy: + return proxy.upsert_workflow_entities(workflow_id, actions_dict, workflow_spec, step_specs, data_specs) + # ================================================================== diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index 24af73dfd..b5477635f 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -13,7 +13,14 @@ from pandaserver.taskbuffer.db_proxy_mods.base_module import BaseModule, varNUMBER from pandaserver.taskbuffer.db_proxy_mods.entity_module import get_entity_module from pandaserver.taskbuffer.JobSpec import JobSpec -from pandaserver.workflow.workflow_specs import WFDataSpec, WFStepSpec, WorkflowSpec +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFStepSpec, + WFStepStatus, + WorkflowSpec, + WorkflowStatus, +) # Module class to define methods related to workflow @@ -71,9 +78,9 @@ def get_workflow_step(self, step_id: int) -> WFStepSpec | None: tmp_log.error("more than one steps; unexpected") else: for res in res_list: - wf_step_spec = WFStepSpec() - wf_step_spec.pack(res) - return wf_step_spec + step_spec = WFStepSpec() + step_spec.pack(res) + return step_spec else: tmp_log.warning("no step found; skipped") return None @@ -99,9 +106,9 @@ def get_workflow_data(self, data_id: int) -> WFDataSpec | None: tmp_log.error("more than one data; unexpected") else: for res in res_list: - wf_data_spec = WFDataSpec() - wf_data_spec.pack(res) - return wf_data_spec + data_spec = WFDataSpec() + data_spec.pack(res) + return data_spec else: tmp_log.warning("no data found; skipped") return None @@ -123,12 +130,12 @@ def get_steps_of_workflow(self, workflow_id: int) -> list[WFStepSpec]: self.cur.execute(sql + comment, var_map) res_list = self.cur.fetchall() if res_list is not None: - wf_step_specs = [] + step_specs = [] for res in res_list: - wf_step_spec = WFStepSpec() - wf_step_spec.pack(res) - wf_step_specs.append(wf_step_spec) - return wf_step_specs + step_spec = WFStepSpec() + step_spec.pack(res) + step_specs.append(step_spec) + return step_specs else: tmp_log.warning("no steps found; skipped") return [] @@ -150,12 +157,12 @@ def get_data_of_workflow(self, workflow_id: int) -> list[WFDataSpec]: self.cur.execute(sql + comment, var_map) res_list = self.cur.fetchall() if res_list is not None: - wf_data_specs = [] + data_specs = [] for res in res_list: - wf_data_spec = WFDataSpec() - wf_data_spec.pack(res) - wf_data_specs.append(wf_data_spec) - return wf_data_specs + data_spec = WFDataSpec() + data_spec.pack(res) + data_specs.append(data_spec) + return data_specs else: tmp_log.warning("no data found; skipped") return [] @@ -443,12 +450,12 @@ def insert_workflow(self, workflow_spec: WorkflowSpec) -> int | None: except Exception: return None - def insert_workflow_step(self, wf_step_spec: WFStepSpec) -> int | None: + def insert_workflow_step(self, step_spec: WFStepSpec) -> int | None: """ Insert a new workflow step specification into the database Args: - wf_step_spec (WFStepSpec): The workflow step specification to insert + step_spec (WFStepSpec): The workflow step specification to insert Returns: int | None: The ID of the inserted workflow step if successful, otherwise None @@ -459,13 +466,13 @@ def insert_workflow_step(self, wf_step_spec: WFStepSpec) -> int | None: try: with self.transaction(tmp_log=tmp_log) as (cur, _): # sql to insert workflow step - wf_step_spec.creation_time = naive_utcnow() + step_spec.creation_time = naive_utcnow() sql_insert = ( - f"INSERT INTO {panda_config.schemaJEDI}.workflow_steps ({wf_step_spec.columnNames()}) " - f"{wf_step_spec.bindValuesExpression()} " + f"INSERT INTO {panda_config.schemaJEDI}.workflow_steps ({step_spec.columnNames()}) " + f"{step_spec.bindValuesExpression()} " f"RETURNING step_id INTO :new_step_id " ) - var_map = wf_step_spec.valuesMap(useSeq=True) + var_map = step_spec.valuesMap(useSeq=True) var_map[":new_step_id"] = self.cur.var(varNUMBER) self.cur.execute(sql_insert + comment, var_map) step_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_step_id"]))) @@ -474,12 +481,12 @@ def insert_workflow_step(self, wf_step_spec: WFStepSpec) -> int | None: except Exception: return None - def insert_workflow_data(self, wf_data_spec: WFDataSpec) -> int | None: + def insert_workflow_data(self, data_spec: WFDataSpec) -> int | None: """ Insert a new workflow data specification into the database Args: - wf_data_spec (WFDataSpec): The workflow data specification to insert + data_spec (WFDataSpec): The workflow data specification to insert Returns: int | None: The ID of the inserted workflow data if successful, otherwise None @@ -490,13 +497,13 @@ def insert_workflow_data(self, wf_data_spec: WFDataSpec) -> int | None: try: with self.transaction(tmp_log=tmp_log) as (cur, _): # sql to insert workflow data - wf_data_spec.creation_time = naive_utcnow() + data_spec.creation_time = naive_utcnow() sql_insert = ( - f"INSERT INTO {panda_config.schemaJEDI}.workflow_data ({wf_data_spec.columnNames()}) " - f"{wf_data_spec.bindValuesExpression()} " + f"INSERT INTO {panda_config.schemaJEDI}.workflow_data ({data_spec.columnNames()}) " + f"{data_spec.bindValuesExpression()} " f"RETURNING data_id INTO :new_data_id " ) - var_map = wf_data_spec.valuesMap(useSeq=True) + var_map = data_spec.valuesMap(useSeq=True) var_map[":new_data_id"] = self.cur.var(varNUMBER) self.cur.execute(sql_insert + comment, var_map) data_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_data_id"]))) @@ -533,54 +540,188 @@ def update_workflow(self, workflow_spec: WorkflowSpec) -> WorkflowSpec | None: except Exception: return None - def update_workflow_step(self, wf_step_spec: WFStepSpec) -> WFStepSpec | None: + def update_workflow_step(self, step_spec: WFStepSpec) -> WFStepSpec | None: """ Update a workflow step specification in the database Args: - wf_step_spec (WFStepSpec): The workflow step specification to update + step_spec (WFStepSpec): The workflow step specification to update Returns: WFStepSpec | None: The updated workflow step specification if successful, otherwise None """ comment = " /* DBProxy.update_workflow_step */" - tmp_log = self.create_tagged_logger(comment, f"step_id={wf_step_spec.step_id}") + tmp_log = self.create_tagged_logger(comment, f"step_id={step_spec.step_id}") tmp_log.debug("start") try: with self.transaction(tmp_log=tmp_log) as (cur, _): # sql to update workflow step - wf_step_spec.modification_time = naive_utcnow() - sql_update = f"UPDATE {panda_config.schemaJEDI}.workflow_steps " f"SET {wf_step_spec.bindUpdateChangesExpression()} " "WHERE step_id=:step_id " - var_map = wf_step_spec.valuesMap(useSeq=False, onlyChanged=True) - var_map[":step_id"] = wf_step_spec.step_id + step_spec.modification_time = naive_utcnow() + sql_update = f"UPDATE {panda_config.schemaJEDI}.workflow_steps " f"SET {step_spec.bindUpdateChangesExpression()} " "WHERE step_id=:step_id " + var_map = step_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":step_id"] = step_spec.step_id cur.execute(sql_update + comment, var_map) - tmp_log.debug(f"updated {wf_step_spec.bindUpdateChangesExpression()}") - return wf_step_spec + tmp_log.debug(f"updated {step_spec.bindUpdateChangesExpression()}") + return step_spec except Exception: return None - def update_workflow_data(self, wf_data_spec: WFDataSpec) -> WFDataSpec | None: + def update_workflow_data(self, data_spec: WFDataSpec) -> WFDataSpec | None: """ Update a workflow data specification in the database Args: - wf_data_spec (WFDataSpec): The workflow data specification to update + data_spec (WFDataSpec): The workflow data specification to update Returns: WFDataSpec | None: The updated workflow data specification if successful, otherwise None """ comment = " /* DBProxy.update_workflow_data */" - tmp_log = self.create_tagged_logger(comment, f"data_id={wf_data_spec.data_id}") + tmp_log = self.create_tagged_logger(comment, f"data_id={data_spec.data_id}") tmp_log.debug("start") try: with self.transaction(tmp_log=tmp_log) as (cur, _): # sql to update workflow data - wf_data_spec.modification_time = naive_utcnow() - sql_update = f"UPDATE {panda_config.schemaJEDI}.workflow_data " f"SET {wf_data_spec.bindUpdateChangesExpression()} " "WHERE data_id=:data_id " - var_map = wf_data_spec.valuesMap(useSeq=False, onlyChanged=True) - var_map[":data_id"] = wf_data_spec.data_id + data_spec.modification_time = naive_utcnow() + sql_update = f"UPDATE {panda_config.schemaJEDI}.workflow_data " f"SET {data_spec.bindUpdateChangesExpression()} " "WHERE data_id=:data_id " + var_map = data_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":data_id"] = data_spec.data_id cur.execute(sql_update + comment, var_map) - tmp_log.debug(f"updated {wf_data_spec.bindUpdateChangesExpression()}") - return wf_data_spec + tmp_log.debug(f"updated {data_spec.bindUpdateChangesExpression()}") + return data_spec + except Exception: + return None + + def upsert_workflow_entities( + self, + workflow_id: int | None, + actions_dict: dict | None = None, + workflow_spec: WorkflowSpec | None = None, + step_specs: list[WFStepSpec] | None = None, + data_specs: list[WFDataSpec] | None = None, + ) -> dict | None: + """ + Update or insert (if not existing) steps and data associated with a workflow within a transaction + + Args: + workflow_id (int | None): ID of the workflow to update, or None if to insert + actions_dict (dict | None): Dictionary of actions (insert, update, or None) to perform on the entities (workflow, steps, data), e.g. {"workflow": None, "steps": "insert", "data": "update"} + workflow_spec (WorkflowSpec|None): The workflow specification to update or insert + step_specs (list[WFStepSpec]|None): List of workflow step specifications to update or insert + data_specs (list[WFDataSpec]|None): List of workflow data specifications to update or insert + + Returns: + dict | None: Dictionary containing the number of steps and data upserted, or None if an error occurred + """ + comment = " /* DBProxy.upsert_workflow_entities */" + # Determine actions of each entity + action_of_workflow = None + action_of_steps = None + action_of_data = None + if actions_dict: + if (tmp_action_of_workflow := actions_dict.get("workflow")) and workflow_spec: + if tmp_action_of_workflow == "insert" and workflow_id is None: + action_of_workflow = "insert" + elif tmp_action_of_workflow == "update" and workflow_id is not None and workflow_spec.workflow_id == workflow_id: + action_of_workflow = "update" + action_of_steps = actions_dict.get("steps") if (workflow_id and step_specs) else None + action_of_data = actions_dict.get("data") if (workflow_id and data_specs) else None + actions_dict = { + "workflow": action_of_workflow, + "steps": action_of_steps, + "data": action_of_data, + } + # log + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_spec.workflow_id}") + tmp_log.debug(f"start, actions={actions_dict}") + # skip if no action specified + if not any(actions_dict.values()): + self.log.warning("no action specified; skipped") + return None + try: + n_steps_upserted = 0 + n_data_upserted = 0 + with self.transaction(tmp_log=tmp_log) as (cur, _): + # action for data + if action_of_data == "insert": + for data_spec in data_specs: + data_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflow_data ({data_spec.columnNames()}) " + f"{data_spec.bindValuesExpression()} " + f"RETURNING data_id INTO :new_data_id " + ) + var_map = data_spec.valuesMap(useSeq=True) + var_map[":new_data_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + data_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_data_id"]))) + data_spec.data_id = data_id + n_data_upserted += 1 + tmp_log.debug(f"inserted a data workflow_id={workflow_id} data_id={data_id}") + elif action_of_data == "update": + for data_spec in data_specs: + data_spec.modification_time = naive_utcnow() + sql_update = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_data " f"SET {data_spec.bindUpdateChangesExpression()} " "WHERE data_id=:data_id " + ) + var_map = data_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":data_id"] = data_spec.data_id + self.cur.execute(sql_update + comment, var_map) + n_data_upserted += 1 + tmp_log.debug(f"updated a data workflow_id={workflow_id} data_id={data_spec.data_id}") + # action for steps + if action_of_steps == "insert": + for step_spec in step_specs: + step_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflow_steps ({step_spec.columnNames()}) " + f"{step_spec.bindValuesExpression()} " + f"RETURNING step_id INTO :new_step_id " + ) + var_map = step_spec.valuesMap(useSeq=True) + var_map[":new_step_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + step_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_step_id"]))) + step_spec.step_id = step_id + n_steps_upserted += 1 + tmp_log.debug(f"inserted a step workflow_id={workflow_id} step_id={step_id}") + elif action_of_steps == "update": + for step_spec in step_specs: + step_spec.modification_time = naive_utcnow() + sql_update = ( + f"UPDATE {panda_config.schemaJEDI}.workflow_steps " f"SET {step_spec.bindUpdateChangesExpression()} " "WHERE step_id=:step_id " + ) + var_map = step_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":step_id"] = step_spec.step_id + self.cur.execute(sql_update + comment, var_map) + n_steps_upserted += 1 + tmp_log.debug(f"updated a step workflow_id={workflow_id} step_id={step_spec.step_id}") + # action for workflow + if action_of_workflow == "insert": + workflow_spec.creation_time = naive_utcnow() + sql_insert = ( + f"INSERT INTO {panda_config.schemaJEDI}.workflows ({workflow_spec.columnNames()}) " + f"{workflow_spec.bindValuesExpression()} " + f"RETURNING workflow_id INTO :new_workflow_id " + ) + var_map = workflow_spec.valuesMap(useSeq=True) + var_map[":new_workflow_id"] = self.cur.var(varNUMBER) + self.cur.execute(sql_insert + comment, var_map) + workflow_id = int(self.getvalue_corrector(self.cur.getvalue(var_map[":new_workflow_id"]))) + workflow_spec.workflow_id = workflow_id + tmp_log.debug(f"inserted a workflow workflow_id={workflow_id}") + elif action_of_workflow == "update": + workflow_spec.modification_time = naive_utcnow() + sql_update = ( + f"UPDATE {panda_config.schemaJEDI}.workflows " f"SET {workflow_spec.bindUpdateChangesExpression()} " "WHERE workflow_id=:workflow_id " + ) + var_map = workflow_spec.valuesMap(useSeq=False, onlyChanged=True) + var_map[":workflow_id"] = workflow_spec.workflow_id + self.cur.execute(sql_update + comment, var_map) + tmp_log.debug(f"updated a workflow workflow_id={workflow_spec.workflow_id}") + tmp_log.debug("actions completed") + # Summary + tmp_log.debug(f"done, actions={actions_dict}, upserted workflow_id={workflow_id} with {n_steps_upserted} steps and {n_data_upserted} data") + return {"workflow_id": workflow_id, "steps": n_steps_upserted, "data": n_data_upserted} except Exception: return None diff --git a/pandaserver/workflow/test_workflow_core_functions.py b/pandaserver/workflow/test_workflow_core_functions.py index 58a7088fa..d0f5523a9 100644 --- a/pandaserver/workflow/test_workflow_core_functions.py +++ b/pandaserver/workflow/test_workflow_core_functions.py @@ -343,9 +343,18 @@ # Test cases for workflow core -wf_spec = wfif.register_workflow( - prodsourcelabel=prodsourcelabel, - username=username, - workflow_name=workflow_name, - workflow_definition_json=wfd_json, -) + +# Register the workflow +# print("Registering workflow...") +# wf_spec = wfif.register_workflow( +# prodsourcelabel=prodsourcelabel, +# username=username, +# workflow_name=workflow_name, +# workflow_definition_json=wfd_json, +# ) + +wf_spec = taskBuffer.get_workflow(workflow_id=1) + +# Process the registered workflow +print("Processing registered workflow...") +wfif.process_workflow_registered(wf_spec) diff --git a/pandaserver/workflow/workflow_specs.py b/pandaserver/workflow/workflow_base.py similarity index 80% rename from pandaserver/workflow/workflow_specs.py rename to pandaserver/workflow/workflow_base.py index e868bf91e..33ecb3919 100644 --- a/pandaserver/workflow/workflow_specs.py +++ b/pandaserver/workflow/workflow_base.py @@ -16,7 +16,79 @@ AttributeWithType = namedtuple("AttributeWithType", ["attribute", "type"]) -# =============================================================== +# ==== Status of Entities ====================================== + + +class WorkflowStatus(object): + """ + Class to define the status of workflows + """ + + registered = "registered" + checking = "checking" + checked = "checked" + starting = "starting" + running = "running" + done = "done" + failed = "failed" + cancelled = "cancelled" + + +class WFStepStatus(object): + """ + Class to define the status of workflow steps + """ + + registered = "registered" + pending = "pending" + ready = "ready" + submitted = "submitted" + running = "running" + done = "done" + failed = "failed" + cancelled = "cancelled" + + +class WFDataStatus(object): + """ + Class to define the status of workflow data + """ + + registered = "registered" + checking = "checking" + checked_nonex = "checked_nonex" + checked_exist = "checked_exist" + generating_start = "generating_start" + generating_ready = "generating_ready" + done_generated = "done_generated" + done_skipped = "done_skipped" + cancelled = "cancelled" + retired = "retired" + + +# ==== Types =================================================== + + +class WFStepType(object): + """ + Class to define the types of workflow steps + """ + + ... + ordinary = "ordinary" + + +class WFDataType(object): + """ + Class to define the types of workflow data + """ + + input = "input" + output = "output" + mid = "mid" + + +# ==== Specifications ========================================== class WorkflowBaseSpec(SpecBase): diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 3d77d95b9..181517ab9 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -12,12 +12,22 @@ from datetime import datetime, timedelta from typing import Any, Dict, List +from pandacommon.pandalogger.LogWrapper import LogWrapper from pandacommon.pandalogger.PandaLogger import PandaLogger from pandacommon.pandautils.PandaUtils import get_sql_IN_bind_variables, naive_utcnow from pandaserver.config import panda_config from pandaserver.dataservice.ddm import rucioAPI -from pandaserver.workflow.workflow_specs import WFDataSpec, WFStepSpec, WorkflowSpec +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFDataType, + WFStepSpec, + WFStepStatus, + WFStepType, + WorkflowSpec, + WorkflowStatus, +) import polars as pl # isort:skip @@ -160,6 +170,8 @@ def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: Returns: int | None: The ID of the registered workflow if successful, otherwise None """ + tmp_log = LogWrapper(logger, f"register_workflow prodsourcelabel={prodsourcelabel} username={username} name={workflow_name}") + tmp_log.debug("start") # Implementation of workflow registration logic ... workflow_spec = WorkflowSpec() @@ -172,9 +184,9 @@ def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: # Insert to DB ret_workflow_id = self.tbif.insert_workflow(workflow_spec) if ret_workflow_id is None: - logger.error(f"Failed to register workflow prodsourcelabel={prodsourcelabel} name={workflow_name}") + tmp_log.error(f"Failed to register workflow") return None - logger.info(f"Registered workflow prodsourcelabel={prodsourcelabel} username={username} name={workflow_name} workflow_id={ret_workflow_id}") + tmp_log.info(f"Registered workflow workflow_id={ret_workflow_id}") return ret_workflow_id #### Workflow status transitions @@ -187,22 +199,60 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): Args: workflow_spec (WorkflowSpec): The workflow specification to process """ - # Parse the workflow definition + tmp_log = LogWrapper(logger, f"process_workflow_registered workflow_id={workflow_spec.workflow_id}") + tmp_log.debug("start") try: + # Parse the workflow definition workflow_definition_dict = json.loads(workflow_spec.definition_json) + # initialize + data_specs = [] + step_specs = [] + now_time = naive_utcnow() + # Register root inputs and outputs + for input_name, input_target in workflow_definition_dict["root_inputs"].items(): + data_spec = WFDataSpec() + data_spec.workflow_id = workflow_spec.workflow_id + data_spec.name = input_name + data_spec.target_id = input_target + data_spec.status = WFDataStatus.registered + data_spec.type = WFDataType.input + data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable + data_spec.creation_time = now_time + data_specs.append(data_spec) + for output_name, output_target in workflow_definition_dict["root_outputs"].items(): + data_spec = WFDataSpec() + data_spec.workflow_id = workflow_spec.workflow_id + data_spec.name = output_name + data_spec.target_id = output_target + data_spec.status = WFDataStatus.registered + data_spec.type = WFDataType.output + data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable + data_spec.creation_time = now_time + data_specs.append(data_spec) # Register steps based on nodes in the definition for node in workflow_definition_dict["nodes"]: - step_spec = WFStepSpec() - step_spec.workflow_id = workflow_spec.workflow_id - step_spec.member_id = node["id"] - step_spec.status = "registered" - step_spec.type = node.get("type", "default") - # step_spec.parameters = json.dumps(node.get("parameters", {})) - step_spec.creation_time = naive_utcnow() - except Exception as e: - logger.error(f"Failed to parse workflow definition for workflow_id={workflow_spec.workflow_id}: {e}") - - # FIXME: temporary, skip data checking and go to starting directly - workflow_spec.status = "starting" - # Update DB - self.tbif.update_workflow(workflow_spec) + # FIXME: not yet consider scatter, condition, loop, etc. + if not (node.get("condition") or node.get("scatter") or node.get("loop")): + step_spec = WFStepSpec() + step_spec.workflow_id = workflow_spec.workflow_id + step_spec.member_id = node["id"] + step_spec.name = node["name"] + step_spec.status = "registered" + step_spec.type = WFStepType.ordinary + step_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable + step_spec.definition_json = json.dumps(node, default=json_serialize_default) + step_spec.creation_time = now_time + step_specs.append(step_spec) + # FIXME: temporary, skip data checking and go to starting directly + workflow_spec.status = "starting" + # Upsert DB + self.tbif.upsert_workflow_entities( + workflow_spec.workflow_id, + actions_dict={"workflow": "update", "steps": "insert", "data": "insert"}, + workflow_spec=workflow_spec, + step_specs=step_specs, + data_specs=data_specs, + ) + tmp_log.info(f"Processed workflow registered, workflow_id={workflow_spec.workflow_id}, steps={len(step_specs)}, data={len(data_specs)}") + except Exception: + tmp_log.error(f"got error ; {traceback.format_exc()}") From da06be0aad2d8587d583ce0d1756cdcd8bbe162b Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 15 Aug 2025 14:11:59 +0200 Subject: [PATCH 011/101] fix --- pandaserver/workflow/workflow_base.py | 5 +++-- pandaserver/workflow/workflow_core.py | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 33ecb3919..0e156150d 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -212,6 +212,7 @@ class WFStepSpec(WorkflowBaseSpec): AttributeWithType("check_time", datetime), AttributeWithType("locked_by", str), AttributeWithType("lock_time", datetime), + AttributeWithType("definition_json", str), AttributeWithType("parameters", str), ) # attributes @@ -221,7 +222,7 @@ class WFStepSpec(WorkflowBaseSpec): # attributes to force update _forceUpdateAttrs = () # mapping between sequence and attr - _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_STEP_ID_SEQ.nextval"} + _seqAttrMap = {"step_id": f"{panda_config.schemaJEDI}.WORKFLOW_STEP_ID_SEQ.nextval"} class WFDataSpec(WorkflowBaseSpec): @@ -254,4 +255,4 @@ class WFDataSpec(WorkflowBaseSpec): # attributes to force update _forceUpdateAttrs = () # mapping between sequence and attr - _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_DATA_ID_SEQ.nextval"} + _seqAttrMap = {"data_id": f"{panda_config.schemaJEDI}.WORKFLOW_DATA_ID_SEQ.nextval"} diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 181517ab9..c367d4060 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -189,7 +189,7 @@ def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: tmp_log.info(f"Registered workflow workflow_id={ret_workflow_id}") return ret_workflow_id - #### Workflow status transitions + # ---- Workflow status transitions ------------------------- def process_workflow_registered(self, workflow_spec: WorkflowSpec): """ @@ -219,11 +219,11 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable data_spec.creation_time = now_time data_specs.append(data_spec) - for output_name, output_target in workflow_definition_dict["root_outputs"].items(): + for output_name, output_dict in workflow_definition_dict["root_outputs"].items(): data_spec = WFDataSpec() data_spec.workflow_id = workflow_spec.workflow_id data_spec.name = output_name - data_spec.target_id = output_target + data_spec.target_id = output_dict.get("value") data_spec.status = WFDataStatus.registered data_spec.type = WFDataType.output data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable @@ -256,3 +256,5 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): tmp_log.info(f"Processed workflow registered, workflow_id={workflow_spec.workflow_id}, steps={len(step_specs)}, data={len(data_specs)}") except Exception: tmp_log.error(f"got error ; {traceback.format_exc()}") + + # ---- Data status transitions ----------------------------- From 2cb0860cfcf877486192cf482956b056a8225465 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 25 Aug 2025 08:18:38 +0200 Subject: [PATCH 012/101] workflow: update schema, preliminary handlers --- pandaserver/taskbuffer/TaskBuffer.py | 23 ++- .../data_handler_plugins/base_data_handler.py | 7 + .../step_handler_plugins/base_step_handler.py | 38 +++++ .../panda_task_step_handler.py | 91 ++++++++++++ pandaserver/workflow/workflow_base.py | 2 + pandaserver/workflow/workflow_core.py | 140 +++++++++--------- 6 files changed, 224 insertions(+), 77 deletions(-) create mode 100644 pandaserver/workflow/data_handler_plugins/base_data_handler.py create mode 100644 pandaserver/workflow/step_handler_plugins/base_step_handler.py create mode 100644 pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index 6fb7da60b..f2b665138 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -2643,6 +2643,13 @@ def disable_job_cloning(self, jedi_task_id): ret = proxy.disable_job_cloning(jedi_task_id) return ret + # gets statistics on the number of jobs with a specific status for each nucleus at each site + def get_num_jobs_with_status_by_nucleus(self, vo, job_status): + with self.proxyPool.get() as proxy: + return proxy.get_num_jobs_with_status_by_nucleus(vo, job_status) + + # ==== JEDI taskbuffer functions =========================== + # get JEDI task with jediTaskID def getTaskWithID_JEDI(self, jediTaskID, fullFlag=False, lockTask=False, pid=None, lockInterval=None, clearError=False): with self.proxyPool.get() as proxy: @@ -2653,6 +2660,13 @@ def updateInputFilesStaged_JEDI(self, jeditaskid, scope, filenames_dict, chunk_s with self.proxyPool.get() as proxy: return proxy.updateInputFilesStaged_JEDI(jeditaskid, scope, filenames_dict, chunk_size, by, check_scope) + # insert TaskParams + def insertTaskParams_JEDI(self, vo, prodSourceLabel, userName, taskName, taskParams, parent_tid=None): + with self.proxyPool.get() as proxy: + return proxy.insertTaskParams_JEDI(vo, prodSourceLabel, userName, taskName, taskParams, parent_tid) + + # ==== Data Carousel functions ============================= + # insert data carousel requests def insert_data_carousel_requests_JEDI(self, task_id, dc_req_specs): with self.proxyPool.get() as proxy: @@ -2703,12 +2717,7 @@ def resubmit_data_carousel_request_JEDI(self, request_id, exclude_prev_dst=False with self.proxyPool.get() as proxy: return proxy.resubmit_data_carousel_request_JEDI(request_id, exclude_prev_dst) - # gets statistics on the number of jobs with a specific status for each nucleus at each site - def get_num_jobs_with_status_by_nucleus(self, vo, job_status): - with self.proxyPool.get() as proxy: - return proxy.get_num_jobs_with_status_by_nucleus(vo, job_status) - - # ==== workflow fucntions ========================================== + # ==== workflow fucntions ================================== def get_workflow(self, workflow_id): with self.proxyPool.get() as proxy: @@ -2782,7 +2791,7 @@ def upsert_workflow_entities(self, workflow_id, actions_dict=None, workflow_spec with self.proxyPool.get() as proxy: return proxy.upsert_workflow_entities(workflow_id, actions_dict, workflow_spec, step_specs, data_specs) - # ================================================================== + # ========================================================== # Singleton diff --git a/pandaserver/workflow/data_handler_plugins/base_data_handler.py b/pandaserver/workflow/data_handler_plugins/base_data_handler.py new file mode 100644 index 000000000..4a439dac0 --- /dev/null +++ b/pandaserver/workflow/data_handler_plugins/base_data_handler.py @@ -0,0 +1,7 @@ +class BaseDataHandler: + """ + Base class for data handlers in the workflow system. + This class provides a common interface and some utility methods for data handlers. + """ + + def __init__(self, *args, **kwargs): ... diff --git a/pandaserver/workflow/step_handler_plugins/base_step_handler.py b/pandaserver/workflow/step_handler_plugins/base_step_handler.py new file mode 100644 index 000000000..1d0c06914 --- /dev/null +++ b/pandaserver/workflow/step_handler_plugins/base_step_handler.py @@ -0,0 +1,38 @@ +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFDataType, + WFStepSpec, + WFStepStatus, + WFStepType, + WorkflowSpec, + WorkflowStatus, +) + + +class BaseStepHandler: + """ + Base class for step handlers in the workflow. + This class provides a common interface and some utility methods for step handlers. + """ + + def __init__(self, task_buffer, *args, **kwargs): + """ + Initialize the step handler with necessary parameters. + + Args: + task_buffer: The task buffer interface to interact with the task database. + *args: Additional positional arguments. + **kwargs: Additional keyword arguments. + """ + self.tbif = task_buffer + + def submit_target(self, step_specs: WFStepSpec, **kwargs): + """ + Submit a target for processing the step. + This method should be implemented by subclasses to handle the specifics of target submission. + + Args: + step_specs (WFStepSpec): Specifications of the workflow step to be submitted. + """ + raise NotImplementedError("Subclasses must implement this method.") diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py new file mode 100644 index 000000000..8247a48d3 --- /dev/null +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -0,0 +1,91 @@ +import json + +from pandaserver.workflow.step_handler_plugins.base_step_handler import BaseStepHandler +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFDataType, + WFStepSpec, + WFStepStatus, + WFStepType, + WorkflowSpec, + WorkflowStatus, +) + + +class PandaTaskStepHandler(BaseStepHandler): + """ + Handler for PanDA task steps in the workflow. + This class is responsible for managing the execution of PanDA tasks within a workflow. + """ + + def __init__(self, *args, **kwargs): + """ + Initialize the step handler with necessary parameters. + """ + # Initialize base class or any required modules here + super().__init__(*args, **kwargs) + + def submit_target(self, step_spec: WFStepSpec, workflow_spec: WorkflowSpec, **kwargs): + """ + Submit a target for processing the PanDA task step. + This method should be implemented to handle the specifics of PanDA task submission. + """ + + ... + # task_param_map = {} + # task_param_map["taskName"] = step_spec.name + # task_param_map["userName"] = workflow_spec.username + # task_param_map["vo"] = "atlas" + # task_param_map["taskPriority"] = 1000 + # # task_param_map["architecture"] = "i686-slc5-gcc43-opt" + # # task_param_map["transUses"] = "Atlas-17.2.7" + # task_param_map["transUses"] = None + # # task_param_map["transHome"] = "AtlasProduction-17.2.8.10" + # task_param_map["transHome"] = None + # task_param_map["transPath"] = "runGen-00-00-02" + # task_param_map["processingType"] = "reco" + # task_param_map["prodSourceLabel"] = "user" + # # task_param_map["prodSourceLabel"] = "managed" + # task_param_map["taskType"] = "anal" + # # task_param_map["taskType"] = "prod" + # task_param_map["inputPreStaging"] = True + # # task_param_map["panda_data_carousel"] = True + # task_param_map["remove_rule_when_done"] = True + # # task_param_map["workingGroup"] = "AP_Higgs" + # task_param_map["coreCount"] = 1 + # task_param_map["nFiles"] = 1 + # # task_param_map["cloud"] = "US" + # logDatasetName = f"panda.jeditest.log.{uuid.uuid4()}" + # task_param_map["log"] = { + # "dataset": logDatasetName, + # "type": "template", + # "param_type": "log", + # "token": "ATLASDATADISK", + # "value": f"{logDatasetName}.${{SN}}.log.tgz", + # } + # outDatasetName = f"panda.jeditest.NTUP_EMBLLDN.{uuid.uuid4()}" + # task_param_map["jobParameters"] = [ + # { + # "type": "template", + # "param_type": "input", + # "value": "inputAODFile=${IN}", + # "dataset": "mc23_13p6TeV:mc23_13p6TeV.602027.PhH7EG_NLO_LQ_S43_ResProd_lam22_5000_3p5.merge.AOD.e8531_e8528_s4162_s4114_r14622_r14663_tid34033945_00", + # "expand": True, + # }, + # {"type": "template", "param_type": "pseudo_input", "value": "dummy_value", "dataset": "pseudo_dataset"}, + # {"type": "constant", "value": "AMITag=p1462"}, + # { + # "type": "template", + # "param_type": "output", + # "token": "ATLASDATADISK", + # "value": f"outputNTUP_EMBLLDNFile={outDatasetName}.${{SN}}.pool.root", + # "dataset": outDatasetName, + # }, + # ] + + # task_param_json = json.dumps(task_param_map) + + # self.tbif.insertTaskParams_JEDI( + # task_param_map["vo"], task_param_map["prodSourceLabel"], task_param_map["userName"], task_param_map["taskName"], task_param_json + # ) diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 0e156150d..1da9419b7 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -167,6 +167,8 @@ class WorkflowSpec(WorkflowBaseSpec): attributes_with_types = ( AttributeWithType("workflow_id", int), AttributeWithType("name", str), + AttributeWithType("parent_id", int), + AttributeWithType("loop_count", int), AttributeWithType("status", str), AttributeWithType("prodsourcelabel", str), AttributeWithType("username", str), diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index c367d4060..88c386b0a 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -29,7 +29,7 @@ WorkflowStatus, ) -import polars as pl # isort:skip +# import polars as pl # isort:skip # main logger @@ -174,20 +174,20 @@ def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: tmp_log.debug("start") # Implementation of workflow registration logic ... - workflow_spec = WorkflowSpec() - workflow_spec.prodsourcelabel = prodsourcelabel - workflow_spec.name = workflow_name - workflow_spec.username = username - workflow_spec.definition_json = workflow_definition_json - workflow_spec.creation_time = naive_utcnow() - workflow_spec.status = "registered" - # Insert to DB - ret_workflow_id = self.tbif.insert_workflow(workflow_spec) - if ret_workflow_id is None: - tmp_log.error(f"Failed to register workflow") - return None - tmp_log.info(f"Registered workflow workflow_id={ret_workflow_id}") - return ret_workflow_id + # workflow_spec = WorkflowSpec() + # workflow_spec.prodsourcelabel = prodsourcelabel + # workflow_spec.name = workflow_name + # workflow_spec.username = username + # workflow_spec.definition_json = workflow_definition_json + # workflow_spec.creation_time = naive_utcnow() + # workflow_spec.status = "registered" + # # Insert to DB + # ret_workflow_id = self.tbif.insert_workflow(workflow_spec) + # if ret_workflow_id is None: + # tmp_log.error(f"Failed to register workflow") + # return None + # tmp_log.info(f"Registered workflow workflow_id={ret_workflow_id}") + # return ret_workflow_id # ---- Workflow status transitions ------------------------- @@ -201,60 +201,60 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): """ tmp_log = LogWrapper(logger, f"process_workflow_registered workflow_id={workflow_spec.workflow_id}") tmp_log.debug("start") - try: - # Parse the workflow definition - workflow_definition_dict = json.loads(workflow_spec.definition_json) - # initialize - data_specs = [] - step_specs = [] - now_time = naive_utcnow() - # Register root inputs and outputs - for input_name, input_target in workflow_definition_dict["root_inputs"].items(): - data_spec = WFDataSpec() - data_spec.workflow_id = workflow_spec.workflow_id - data_spec.name = input_name - data_spec.target_id = input_target - data_spec.status = WFDataStatus.registered - data_spec.type = WFDataType.input - data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable - data_spec.creation_time = now_time - data_specs.append(data_spec) - for output_name, output_dict in workflow_definition_dict["root_outputs"].items(): - data_spec = WFDataSpec() - data_spec.workflow_id = workflow_spec.workflow_id - data_spec.name = output_name - data_spec.target_id = output_dict.get("value") - data_spec.status = WFDataStatus.registered - data_spec.type = WFDataType.output - data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable - data_spec.creation_time = now_time - data_specs.append(data_spec) - # Register steps based on nodes in the definition - for node in workflow_definition_dict["nodes"]: - # FIXME: not yet consider scatter, condition, loop, etc. - if not (node.get("condition") or node.get("scatter") or node.get("loop")): - step_spec = WFStepSpec() - step_spec.workflow_id = workflow_spec.workflow_id - step_spec.member_id = node["id"] - step_spec.name = node["name"] - step_spec.status = "registered" - step_spec.type = WFStepType.ordinary - step_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable - step_spec.definition_json = json.dumps(node, default=json_serialize_default) - step_spec.creation_time = now_time - step_specs.append(step_spec) - # FIXME: temporary, skip data checking and go to starting directly - workflow_spec.status = "starting" - # Upsert DB - self.tbif.upsert_workflow_entities( - workflow_spec.workflow_id, - actions_dict={"workflow": "update", "steps": "insert", "data": "insert"}, - workflow_spec=workflow_spec, - step_specs=step_specs, - data_specs=data_specs, - ) - tmp_log.info(f"Processed workflow registered, workflow_id={workflow_spec.workflow_id}, steps={len(step_specs)}, data={len(data_specs)}") - except Exception: - tmp_log.error(f"got error ; {traceback.format_exc()}") + # try: + # # Parse the workflow definition + # workflow_definition_dict = json.loads(workflow_spec.definition_json) + # # initialize + # data_specs = [] + # step_specs = [] + # now_time = naive_utcnow() + # # Register root inputs and outputs + # for input_name, input_target in workflow_definition_dict["root_inputs"].items(): + # data_spec = WFDataSpec() + # data_spec.workflow_id = workflow_spec.workflow_id + # data_spec.name = input_name + # data_spec.target_id = input_target + # data_spec.status = WFDataStatus.registered + # data_spec.type = WFDataType.input + # data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable + # data_spec.creation_time = now_time + # data_specs.append(data_spec) + # for output_name, output_dict in workflow_definition_dict["root_outputs"].items(): + # data_spec = WFDataSpec() + # data_spec.workflow_id = workflow_spec.workflow_id + # data_spec.name = output_name + # data_spec.target_id = output_dict.get("value") + # data_spec.status = WFDataStatus.registered + # data_spec.type = WFDataType.output + # data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable + # data_spec.creation_time = now_time + # data_specs.append(data_spec) + # # Register steps based on nodes in the definition + # for node in workflow_definition_dict["nodes"]: + # # FIXME: not yet consider scatter, condition, loop, etc. + # if not (node.get("condition") or node.get("scatter") or node.get("loop")): + # step_spec = WFStepSpec() + # step_spec.workflow_id = workflow_spec.workflow_id + # step_spec.member_id = node["id"] + # step_spec.name = node["name"] + # step_spec.status = "registered" + # step_spec.type = WFStepType.ordinary + # step_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable + # step_spec.definition_json = json.dumps(node, default=json_serialize_default) + # step_spec.creation_time = now_time + # step_specs.append(step_spec) + # # FIXME: temporary, skip data checking and go to starting directly + # workflow_spec.status = "starting" + # # Upsert DB + # self.tbif.upsert_workflow_entities( + # workflow_spec.workflow_id, + # actions_dict={"workflow": "update", "steps": "insert", "data": "insert"}, + # workflow_spec=workflow_spec, + # step_specs=step_specs, + # data_specs=data_specs, + # ) + # tmp_log.info(f"Processed workflow registered, workflow_id={workflow_spec.workflow_id}, steps={len(step_specs)}, data={len(data_specs)}") + # except Exception: + # tmp_log.error(f"got error ; {traceback.format_exc()}") # ---- Data status transitions ----------------------------- From 84693b746c6719d5433a882a62322c4629a3f5a9 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 23 Sep 2025 14:24:24 +0200 Subject: [PATCH 013/101] add workflow api --- pandaserver/api/v1/workflow_api.py | 150 ++++++++++++++++++++++++++ pandaserver/server/panda.py | 4 + pandaserver/workflow/workflow_core.py | 32 +++--- 3 files changed, 170 insertions(+), 16 deletions(-) create mode 100644 pandaserver/api/v1/workflow_api.py diff --git a/pandaserver/api/v1/workflow_api.py b/pandaserver/api/v1/workflow_api.py new file mode 100644 index 000000000..56d8736b7 --- /dev/null +++ b/pandaserver/api/v1/workflow_api.py @@ -0,0 +1,150 @@ +import datetime +from concurrent.futures import ThreadPoolExecutor +from threading import Lock +from typing import Any, Dict, List + +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger +from pandacommon.pandautils.PandaUtils import naive_utcnow + +from pandaserver.api.v1.common import ( + MESSAGE_DATABASE, + TIME_OUT, + TimedMethod, + generate_response, + get_dn, + has_production_role, + request_validation, +) +from pandaserver.srvcore.panda_request import PandaRequest +from pandaserver.taskbuffer.TaskBuffer import TaskBuffer +from pandaserver.workflow.workflow_core import WorkflowInterface + +_logger = PandaLogger().getLogger("api_workflow") + +# These global variables are initialized in the init_task_buffer method +global_task_buffer = None +global_wfif = None + +# These global variables don't depend on DB access and can be initialized here +# global_proxy_cache = panda_proxy_cache.MyProxyInterface() +# global_token_cache = token_cache.TokenCache() + + +def init_task_buffer(task_buffer: TaskBuffer) -> None: + """ + Initialize the task buffer and other interfaces. This method needs to be called before any other method in this module. + """ + global global_task_buffer + global_task_buffer = task_buffer + + global global_wfif + global_wfif = WorkflowInterface(global_task_buffer) + + +@request_validation(_logger, secure=True, production=True, request_method="POST") +def submit_workflow(req: PandaRequest, workflow_definition: dict) -> dict: + """ + Submit a PanDA native workflow. + + API details: + HTTP Method: POST + Path: /v1/workflow/submit_workflow + + Args: + req(PandaRequest): internally generated request object containing the env variables + workflow_definition (dict): dictionary of workflow definition + + Returns: + dict: dictionary `{'success': True/False, 'message': 'Description of error', 'data': }` + """ + + username = get_dn(req) + prodsourcelabel = "user" + if has_production_role(req): + prodsourcelabel = "managed" + workflow_name = workflow_definition.get("workflow_name", None) + + tmp_logger = LogWrapper(_logger, f'submit_workflow prodsourcelabel={prodsourcelabel} username="{username}" workflow_name={workflow_name}') + tmp_logger.debug("Start") + success, message, data = False, "", None + time_start = naive_utcnow() + + workflow_id = global_wfif.register_workflow(prodsourcelabel, username, workflow_name, workflow_definition) + + if workflow_id is not None: + success = True + data = {"workflow_id": workflow_id} + else: + message = "Failed to submit workflow request" + + time_delta = naive_utcnow() - time_start + tmp_logger.debug(f"Done. Took {time_delta.seconds}.{time_delta.microseconds // 1000:03d} sec") + + return generate_response(success, message, data) + + +# def put_workflow_request(panda_request: PandaRequest, data: str, check: bool = False, sync: bool = False) -> str: +# """ +# Upload workflow request to the server. +# Args: +# panda_request (PandaRequest): PanDA request object. +# data (string): workflow request data. +# check (bool): check flag. +# sync (bool): synchronous processing. +# Returns: +# string: String in json format with (boolean, message) +# """ + +# if not Protocol.isSecure(panda_request): +# return json.dumps((False, ERROR_NOT_SECURE)) + +# user_name = panda_request.subprocess_env["SSL_CLIENT_S_DN"] +# creation_time = naive_utcnow().strftime("%Y-%m-%d %H:%M:%S") + +# tmp_log = LogWrapper(_logger, "put_workflow_request") + +# tmp_log.debug(f"start user={user_name} check={check}") + +# if check in ("True", True): +# check = True +# elif sync in ("True", True): +# sync = True + +# try: +# # generate the filename +# file_name = f"{panda_config.cache_dir}/workflow.{str(uuid.uuid4())}" +# tmp_log.debug(f"file={file_name}") + +# # write +# with open(file_name, "w") as file_object: +# data_dict = { +# "userName": user_name, +# "creationTime": creation_time, +# "data": json.loads(data), +# } +# json.dump(data_dict, file_object) + +# if sync or check: +# from pandaserver.taskbuffer.workflow_processor import WorkflowProcessor + +# processor = WorkflowProcessor(log_stream=_logger) +# if check: +# ret = processor.process(file_name, True, True, True, True) +# else: +# ret = processor.process(file_name, True, False, True, False) +# if os.path.exists(file_name): +# try: +# os.remove(file_name) +# except Exception: +# pass +# tmp_log.debug("done") +# return json.dumps((True, ret)) + +# except Exception as exc: +# error_message = f"cannot put request due to {str(exc)} " +# tmp_log.error(error_message + traceback.format_exc()) +# return json.dumps((False, error_message)) + +# tmp_log.debug("done") +# return json.dumps((True, "request was accepted and will be processed in a few minutes")) diff --git a/pandaserver/server/panda.py b/pandaserver/server/panda.py index b6f922d33..8a710adcd 100755 --- a/pandaserver/server/panda.py +++ b/pandaserver/server/panda.py @@ -37,6 +37,7 @@ from pandaserver.api.v1 import statistics_api as statistics_api_v1 from pandaserver.api.v1 import system_api as system_api_v1 from pandaserver.api.v1 import task_api as task_api_v1 +from pandaserver.api.v1 import workflow_api as workflow_api_v1 from pandaserver.api.v1.common import extract_allowed_methods from pandaserver.config import panda_config @@ -170,6 +171,7 @@ statistics_api_v1_methods = extract_allowed_methods(statistics_api_v1) system_api_v1_methods = extract_allowed_methods(system_api_v1) task_api_v1_methods = extract_allowed_methods(task_api_v1) +workflow_api_v1_methods = extract_allowed_methods(workflow_api_v1) # initialize oracledb using dummy connection initializer.init() @@ -198,6 +200,7 @@ statistics_api_v1.init_task_buffer(taskBuffer) # System API does not need to be initialized. system_api_v1.init_task_buffer(taskBuffer) task_api_v1.init_task_buffer(taskBuffer) + workflow_api_v1.init_task_buffer(taskBuffer) # initialize JobDispatcher jobDispatcher.init(taskBuffer) @@ -367,6 +370,7 @@ def module_mapping(version, api_module): "statistics": {"module": statistics_api_v1, "allowed_methods": statistics_api_v1_methods}, "system": {"module": system_api_v1, "allowed_methods": system_api_v1_methods}, "task": {"module": task_api_v1, "allowed_methods": task_api_v1_methods}, + "workflow": {"module": workflow_api_v1, "allowed_methods": workflow_api_v1_methods}, }, } try: diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 88c386b0a..594380f48 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -155,7 +155,7 @@ def workflow_data_lock(self, data_id: int, lock_expiration_sec: int = 120): # Add methods for workflow management here - def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: str, workflow_definition_json: str, *args, **kwargs) -> int | None: + def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: str, workflow_definition: dict, *args, **kwargs) -> int | None: """ Register a new workflow @@ -163,7 +163,7 @@ def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: prodsourcelabel (str): Production source label for the workflow username (str): Username of the person registering the workflow name (str): Name of the workflow - workflow_definition_json (str): JSON string defining the workflow + workflow_definition (dict): Dictionary of workflow definition *args: Additional arguments **kwargs: Additional keyword arguments @@ -174,20 +174,20 @@ def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: tmp_log.debug("start") # Implementation of workflow registration logic ... - # workflow_spec = WorkflowSpec() - # workflow_spec.prodsourcelabel = prodsourcelabel - # workflow_spec.name = workflow_name - # workflow_spec.username = username - # workflow_spec.definition_json = workflow_definition_json - # workflow_spec.creation_time = naive_utcnow() - # workflow_spec.status = "registered" - # # Insert to DB - # ret_workflow_id = self.tbif.insert_workflow(workflow_spec) - # if ret_workflow_id is None: - # tmp_log.error(f"Failed to register workflow") - # return None - # tmp_log.info(f"Registered workflow workflow_id={ret_workflow_id}") - # return ret_workflow_id + workflow_spec = WorkflowSpec() + workflow_spec.prodsourcelabel = prodsourcelabel + workflow_spec.name = workflow_name + workflow_spec.username = username + workflow_spec.definition_json = json.dumps(workflow_definition, default=json_serialize_default) + workflow_spec.creation_time = naive_utcnow() + workflow_spec.status = "registered" + # Insert to DB + ret_workflow_id = self.tbif.insert_workflow(workflow_spec) + if ret_workflow_id is None: + tmp_log.error(f"Failed to register workflow") + return None + tmp_log.info(f"Registered workflow workflow_id={ret_workflow_id}") + return ret_workflow_id # ---- Workflow status transitions ------------------------- From b958b5395fb04eb1f80ce40f26f7b53ee75d88e3 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 29 Sep 2025 17:09:19 +0200 Subject: [PATCH 014/101] workflows4: allow to submit raw request --- pandaserver/api/v1/workflow_api.py | 43 +++++++++++++++- pandaserver/workflow/workflow_base.py | 72 +++++++++++++++++++++++++++ pandaserver/workflow/workflow_core.py | 18 +++++-- 3 files changed, 128 insertions(+), 5 deletions(-) diff --git a/pandaserver/api/v1/workflow_api.py b/pandaserver/api/v1/workflow_api.py index 56d8736b7..a77358abf 100644 --- a/pandaserver/api/v1/workflow_api.py +++ b/pandaserver/api/v1/workflow_api.py @@ -42,6 +42,47 @@ def init_task_buffer(task_buffer: TaskBuffer) -> None: global_wfif = WorkflowInterface(global_task_buffer) +@request_validation(_logger, secure=True, production=True, request_method="POST") +def submit_workflow_raw_request(req: PandaRequest, params: dict) -> dict: + """ + Submit raw request of PanDA native workflow. + + API details: + HTTP Method: POST + Path: /v1/workflow/submit_workflow_raw_request + + Args: + req(PandaRequest): internally generated request object containing the env variables + params (dict): dictionary of parameters of the raw request + + Returns: + dict: dictionary `{'success': True/False, 'message': 'Description of error', 'data': }` + """ + + username = get_dn(req) + prodsourcelabel = "user" + if has_production_role(req): + prodsourcelabel = "managed" + + tmp_logger = LogWrapper(_logger, f'submit_workflow_raw_request prodsourcelabel={prodsourcelabel} username="{username}" ') + tmp_logger.debug("Start") + success, message, data = False, "", None + time_start = naive_utcnow() + + workflow_id = global_wfif.register_workflow(prodsourcelabel, username, raw_request_params=params) + + if workflow_id is not None: + success = True + data = {"workflow_id": workflow_id} + else: + message = "Failed to submit raw workflow request" + + time_delta = naive_utcnow() - time_start + tmp_logger.debug(f"Done. Took {time_delta.seconds}.{time_delta.microseconds // 1000:03d} sec") + + return generate_response(success, message, data) + + @request_validation(_logger, secure=True, production=True, request_method="POST") def submit_workflow(req: PandaRequest, workflow_definition: dict) -> dict: """ @@ -76,7 +117,7 @@ def submit_workflow(req: PandaRequest, workflow_definition: dict) -> dict: success = True data = {"workflow_id": workflow_id} else: - message = "Failed to submit workflow request" + message = "Failed to submit workflow" time_delta = naive_utcnow() - time_start tmp_logger.debug(f"Done. Took {time_delta.seconds}.{time_delta.microseconds // 1000:03d} sec") diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 1da9419b7..e3a51d2e3 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -25,6 +25,8 @@ class WorkflowStatus(object): """ registered = "registered" + parsing = "parsing" + parsed = "parsed" checking = "checking" checked = "checked" starting = "starting" @@ -179,6 +181,7 @@ class WorkflowSpec(WorkflowBaseSpec): AttributeWithType("check_time", datetime), AttributeWithType("locked_by", str), AttributeWithType("lock_time", datetime), + AttributeWithType("raw_request_json", str), AttributeWithType("definition_json", str), AttributeWithType("parameters", str), ) @@ -191,6 +194,52 @@ class WorkflowSpec(WorkflowBaseSpec): # mapping between sequence and attr _seqAttrMap = {"workflow_id": f"{panda_config.schemaJEDI}.WORKFLOW_ID_SEQ.nextval"} + @property + def raw_request_json_map(self) -> dict: + """ + Get the dictionary parsed by raw_request_json attribute in JSON + + Returns: + dict : dict of raw_request_json if it is JSON or empty dict if null + """ + if self.raw_request_json is None: + return {} + else: + return json.loads(self.raw_request_json) + + @raw_request_json_map.setter + def raw_request_json_map(self, value_map: dict): + """ + Set the dictionary and store in raw_request_json attribute in JSON + + Args: + value_map (dict): dict to set the raw_request_json map + """ + self.raw_request_json = json.dumps(value_map) + + @property + def definition_json_map(self) -> dict: + """ + Get the dictionary parsed by definition_json attribute in JSON + + Returns: + dict : dict of definition_json if it is JSON or empty dict if null + """ + if self.definition_json is None: + return {} + else: + return json.loads(self.definition_json) + + @definition_json_map.setter + def definition_json_map(self, value_map: dict): + """ + Set the dictionary and store in definition_json attribute in JSON + + Args: + value_map (dict): dict to set the definition_json map + """ + self.definition_json = json.dumps(value_map) + class WFStepSpec(WorkflowBaseSpec): """ @@ -226,6 +275,29 @@ class WFStepSpec(WorkflowBaseSpec): # mapping between sequence and attr _seqAttrMap = {"step_id": f"{panda_config.schemaJEDI}.WORKFLOW_STEP_ID_SEQ.nextval"} + @property + def definition_json_map(self) -> dict: + """ + Get the dictionary parsed by definition_json attribute in JSON + + Returns: + dict : dict of definition_json if it is JSON or empty dict if null + """ + if self.definition_json is None: + return {} + else: + return json.loads(self.definition_json) + + @definition_json_map.setter + def definition_json_map(self, value_map: dict): + """ + Set the dictionary and store in definition_json attribute in JSON + + Args: + value_map (dict): dict to set the definition_json map + """ + self.definition_json = json.dumps(value_map) + class WFDataSpec(WorkflowBaseSpec): """ diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 594380f48..14894cf36 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -155,15 +155,18 @@ def workflow_data_lock(self, data_id: int, lock_expiration_sec: int = 120): # Add methods for workflow management here - def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: str, workflow_definition: dict, *args, **kwargs) -> int | None: + def register_workflow( + self, prodsourcelabel: str, username: str, workflow_name: str = None, workflow_definition: dict = None, raw_request_params: dict = None, *args, **kwargs + ) -> int | None: """ Register a new workflow Args: prodsourcelabel (str): Production source label for the workflow username (str): Username of the person registering the workflow - name (str): Name of the workflow + workflow_name (str): Name of the workflow workflow_definition (dict): Dictionary of workflow definition + raw_request_params (dict): Dictionary of parameters of the raw request *args: Additional arguments **kwargs: Additional keyword arguments @@ -176,9 +179,16 @@ def register_workflow(self, prodsourcelabel: str, username: str, workflow_name: ... workflow_spec = WorkflowSpec() workflow_spec.prodsourcelabel = prodsourcelabel - workflow_spec.name = workflow_name workflow_spec.username = username - workflow_spec.definition_json = json.dumps(workflow_definition, default=json_serialize_default) + if workflow_name is not None: + workflow_spec.name = workflow_name + if workflow_definition is not None: + workflow_spec.definition_json = json.dumps(workflow_definition, default=json_serialize_default) + elif raw_request_params is not None: + workflow_spec.raw_request_json = json.dumps(raw_request_params, default=json_serialize_default) + else: + tmp_log.error(f"Either workflow_definition or raw_request_params must be provided") + return None workflow_spec.creation_time = naive_utcnow() workflow_spec.status = "registered" # Insert to DB From deefcc3e6366c9a485e5fca9d5bc040056c79442 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 1 Oct 2025 15:44:30 +0200 Subject: [PATCH 015/101] workflows4: parse workflow --- pandaserver/api/v1/workflow_api.py | 13 +- pandaserver/workflow/workflow_base.py | 1 - pandaserver/workflow/workflow_core.py | 73 ++++++--- pandaserver/workflow/workflow_parser.py | 180 +++++++++++++++++++++ pandaserver/workflow/workflow_utils.py | 202 ------------------------ 5 files changed, 245 insertions(+), 224 deletions(-) create mode 100644 pandaserver/workflow/workflow_parser.py diff --git a/pandaserver/api/v1/workflow_api.py b/pandaserver/api/v1/workflow_api.py index a77358abf..7f0f90865 100644 --- a/pandaserver/api/v1/workflow_api.py +++ b/pandaserver/api/v1/workflow_api.py @@ -1,4 +1,5 @@ import datetime +import json from concurrent.futures import ThreadPoolExecutor from threading import Lock from typing import Any, Dict, List @@ -43,7 +44,7 @@ def init_task_buffer(task_buffer: TaskBuffer) -> None: @request_validation(_logger, secure=True, production=True, request_method="POST") -def submit_workflow_raw_request(req: PandaRequest, params: dict) -> dict: +def submit_workflow_raw_request(req: PandaRequest, params: dict | str) -> dict: """ Submit raw request of PanDA native workflow. @@ -53,7 +54,7 @@ def submit_workflow_raw_request(req: PandaRequest, params: dict) -> dict: Args: req(PandaRequest): internally generated request object containing the env variables - params (dict): dictionary of parameters of the raw request + params (dict|str): dictionary or JSON of parameters of the raw request Returns: dict: dictionary `{'success': True/False, 'message': 'Description of error', 'data': }` @@ -69,6 +70,14 @@ def submit_workflow_raw_request(req: PandaRequest, params: dict) -> dict: success, message, data = False, "", None time_start = naive_utcnow() + if isinstance(params, str): + try: + params = json.loads(params) + except Exception as exc: + message = f"Failed to parse params: {params} {str(exc)}" + tmp_logger.error(message) + return generate_response(success, message, data) + workflow_id = global_wfif.register_workflow(prodsourcelabel, username, raw_request_params=params) if workflow_id is not None: diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index e3a51d2e3..86802ae43 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -25,7 +25,6 @@ class WorkflowStatus(object): """ registered = "registered" - parsing = "parsing" parsed = "parsed" checking = "checking" checked = "checked" diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 14894cf36..3bad0893a 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -18,6 +18,7 @@ from pandaserver.config import panda_config from pandaserver.dataservice.ddm import rucioAPI +from pandaserver.srvcore.CoreUtils import clean_user_id from pandaserver.workflow.workflow_base import ( WFDataSpec, WFDataStatus, @@ -28,6 +29,10 @@ WorkflowSpec, WorkflowStatus, ) +from pandaserver.workflow.workflow_parser import ( + json_serialize_default, + parse_raw_request, +) # import polars as pl # isort:skip @@ -41,23 +46,6 @@ # =============================================================== - -def json_serialize_default(obj): - """ - Default JSON serializer for non-serializable objects - - Args: - obj (Any): Object to serialize - - Returns: - Any: JSON serializable object - """ - # convert set to list - if isinstance(obj, set): - return list(obj) - return obj - - # =============================================================== @@ -179,7 +167,7 @@ def register_workflow( ... workflow_spec = WorkflowSpec() workflow_spec.prodsourcelabel = prodsourcelabel - workflow_spec.username = username + workflow_spec.username = clean_user_id(username) if workflow_name is not None: workflow_spec.name = workflow_name if workflow_definition is not None: @@ -204,7 +192,54 @@ def register_workflow( def process_workflow_registered(self, workflow_spec: WorkflowSpec): """ Process a workflow in registered status - Parse the workflow definition, register steps, and update its status + To parse to get workflow definition from raw request + + Args: + workflow_spec (WorkflowSpec): The workflow specification to process + """ + tmp_log = LogWrapper(logger, f"process_workflow_registered workflow_id={workflow_spec.workflow_id}") + tmp_log.debug("start") + try: + if workflow_spec.definition_json is not None: + # Already has definition, skip parsing + tmp_log.debug(f"Workflow already has definition, skipping parsing") + else: + # Parse the workflow definition from raw request + raw_request_dict = workflow_spec.raw_request_json_map() + sandbox_url = os.path.join(raw_request_dict["sourceURL"], "cache", raw_request_dict["sandbox"]) + log_token = f'< user="{workflow_spec.username}" outDS={raw_request_dict["outDS"]}>' + is_ok, is_fatal, workflow_definition_dict = parse_raw_request( + sandbox_url=raw_request_dict.get("sandbox_url"), + log_token=raw_request_dict.get("log_token"), + user_name=workflow_spec.username, + ops_file=raw_request_dict.get("ops_file"), + ) + # Failure handling + if is_fatal: + tmp_log.error(f"Fatal error in parsing raw request; cancelled the workflow") + workflow_spec.status = WorkflowStatus.cancelled + workflow_spec.set_parameter("cancel_reason", "Fatal error in parsing raw request") + self.tbif.update_workflow(workflow_spec) + return + if not is_ok: + tmp_log.warning(f"Failed to parse raw request; skipped") + return + # Parsed successfully, update definition + workflow_spec.definition_json = json.dumps(workflow_definition_dict, default=json_serialize_default) + tmp_log.debug(f"Parsed raw request into definition") + # Update status to parsed + workflow_spec.status = WorkflowStatus.parsed + # Update DB + self.tbif.update_workflow(workflow_spec) + tmp_log.info(f"Done, status={workflow_spec.status}") + except Exception: + tmp_log.error(f"Got error ; {traceback.format_exc()}") + + def process_workflow_parsed(self, workflow_spec: WorkflowSpec): + """ + Process a workflow in parsed status + Register steps, and update its status + Parse raw request into workflow definition, register steps, and update its status Args: workflow_spec (WorkflowSpec): The workflow specification to process diff --git a/pandaserver/workflow/workflow_parser.py b/pandaserver/workflow/workflow_parser.py new file mode 100644 index 000000000..f5a2cc13c --- /dev/null +++ b/pandaserver/workflow/workflow_parser.py @@ -0,0 +1,180 @@ +import copy +import json +import os +import re +import shlex +import sys +import tempfile +import traceback + +import requests +from idds.atlas.workflowv2.atlaslocalpandawork import ATLASLocalPandaWork +from idds.atlas.workflowv2.atlaspandawork import ATLASPandaWork +from idds.workflowv2.workflow import AndCondition, Condition, OrCondition, Workflow +from pandaclient import PhpoScript, PrunScript +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger +from ruamel.yaml import YAML + +from pandaserver.srvcore.CoreUtils import clean_user_id, commands_get_status_output +from pandaserver.workflow import pcwl_utils, workflow_utils +from pandaserver.workflow.snakeparser import Parser + +# supported workflow description languages +SUPPORTED_WORKFLOW_LANGUAGES = ["cwl", "snakemake"] + +# main logger +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + + +# ============================================================================== +# Native PanDA workflow functions +# ============================================================================== + + +def json_serialize_default(obj): + """ + Default JSON serializer for non-serializable objects of Node object + + Args: + obj (Any): Object to serialize + + Returns: + Any: JSON serializable object + """ + # convert set to list + if isinstance(obj, set): + return list(obj) + elif isinstance(obj, workflow_utils.Node): + return obj.id + return obj + + +def parse_raw_request(sandbox_url, log_token, user_name, ops_file=None) -> tuple[bool, bool, dict]: + """ + Parse raw request with files in sandbox into workflow definition + + Args: + sandbox_url (str): URL to download sandbox + log_token (str): Log token + user_name (str): User name + ops_file (str | None): File containing operations + + Returns: + bool: Whether the parsing is successful + bool: Whether the failure is fatal + dict: Workflow definition dictionary + """ + tmp_log = LogWrapper(logger, log_token) + is_ok = True + is_fatal = False + # request_id = None + workflow_definition_dict = dict() + try: + if ops_file is not None: + # read ops file + with open(ops_file) as f: + ops = json.load(f) + try: + os.remove(ops_file) + except Exception: + pass + # go to temp dir + cur_dir = os.getcwd() + with tempfile.TemporaryDirectory() as tmp_dirname: + os.chdir(tmp_dirname) + # download sandbox + tmp_log.info(f"downloading sandbox from {sandbox_url}") + with requests.get(sandbox_url, allow_redirects=True, verify=False, stream=True) as r: + if r.status_code == 400: + tmp_log.error("not found") + is_fatal = True + is_ok = False + elif r.status_code != 200: + tmp_log.error(f"bad HTTP response {r.status_code}") + is_ok = False + # extract sandbox + if is_ok: + with open(ops["data"]["sandbox"], "wb") as fs: + for chunk in r.raw.stream(1024, decode_content=False): + if chunk: + fs.write(chunk) + fs.close() + tmp_stat, tmp_out = commands_get_status_output(f"tar xvfz {ops['data']['sandbox']}") + if tmp_stat != 0: + tmp_log.error(tmp_out) + dump_str = f"failed to extract {ops['data']['sandbox']}" + tmp_log.error(dump_str) + is_fatal = True + is_ok = False + # parse workflow files + if is_ok: + tmp_log.info("parse workflow") + workflow_name = None + if (wf_lang := ops["data"]["language"]) in SUPPORTED_WORKFLOW_LANGUAGES: + if wf_lang == "cwl": + workflow_name = ops["data"].get("workflow_name") + nodes, root_in = pcwl_utils.parse_workflow_file(ops["data"]["workflowSpecFile"], tmp_log) + with open(ops["data"]["workflowInputFile"]) as workflow_input: + yaml = YAML(typ="safe", pure=True) + data = yaml.load(workflow_input) + elif wf_lang == "snakemake": + parser = Parser(ops["data"]["workflowSpecFile"], logger=tmp_log) + nodes, root_in = parser.parse_nodes() + data = dict() + # resolve nodes + s_id, t_nodes, nodes = workflow_utils.resolve_nodes(nodes, root_in, data, 0, set(), ops["data"]["outDS"], tmp_log) + workflow_utils.set_workflow_outputs(nodes) + id_node_map = workflow_utils.get_node_id_map(nodes) + [node.resolve_params(ops["data"]["taskParams"], id_node_map) for node in nodes] + dump_str = "the description was internally converted as follows\n" + workflow_utils.dump_nodes(nodes) + tmp_log.info(dump_str) + for node in nodes: + s_check, o_check = node.verify() + tmp_str = f"Verification failure in ID:{node.id} {o_check}" + if not s_check: + tmp_log.error(tmp_str) + dump_str += tmp_str + dump_str += "\n" + is_fatal = True + is_ok = False + else: + dump_str = "{} is not supported to describe the workflow" + tmp_log.error(dump_str) + is_fatal = True + is_ok = False + # genertate workflow definition + if is_ok: + # root inputs + root_inputs_dict = dict() + for k in root_in: + kk = k.split("#")[-1] + if kk in data: + root_inputs_dict[k] = data[kk] + # root outputs + root_outputs_dict = dict() + nodes_list = [] + # nodes + for node in nodes: + nodes_list.append(vars(node)) + if node.is_tail: + root_outputs_dict.update(node.outputs) + # workflow definition + workflow_definition_dict = { + "workflow_name": workflow_name, + "user_name": user_name, + "root_inputs": root_inputs_dict, + "root_outputs": root_outputs_dict, + "nodes": nodes_list, + } + os.chdir(cur_dir) + except Exception as e: + is_ok = False + is_fatal = True + tmp_log.error(f"failed to run with {str(e)} {traceback.format_exc()}") + + # with tempfile.NamedTemporaryFile(delete=False, mode="w") as tmp_json: + # json.dump([is_ok, is_fatal, request_id, tmp_log.dumpToString()], tmp_json) + # print(tmp_json.name) + + return is_ok, is_fatal, workflow_definition_dict diff --git a/pandaserver/workflow/workflow_utils.py b/pandaserver/workflow/workflow_utils.py index 7a1600a63..dce483be5 100644 --- a/pandaserver/workflow/workflow_utils.py +++ b/pandaserver/workflow/workflow_utils.py @@ -1025,205 +1025,3 @@ def convert_nodes_to_workflow(nodes, workflow_node=None, workflow=None, workflow if not is_top: return id_work_map, dump_str_list return workflow, dump_str_list - - -# register panda workflow -def register_panda_workflow(nodes, workflow_node=None, workflow=None, workflow_name=None): - """ - Register nodes as PanDA workflow - """ - # if workflow is None: - # is_top = True - # workflow = Workflow() - # workflow.name = workflow_name - # else: - # is_top = False - # id_work_map = {} - # all_sub_id_work_map = {} - # sub_to_id_map = {} - # cond_dump_str = " Conditions\n" - # class_dump_str = f"===== Workflow ID:{workflow_node.id if workflow_node else workflow_name} ====\n" - # class_dump_str += " Works\n" - # dump_str_list = [] - # # create works or workflows - # for node in nodes: - # if node.is_leaf: - # # work - # if node.type == "junction": - # work = ATLASLocalPandaWork(task_parameters=node.task_params) - # work.add_custom_condition("to_exit", True) - # else: - # work = ATLASPandaWork(task_parameters=node.task_params) - # workflow.add_work(work) - # id_work_map[node.id] = work - # class_dump_str += f" {node.short_desc()} Class:{work.__class__.__name__}\n" - # else: - # # sub workflow - # sub_workflow = Workflow() - # id_work_map[node.id] = sub_workflow - # class_dump_str += f" {node.short_desc()} Class:{sub_workflow.__class__.__name__}\n" - # sub_id_work_map, tmp_dump_str_list = convert_nodes_to_workflow(node.sub_nodes, node, sub_workflow) - # dump_str_list += tmp_dump_str_list - # for sub_id in node.get_all_sub_node_ids(): - # all_sub_id_work_map[sub_id] = sub_workflow - # sub_to_id_map[sub_id] = node.id - # # add loop condition - # if node.loop: - # for sub_node in node.sub_nodes: - # if sub_node.type == "junction": - # # use to_continue for loop termination - # j_work = sub_id_work_map[sub_node.id] - # j_work.add_custom_condition(key="to_continue", value=True) - # cond = Condition(cond=j_work.get_custom_condition_status) - # sub_workflow.add_loop_condition(cond) - # cond_dump_str += f" Loop in ID:{node.id} with terminator ID:{sub_node.id}\n" - # break - # workflow.add_work(sub_workflow) - # # add conditions - # for node in nodes: - # if node.parents: - # c_work = id_work_map[node.id] - # if not node.condition: - # # default conditions if unspecified - # cond_func_list = [] - # for p_id in node.parents: - # if p_id in id_work_map: - # p_work = id_work_map[p_id] - # str_p_id = p_id - # elif p_id in all_sub_id_work_map: - # p_work = all_sub_id_work_map[p_id] - # str_p_id = sub_to_id_map[p_id] - # else: - # # head node - # continue - # if len(node.parents) > 1 or isinstance(p_work, Workflow) or node.type in ["junction", "reana", "gitlab"]: - # cond_function = p_work.is_processed - # else: - # cond_function = p_work.is_started - # if cond_function not in cond_func_list: - # cond_func_list.append(cond_function) - # cond_dump_str += f" Default Link ID:{str_p_id} {cond_function.__name__} -> ID:{node.id}\n" - # cond = AndCondition(true_works=[c_work], conditions=cond_func_list) - # workflow.add_condition(cond) - # else: - # # convert conditions - # cond_list = node.condition.get_dict_form() - # base_cond_map = {} - # str_cond_map = {} - # root_condition = None - # for tmp_idx, base_cond in cond_list: - # # leaf condition - # if base_cond["right"] is None: - # # condition based on works - # cond_func_list = [] - # str_func_list = [] - # for p_id in base_cond["left"]: - # if p_id in id_work_map: - # p_work = id_work_map[p_id] - # str_p_id = p_id - # else: - # p_work = all_sub_id_work_map[p_id] - # str_p_id = sub_to_id_map[p_id] - # # finished or failed - # if base_cond["operator"] is None: - # cond_function = p_work.is_processed - # else: - # cond_function = p_work.is_failed - # cond_func_list.append(cond_function) - # str_func_list.append(f"ID:{str_p_id} {cond_function.__name__}") - # cond = AndCondition(conditions=cond_func_list) - # base_cond_map[tmp_idx] = cond - # str_func = "AND ".join(str_func_list) - # str_cond_map[tmp_idx] = str_func - # cond_dump_str += f" Unary Ops {cond.__class__.__name__}({str_func}) -> ID:{node.id}\n" - # root_condition = cond - # else: - # # composite condition - # l_str_func_list = [] - # r_str_func_list = [] - # if isinstance(base_cond["left"], set): - # cond_func_list = [] - # for p_id in base_cond["left"]: - # if p_id in id_work_map: - # p_work = id_work_map[p_id] - # str_p_id = p_id - # else: - # p_work = all_sub_id_work_map[p_id] - # str_p_id = sub_to_id_map[p_id] - # cond_function = p_work.is_processed - # cond_func_list.append(cond_function) - # l_str_func_list.append(f"ID:{str_p_id} {cond_function.__name__}") - # l_cond = AndCondition(conditions=cond_func_list) - # l_str_func = "AND ".join(l_str_func_list) - # str_cond_map[base_cond["left"]] = l_str_func - # else: - # l_cond = base_cond_map[base_cond["left"]] - # l_str_func = str_cond_map[base_cond["left"]] - # if isinstance(base_cond["right"], set): - # cond_func_list = [] - # for p_id in base_cond["right"]: - # if p_id in id_work_map: - # p_work = id_work_map[p_id] - # str_p_id = p_id - # else: - # p_work = all_sub_id_work_map[p_id] - # str_p_id = sub_to_id_map[p_id] - # cond_function = p_work.is_processed - # cond_func_list.append(cond_function) - # r_str_func_list.append(f"ID:{str_p_id} {cond_function.__name__}") - # r_cond = AndCondition(conditions=cond_func_list) - # r_str_func = "AND ".join(r_str_func_list) - # str_cond_map[base_cond["right"]] = r_str_func - # else: - # r_cond = base_cond_map[base_cond["right"]] - # r_str_func = str_cond_map[base_cond["right"]] - # if base_cond["operator"] == "and": - # cond = AndCondition( - # conditions=[ - # l_cond.is_condition_true, - # r_cond.is_condition_true, - # ] - # ) - # else: - # cond = OrCondition( - # conditions=[ - # l_cond.is_condition_true, - # r_cond.is_condition_true, - # ] - # ) - # base_cond_map[tmp_idx] = cond - # cond_dump_str += f" Binary Ops {cond.__class__.__name__}({l_str_func}, {r_str_func}) for ID:{node.id}\n" - # root_condition = cond - # # set root condition - # if root_condition: - # root_condition.true_works = [c_work] - # workflow.add_condition(root_condition) - # # global parameters - # if workflow_node: - # tmp_global, tmp_workflow_global = workflow_node.get_global_parameters() - # if tmp_global: - # loop_locals = {} - # loop_slices = [] - # for k, v in tmp_global.items(): - # if not isinstance(v, dict): - # # normal looping locals - # loop_locals["user_" + k] = tmp_global[k] - # else: - # # sliced locals - # v["src"] = "user_" + v["src"] - # loop_slices.append([k, v]) - # if loop_locals: - # workflow.set_global_parameters(loop_locals) - # for k, v in loop_slices: - # workflow.set_sliced_global_parameters(source=v["src"], index=v["idx"], name="user_" + k) - # cond_dump_str += "\n Looping local variables\n" - # cond_dump_str += f" {tmp_global}\n" - # if tmp_workflow_global: - # cond_dump_str += "\n Workflow local variable\n" - # cond_dump_str += f" {tmp_workflow_global}\n" - # # dump strings - # dump_str_list.insert(0, class_dump_str + "\n" + cond_dump_str + "\n\n") - # # return - # if not is_top: - # return id_work_map, dump_str_list - # return workflow, dump_str_list From 4c253af36c531b4d915f285c3f06d6686b5728cf Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 2 Oct 2025 12:28:38 +0200 Subject: [PATCH 016/101] workflows4: fix --- pandaserver/workflow/workflow_core.py | 8 +++---- pandaserver/workflow/workflow_parser.py | 32 ++++++++++--------------- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 3bad0893a..fc1e43b87 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -205,14 +205,14 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): tmp_log.debug(f"Workflow already has definition, skipping parsing") else: # Parse the workflow definition from raw request - raw_request_dict = workflow_spec.raw_request_json_map() + raw_request_dict = workflow_spec.raw_request_json_map sandbox_url = os.path.join(raw_request_dict["sourceURL"], "cache", raw_request_dict["sandbox"]) log_token = f'< user="{workflow_spec.username}" outDS={raw_request_dict["outDS"]}>' is_ok, is_fatal, workflow_definition_dict = parse_raw_request( - sandbox_url=raw_request_dict.get("sandbox_url"), - log_token=raw_request_dict.get("log_token"), + sandbox_url=sandbox_url, + log_token=log_token, user_name=workflow_spec.username, - ops_file=raw_request_dict.get("ops_file"), + raw_request_dict=raw_request_dict, ) # Failure handling if is_fatal: diff --git a/pandaserver/workflow/workflow_parser.py b/pandaserver/workflow/workflow_parser.py index f5a2cc13c..8281ed5b9 100644 --- a/pandaserver/workflow/workflow_parser.py +++ b/pandaserver/workflow/workflow_parser.py @@ -50,7 +50,7 @@ def json_serialize_default(obj): return obj -def parse_raw_request(sandbox_url, log_token, user_name, ops_file=None) -> tuple[bool, bool, dict]: +def parse_raw_request(sandbox_url, log_token, user_name, raw_request_dict) -> tuple[bool, bool, dict]: """ Parse raw request with files in sandbox into workflow definition @@ -58,7 +58,7 @@ def parse_raw_request(sandbox_url, log_token, user_name, ops_file=None) -> tuple sandbox_url (str): URL to download sandbox log_token (str): Log token user_name (str): User name - ops_file (str | None): File containing operations + raw_request_dict (dict): Raw request dictionary Returns: bool: Whether the parsing is successful @@ -71,14 +71,6 @@ def parse_raw_request(sandbox_url, log_token, user_name, ops_file=None) -> tuple # request_id = None workflow_definition_dict = dict() try: - if ops_file is not None: - # read ops file - with open(ops_file) as f: - ops = json.load(f) - try: - os.remove(ops_file) - except Exception: - pass # go to temp dir cur_dir = os.getcwd() with tempfile.TemporaryDirectory() as tmp_dirname: @@ -95,15 +87,15 @@ def parse_raw_request(sandbox_url, log_token, user_name, ops_file=None) -> tuple is_ok = False # extract sandbox if is_ok: - with open(ops["data"]["sandbox"], "wb") as fs: + with open(raw_request_dict["sandbox"], "wb") as fs: for chunk in r.raw.stream(1024, decode_content=False): if chunk: fs.write(chunk) fs.close() - tmp_stat, tmp_out = commands_get_status_output(f"tar xvfz {ops['data']['sandbox']}") + tmp_stat, tmp_out = commands_get_status_output(f"tar xvfz {raw_request_dict['sandbox']}") if tmp_stat != 0: tmp_log.error(tmp_out) - dump_str = f"failed to extract {ops['data']['sandbox']}" + dump_str = f"failed to extract {raw_request_dict['sandbox']}" tmp_log.error(dump_str) is_fatal = True is_ok = False @@ -111,22 +103,22 @@ def parse_raw_request(sandbox_url, log_token, user_name, ops_file=None) -> tuple if is_ok: tmp_log.info("parse workflow") workflow_name = None - if (wf_lang := ops["data"]["language"]) in SUPPORTED_WORKFLOW_LANGUAGES: + if (wf_lang := raw_request_dict["language"]) in SUPPORTED_WORKFLOW_LANGUAGES: if wf_lang == "cwl": - workflow_name = ops["data"].get("workflow_name") - nodes, root_in = pcwl_utils.parse_workflow_file(ops["data"]["workflowSpecFile"], tmp_log) - with open(ops["data"]["workflowInputFile"]) as workflow_input: + workflow_name = raw_request_dict.get("workflow_name") + nodes, root_in = pcwl_utils.parse_workflow_file(raw_request_dict["workflowSpecFile"], tmp_log) + with open(raw_request_dict["workflowInputFile"]) as workflow_input: yaml = YAML(typ="safe", pure=True) data = yaml.load(workflow_input) elif wf_lang == "snakemake": - parser = Parser(ops["data"]["workflowSpecFile"], logger=tmp_log) + parser = Parser(raw_request_dict["workflowSpecFile"], logger=tmp_log) nodes, root_in = parser.parse_nodes() data = dict() # resolve nodes - s_id, t_nodes, nodes = workflow_utils.resolve_nodes(nodes, root_in, data, 0, set(), ops["data"]["outDS"], tmp_log) + s_id, t_nodes, nodes = workflow_utils.resolve_nodes(nodes, root_in, data, 0, set(), raw_request_dict["outDS"], tmp_log) workflow_utils.set_workflow_outputs(nodes) id_node_map = workflow_utils.get_node_id_map(nodes) - [node.resolve_params(ops["data"]["taskParams"], id_node_map) for node in nodes] + [node.resolve_params(raw_request_dict["taskParams"], id_node_map) for node in nodes] dump_str = "the description was internally converted as follows\n" + workflow_utils.dump_nodes(nodes) tmp_log.info(dump_str) for node in nodes: From e013b08aad0c77ae6b1a06c9798d92db58ab97a6 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 7 Oct 2025 13:42:38 +0200 Subject: [PATCH 017/101] workflows4: parsed to starting --- .../workflow/test_workflow_core_functions.py | 623 +++++++++--------- pandaserver/workflow/workflow_core.py | 126 ++-- 2 files changed, 380 insertions(+), 369 deletions(-) diff --git a/pandaserver/workflow/test_workflow_core_functions.py b/pandaserver/workflow/test_workflow_core_functions.py index d0f5523a9..cc62d14ad 100644 --- a/pandaserver/workflow/test_workflow_core_functions.py +++ b/pandaserver/workflow/test_workflow_core_functions.py @@ -17,316 +17,317 @@ username = "testuser" workflow_name = "test_workflow_bg_comb_00" +WFID = 1 # workflow ID to be used in this test # workflow definition json -wfd_json = json.dumps( - json.loads( - """ -{ - "root_inputs": { - "sig_bg_comb.cwl#background": "mc16_5TeV.361238.Pythia8EvtGen_A3NNPDF23LO_minbias_inelastic_low.merge.HITS.e6446_s3238_s3250/", - "sig_bg_comb.cwl#signal": "mc16_valid:mc16_valid.900248.PG_singlepion_flatPt2to50.simul.HITS.e8312_s3238_tid26378578_00" - }, - "root_outputs": {"sig_bg_comb.cwl#combine/outDS": {"value": "user.me.my_outDS_005_combine"}}, - "nodes": [ - { - "condition": null, - "data": null, - "id": 1, - "in_loop": false, - "inputs": { - "sig_bg_comb.cwl#make_signal/opt_args": { - "default": "--outputs abc.dat,def.zip --nFilesPerJob 5", - "source": null - }, - "sig_bg_comb.cwl#make_signal/opt_containerImage": { - "default": "docker://busybox", - "source": null - }, - "sig_bg_comb.cwl#make_signal/opt_exec": { - "default": "echo %IN > abc.dat; echo 123 > def.zip", - "source": null - }, - "sig_bg_comb.cwl#make_signal/opt_inDS": { - "default": null, - "source": "sig_bg_comb.cwl#signal" - } - }, - "is_head": false, - "is_leaf": true, - "is_tail": false, - "is_workflow_output": false, - "loop": false, - "name": "make_signal", - "output_types": [], - "outputs": { - "sig_bg_comb.cwl#make_signal/outDS": {} - }, - "parents": [], - "root_inputs": null, - "scatter": null, - "sub_nodes": [], - "task_params": null, - "type": "prun", - "upper_root_inputs": null - }, - { - "condition": null, - "data": null, - "id": 2, - "in_loop": false, - "inputs": { - "sig_bg_comb.cwl#make_background_1/opt_args": { - "default": "--outputs opq.root,xyz.pool --nGBPerJob 10", - "source": null - }, - "sig_bg_comb.cwl#make_background_1/opt_exec": { - "default": "echo %IN > opq.root; echo %IN > xyz.pool", - "source": null - }, - "sig_bg_comb.cwl#make_background_1/opt_inDS": { - "default": null, - "source": "sig_bg_comb.cwl#background" - } - }, - "is_head": false, - "is_leaf": true, - "is_tail": false, - "is_workflow_output": false, - "loop": false, - "name": "make_background_1", - "output_types": [], - "outputs": { - "sig_bg_comb.cwl#make_background_1/outDS": {} - }, - "parents": [], - "root_inputs": null, - "scatter": null, - "sub_nodes": [], - "task_params": null, - "type": "prun", - "upper_root_inputs": null - }, - { - "condition": null, - "data": null, - "id": 3, - "in_loop": false, - "inputs": { - "sig_bg_comb.cwl#premix/opt_args": { - "default": "--outputs klm.root --secondaryDSs IN2:2:%{SECDS1}", - "source": null - }, - "sig_bg_comb.cwl#premix/opt_exec": { - "default": "echo %IN %IN2 > klm.root", - "source": null - }, - "sig_bg_comb.cwl#premix/opt_inDS": { - "default": null, - "parent_id": 1, - "source": "sig_bg_comb.cwl#make_signal/outDS" - }, - "sig_bg_comb.cwl#premix/opt_inDsType": { - "default": "def.zip", - "source": null - }, - "sig_bg_comb.cwl#premix/opt_secondaryDSs": { - "default": null, - "parent_id": [ - 2 - ], - "source": [ - "sig_bg_comb.cwl#make_background_1/outDS" - ] - }, - "sig_bg_comb.cwl#premix/opt_secondaryDsTypes": { - "default": [ - "xyz.pool" - ], - "source": null - } - }, - "is_head": false, - "is_leaf": true, - "is_tail": false, - "is_workflow_output": false, - "loop": false, - "name": "premix", - "output_types": [], - "outputs": { - "sig_bg_comb.cwl#premix/outDS": {} - }, - "parents": [ - 1, - 2 - ], - "root_inputs": null, - "scatter": null, - "sub_nodes": [], - "task_params": null, - "type": "prun", - "upper_root_inputs": null - }, - { - "condition": null, - "data": null, - "id": 4, - "in_loop": false, - "inputs": { - "sig_bg_comb.cwl#generate_some/opt_args": { - "default": "--outputs gen.root --nJobs 10", - "source": null - }, - "sig_bg_comb.cwl#generate_some/opt_exec": { - "default": "echo %RNDM:10 > gen.root", - "source": null - } - }, - "is_head": false, - "is_leaf": true, - "is_tail": false, - "is_workflow_output": false, - "loop": false, - "name": "generate_some", - "output_types": [], - "outputs": { - "sig_bg_comb.cwl#generate_some/outDS": {} - }, - "parents": [], - "root_inputs": null, - "scatter": null, - "sub_nodes": [], - "task_params": null, - "type": "prun", - "upper_root_inputs": null - }, - { - "condition": null, - "data": null, - "id": 5, - "in_loop": false, - "inputs": { - "sig_bg_comb.cwl#make_background_2/opt_args": { - "default": "--outputs ooo.root,jjj.txt --secondaryDSs IN2:2:%{SECDS1}", - "source": null - }, - "sig_bg_comb.cwl#make_background_2/opt_containerImage": { - "default": "docker://alpine", - "source": null - }, - "sig_bg_comb.cwl#make_background_2/opt_exec": { - "default": "echo %IN > ooo.root; echo %IN2 > jjj.txt", - "source": null - }, - "sig_bg_comb.cwl#make_background_2/opt_inDS": { - "default": null, - "source": "sig_bg_comb.cwl#background" - }, - "sig_bg_comb.cwl#make_background_2/opt_secondaryDSs": { - "default": null, - "parent_id": [ - 4 - ], - "source": [ - "sig_bg_comb.cwl#generate_some/outDS" - ] - }, - "sig_bg_comb.cwl#make_background_2/opt_secondaryDsTypes": { - "default": [ - "gen.root" - ], - "source": null - } - }, - "is_head": false, - "is_leaf": true, - "is_tail": false, - "is_workflow_output": false, - "loop": false, - "name": "make_background_2", - "output_types": [], - "outputs": { - "sig_bg_comb.cwl#make_background_2/outDS": {} - }, - "parents": [ - 4 - ], - "root_inputs": null, - "scatter": null, - "sub_nodes": [], - "task_params": null, - "type": "prun", - "upper_root_inputs": null - }, - { - "condition": null, - "data": null, - "id": 6, - "in_loop": false, - "inputs": { - "sig_bg_comb.cwl#combine/opt_args": { - "default": "--outputs aaa.root --secondaryDSs IN2:2:%{SECDS1},IN3:5:%{SECDS2}", - "source": null - }, - "sig_bg_comb.cwl#combine/opt_exec": { - "default": "echo %IN %IN2 %IN3 > aaa.root", - "source": null - }, - "sig_bg_comb.cwl#combine/opt_inDS": { - "default": null, - "parent_id": 1, - "source": "sig_bg_comb.cwl#make_signal/outDS" - }, - "sig_bg_comb.cwl#combine/opt_inDsType": { - "default": "abc.dat", - "source": null - }, - "sig_bg_comb.cwl#combine/opt_secondaryDSs": { - "default": null, - "parent_id": [ - 3, - 5 - ], - "source": [ - "sig_bg_comb.cwl#premix/outDS", - "sig_bg_comb.cwl#make_background_2/outDS" - ] - }, - "sig_bg_comb.cwl#combine/opt_secondaryDsTypes": { - "default": [ - "klm.root", - "ooo.root" - ], - "source": null - } - }, - "is_head": false, - "is_leaf": true, - "is_tail": true, - "is_workflow_output": false, - "loop": false, - "name": "combine", - "output_types": [], - "outputs": { - "sig_bg_comb.cwl#combine/outDS": {} - }, - "parents": [ - 1, - 3, - 5 - ], - "root_inputs": null, - "scatter": null, - "sub_nodes": [], - "task_params": null, - "type": "prun", - "upper_root_inputs": null - } - ] -} -""" - ) -) +# wfd_json = json.dumps( +# json.loads( +# """ +# { +# "root_inputs": { +# "sig_bg_comb.cwl#background": "mc16_5TeV.361238.Pythia8EvtGen_A3NNPDF23LO_minbias_inelastic_low.merge.HITS.e6446_s3238_s3250/", +# "sig_bg_comb.cwl#signal": "mc16_valid:mc16_valid.900248.PG_singlepion_flatPt2to50.simul.HITS.e8312_s3238_tid26378578_00" +# }, +# "root_outputs": {"sig_bg_comb.cwl#combine/outDS": {"value": "user.me.my_outDS_005_combine"}}, +# "nodes": [ +# { +# "condition": null, +# "data": null, +# "id": 1, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#make_signal/opt_args": { +# "default": "--outputs abc.dat,def.zip --nFilesPerJob 5", +# "source": null +# }, +# "sig_bg_comb.cwl#make_signal/opt_containerImage": { +# "default": "docker://busybox", +# "source": null +# }, +# "sig_bg_comb.cwl#make_signal/opt_exec": { +# "default": "echo %IN > abc.dat; echo 123 > def.zip", +# "source": null +# }, +# "sig_bg_comb.cwl#make_signal/opt_inDS": { +# "default": null, +# "source": "sig_bg_comb.cwl#signal" +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": false, +# "is_workflow_output": false, +# "loop": false, +# "name": "make_signal", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#make_signal/outDS": {} +# }, +# "parents": [], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# }, +# { +# "condition": null, +# "data": null, +# "id": 2, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#make_background_1/opt_args": { +# "default": "--outputs opq.root,xyz.pool --nGBPerJob 10", +# "source": null +# }, +# "sig_bg_comb.cwl#make_background_1/opt_exec": { +# "default": "echo %IN > opq.root; echo %IN > xyz.pool", +# "source": null +# }, +# "sig_bg_comb.cwl#make_background_1/opt_inDS": { +# "default": null, +# "source": "sig_bg_comb.cwl#background" +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": false, +# "is_workflow_output": false, +# "loop": false, +# "name": "make_background_1", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#make_background_1/outDS": {} +# }, +# "parents": [], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# }, +# { +# "condition": null, +# "data": null, +# "id": 3, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#premix/opt_args": { +# "default": "--outputs klm.root --secondaryDSs IN2:2:%{SECDS1}", +# "source": null +# }, +# "sig_bg_comb.cwl#premix/opt_exec": { +# "default": "echo %IN %IN2 > klm.root", +# "source": null +# }, +# "sig_bg_comb.cwl#premix/opt_inDS": { +# "default": null, +# "parent_id": 1, +# "source": "sig_bg_comb.cwl#make_signal/outDS" +# }, +# "sig_bg_comb.cwl#premix/opt_inDsType": { +# "default": "def.zip", +# "source": null +# }, +# "sig_bg_comb.cwl#premix/opt_secondaryDSs": { +# "default": null, +# "parent_id": [ +# 2 +# ], +# "source": [ +# "sig_bg_comb.cwl#make_background_1/outDS" +# ] +# }, +# "sig_bg_comb.cwl#premix/opt_secondaryDsTypes": { +# "default": [ +# "xyz.pool" +# ], +# "source": null +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": false, +# "is_workflow_output": false, +# "loop": false, +# "name": "premix", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#premix/outDS": {} +# }, +# "parents": [ +# 1, +# 2 +# ], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# }, +# { +# "condition": null, +# "data": null, +# "id": 4, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#generate_some/opt_args": { +# "default": "--outputs gen.root --nJobs 10", +# "source": null +# }, +# "sig_bg_comb.cwl#generate_some/opt_exec": { +# "default": "echo %RNDM:10 > gen.root", +# "source": null +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": false, +# "is_workflow_output": false, +# "loop": false, +# "name": "generate_some", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#generate_some/outDS": {} +# }, +# "parents": [], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# }, +# { +# "condition": null, +# "data": null, +# "id": 5, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#make_background_2/opt_args": { +# "default": "--outputs ooo.root,jjj.txt --secondaryDSs IN2:2:%{SECDS1}", +# "source": null +# }, +# "sig_bg_comb.cwl#make_background_2/opt_containerImage": { +# "default": "docker://alpine", +# "source": null +# }, +# "sig_bg_comb.cwl#make_background_2/opt_exec": { +# "default": "echo %IN > ooo.root; echo %IN2 > jjj.txt", +# "source": null +# }, +# "sig_bg_comb.cwl#make_background_2/opt_inDS": { +# "default": null, +# "source": "sig_bg_comb.cwl#background" +# }, +# "sig_bg_comb.cwl#make_background_2/opt_secondaryDSs": { +# "default": null, +# "parent_id": [ +# 4 +# ], +# "source": [ +# "sig_bg_comb.cwl#generate_some/outDS" +# ] +# }, +# "sig_bg_comb.cwl#make_background_2/opt_secondaryDsTypes": { +# "default": [ +# "gen.root" +# ], +# "source": null +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": false, +# "is_workflow_output": false, +# "loop": false, +# "name": "make_background_2", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#make_background_2/outDS": {} +# }, +# "parents": [ +# 4 +# ], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# }, +# { +# "condition": null, +# "data": null, +# "id": 6, +# "in_loop": false, +# "inputs": { +# "sig_bg_comb.cwl#combine/opt_args": { +# "default": "--outputs aaa.root --secondaryDSs IN2:2:%{SECDS1},IN3:5:%{SECDS2}", +# "source": null +# }, +# "sig_bg_comb.cwl#combine/opt_exec": { +# "default": "echo %IN %IN2 %IN3 > aaa.root", +# "source": null +# }, +# "sig_bg_comb.cwl#combine/opt_inDS": { +# "default": null, +# "parent_id": 1, +# "source": "sig_bg_comb.cwl#make_signal/outDS" +# }, +# "sig_bg_comb.cwl#combine/opt_inDsType": { +# "default": "abc.dat", +# "source": null +# }, +# "sig_bg_comb.cwl#combine/opt_secondaryDSs": { +# "default": null, +# "parent_id": [ +# 3, +# 5 +# ], +# "source": [ +# "sig_bg_comb.cwl#premix/outDS", +# "sig_bg_comb.cwl#make_background_2/outDS" +# ] +# }, +# "sig_bg_comb.cwl#combine/opt_secondaryDsTypes": { +# "default": [ +# "klm.root", +# "ooo.root" +# ], +# "source": null +# } +# }, +# "is_head": false, +# "is_leaf": true, +# "is_tail": true, +# "is_workflow_output": false, +# "loop": false, +# "name": "combine", +# "output_types": [], +# "outputs": { +# "sig_bg_comb.cwl#combine/outDS": {} +# }, +# "parents": [ +# 1, +# 3, +# 5 +# ], +# "root_inputs": null, +# "scatter": null, +# "sub_nodes": [], +# "task_params": null, +# "type": "prun", +# "upper_root_inputs": null +# } +# ] +# } +# """ +# ) +# ) # interface for workflow operations @@ -353,8 +354,12 @@ # workflow_definition_json=wfd_json, # ) -wf_spec = taskBuffer.get_workflow(workflow_id=1) # Process the registered workflow +wf_spec = taskBuffer.get_workflow(workflow_id=WFID) print("Processing registered workflow...") wfif.process_workflow_registered(wf_spec) + +wf_spec = taskBuffer.get_workflow(workflow_id=WFID) +print("Processing parsed workflow...") +wfif.process_workflow_parsed(wf_spec) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index fc1e43b87..77e63574c 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -162,7 +162,7 @@ def register_workflow( int | None: The ID of the registered workflow if successful, otherwise None """ tmp_log = LogWrapper(logger, f"register_workflow prodsourcelabel={prodsourcelabel} username={username} name={workflow_name}") - tmp_log.debug("start") + tmp_log.debug("Start") # Implementation of workflow registration logic ... workflow_spec = WorkflowSpec() @@ -178,7 +178,7 @@ def register_workflow( tmp_log.error(f"Either workflow_definition or raw_request_params must be provided") return None workflow_spec.creation_time = naive_utcnow() - workflow_spec.status = "registered" + workflow_spec.status = WorkflowStatus.registered # Insert to DB ret_workflow_id = self.tbif.insert_workflow(workflow_spec) if ret_workflow_id is None: @@ -198,7 +198,7 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): workflow_spec (WorkflowSpec): The workflow specification to process """ tmp_log = LogWrapper(logger, f"process_workflow_registered workflow_id={workflow_spec.workflow_id}") - tmp_log.debug("start") + tmp_log.debug("Start") try: if workflow_spec.definition_json is not None: # Already has definition, skip parsing @@ -244,62 +244,68 @@ def process_workflow_parsed(self, workflow_spec: WorkflowSpec): Args: workflow_spec (WorkflowSpec): The workflow specification to process """ - tmp_log = LogWrapper(logger, f"process_workflow_registered workflow_id={workflow_spec.workflow_id}") - tmp_log.debug("start") - # try: - # # Parse the workflow definition - # workflow_definition_dict = json.loads(workflow_spec.definition_json) - # # initialize - # data_specs = [] - # step_specs = [] - # now_time = naive_utcnow() - # # Register root inputs and outputs - # for input_name, input_target in workflow_definition_dict["root_inputs"].items(): - # data_spec = WFDataSpec() - # data_spec.workflow_id = workflow_spec.workflow_id - # data_spec.name = input_name - # data_spec.target_id = input_target - # data_spec.status = WFDataStatus.registered - # data_spec.type = WFDataType.input - # data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable - # data_spec.creation_time = now_time - # data_specs.append(data_spec) - # for output_name, output_dict in workflow_definition_dict["root_outputs"].items(): - # data_spec = WFDataSpec() - # data_spec.workflow_id = workflow_spec.workflow_id - # data_spec.name = output_name - # data_spec.target_id = output_dict.get("value") - # data_spec.status = WFDataStatus.registered - # data_spec.type = WFDataType.output - # data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable - # data_spec.creation_time = now_time - # data_specs.append(data_spec) - # # Register steps based on nodes in the definition - # for node in workflow_definition_dict["nodes"]: - # # FIXME: not yet consider scatter, condition, loop, etc. - # if not (node.get("condition") or node.get("scatter") or node.get("loop")): - # step_spec = WFStepSpec() - # step_spec.workflow_id = workflow_spec.workflow_id - # step_spec.member_id = node["id"] - # step_spec.name = node["name"] - # step_spec.status = "registered" - # step_spec.type = WFStepType.ordinary - # step_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable - # step_spec.definition_json = json.dumps(node, default=json_serialize_default) - # step_spec.creation_time = now_time - # step_specs.append(step_spec) - # # FIXME: temporary, skip data checking and go to starting directly - # workflow_spec.status = "starting" - # # Upsert DB - # self.tbif.upsert_workflow_entities( - # workflow_spec.workflow_id, - # actions_dict={"workflow": "update", "steps": "insert", "data": "insert"}, - # workflow_spec=workflow_spec, - # step_specs=step_specs, - # data_specs=data_specs, - # ) - # tmp_log.info(f"Processed workflow registered, workflow_id={workflow_spec.workflow_id}, steps={len(step_specs)}, data={len(data_specs)}") - # except Exception: - # tmp_log.error(f"got error ; {traceback.format_exc()}") + tmp_log = LogWrapper(logger, f"process_workflow_parsed workflow_id={workflow_spec.workflow_id}") + tmp_log.debug("Start") + try: + # Parse the workflow definition + workflow_definition_dict = workflow_spec.definition_json_map + if workflow_definition_dict is None: + tmp_log.error(f"Workflow definition is None; cancelled the workflow") + workflow_spec.status = WorkflowStatus.cancelled + workflow_spec.set_parameter("cancel_reason", "Workflow definition is None") + self.tbif.update_workflow(workflow_spec) + return + # initialize + data_specs = [] + step_specs = [] + now_time = naive_utcnow() + # Register root inputs and outputs + for input_name, input_target in workflow_definition_dict["root_inputs"].items(): + data_spec = WFDataSpec() + data_spec.workflow_id = workflow_spec.workflow_id + data_spec.name = input_name + data_spec.target_id = input_target + data_spec.status = WFDataStatus.registered + data_spec.type = WFDataType.input + data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable + data_spec.creation_time = now_time + data_specs.append(data_spec) + for output_name, output_dict in workflow_definition_dict["root_outputs"].items(): + data_spec = WFDataSpec() + data_spec.workflow_id = workflow_spec.workflow_id + data_spec.name = output_name + data_spec.target_id = output_dict.get("value") + data_spec.status = WFDataStatus.registered + data_spec.type = WFDataType.output + data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable + data_spec.creation_time = now_time + data_specs.append(data_spec) + # Register steps based on nodes in the definition + for node in workflow_definition_dict["nodes"]: + # FIXME: not yet consider scatter, condition, loop, etc. + if not (node.get("condition") or node.get("scatter") or node.get("loop")): + step_spec = WFStepSpec() + step_spec.workflow_id = workflow_spec.workflow_id + step_spec.member_id = node["id"] + step_spec.name = node["name"] + step_spec.status = WFStepStatus.registered + step_spec.type = WFStepType.ordinary + step_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable + step_spec.definition_json = json.dumps(node, default=json_serialize_default) + step_spec.creation_time = now_time + step_specs.append(step_spec) + # FIXME: temporary, skip data checking and go to starting directly + workflow_spec.status = WorkflowStatus.starting + # Upsert DB + self.tbif.upsert_workflow_entities( + workflow_spec.workflow_id, + actions_dict={"workflow": "update", "steps": "insert", "data": "insert"}, + workflow_spec=workflow_spec, + step_specs=step_specs, + data_specs=data_specs, + ) + tmp_log.info(f"Done, inserted {len(step_specs)} steps and {len(data_specs)} data, status={workflow_spec.status}") + except Exception: + tmp_log.error(f"Got error ; {traceback.format_exc()}") # ---- Data status transitions ----------------------------- From 05b68ee58293a659390f29ca711da96c0259e417 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 8 Oct 2025 17:27:28 +0200 Subject: [PATCH 018/101] workflows4: workflow starting --- pandaserver/workflow/workflow_base.py | 6 + pandaserver/workflow/workflow_core.py | 263 ++++++++++++++++++++++++-- 2 files changed, 258 insertions(+), 11 deletions(-) diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 86802ae43..582061b4c 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -41,6 +41,9 @@ class WFStepStatus(object): """ registered = "registered" + checking = "checking" + checked_true = "checked_true" + checked_false = "checked_false" pending = "pending" ready = "ready" submitted = "submitted" @@ -66,6 +69,9 @@ class WFDataStatus(object): cancelled = "cancelled" retired = "retired" + good_input_statuses = (generating_ready, done_generated, done_skipped) + good_output_statuses = (done_generated, done_skipped) + # ==== Types =================================================== diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 77e63574c..a37c58a36 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -1,5 +1,6 @@ import copy import functools +import importlib import json import os import random @@ -44,9 +45,52 @@ AttributeWithType = namedtuple("AttributeWithType", ["attribute", "type"]) -# =============================================================== +# ==== Plugin Map ============================================== + +PLUGIN_RAW_MAP = { + "step_handler": { + "panda_task": ("panda_task_step_handler", "PandaTaskStepHandler"), + # Add more step handler plugins here + }, + # "data_handler": { + # "example_data": ("example_data_handler", "ExampleDataHandler"), + # }, + # Add more plugin types here +} + +# map of flovar to plugin classes +flavor_plugin_map = {} +for plugin_type, plugins in PLUGIN_RAW_MAP.items(): + flavor_plugin_map[plugin_type] = {} + for flavor, (module_name, class_name) in plugins.items(): + try: + full_module_name = f"pandaserver.workflow.{plugin_type}_plugins.{module_name}" + module = importlib.import_module(full_module_name) + cls = getattr(module, class_name) + flavor_plugin_map[plugin_type][flavor] = cls + logger.debug(f"Imported {plugin_type} plugin {flavor} from {module_name}.{class_name}") + except Exception as e: + logger.error(f"Failed to import {plugin_type} plugin {flavor} from {module_name}.{class_name}: {e}") + -# =============================================================== +# ==== Functions =============================================== + + +def get_plugin(plugin_type: str, flavor: str): + """ + Get the plugin class for the given type and flavor + + Args: + plugin_type (str): Type of the plugin (e.g., "step_handler", "data_handler") + flavor (str): Flavor of the plugin (e.g., "panda_task") + + Returns: + class: The plugin class if found, otherwise None + """ + return flavor_plugin_map.get(plugin_type, {}).get(flavor) + + +# ==== Workflow Interface ====================================== class WorkflowInterface(object): @@ -144,7 +188,14 @@ def workflow_data_lock(self, data_id: int, lock_expiration_sec: int = 120): # Add methods for workflow management here def register_workflow( - self, prodsourcelabel: str, username: str, workflow_name: str = None, workflow_definition: dict = None, raw_request_params: dict = None, *args, **kwargs + self, + prodsourcelabel: str, + username: str, + workflow_name: str | None = None, + workflow_definition: dict | None = None, + raw_request_params: dict | None = None, + *args, + **kwargs, ) -> int | None: """ Register a new workflow @@ -152,9 +203,9 @@ def register_workflow( Args: prodsourcelabel (str): Production source label for the workflow username (str): Username of the person registering the workflow - workflow_name (str): Name of the workflow - workflow_definition (dict): Dictionary of workflow definition - raw_request_params (dict): Dictionary of parameters of the raw request + workflow_name (str | None): Name of the workflow + workflow_definition (dict | None): Dictionary of workflow definition + raw_request_params (dict | None): Dictionary of parameters of the raw request *args: Additional arguments **kwargs: Additional keyword arguments @@ -187,6 +238,123 @@ def register_workflow( tmp_log.info(f"Registered workflow workflow_id={ret_workflow_id}") return ret_workflow_id + # ---- Data status transitions ----------------------------- + + def process_data_registered(self, data_spec: WFDataSpec): + """ + Process data in registered status + To prepare for checking the data + + Args: + data_spec (WFDataSpec): The workflow data specification to process + """ + tmp_log = LogWrapper(logger, f"process_data_registered data_id={data_spec.data_id}") + tmp_log.debug("Start") + # Check status + if data_spec.status != WFDataStatus.registered: + tmp_log.warning(f"Data status changed unexpectedly from {WFDataStatus.registered} to {data_spec.status}; skipped") + return + # Process + try: + # For now, just update status to checking + data_spec.status = WFDataStatus.checking + data_spec.modification_time = naive_utcnow() + self.tbif.update_workflow_data(data_spec) + tmp_log.info(f"Done, status={data_spec.status}") + except Exception: + tmp_log.error(f"Got error ; {traceback.format_exc()}") + + # ---- Step status transitions ----------------------------- + + def process_step_registered(self, step_spec: WFStepSpec): + """ + Process a step in registered status + To prepare for checking the step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + """ + tmp_log = LogWrapper(logger, f"process_step_registered step_id={step_spec.step_id}") + tmp_log.debug("Start") + # Check status + if step_spec.status != WFStepStatus.registered: + tmp_log.warning(f"Step status changed unexpectedly from {WFStepStatus.registered} to {step_spec.status}; skipped") + return + # Process + try: + # For now, just update status to pending + step_spec.status = WFStepStatus.pending + step_spec.modification_time = naive_utcnow() + self.tbif.update_workflow_step(step_spec) + tmp_log.info(f"Done, status={step_spec.status}") + except Exception: + tmp_log.error(f"Got error ; {traceback.format_exc()}") + + def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpec] | None = None): + """ + Process a step in pending status + To check the inputs of the step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + data_spec_map (Dict[str, WFDataSpec] | None): Optional map of data name to WFDataSpec for the workflow + """ + tmp_log = LogWrapper(logger, f"process_step_pending workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log.debug("Start") + # Check status + if step_spec.status != WFStepStatus.pending: + tmp_log.warning(f"Step status changed unexpectedly from {WFStepStatus.pending} to {step_spec.status}; skipped") + return + # Process + try: + # Get data spec map of the workflow + if data_spec_map is None: + data_specs = self.tbif.get_workflow_data(workflow_id=step_spec.workflow_id) + data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} + # Input data list of the step + step_spec_definition = step_spec.definition_json_map + input_data_list = step_spec_definition.get("input_data_list", []) + # Check if all input data are good, aka ready as input + all_inputs_good = True + for input_data_name in input_data_list: + data_spec = data_spec_map.get(input_data_name) + if data_spec is None: + tmp_log.warning(f"Input data {input_data_name} not found in workflow data") + all_inputs_good = False + break + elif data_spec.status not in WFDataStatus.good_input_statuses: + tmp_log.debug(f"Input data {input_data_name} status {data_spec.status} is not ready for input") + all_inputs_good = False + break + # If not all inputs are good, just return and wait for next round + if not all_inputs_good: + tmp_log.debug(f"Some input data are not good; skipped") + return + # All inputs are good, register outputs of the step and update step status to ready + tmp_log.debug(f"All input data are good; proceeding") + output_data_type = WFDataType.mid + if not step_spec_definition.get("is_tail"): + # is intermediate step, register their outputs as mid type + output_data_list = step_spec_definition.get("output_data_list", []) + now_time = naive_utcnow() + for output_data_name in output_data_list: + data_spec = WFDataSpec() + data_spec.workflow_id = step_spec.workflow_id + data_spec.name = output_data_name + data_spec.target_id = None # to be filled later + data_spec.status = WFDataStatus.registered + data_spec.type = WFDataType.mid + data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable + data_spec.creation_time = now_time + self.tbif.insert_workflow_data(data_spec) + tmp_log.debug(f"Registered mid data {output_data_name} of step_id={step_spec.step_id}") + step_spec.status = WFStepStatus.ready + step_spec.modification_time = naive_utcnow() + self.tbif.update_workflow_step(step_spec) + tmp_log.info(f"Done, status={step_spec.status}") + except Exception: + tmp_log.error(f"Got error ; {traceback.format_exc()}") + # ---- Workflow status transitions ------------------------- def process_workflow_registered(self, workflow_spec: WorkflowSpec): @@ -199,10 +367,15 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): """ tmp_log = LogWrapper(logger, f"process_workflow_registered workflow_id={workflow_spec.workflow_id}") tmp_log.debug("Start") + # Check status + if workflow_spec.status != WorkflowStatus.registered: + tmp_log.warning(f"Workflow status changed unexpectedly from {WorkflowStatus.registered} to {workflow_spec.status}; skipped") + return + # Process try: if workflow_spec.definition_json is not None: # Already has definition, skip parsing - tmp_log.debug(f"Workflow already has definition, skipping parsing") + tmp_log.debug(f"Workflow already has definition; skipped parsing") else: # Parse the workflow definition from raw request raw_request_dict = workflow_spec.raw_request_json_map @@ -246,6 +419,11 @@ def process_workflow_parsed(self, workflow_spec: WorkflowSpec): """ tmp_log = LogWrapper(logger, f"process_workflow_parsed workflow_id={workflow_spec.workflow_id}") tmp_log.debug("Start") + # Check status + if workflow_spec.status != WorkflowStatus.parsed: + tmp_log.warning(f"Workflow status changed unexpectedly from {WorkflowStatus.parsed} to {workflow_spec.status}; skipped") + return + # Process try: # Parse the workflow definition workflow_definition_dict = workflow_spec.definition_json_map @@ -291,11 +469,29 @@ def process_workflow_parsed(self, workflow_spec: WorkflowSpec): step_spec.status = WFStepStatus.registered step_spec.type = WFStepType.ordinary step_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable - step_spec.definition_json = json.dumps(node, default=json_serialize_default) + # step definition + step_definition = copy.deepcopy(node) + # resolve inputs and outputs + input_data_set = set() + output_data_set = set() + for input_target in step_definition.get("inputs", {}).values(): + if not input_target.get("source"): + continue + sources = [] + if isinstance(input_target["source"], list): + sources = copy.deepcopy(input_target["source"]) + else: + sources = [input_target["source"]] + input_data_set.update(sources) + for output_name in step_definition.get("outputs", {}).keys(): + output_data_set.add(output_name) + step_definition["input_data_list"] = list(input_data_set) + step_definition["output_data_list"] = list(output_data_set) + step_spec.definition_json_map = step_definition step_spec.creation_time = now_time step_specs.append(step_spec) - # FIXME: temporary, skip data checking and go to starting directly - workflow_spec.status = WorkflowStatus.starting + # Update status to checking + workflow_spec.status = WorkflowStatus.checking # Upsert DB self.tbif.upsert_workflow_entities( workflow_spec.workflow_id, @@ -308,4 +504,49 @@ def process_workflow_parsed(self, workflow_spec: WorkflowSpec): except Exception: tmp_log.error(f"Got error ; {traceback.format_exc()}") - # ---- Data status transitions ----------------------------- + def process_workflow_starting(self, workflow_spec: WorkflowSpec): + """ + Process a workflow in starting status + To start the steps in the workflow + + Args: + workflow_spec (WorkflowSpec): The workflow specification to process + """ + tmp_log = LogWrapper(logger, f"process_workflow_starting workflow_id={workflow_spec.workflow_id}") + tmp_log.debug("Start") + # Check status + if workflow_spec.status != WorkflowStatus.starting: + tmp_log.warning(f"Workflow status changed unexpectedly from {WorkflowStatus.starting} to {workflow_spec.status}; skipped") + return + # Process + try: + # Get steps in registered status + step_specs = self.tbif.get_workflow_steps(workflow_id=workflow_spec.workflow_id, status_list=[WFStepStatus.registered]) + if not step_specs: + tmp_log.warning(f"No steps in {WFStepStatus.registered} status; skipped") + return + # Get data spec map of the workflow + data_specs = self.tbif.get_workflow_data(workflow_id=workflow_spec.workflow_id) + data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} + # Process each step + for step_spec in step_specs: + with self.workflow_step_lock(step_spec.step_id) as locked_step_spec: + if locked_step_spec is None: + tmp_log.warning(f"Failed to acquire lock for step_id={step_spec.step_id}; skipped") + continue + if locked_step_spec.status != WFStepStatus.registered: + tmp_log.warning(f"Step status changed unexpectely from {WFStepStatus.registered} to {locked_step_spec.status}; skipped") + continue + step_spec = locked_step_spec + # Process the step + match step_spec.status: + case WFStepStatus.registered: + self.process_step_registered(step_spec) + case WFStepStatus.pending: + self.process_step_pending(step_spec, data_spec_map=data_spec_map) + case _: + # tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") + continue + tmp_log.info(f"Done processing steps in {WFStepStatus.registered} status") + except Exception: + tmp_log.error(f"Got error ; {traceback.format_exc()}") From 26c9fe676e9a459bd3fe0fdedf6ae7dd5f4e812d Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 9 Oct 2025 16:32:33 +0200 Subject: [PATCH 019/101] fix --- pandaserver/taskbuffer/TaskBuffer.py | 10 ++++---- .../db_proxy_mods/workflow_module.py | 24 +++++++++++++++++-- .../workflow/test_workflow_core_functions.py | 4 ++++ pandaserver/workflow/workflow_core.py | 4 ++-- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index e7103dbe3..4caafb262 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -2737,7 +2737,7 @@ def resubmit_data_carousel_request_JEDI(self, request_id, exclude_prev_dst=False with self.proxyPool.get() as proxy: return proxy.resubmit_data_carousel_request_JEDI(request_id, exclude_prev_dst) - # ==== workflow fucntions ================================== + # ==== Workflow fucntions ================================== def get_workflow(self, workflow_id): with self.proxyPool.get() as proxy: @@ -2751,13 +2751,13 @@ def get_workflow_data(self, data_id): with self.proxyPool.get() as proxy: return proxy.get_workflow_data(data_id) - def get_steps_of_workflow(self, workflow_id): + def get_steps_of_workflow(self, workflow_id, status_filter_list=None, status_exclusion_list=None): with self.proxyPool.get() as proxy: - return proxy.get_steps_of_workflow(workflow_id) + return proxy.get_steps_of_workflow(workflow_id, status_filter_list, status_exclusion_list) - def get_data_of_workflow(self, workflow_id): + def get_data_of_workflow(self, workflow_id, status_filter_list=None, status_exclusion_list=None): with self.proxyPool.get() as proxy: - return proxy.get_data_of_workflow(workflow_id) + return proxy.get_data_of_workflow(workflow_id, status_filter_list, status_exclusion_list) def lock_workflow(self, workflow_id, locked_by, lock_expiration_sec=120): with self.proxyPool.get() as proxy: diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index b5477635f..3538a4f24 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -113,12 +113,14 @@ def get_workflow_data(self, data_id: int) -> WFDataSpec | None: tmp_log.warning("no data found; skipped") return None - def get_steps_of_workflow(self, workflow_id: int) -> list[WFStepSpec]: + def get_steps_of_workflow(self, workflow_id: int, status_filter_list: list | None = None, status_exclusion_list: list | None = None) -> list[WFStepSpec]: """ Retrieve all workflow steps for a given workflow ID Args: workflow_id (int): ID of the workflow to retrieve steps for + status_filter_list (list | None): List of statuses to filter the steps by (optional) + status_exclusion_list (list | None): List of statuses to exclude the steps by (optional) Returns: list[WFStepSpec]: List of workflow step specifications @@ -127,6 +129,14 @@ def get_steps_of_workflow(self, workflow_id: int) -> list[WFStepSpec]: tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}") sql = f"SELECT {WFStepSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_steps " f"WHERE workflow_id=:workflow_id " var_map = {":workflow_id": workflow_id} + if status_filter_list: + status_var_names_str, status_var_map = get_sql_IN_bind_variables(status_filter_list, prefix=":status") + sql += f"AND status IN ({status_var_names_str}) " + var_map.update(status_var_map) + if status_exclusion_list: + antistatus_var_names_str, antistatus_var_map = get_sql_IN_bind_variables(status_exclusion_list, prefix=":antistatus") + sql += f"AND status NOT IN ({antistatus_var_names_str}) " + var_map.update(antistatus_var_map) self.cur.execute(sql + comment, var_map) res_list = self.cur.fetchall() if res_list is not None: @@ -140,12 +150,14 @@ def get_steps_of_workflow(self, workflow_id: int) -> list[WFStepSpec]: tmp_log.warning("no steps found; skipped") return [] - def get_data_of_workflow(self, workflow_id: int) -> list[WFDataSpec]: + def get_data_of_workflow(self, workflow_id: int, status_filter_list: list | None = None, status_exclusion_list: list | None = None) -> list[WFDataSpec]: """ Retrieve all workflow data for a given workflow ID Args: workflow_id (int): ID of the workflow to retrieve data for + status_filter_list (list | None): List of statuses to filter the data by (optional) + status_exclusion_list (list | None): List of statuses to exclude the data by (optional) Returns: list[WFDataSpec]: List of workflow data specifications @@ -154,6 +166,14 @@ def get_data_of_workflow(self, workflow_id: int) -> list[WFDataSpec]: tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}") sql = f"SELECT {WFDataSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_data " f"WHERE workflow_id=:workflow_id " var_map = {":workflow_id": workflow_id} + if status_filter_list: + status_var_names_str, status_var_map = get_sql_IN_bind_variables(status_filter_list, prefix=":status") + sql += f"AND status IN ({status_var_names_str}) " + var_map.update(status_var_map) + if status_exclusion_list: + antistatus_var_names_str, antistatus_var_map = get_sql_IN_bind_variables(status_exclusion_list, prefix=":antistatus") + sql += f"AND status NOT IN ({antistatus_var_names_str}) " + var_map.update(antistatus_var_map) self.cur.execute(sql + comment, var_map) res_list = self.cur.fetchall() if res_list is not None: diff --git a/pandaserver/workflow/test_workflow_core_functions.py b/pandaserver/workflow/test_workflow_core_functions.py index cc62d14ad..0937c9f33 100644 --- a/pandaserver/workflow/test_workflow_core_functions.py +++ b/pandaserver/workflow/test_workflow_core_functions.py @@ -363,3 +363,7 @@ wf_spec = taskBuffer.get_workflow(workflow_id=WFID) print("Processing parsed workflow...") wfif.process_workflow_parsed(wf_spec) + +wf_spec = taskBuffer.get_workflow(workflow_id=WFID) +print("Processing starting workflow...") +wfif.process_workflow_starting(wf_spec) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index a37c58a36..d6d9a3238 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -521,12 +521,12 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec): # Process try: # Get steps in registered status - step_specs = self.tbif.get_workflow_steps(workflow_id=workflow_spec.workflow_id, status_list=[WFStepStatus.registered]) + step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=[WFStepStatus.registered]) if not step_specs: tmp_log.warning(f"No steps in {WFStepStatus.registered} status; skipped") return # Get data spec map of the workflow - data_specs = self.tbif.get_workflow_data(workflow_id=workflow_spec.workflow_id) + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} # Process each step for step_spec in step_specs: From 3616531a4c4782684c46e8ec8fb86b0d10515db6 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 9 Oct 2025 17:12:54 +0200 Subject: [PATCH 020/101] fix --- .../db_proxy_mods/workflow_module.py | 6 +++--- .../workflow/test_workflow_core_functions.py | 20 +++++++++---------- pandaserver/workflow/workflow_core.py | 8 +++++--- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index 3538a4f24..e23e8bbba 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -211,7 +211,7 @@ def lock_workflow(self, workflow_id: int, locked_by: str, lock_expiration_sec: i "AND (locked_by IS NULL OR locked_by=:locked_by OR lock_time<:min_lock_time)" ) var_map = { - ":locked_by": self.full_pid, + ":locked_by": locked_by, ":lock_time": now_time, ":workflow_id": workflow_id, ":min_lock_time": now_time - timedelta(seconds=lock_expiration_sec), @@ -295,7 +295,7 @@ def lock_workflow_step(self, step_id: int, locked_by: str, lock_expiration_sec: "AND (locked_by IS NULL OR locked_by=:locked_by OR lock_time<:min_lock_time)" ) var_map = { - ":locked_by": self.full_pid, + ":locked_by": locked_by, ":lock_time": now_time, ":step_id": step_id, ":min_lock_time": now_time - timedelta(seconds=lock_expiration_sec), @@ -379,7 +379,7 @@ def lock_workflow_data(self, data_id: int, locked_by: str, lock_expiration_sec: "AND (locked_by IS NULL OR locked_by=:locked_by OR lock_time<:min_lock_time)" ) var_map = { - ":locked_by": self.full_pid, + ":locked_by": locked_by, ":lock_time": now_time, ":data_id": data_id, ":min_lock_time": now_time - timedelta(seconds=lock_expiration_sec), diff --git a/pandaserver/workflow/test_workflow_core_functions.py b/pandaserver/workflow/test_workflow_core_functions.py index 0937c9f33..759c6e825 100644 --- a/pandaserver/workflow/test_workflow_core_functions.py +++ b/pandaserver/workflow/test_workflow_core_functions.py @@ -17,7 +17,7 @@ username = "testuser" workflow_name = "test_workflow_bg_comb_00" -WFID = 1 # workflow ID to be used in this test +WFID = sys.argv[1] # workflow ID to be used in this test # workflow definition json # wfd_json = json.dumps( @@ -356,14 +356,14 @@ # Process the registered workflow -wf_spec = taskBuffer.get_workflow(workflow_id=WFID) -print("Processing registered workflow...") -wfif.process_workflow_registered(wf_spec) +# wf_spec = taskBuffer.get_workflow(workflow_id=WFID) +# print("Processing registered workflow...") +# wfif.process_workflow_registered(wf_spec) -wf_spec = taskBuffer.get_workflow(workflow_id=WFID) -print("Processing parsed workflow...") -wfif.process_workflow_parsed(wf_spec) +# wf_spec = taskBuffer.get_workflow(workflow_id=WFID) +# print("Processing parsed workflow...") +# wfif.process_workflow_parsed(wf_spec) -wf_spec = taskBuffer.get_workflow(workflow_id=WFID) -print("Processing starting workflow...") -wfif.process_workflow_starting(wf_spec) +# wf_spec = taskBuffer.get_workflow(workflow_id=WFID) +# print("Processing starting workflow...") +# wfif.process_workflow_starting(wf_spec) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index d6d9a3238..09b0320ed 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -521,7 +521,9 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec): # Process try: # Get steps in registered status - step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=[WFStepStatus.registered]) + step_specs = self.tbif.get_steps_of_workflow( + workflow_id=workflow_spec.workflow_id, status_filter_list=[WFStepStatus.registered, WFStepStatus.pending] + ) if not step_specs: tmp_log.warning(f"No steps in {WFStepStatus.registered} status; skipped") return @@ -534,8 +536,8 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec): if locked_step_spec is None: tmp_log.warning(f"Failed to acquire lock for step_id={step_spec.step_id}; skipped") continue - if locked_step_spec.status != WFStepStatus.registered: - tmp_log.warning(f"Step status changed unexpectely from {WFStepStatus.registered} to {locked_step_spec.status}; skipped") + if locked_step_spec.status not in [WFStepStatus.registered, WFStepStatus.pending]: + tmp_log.warning(f"Step status changed unexpectely to {locked_step_spec.status}; skipped") continue step_spec = locked_step_spec # Process the step From 16bccf74e5cc6bf98040aa5dadd9000695262f13 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 13 Oct 2025 16:22:27 +0200 Subject: [PATCH 021/101] workflows4: step submit target --- pandaserver/api/v1/workflow_api.py | 18 ++--- .../step_handler_plugins/base_step_handler.py | 52 +++++++++++++- .../panda_task_step_handler.py | 52 +++++++++++--- pandaserver/workflow/workflow_core.py | 72 +++++++++++++++---- 4 files changed, 161 insertions(+), 33 deletions(-) diff --git a/pandaserver/api/v1/workflow_api.py b/pandaserver/api/v1/workflow_api.py index 7f0f90865..df5435d4b 100644 --- a/pandaserver/api/v1/workflow_api.py +++ b/pandaserver/api/v1/workflow_api.py @@ -60,12 +60,14 @@ def submit_workflow_raw_request(req: PandaRequest, params: dict | str) -> dict: dict: dictionary `{'success': True/False, 'message': 'Description of error', 'data': }` """ - username = get_dn(req) + user_dn = get_dn(req) prodsourcelabel = "user" - if has_production_role(req): - prodsourcelabel = "managed" - tmp_logger = LogWrapper(_logger, f'submit_workflow_raw_request prodsourcelabel={prodsourcelabel} username="{username}" ') + # FIXME: only for analysis temporarily + # if has_production_role(req): + # prodsourcelabel = "managed" + + tmp_logger = LogWrapper(_logger, f'submit_workflow_raw_request prodsourcelabel={prodsourcelabel} user_dn="{user_dn}" ') tmp_logger.debug("Start") success, message, data = False, "", None time_start = naive_utcnow() @@ -78,7 +80,7 @@ def submit_workflow_raw_request(req: PandaRequest, params: dict | str) -> dict: tmp_logger.error(message) return generate_response(success, message, data) - workflow_id = global_wfif.register_workflow(prodsourcelabel, username, raw_request_params=params) + workflow_id = global_wfif.register_workflow(prodsourcelabel, user_dn, raw_request_params=params) if workflow_id is not None: success = True @@ -109,18 +111,18 @@ def submit_workflow(req: PandaRequest, workflow_definition: dict) -> dict: dict: dictionary `{'success': True/False, 'message': 'Description of error', 'data': }` """ - username = get_dn(req) + user_dn = get_dn(req) prodsourcelabel = "user" if has_production_role(req): prodsourcelabel = "managed" workflow_name = workflow_definition.get("workflow_name", None) - tmp_logger = LogWrapper(_logger, f'submit_workflow prodsourcelabel={prodsourcelabel} username="{username}" workflow_name={workflow_name}') + tmp_logger = LogWrapper(_logger, f'submit_workflow prodsourcelabel={prodsourcelabel} user_dn="{user_dn}" workflow_name={workflow_name}') tmp_logger.debug("Start") success, message, data = False, "", None time_start = naive_utcnow() - workflow_id = global_wfif.register_workflow(prodsourcelabel, username, workflow_name, workflow_definition) + workflow_id = global_wfif.register_workflow(prodsourcelabel, user_dn, workflow_name, workflow_definition) if workflow_id is not None: success = True diff --git a/pandaserver/workflow/step_handler_plugins/base_step_handler.py b/pandaserver/workflow/step_handler_plugins/base_step_handler.py index 1d0c06914..4928ef2b4 100644 --- a/pandaserver/workflow/step_handler_plugins/base_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/base_step_handler.py @@ -1,3 +1,5 @@ +import dataclasses + from pandaserver.workflow.workflow_base import ( WFDataSpec, WFDataStatus, @@ -9,6 +11,45 @@ WorkflowStatus, ) +# === Dataclasses of return objects of step handler methods ===== + + +@dataclasses.dataclass +class SubmitResult: + """ + Result of submitting a target for processing a step. + + Fields: + success (bool | None): Indicates if the submission was successful. + target_id (str | None): The ID of the submitted target (e.g., task ID). + message (str): A message providing additional information about the submission result. + """ + + success: bool | None = None + target_id: str | None = None + message: str = "" + + +@dataclasses.dataclass +class CheckResult: + """ + Result of checking the status of a submitted target. + + Fields: + success (bool | None): Indicates if the status check was successful. + status (WFStepStatus | None): The status of the step to move to. + native_status (str | None): The native status string from the target system. + message (str): A message providing additional information about the status check result. + """ + + success: bool | None = None + status: WFStepStatus | None = None + native_status: str | None = None + message: str = "" + + +# ================================================================= + class BaseStepHandler: """ @@ -27,12 +68,19 @@ def __init__(self, task_buffer, *args, **kwargs): """ self.tbif = task_buffer - def submit_target(self, step_specs: WFStepSpec, **kwargs): + def submit_target(self, step_spec: WFStepSpec, **kwargs) -> SubmitResult: """ Submit a target for processing the step. This method should be implemented by subclasses to handle the specifics of target submission. + This method should NOT modify step_spec. Any update information should be stored in the SubmitResult returned instead. Args: - step_specs (WFStepSpec): Specifications of the workflow step to be submitted. + step_spec (WFStepSpec): Specifications of the workflow step to be submitted. + + Returns: + SubmitResult: An object containing the result of the submission, including success status, target ID, and message. + """ raise NotImplementedError("Subclasses must implement this method.") + + # def check_status(self, target_id: str, **kwargs) -> tuple[bool | None, str | None, str]: diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 8247a48d3..2d7dccb56 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -1,6 +1,15 @@ import json +import traceback +import uuid -from pandaserver.workflow.step_handler_plugins.base_step_handler import BaseStepHandler +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger + +from pandaserver.workflow.step_handler_plugins.base_step_handler import ( + BaseStepHandler, + CheckResult, + SubmitResult, +) from pandaserver.workflow.workflow_base import ( WFDataSpec, WFDataStatus, @@ -12,6 +21,9 @@ WorkflowStatus, ) +# main logger +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + class PandaTaskStepHandler(BaseStepHandler): """ @@ -26,11 +38,21 @@ def __init__(self, *args, **kwargs): # Initialize base class or any required modules here super().__init__(*args, **kwargs) - def submit_target(self, step_spec: WFStepSpec, workflow_spec: WorkflowSpec, **kwargs): + def submit_target(self, step_spec: WFStepSpec, **kwargs) -> SubmitResult: """ Submit a target for processing the PanDA task step. This method should be implemented to handle the specifics of PanDA task submission. + + Args: + step_spec (WFStepSpec): The workflow step specification containing details about the step to be processed. + **kwargs: Additional keyword arguments that may be required for submission. + + Returns: + SubmitResult: An object containing the result of the submission, including success status, target ID (task ID), and message. """ + tmp_log = LogWrapper(logger, f"submit_target workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + # Initialize + submit_result = SubmitResult() ... # task_param_map = {} @@ -83,9 +105,23 @@ def submit_target(self, step_spec: WFStepSpec, workflow_spec: WorkflowSpec, **kw # "dataset": outDatasetName, # }, # ] - - # task_param_json = json.dumps(task_param_map) - - # self.tbif.insertTaskParams_JEDI( - # task_param_map["vo"], task_param_map["prodSourceLabel"], task_param_map["userName"], task_param_map["taskName"], task_param_json - # ) + try: + # Get step definition + step_definition = step_spec.definition_json_map + user_name = step_definition.get("user_name") + user_dn = step_definition.get("user_dn") + task_param_map = step_definition.get("task_params", {}) + # task_param_map["userName"] = user_name + # Submit task + tmp_ret_flag, temp_ret_val = self.tbif.insertTaskParamsPanda(task_param_map, user_dn, False) + if tmp_ret_flag: + submit_result.success = True + submit_result.target_id = temp_ret_val + tmp_log.info(f"submitted task target_id={submit_result.target_id}") + else: + submit_result.message = temp_ret_val + tmp_log.error(f"failed to submit task: {submit_result.message}") + except Exception as e: + submit_result.message = f"exception {str(e)}" + tmp_log.error(f"failed to submit task: {traceback.format_exc()}") + return submit_result diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 09b0320ed..15905c5b7 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -190,7 +190,7 @@ def workflow_data_lock(self, data_id: int, lock_expiration_sec: int = 120): def register_workflow( self, prodsourcelabel: str, - username: str, + user_dn: str, workflow_name: str | None = None, workflow_definition: dict | None = None, raw_request_params: dict | None = None, @@ -202,7 +202,7 @@ def register_workflow( Args: prodsourcelabel (str): Production source label for the workflow - username (str): Username of the person registering the workflow + user_dn (str): Distinguished name of the user submitting the workflow workflow_name (str | None): Name of the workflow workflow_definition (dict | None): Dictionary of workflow definition raw_request_params (dict | None): Dictionary of parameters of the raw request @@ -212,16 +212,18 @@ def register_workflow( Returns: int | None: The ID of the registered workflow if successful, otherwise None """ - tmp_log = LogWrapper(logger, f"register_workflow prodsourcelabel={prodsourcelabel} username={username} name={workflow_name}") + tmp_log = LogWrapper(logger, f"register_workflow prodsourcelabel={prodsourcelabel} user_dn={user_dn} name={workflow_name}") tmp_log.debug("Start") # Implementation of workflow registration logic ... workflow_spec = WorkflowSpec() workflow_spec.prodsourcelabel = prodsourcelabel - workflow_spec.username = clean_user_id(username) + workflow_spec.username = clean_user_id(user_dn) if workflow_name is not None: workflow_spec.name = workflow_name if workflow_definition is not None: + # insert extra info into definition + workflow_definition["user_dn"] = user_dn workflow_spec.definition_json = json.dumps(workflow_definition, default=json_serialize_default) elif raw_request_params is not None: workflow_spec.raw_request_json = json.dumps(raw_request_params, default=json_serialize_default) @@ -258,7 +260,6 @@ def process_data_registered(self, data_spec: WFDataSpec): try: # For now, just update status to checking data_spec.status = WFDataStatus.checking - data_spec.modification_time = naive_utcnow() self.tbif.update_workflow_data(data_spec) tmp_log.info(f"Done, status={data_spec.status}") except Exception: @@ -284,7 +285,6 @@ def process_step_registered(self, step_spec: WFStepSpec): try: # For now, just update status to pending step_spec.status = WFStepStatus.pending - step_spec.modification_time = naive_utcnow() self.tbif.update_workflow_step(step_spec) tmp_log.info(f"Done, status={step_spec.status}") except Exception: @@ -313,7 +313,10 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} # Input data list of the step step_spec_definition = step_spec.definition_json_map - input_data_list = step_spec_definition.get("input_data_list", []) + input_data_list = step_spec_definition.get("input_data_list") + if input_data_list is None: + tmp_log.warning(f"Step definition does not have input_data_list; skipped") + return # Check if all input data are good, aka ready as input all_inputs_good = True for input_data_name in input_data_list: @@ -348,13 +351,49 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W data_spec.creation_time = now_time self.tbif.insert_workflow_data(data_spec) tmp_log.debug(f"Registered mid data {output_data_name} of step_id={step_spec.step_id}") + # update data_spec_map + data_spec_map[output_data_name] = data_spec step_spec.status = WFStepStatus.ready - step_spec.modification_time = naive_utcnow() self.tbif.update_workflow_step(step_spec) tmp_log.info(f"Done, status={step_spec.status}") except Exception: tmp_log.error(f"Got error ; {traceback.format_exc()}") + def process_step_ready(self, step_spec: WFStepSpec): + """ + Process a step in ready status + To submit the step for execution + + Args: + step_spec (WFStepSpec): The workflow step specification to process + """ + tmp_log = LogWrapper(logger, f"process_step_ready workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log.debug("Start") + # Check status + if step_spec.status != WFStepStatus.ready: + tmp_log.warning(f"Step status changed unexpectedly from {WFStepStatus.ready} to {step_spec.status}; skipped") + return + # Process + try: + # Get the step handler plugin + step_handler_cls = get_plugin("step_handler", step_spec.flavor) + if step_handler_cls is None: + tmp_log.error(f"No step handler plugin found for flavor={step_spec.flavor}; skipped") + return + step_handler = step_handler_cls(self.tbif) + # Submit the step + success, target_id, message = step_handler.submit_target(step_spec, self.tbif.get_workflow(step_spec.workflow_id)) + if not success or target_id is None: + tmp_log.error(f"Failed to submit step; {message}") + return + # Update step status to submitted + step_spec.target_id = target_id + step_spec.status = WFStepStatus.submitted + self.tbif.update_workflow_step(step_spec) + tmp_log.info(f"Submitted step, flavor={step_spec.flavor}, target_id={step_spec.target_id}, status={step_spec.status}") + except Exception: + tmp_log.error(f"Got error ; {traceback.format_exc()}") + # ---- Workflow status transitions ------------------------- def process_workflow_registered(self, workflow_spec: WorkflowSpec): @@ -381,7 +420,7 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): raw_request_dict = workflow_spec.raw_request_json_map sandbox_url = os.path.join(raw_request_dict["sourceURL"], "cache", raw_request_dict["sandbox"]) log_token = f'< user="{workflow_spec.username}" outDS={raw_request_dict["outDS"]}>' - is_ok, is_fatal, workflow_definition_dict = parse_raw_request( + is_ok, is_fatal, workflow_definition = parse_raw_request( sandbox_url=sandbox_url, log_token=log_token, user_name=workflow_spec.username, @@ -398,7 +437,7 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): tmp_log.warning(f"Failed to parse raw request; skipped") return # Parsed successfully, update definition - workflow_spec.definition_json = json.dumps(workflow_definition_dict, default=json_serialize_default) + workflow_spec.definition_json = json.dumps(workflow_definition, default=json_serialize_default) tmp_log.debug(f"Parsed raw request into definition") # Update status to parsed workflow_spec.status = WorkflowStatus.parsed @@ -426,8 +465,8 @@ def process_workflow_parsed(self, workflow_spec: WorkflowSpec): # Process try: # Parse the workflow definition - workflow_definition_dict = workflow_spec.definition_json_map - if workflow_definition_dict is None: + workflow_definition = workflow_spec.definition_json_map + if workflow_definition is None: tmp_log.error(f"Workflow definition is None; cancelled the workflow") workflow_spec.status = WorkflowStatus.cancelled workflow_spec.set_parameter("cancel_reason", "Workflow definition is None") @@ -438,7 +477,7 @@ def process_workflow_parsed(self, workflow_spec: WorkflowSpec): step_specs = [] now_time = naive_utcnow() # Register root inputs and outputs - for input_name, input_target in workflow_definition_dict["root_inputs"].items(): + for input_name, input_target in workflow_definition["root_inputs"].items(): data_spec = WFDataSpec() data_spec.workflow_id = workflow_spec.workflow_id data_spec.name = input_name @@ -448,7 +487,7 @@ def process_workflow_parsed(self, workflow_spec: WorkflowSpec): data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable data_spec.creation_time = now_time data_specs.append(data_spec) - for output_name, output_dict in workflow_definition_dict["root_outputs"].items(): + for output_name, output_dict in workflow_definition["root_outputs"].items(): data_spec = WFDataSpec() data_spec.workflow_id = workflow_spec.workflow_id data_spec.name = output_name @@ -459,7 +498,7 @@ def process_workflow_parsed(self, workflow_spec: WorkflowSpec): data_spec.creation_time = now_time data_specs.append(data_spec) # Register steps based on nodes in the definition - for node in workflow_definition_dict["nodes"]: + for node in workflow_definition["nodes"]: # FIXME: not yet consider scatter, condition, loop, etc. if not (node.get("condition") or node.get("scatter") or node.get("loop")): step_spec = WFStepSpec() @@ -471,6 +510,9 @@ def process_workflow_parsed(self, workflow_spec: WorkflowSpec): step_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable # step definition step_definition = copy.deepcopy(node) + # propagate user name and DN from workflow to step + step_definition["user_name"] = workflow_spec.username + step_definition["user_dn"] = workflow_definition.get("user_dn") # resolve inputs and outputs input_data_set = set() output_data_set = set() From 24d9029344e46ef08493b6b3ec90dbb8374bcf1c Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 14 Oct 2025 10:54:33 +0200 Subject: [PATCH 022/101] workflows4: fix submit task --- .../panda_task_step_handler.py | 2 +- .../workflow/test_workflow_core_functions.py | 4 +- pandaserver/workflow/workflow_core.py | 48 +++++++++++-------- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 2d7dccb56..d3a552518 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -113,7 +113,7 @@ def submit_target(self, step_spec: WFStepSpec, **kwargs) -> SubmitResult: task_param_map = step_definition.get("task_params", {}) # task_param_map["userName"] = user_name # Submit task - tmp_ret_flag, temp_ret_val = self.tbif.insertTaskParamsPanda(task_param_map, user_dn, False) + tmp_ret_flag, temp_ret_val = self.tbif.insertTaskParamsPanda(task_param_map, user_dn, False, decode=False) if tmp_ret_flag: submit_result.success = True submit_result.target_id = temp_ret_val diff --git a/pandaserver/workflow/test_workflow_core_functions.py b/pandaserver/workflow/test_workflow_core_functions.py index 759c6e825..496f711be 100644 --- a/pandaserver/workflow/test_workflow_core_functions.py +++ b/pandaserver/workflow/test_workflow_core_functions.py @@ -361,8 +361,8 @@ # wfif.process_workflow_registered(wf_spec) # wf_spec = taskBuffer.get_workflow(workflow_id=WFID) -# print("Processing parsed workflow...") -# wfif.process_workflow_parsed(wf_spec) +# print("Processing checked workflow...") +# wfif.process_workflow_checked(wf_spec) # wf_spec = taskBuffer.get_workflow(workflow_id=WFID) # print("Processing starting workflow...") diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 15905c5b7..17cdeaedb 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -212,20 +212,21 @@ def register_workflow( Returns: int | None: The ID of the registered workflow if successful, otherwise None """ - tmp_log = LogWrapper(logger, f"register_workflow prodsourcelabel={prodsourcelabel} user_dn={user_dn} name={workflow_name}") - tmp_log.debug("Start") + username = clean_user_id(user_dn) + tmp_log = LogWrapper(logger, f"register_workflow prodsourcelabel={prodsourcelabel} username={username} name={workflow_name}") + tmp_log.debug(f'Start, user_dn is "{user_dn}"') # Implementation of workflow registration logic ... workflow_spec = WorkflowSpec() workflow_spec.prodsourcelabel = prodsourcelabel - workflow_spec.username = clean_user_id(user_dn) + workflow_spec.username = username if workflow_name is not None: workflow_spec.name = workflow_name if workflow_definition is not None: - # insert extra info into definition workflow_definition["user_dn"] = user_dn workflow_spec.definition_json = json.dumps(workflow_definition, default=json_serialize_default) elif raw_request_params is not None: + raw_request_params["user_dn"] = user_dn workflow_spec.raw_request_json = json.dumps(raw_request_params, default=json_serialize_default) else: tmp_log.error(f"Either workflow_definition or raw_request_params must be provided") @@ -382,12 +383,12 @@ def process_step_ready(self, step_spec: WFStepSpec): return step_handler = step_handler_cls(self.tbif) # Submit the step - success, target_id, message = step_handler.submit_target(step_spec, self.tbif.get_workflow(step_spec.workflow_id)) - if not success or target_id is None: - tmp_log.error(f"Failed to submit step; {message}") + submit_result = step_handler.submit_target(step_spec) + if not submit_result.success or submit_result.target_id is None: + tmp_log.error(f"Failed to submit step; {submit_result.message}") return # Update step status to submitted - step_spec.target_id = target_id + step_spec.target_id = submit_result.target_id step_spec.status = WFStepStatus.submitted self.tbif.update_workflow_step(step_spec) tmp_log.info(f"Submitted step, flavor={step_spec.flavor}, target_id={step_spec.target_id}, status={step_spec.status}") @@ -427,7 +428,8 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): raw_request_dict=raw_request_dict, ) # Failure handling - if is_fatal: + # if is_fatal: + if False: # disable fatal for now tmp_log.error(f"Fatal error in parsing raw request; cancelled the workflow") workflow_spec.status = WorkflowStatus.cancelled workflow_spec.set_parameter("cancel_reason", "Fatal error in parsing raw request") @@ -436,31 +438,34 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): if not is_ok: tmp_log.warning(f"Failed to parse raw request; skipped") return + # extra info from raw request + workflow_definition["user_dn"] = raw_request_dict.get("user_dn") # Parsed successfully, update definition workflow_spec.definition_json = json.dumps(workflow_definition, default=json_serialize_default) tmp_log.debug(f"Parsed raw request into definition") # Update status to parsed - workflow_spec.status = WorkflowStatus.parsed + # workflow_spec.status = WorkflowStatus.parsed + workflow_spec.status = WorkflowStatus.checked # skip parsed for now # Update DB self.tbif.update_workflow(workflow_spec) tmp_log.info(f"Done, status={workflow_spec.status}") except Exception: tmp_log.error(f"Got error ; {traceback.format_exc()}") - def process_workflow_parsed(self, workflow_spec: WorkflowSpec): + def process_workflow_checked(self, workflow_spec: WorkflowSpec): """ - Process a workflow in parsed status + Process a workflow in checked status Register steps, and update its status Parse raw request into workflow definition, register steps, and update its status Args: workflow_spec (WorkflowSpec): The workflow specification to process """ - tmp_log = LogWrapper(logger, f"process_workflow_parsed workflow_id={workflow_spec.workflow_id}") + tmp_log = LogWrapper(logger, f"process_workflow_checked workflow_id={workflow_spec.workflow_id}") tmp_log.debug("Start") # Check status - if workflow_spec.status != WorkflowStatus.parsed: - tmp_log.warning(f"Workflow status changed unexpectedly from {WorkflowStatus.parsed} to {workflow_spec.status}; skipped") + if workflow_spec.status != WorkflowStatus.checked: + tmp_log.warning(f"Workflow status changed unexpectedly from {WorkflowStatus.checked} to {workflow_spec.status}; skipped") return # Process try: @@ -532,8 +537,8 @@ def process_workflow_parsed(self, workflow_spec: WorkflowSpec): step_spec.definition_json_map = step_definition step_spec.creation_time = now_time step_specs.append(step_spec) - # Update status to checking - workflow_spec.status = WorkflowStatus.checking + # Update status to starting + workflow_spec.status = WorkflowStatus.starting # Upsert DB self.tbif.upsert_workflow_entities( workflow_spec.workflow_id, @@ -563,9 +568,8 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec): # Process try: # Get steps in registered status - step_specs = self.tbif.get_steps_of_workflow( - workflow_id=workflow_spec.workflow_id, status_filter_list=[WFStepStatus.registered, WFStepStatus.pending] - ) + required_step_statuses = [WFStepStatus.registered, WFStepStatus.pending, WFStepStatus.ready] + step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=required_step_statuses) if not step_specs: tmp_log.warning(f"No steps in {WFStepStatus.registered} status; skipped") return @@ -578,7 +582,7 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec): if locked_step_spec is None: tmp_log.warning(f"Failed to acquire lock for step_id={step_spec.step_id}; skipped") continue - if locked_step_spec.status not in [WFStepStatus.registered, WFStepStatus.pending]: + if locked_step_spec.status not in required_step_statuses: tmp_log.warning(f"Step status changed unexpectely to {locked_step_spec.status}; skipped") continue step_spec = locked_step_spec @@ -588,6 +592,8 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec): self.process_step_registered(step_spec) case WFStepStatus.pending: self.process_step_pending(step_spec, data_spec_map=data_spec_map) + case WFStepStatus.ready: + self.process_step_ready(step_spec) case _: # tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") continue From 937fb57e178cf1b3772c1970e85f02e9a54ab937 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 14 Oct 2025 11:37:47 +0200 Subject: [PATCH 023/101] workflows4: reuse plugin instance --- pandaserver/taskbuffer/TaskBuffer.py | 5 --- pandaserver/workflow/workflow_core.py | 45 +++++++++++++++++++-------- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index 4caafb262..2beb61fe1 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -2680,11 +2680,6 @@ def updateInputFilesStaged_JEDI(self, jeditaskid, scope, filenames_dict, chunk_s with self.proxyPool.get() as proxy: return proxy.updateInputFilesStaged_JEDI(jeditaskid, scope, filenames_dict, chunk_size, by, check_scope) - # insert TaskParams - def insertTaskParams_JEDI(self, vo, prodSourceLabel, userName, taskName, taskParams, parent_tid=None): - with self.proxyPool.get() as proxy: - return proxy.insertTaskParams_JEDI(vo, prodSourceLabel, userName, taskName, taskParams, parent_tid) - # ==== Data Carousel functions ============================= # insert data carousel requests diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 17cdeaedb..44b5f0a30 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -58,16 +58,16 @@ # Add more plugin types here } -# map of flovar to plugin classes -flavor_plugin_map = {} +# map of flavor to plugin classes +flavor_plugin_class_map = {} for plugin_type, plugins in PLUGIN_RAW_MAP.items(): - flavor_plugin_map[plugin_type] = {} + flavor_plugin_class_map[plugin_type] = {} for flavor, (module_name, class_name) in plugins.items(): try: full_module_name = f"pandaserver.workflow.{plugin_type}_plugins.{module_name}" module = importlib.import_module(full_module_name) cls = getattr(module, class_name) - flavor_plugin_map[plugin_type][flavor] = cls + flavor_plugin_class_map[plugin_type][flavor] = cls logger.debug(f"Imported {plugin_type} plugin {flavor} from {module_name}.{class_name}") except Exception as e: logger.error(f"Failed to import {plugin_type} plugin {flavor} from {module_name}.{class_name}: {e}") @@ -76,7 +76,7 @@ # ==== Functions =============================================== -def get_plugin(plugin_type: str, flavor: str): +def get_plugin_class(plugin_type: str, flavor: str): """ Get the plugin class for the given type and flavor @@ -87,7 +87,7 @@ def get_plugin(plugin_type: str, flavor: str): Returns: class: The plugin class if found, otherwise None """ - return flavor_plugin_map.get(plugin_type, {}).get(flavor) + return flavor_plugin_class_map.get(plugin_type, {}).get(flavor) # ==== Workflow Interface ====================================== @@ -110,8 +110,31 @@ def __init__(self, task_buffer, *args, **kwargs): self.tbif = task_buffer self.ddm_if = rucioAPI self.full_pid = f"{socket.getfqdn().split('.')[0]}-{os.getpgrp()}-{os.getpid()}" + self.plugin_map = {} - #### Context managers for locking + def get_plugin(self, plugin_type: str, flavor: str): + """ + Get the plugin instance for the given type and flavor + + Args: + plugin_type (str): Type of the plugin (e.g., "step_handler", "data_handler") + flavor (str): Flavor of the plugin (e.g., "panda_task") + + Returns: + Any: The plugin instance if found, otherwise None + """ + plugin = self.plugin_map.get(plugin_type, {}).get(flavor) + if plugin is not None: + return plugin + else: + # not yet loaded, try to load + cls = get_plugin_class(plugin_type, flavor) + if cls is not None: + self.plugin_map.setdefault(plugin_type, {})[flavor] = cls(task_buffer=self.tbif, ddm_if=self.ddm_if) + plugin = self.plugin_map[plugin_type][flavor] + return plugin + + # --- Context managers for locking ------------------------- @contextmanager def workflow_lock(self, workflow_id: int, lock_expiration_sec: int = 120): @@ -185,7 +208,7 @@ def workflow_data_lock(self, data_id: int, lock_expiration_sec: int = 120): # lock not acquired yield None - # Add methods for workflow management here + # --- Registration ----------------------------------------- def register_workflow( self, @@ -377,11 +400,7 @@ def process_step_ready(self, step_spec: WFStepSpec): # Process try: # Get the step handler plugin - step_handler_cls = get_plugin("step_handler", step_spec.flavor) - if step_handler_cls is None: - tmp_log.error(f"No step handler plugin found for flavor={step_spec.flavor}; skipped") - return - step_handler = step_handler_cls(self.tbif) + step_handler = self.get_plugin("step_handler", step_spec.flavor) # Submit the step submit_result = step_handler.submit_target(step_spec) if not submit_result.success or submit_result.target_id is None: From f23c17bd5aa80fa870fc11cc0c014a5c425c3b0d Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 14 Oct 2025 16:58:27 +0200 Subject: [PATCH 024/101] workflows4: add check step and result obj --- .../step_handler_plugins/base_step_handler.py | 61 ++-- .../panda_task_step_handler.py | 85 +++++- pandaserver/workflow/workflow_base.py | 78 +++++ pandaserver/workflow/workflow_core.py | 286 +++++++++++++++--- 4 files changed, 414 insertions(+), 96 deletions(-) diff --git a/pandaserver/workflow/step_handler_plugins/base_step_handler.py b/pandaserver/workflow/step_handler_plugins/base_step_handler.py index 4928ef2b4..96f9f6cea 100644 --- a/pandaserver/workflow/step_handler_plugins/base_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/base_step_handler.py @@ -6,50 +6,13 @@ WFDataType, WFStepSpec, WFStepStatus, + WFStepTargetCheckResult, + WFStepTargetSubmitResult, WFStepType, WorkflowSpec, WorkflowStatus, ) -# === Dataclasses of return objects of step handler methods ===== - - -@dataclasses.dataclass -class SubmitResult: - """ - Result of submitting a target for processing a step. - - Fields: - success (bool | None): Indicates if the submission was successful. - target_id (str | None): The ID of the submitted target (e.g., task ID). - message (str): A message providing additional information about the submission result. - """ - - success: bool | None = None - target_id: str | None = None - message: str = "" - - -@dataclasses.dataclass -class CheckResult: - """ - Result of checking the status of a submitted target. - - Fields: - success (bool | None): Indicates if the status check was successful. - status (WFStepStatus | None): The status of the step to move to. - native_status (str | None): The native status string from the target system. - message (str): A message providing additional information about the status check result. - """ - - success: bool | None = None - status: WFStepStatus | None = None - native_status: str | None = None - message: str = "" - - -# ================================================================= - class BaseStepHandler: """ @@ -68,19 +31,31 @@ def __init__(self, task_buffer, *args, **kwargs): """ self.tbif = task_buffer - def submit_target(self, step_spec: WFStepSpec, **kwargs) -> SubmitResult: + def submit_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetSubmitResult: """ Submit a target for processing the step. This method should be implemented by subclasses to handle the specifics of target submission. - This method should NOT modify step_spec. Any update information should be stored in the SubmitResult returned instead. + This method should NOT modify step_spec. Any update information should be stored in the WFStepTargetSubmitResult returned instead. Args: step_spec (WFStepSpec): Specifications of the workflow step to be submitted. Returns: - SubmitResult: An object containing the result of the submission, including success status, target ID, and message. + WFStepTargetSubmitResult: An object containing the result of the submission, including success status, target ID, and message. """ raise NotImplementedError("Subclasses must implement this method.") - # def check_status(self, target_id: str, **kwargs) -> tuple[bool | None, str | None, str]: + def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResult: + """ + Check the status of the submitted target. + This method should be implemented by subclasses to handle the specifics of target status checking. + This method should NOT modify step_spec. Any update information should be stored in the WFStepTargetCheckResult returned instead. + + Args: + step_spec (WFStepSpec): Specifications of the workflow step to be checked. + + Returns: + WFStepTargetCheckResult: An object containing the result of the check, including success status, current step status, and message. + """ + raise NotImplementedError("Subclasses must implement this method.") diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index d3a552518..bcd2e92dd 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -7,8 +7,8 @@ from pandaserver.workflow.step_handler_plugins.base_step_handler import ( BaseStepHandler, - CheckResult, - SubmitResult, + WFStepTargetCheckResult, + WFStepTargetSubmitResult, ) from pandaserver.workflow.workflow_base import ( WFDataSpec, @@ -38,7 +38,7 @@ def __init__(self, *args, **kwargs): # Initialize base class or any required modules here super().__init__(*args, **kwargs) - def submit_target(self, step_spec: WFStepSpec, **kwargs) -> SubmitResult: + def submit_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetSubmitResult: """ Submit a target for processing the PanDA task step. This method should be implemented to handle the specifics of PanDA task submission. @@ -48,12 +48,16 @@ def submit_target(self, step_spec: WFStepSpec, **kwargs) -> SubmitResult: **kwargs: Additional keyword arguments that may be required for submission. Returns: - SubmitResult: An object containing the result of the submission, including success status, target ID (task ID), and message. + WFStepTargetSubmitResult: An object containing the result of the submission, including success status, target ID (task ID), and message. """ tmp_log = LogWrapper(logger, f"submit_target workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") # Initialize - submit_result = SubmitResult() - + submit_result = WFStepTargetSubmitResult() + # Check step type + if step_spec.type != WFStepType.panda_task: + tmp_log.warning(f"type={step_spec.type} not panda_task; skipped") + submit_result.message = f"type not panda_task; skipped" + return submit_result ... # task_param_map = {} # task_param_map["taskName"] = step_spec.name @@ -116,12 +120,73 @@ def submit_target(self, step_spec: WFStepSpec, **kwargs) -> SubmitResult: tmp_ret_flag, temp_ret_val = self.tbif.insertTaskParamsPanda(task_param_map, user_dn, False, decode=False) if tmp_ret_flag: submit_result.success = True - submit_result.target_id = temp_ret_val - tmp_log.info(f"submitted task target_id={submit_result.target_id}") + submit_result.target_id = str(temp_ret_val) + tmp_log.info(f"Submitted task target_id={submit_result.target_id}") else: submit_result.message = temp_ret_val - tmp_log.error(f"failed to submit task: {submit_result.message}") + tmp_log.error(f"Failed to submit task: {submit_result.message}") except Exception as e: submit_result.message = f"exception {str(e)}" - tmp_log.error(f"failed to submit task: {traceback.format_exc()}") + tmp_log.error(f"Failed to submit task: {traceback.format_exc()}") return submit_result + + def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResult: + """ + Check the status of a submitted target for the given step. + This method should be implemented to handle the specifics of status checking. + + Args: + step_spec (WFStepSpec): The workflow step specification containing details about the step to be processed. + **kwargs: Additional keyword arguments that may be required for status checking. + + Returns: + WFStepTargetCheckResult: An object containing the result of the status check, including success status, step status, native status, and message. + """ + tmp_log = LogWrapper(logger, f"check_target workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + allowed_step_statuses = [WFStepStatus.submitted, WFStepStatus.running] + try: + # Initialize + check_result = WFStepTargetCheckResult() + # Check preconditions + if step_spec.status not in allowed_step_statuses: + check_result.message = f"not in status to check; skipped" + tmp_log.warning(f"status={step_spec.status} not in status to check; skipped") + return check_result + if step_spec.type != WFStepType.panda_task: + check_result.message = f"type not panda_task; skipped" + tmp_log.warning(f"type={step_spec.type} not panda_task; skipped") + return check_result + if step_spec.target_id is None: + check_result.message = f"target_id is None; skipped" + tmp_log.warning(f"target_id is None; skipped") + return check_result + # Get task ID and status + task_id = int(step_spec.target_id) + res = self.tbif.getTaskStatus(task_id) + if not res: + check_result.message = f"task_id={task_id} not found" + tmp_log.error(f"{check_result.message}") + return check_result + # Interpret status + task_status = res[0] + check_result.success = True + check_result.native_status = task_status + if task_status in ["running", "transferring", "transferred", "merging"]: + check_result.status = WFStepStatus.running + elif task_status in ["defined", "assigned", "activated", "starting", "ready"]: + check_result.status = WFStepStatus.submitted + elif task_status in ["done", "finished"]: + check_result.status = WFStepStatus.finished + elif task_status in ["failed", "exhausted", "aborted", "toabort", "aborting", "broken", "tobroken"]: + check_result.status = WFStepStatus.failed + else: + check_result.success = False + check_result.message = f"unknown task_status {task_status}" + tmp_log.error(f"{check_result.message}") + return check_result + tmp_log.info(f"Got task_id={task_id} task_status={task_status}") + except Exception as e: + check_result.success = False + check_result.message = f"exception {str(e)}" + tmp_log.error(f"Failed to check status: {traceback.format_exc()}") + return check_result diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 582061b4c..703ece6e4 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -52,6 +52,9 @@ class WFStepStatus(object): failed = "failed" cancelled = "cancelled" + after_submitted_statuses = (running, done, failed, cancelled) + after_running_statuses = (done, failed, cancelled) + class WFDataStatus(object): """ @@ -335,3 +338,78 @@ class WFDataSpec(WorkflowBaseSpec): _forceUpdateAttrs = () # mapping between sequence and attr _seqAttrMap = {"data_id": f"{panda_config.schemaJEDI}.WORKFLOW_DATA_ID_SEQ.nextval"} + + +# === Return objects of core methods which process status ====== + + +@dataclass(slots=True) +class WFStepProcessResult: + """ + Result of processing a step. + + Fields: + success (bool | None): Indicates if the processing was successful. + new_status (WFStepStatus | None): The new status of the step after processing, None if no change. + message (str): A message providing additional information about the processing result. + """ + + success: bool | None = None + new_status: WFStepStatus | None = None + message: str = "" + + +@dataclass(slots=True) +class WorkflowProcessResult: + """ + Result of processing a workflow. + + Fields: + success (bool | None): Indicates if the processing was successful. + new_status (WorkflowStatus | None): The new status of the workflow after processing, None if no change. + message (str): A message providing additional information about the processing result. + """ + + success: bool | None = None + new_status: WorkflowStatus | None = None + message: str = "" + + +# === Return objects of step handler methods =================== + + +@dataclass(slots=True) +class WFStepTargetSubmitResult: + """ + Result of submitting a target of a step. + + Fields: + success (bool | None): Indicates if the submission was successful. + target_id (str | None): The ID of the submitted target (e.g., task ID). + message (str): A message providing additional information about the submission result. + """ + + success: bool | None = None + target_id: str | None = None + message: str = "" + + +@dataclass(slots=True) +class WFStepTargetCheckResult: + """ + Result of checking the status of a submitted target. + + Fields: + success (bool | None): Indicates if the status check was successful. + status (WFStepStatus | None): The status of the step to move to. + native_status (str | None): The native status string from the target system. + message (str): A message providing additional information about the status check result. + """ + + success: bool | None = None + status: WFStepStatus | None = None + native_status: str | None = None + message: str = "" + + +# ============================================================== diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 44b5f0a30..9187d547f 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -24,9 +24,13 @@ WFDataSpec, WFDataStatus, WFDataType, + WFStepProcessResult, WFStepSpec, WFStepStatus, + WFStepTargetCheckResult, + WFStepTargetSubmitResult, WFStepType, + WorkflowProcessResult, WorkflowSpec, WorkflowStatus, ) @@ -291,30 +295,37 @@ def process_data_registered(self, data_spec: WFDataSpec): # ---- Step status transitions ----------------------------- - def process_step_registered(self, step_spec: WFStepSpec): + def process_step_registered(self, step_spec: WFStepSpec) -> WFStepProcessResult: """ Process a step in registered status To prepare for checking the step Args: step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step """ tmp_log = LogWrapper(logger, f"process_step_registered step_id={step_spec.step_id}") tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() # Check status if step_spec.status != WFStepStatus.registered: - tmp_log.warning(f"Step status changed unexpectedly from {WFStepStatus.registered} to {step_spec.status}; skipped") - return + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.registered} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result # Process try: # For now, just update status to pending step_spec.status = WFStepStatus.pending self.tbif.update_workflow_step(step_spec) tmp_log.info(f"Done, status={step_spec.status}") - except Exception: - tmp_log.error(f"Got error ; {traceback.format_exc()}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{process_result.message}") - def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpec] | None = None): + def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpec] | None = None) -> WFStepProcessResult: """ Process a step in pending status To check the inputs of the step @@ -322,13 +333,19 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W Args: step_spec (WFStepSpec): The workflow step specification to process data_spec_map (Dict[str, WFDataSpec] | None): Optional map of data name to WFDataSpec for the workflow + + Returns: + WFStepProcessResult: The result of processing the step """ tmp_log = LogWrapper(logger, f"process_step_pending workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() # Check status if step_spec.status != WFStepStatus.pending: - tmp_log.warning(f"Step status changed unexpectedly from {WFStepStatus.pending} to {step_spec.status}; skipped") - return + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.pending} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result # Process try: # Get data spec map of the workflow @@ -339,8 +356,9 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W step_spec_definition = step_spec.definition_json_map input_data_list = step_spec_definition.get("input_data_list") if input_data_list is None: - tmp_log.warning(f"Step definition does not have input_data_list; skipped") - return + process_result.message = f"Step definition does not have input_data_list; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result # Check if all input data are good, aka ready as input all_inputs_good = True for input_data_name in input_data_list: @@ -356,10 +374,10 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W # If not all inputs are good, just return and wait for next round if not all_inputs_good: tmp_log.debug(f"Some input data are not good; skipped") - return + process_result.success = True + return process_result # All inputs are good, register outputs of the step and update step status to ready tmp_log.debug(f"All input data are good; proceeding") - output_data_type = WFDataType.mid if not step_spec_definition.get("is_tail"): # is intermediate step, register their outputs as mid type output_data_list = step_spec_definition.get("output_data_list", []) @@ -379,24 +397,34 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W data_spec_map[output_data_name] = data_spec step_spec.status = WFStepStatus.ready self.tbif.update_workflow_step(step_spec) + process_result.success = True + process_result.new_status = step_spec.status tmp_log.info(f"Done, status={step_spec.status}") - except Exception: + except Exception as e: + process_result.message = f"Got error {str(e)}" tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result - def process_step_ready(self, step_spec: WFStepSpec): + def process_step_ready(self, step_spec: WFStepSpec) -> WFStepProcessResult: """ Process a step in ready status To submit the step for execution Args: step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step """ tmp_log = LogWrapper(logger, f"process_step_ready workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() # Check status if step_spec.status != WFStepStatus.ready: - tmp_log.warning(f"Step status changed unexpectedly from {WFStepStatus.ready} to {step_spec.status}; skipped") - return + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.ready} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result # Process try: # Get the step handler plugin @@ -404,32 +432,141 @@ def process_step_ready(self, step_spec: WFStepSpec): # Submit the step submit_result = step_handler.submit_target(step_spec) if not submit_result.success or submit_result.target_id is None: - tmp_log.error(f"Failed to submit step; {submit_result.message}") - return + process_result.message = f"Failed to submit step; {submit_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result # Update step status to submitted step_spec.target_id = submit_result.target_id step_spec.status = WFStepStatus.submitted self.tbif.update_workflow_step(step_spec) + process_result.success = True + process_result.new_status = step_spec.status tmp_log.info(f"Submitted step, flavor={step_spec.flavor}, target_id={step_spec.target_id}, status={step_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result + + def process_step_submitted(self, step_spec: WFStepSpec) -> WFStepProcessResult: + """ + Process a step in submitted status + To check the status of the submitted step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step + """ + tmp_log = LogWrapper(logger, f"process_step_submitted workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() + # Check status + if step_spec.status != WFStepStatus.submitted: + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.submitted} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Get the step handler plugin + step_handler = self.get_plugin("step_handler", step_spec.flavor) + # Check the step status + check_result = step_handler.check_target(step_spec) + if not check_result.success or check_result.step_status is None: + process_result.message = f"Failed to check step; {check_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result + # Update step status + if check_result.step_status in WorkflowInterface.after_submitted_statuses: + # Step status advanced + step_spec.status = check_result.step_status + process_result.new_status = step_spec.status + elif check_result.step_status == WFStepStatus.submitted: + # Still in submitted, do nothing + pass + else: + tmp_log.warning(f"Invalid step_status {check_result.step_status} from target check result; skipped") + now_time = naive_utcnow() + step_spec.check_time = now_time + self.tbif.update_workflow_step(step_spec) + process_result.success = True + tmp_log.info(f"Checked step, flavor={step_spec.flavor}, target_id={step_spec.target_id}, status={step_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result + + def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: + """ + Process a step in running status + To check the status of the running step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step + """ + tmp_log = LogWrapper(logger, f"process_step_running workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() + # Check status + if step_spec.status != WFStepStatus.running: + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.running} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Get the step handler plugin + step_handler = self.get_plugin("step_handler", step_spec.flavor) + # Check the step status + check_result = step_handler.check_target(step_spec) + if not check_result.success or check_result.step_status is None: + process_result.message = f"Failed to check step; {check_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result + # Update step status + if check_result.step_status in WorkflowInterface.after_running_statuses: + # Step status advanced + step_spec.status = check_result.step_status + process_result.new_status = step_spec.status + elif check_result.step_status == WFStepStatus.running: + # Still in running, do nothing + pass + else: + tmp_log.warning(f"Invalid step_status {check_result.step_status} from target check result; skipped") + now_time = naive_utcnow() + step_spec.check_time = now_time + self.tbif.update_workflow_step(step_spec) + process_result.success = True + tmp_log.info(f"Checked step, flavor={step_spec.flavor}, target_id={step_spec.target_id}, status={step_spec.status}") except Exception: tmp_log.error(f"Got error ; {traceback.format_exc()}") # ---- Workflow status transitions ------------------------- - def process_workflow_registered(self, workflow_spec: WorkflowSpec): + def process_workflow_registered(self, workflow_spec: WorkflowSpec) -> WorkflowProcessResult: """ Process a workflow in registered status To parse to get workflow definition from raw request Args: workflow_spec (WorkflowSpec): The workflow specification to process + + Returns: + WorkflowProcessResult: The result of processing the workflow """ tmp_log = LogWrapper(logger, f"process_workflow_registered workflow_id={workflow_spec.workflow_id}") tmp_log.debug("Start") + # Initialize + process_result = WorkflowProcessResult() # Check status if workflow_spec.status != WorkflowStatus.registered: - tmp_log.warning(f"Workflow status changed unexpectedly from {WorkflowStatus.registered} to {workflow_spec.status}; skipped") - return + process_result.message = f"Workflow status changed unexpectedly from {WorkflowStatus.registered} to {workflow_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result # Process try: if workflow_spec.definition_json is not None: @@ -449,14 +586,16 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): # Failure handling # if is_fatal: if False: # disable fatal for now - tmp_log.error(f"Fatal error in parsing raw request; cancelled the workflow") + process_result.message = f"Fatal error in parsing raw request; cancelled the workflow" + tmp_log.error(f"{process_result.message}") workflow_spec.status = WorkflowStatus.cancelled workflow_spec.set_parameter("cancel_reason", "Fatal error in parsing raw request") self.tbif.update_workflow(workflow_spec) - return + return process_result if not is_ok: - tmp_log.warning(f"Failed to parse raw request; skipped") - return + process_result.message = f"Failed to parse raw request; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result # extra info from raw request workflow_definition["user_dn"] = raw_request_dict.get("user_dn") # Parsed successfully, update definition @@ -467,11 +606,15 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec): workflow_spec.status = WorkflowStatus.checked # skip parsed for now # Update DB self.tbif.update_workflow(workflow_spec) + process_result.success = True + process_result.new_status = workflow_spec.status tmp_log.info(f"Done, status={workflow_spec.status}") - except Exception: + except Exception as e: + process_result.message = f"Got error {str(e)}" tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result - def process_workflow_checked(self, workflow_spec: WorkflowSpec): + def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProcessResult: """ Process a workflow in checked status Register steps, and update its status @@ -479,23 +622,30 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec): Args: workflow_spec (WorkflowSpec): The workflow specification to process + + Returns: + WorkflowProcessResult: The result of processing the workflow """ tmp_log = LogWrapper(logger, f"process_workflow_checked workflow_id={workflow_spec.workflow_id}") tmp_log.debug("Start") + # Initialize + process_result = WorkflowProcessResult() # Check status if workflow_spec.status != WorkflowStatus.checked: - tmp_log.warning(f"Workflow status changed unexpectedly from {WorkflowStatus.checked} to {workflow_spec.status}; skipped") - return + process_result.message = f"Workflow status changed unexpectedly from {WorkflowStatus.checked} to {workflow_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result # Process try: # Parse the workflow definition workflow_definition = workflow_spec.definition_json_map if workflow_definition is None: - tmp_log.error(f"Workflow definition is None; cancelled the workflow") + process_result.message = f"Workflow definition is None; cancelled the workflow" + tmp_log.error(f"{process_result.message}") workflow_spec.status = WorkflowStatus.cancelled workflow_spec.set_parameter("cancel_reason", "Workflow definition is None") self.tbif.update_workflow(workflow_spec) - return + return process_result # initialize data_specs = [] step_specs = [] @@ -566,32 +716,57 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec): step_specs=step_specs, data_specs=data_specs, ) + process_result.success = True + process_result.new_status = workflow_spec.status tmp_log.info(f"Done, inserted {len(step_specs)} steps and {len(data_specs)} data, status={workflow_spec.status}") - except Exception: + except Exception as e: + process_result.message = f"Got error {str(e)}" tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result - def process_workflow_starting(self, workflow_spec: WorkflowSpec): + def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProcessResult: """ Process a workflow in starting status To start the steps in the workflow Args: workflow_spec (WorkflowSpec): The workflow specification to process + + Returns: + WorkflowProcessResult: The result of processing the workflow """ tmp_log = LogWrapper(logger, f"process_workflow_starting workflow_id={workflow_spec.workflow_id}") tmp_log.debug("Start") + # Initialize + process_result = WorkflowProcessResult() + steps_status_stats = {"changed": {}, "unchanged": {}, "total": 0} # Check status if workflow_spec.status != WorkflowStatus.starting: - tmp_log.warning(f"Workflow status changed unexpectedly from {WorkflowStatus.starting} to {workflow_spec.status}; skipped") - return + process_result.message = f"Workflow status changed unexpectedly from {WorkflowStatus.starting} to {workflow_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result # Process try: # Get steps in registered status - required_step_statuses = [WFStepStatus.registered, WFStepStatus.pending, WFStepStatus.ready] + required_step_statuses = [WFStepStatus.registered, WFStepStatus.pending, WFStepStatus.ready, WFStepStatus.submitted] + over_advanced_step_statuses = [WFStepStatus.running, WFStepStatus.done, WFStepStatus.failed] step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=required_step_statuses) + over_advanced_step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=over_advanced_step_statuses) if not step_specs: - tmp_log.warning(f"No steps in {WFStepStatus.registered} status; skipped") - return + process_result.message = f"No step in required status; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + if over_advanced_step_specs: + process_result.message = f"Some steps are not in required status; force to advance the workflow" + tmp_log.warning(f"{process_result.message}") + # Advance the workflow to running directly + workflow_spec.status = WorkflowStatus.running + workflow_spec.start_time = naive_utcnow() + self.tbif.update_workflow(workflow_spec) + process_result.success = True + process_result.new_status = workflow_spec.status + tmp_log.info(f"Done, forced advanced to status={workflow_spec.status}") + return process_result # Get data spec map of the workflow data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} @@ -606,16 +781,41 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec): continue step_spec = locked_step_spec # Process the step + tmp_res = None match step_spec.status: case WFStepStatus.registered: - self.process_step_registered(step_spec) + tmp_res = self.process_step_registered(step_spec) case WFStepStatus.pending: - self.process_step_pending(step_spec, data_spec_map=data_spec_map) + tmp_res = self.process_step_pending(step_spec, data_spec_map=data_spec_map) case WFStepStatus.ready: - self.process_step_ready(step_spec) + tmp_res = self.process_step_ready(step_spec) + case WFStepStatus.submitted: + tmp_res = self.process_step_submitted(step_spec) case _: # tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") continue - tmp_log.info(f"Done processing steps in {WFStepStatus.registered} status") - except Exception: + if tmp_res and tmp_res.success: + # update stats + if tmp_res.new_status and tmp_res.new_status != step_spec.status: + steps_status_stats["changed"].setdefault(step_spec.status, 0) + steps_status_stats["changed"][step_spec.status] += 1 + else: + steps_status_stats["unchanged"].setdefault(step_spec.status, 0) + steps_status_stats["unchanged"][step_spec.status] += 1 + steps_status_stats["total"] += 1 + tmp_log.info(f"Processed steps: {steps_status_stats}") + # Update workflow status to running if any of step is submitted + if steps_status_stats["changed"].get(WFStepStatus.submitted) or steps_status_stats["unchanged"].get(WFStepStatus.submitted): + workflow_spec.status = WorkflowStatus.running + workflow_spec.start_time = naive_utcnow() + self.tbif.update_workflow(workflow_spec) + process_result.success = True + process_result.new_status = workflow_spec.status + tmp_log.info(f"Done, advanced to status={workflow_spec.status}") + else: + process_result.success = True + tmp_log.info(f"Done, status remains {workflow_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result From bf695654fde511d5b6c359c361bb46fe37b7bde6 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 16 Oct 2025 16:49:31 +0200 Subject: [PATCH 025/101] workflows4: process running workflow --- pandaserver/taskbuffer/TaskBuffer.py | 4 +- .../db_proxy_mods/workflow_module.py | 9 +- pandaserver/workflow/workflow_core.py | 169 ++++++++++++++---- 3 files changed, 143 insertions(+), 39 deletions(-) diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index 694bd5a6e..94cb48a6e 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -2752,9 +2752,9 @@ def get_steps_of_workflow(self, workflow_id, status_filter_list=None, status_exc with self.proxyPool.get() as proxy: return proxy.get_steps_of_workflow(workflow_id, status_filter_list, status_exclusion_list) - def get_data_of_workflow(self, workflow_id, status_filter_list=None, status_exclusion_list=None): + def get_data_of_workflow(self, workflow_id, status_filter_list=None, status_exclusion_list=None, type_filter_list=None): with self.proxyPool.get() as proxy: - return proxy.get_data_of_workflow(workflow_id, status_filter_list, status_exclusion_list) + return proxy.get_data_of_workflow(workflow_id, status_filter_list, status_exclusion_list, type_filter_list) def lock_workflow(self, workflow_id, locked_by, lock_expiration_sec=120): with self.proxyPool.get() as proxy: diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index e23e8bbba..00e69865d 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -150,7 +150,9 @@ def get_steps_of_workflow(self, workflow_id: int, status_filter_list: list | Non tmp_log.warning("no steps found; skipped") return [] - def get_data_of_workflow(self, workflow_id: int, status_filter_list: list | None = None, status_exclusion_list: list | None = None) -> list[WFDataSpec]: + def get_data_of_workflow( + self, workflow_id: int, status_filter_list: list | None = None, status_exclusion_list: list | None = None, type_filter_list: list | None = None + ) -> list[WFDataSpec]: """ Retrieve all workflow data for a given workflow ID @@ -158,6 +160,7 @@ def get_data_of_workflow(self, workflow_id: int, status_filter_list: list | None workflow_id (int): ID of the workflow to retrieve data for status_filter_list (list | None): List of statuses to filter the data by (optional) status_exclusion_list (list | None): List of statuses to exclude the data by (optional) + type_filter_list (list | None): List of types to filter the data by (optional) Returns: list[WFDataSpec]: List of workflow data specifications @@ -174,6 +177,10 @@ def get_data_of_workflow(self, workflow_id: int, status_filter_list: list | None antistatus_var_names_str, antistatus_var_map = get_sql_IN_bind_variables(status_exclusion_list, prefix=":antistatus") sql += f"AND status NOT IN ({antistatus_var_names_str}) " var_map.update(antistatus_var_map) + if type_filter_list: + type_var_names_str, type_var_map = get_sql_IN_bind_variables(type_filter_list, prefix=":type") + sql += f"AND type IN ({type_var_names_str}) " + var_map.update(type_var_map) self.cur.execute(sql + comment, var_map) res_list = self.cur.fetchall() if res_list is not None: diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 9187d547f..61c431783 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -212,7 +212,7 @@ def workflow_data_lock(self, data_id: int, lock_expiration_sec: int = 120): # lock not acquired yield None - # --- Registration ----------------------------------------- + # --- Workflow operation ----------------------------------- def register_workflow( self, @@ -268,6 +268,12 @@ def register_workflow( tmp_log.info(f"Registered workflow workflow_id={ret_workflow_id}") return ret_workflow_id + def cancel_workflow(self, workflow_id: int) -> bool: ... + + # --- Step operation --------------------------------------- + + def cancel_step(self, step_id: int) -> bool: ... + # ---- Data status transitions ----------------------------- def process_data_registered(self, data_spec: WFDataSpec): @@ -545,6 +551,56 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: except Exception: tmp_log.error(f"Got error ; {traceback.format_exc()}") + def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, WFDataSpec] | None = None) -> Dict: + """ + Process a list of workflow steps + + Args: + step_specs (List[WFStepSpec]): List of workflow step specifications to process + data_spec_map (Dict[str, WFDataSpec] | None): Optional map of data name to WFDataSpec for the workflow + + Returns: + Dict: Statistics of the processing results + """ + tmp_log = LogWrapper(logger, f"process_steps workflow_id={step_spec.workflow_id}") + n_steps = len(step_specs) + tmp_log.debug(f"Start, processing {n_steps} steps") + steps_status_stats = {"n_steps": n_steps, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} + for step_spec in step_specs: + with self.workflow_step_lock(step_spec.step_id) as locked_step_spec: + if locked_step_spec is None: + tmp_log.warning(f"Failed to acquire lock for step_id={step_spec.step_id}; skipped") + continue + step_spec = locked_step_spec + # Process the step + tmp_res = None + match step_spec.status: + case WFStepStatus.registered: + tmp_res = self.process_step_registered(step_spec) + case WFStepStatus.pending: + tmp_res = self.process_step_pending(step_spec, data_spec_map=data_spec_map) + case WFStepStatus.ready: + tmp_res = self.process_step_ready(step_spec) + case WFStepStatus.submitted: + tmp_res = self.process_step_submitted(step_spec) + case WFStepStatus.running: + tmp_res = self.process_step_running(step_spec) + case _: + tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") + continue + if tmp_res and tmp_res.success: + # update stats + if tmp_res.new_status and tmp_res.new_status != step_spec.status: + steps_status_stats["changed"].setdefault(step_spec.status, 0) + steps_status_stats["changed"][step_spec.status] += 1 + else: + steps_status_stats["unchanged"].setdefault(step_spec.status, 0) + steps_status_stats["unchanged"][step_spec.status] += 1 + steps_status_stats["processed"].setdefault(step_spec.status, 0) + steps_status_stats["processed"][step_spec.status] += 1 + steps_status_stats["n_processed"] += 1 + tmp_log.info(f"Done, processed steps: {steps_status_stats}") + # ---- Workflow status transitions ------------------------- def process_workflow_registered(self, workflow_spec: WorkflowSpec) -> WorkflowProcessResult: @@ -739,7 +795,6 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc tmp_log.debug("Start") # Initialize process_result = WorkflowProcessResult() - steps_status_stats = {"changed": {}, "unchanged": {}, "total": 0} # Check status if workflow_spec.status != WorkflowStatus.starting: process_result.message = f"Workflow status changed unexpectedly from {WorkflowStatus.starting} to {workflow_spec.status}; skipped" @@ -771,41 +826,9 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} # Process each step - for step_spec in step_specs: - with self.workflow_step_lock(step_spec.step_id) as locked_step_spec: - if locked_step_spec is None: - tmp_log.warning(f"Failed to acquire lock for step_id={step_spec.step_id}; skipped") - continue - if locked_step_spec.status not in required_step_statuses: - tmp_log.warning(f"Step status changed unexpectely to {locked_step_spec.status}; skipped") - continue - step_spec = locked_step_spec - # Process the step - tmp_res = None - match step_spec.status: - case WFStepStatus.registered: - tmp_res = self.process_step_registered(step_spec) - case WFStepStatus.pending: - tmp_res = self.process_step_pending(step_spec, data_spec_map=data_spec_map) - case WFStepStatus.ready: - tmp_res = self.process_step_ready(step_spec) - case WFStepStatus.submitted: - tmp_res = self.process_step_submitted(step_spec) - case _: - # tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") - continue - if tmp_res and tmp_res.success: - # update stats - if tmp_res.new_status and tmp_res.new_status != step_spec.status: - steps_status_stats["changed"].setdefault(step_spec.status, 0) - steps_status_stats["changed"][step_spec.status] += 1 - else: - steps_status_stats["unchanged"].setdefault(step_spec.status, 0) - steps_status_stats["unchanged"][step_spec.status] += 1 - steps_status_stats["total"] += 1 - tmp_log.info(f"Processed steps: {steps_status_stats}") + steps_status_stats = self.process_steps(step_specs, data_spec_map=data_spec_map) # Update workflow status to running if any of step is submitted - if steps_status_stats["changed"].get(WFStepStatus.submitted) or steps_status_stats["unchanged"].get(WFStepStatus.submitted): + if steps_status_stats["processed"].get(WFStepStatus.submitted): workflow_spec.status = WorkflowStatus.running workflow_spec.start_time = naive_utcnow() self.tbif.update_workflow(workflow_spec) @@ -819,3 +842,77 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc process_result.message = f"Got error {str(e)}" tmp_log.error(f"Got error ; {traceback.format_exc()}") return process_result + + def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProcessResult: + """ + Process a workflow in running status + To monitor the steps in the workflow + + Args: + workflow_spec (WorkflowSpec): The workflow specification to process + + Returns: + WorkflowProcessResult: The result of processing the workflow + """ + tmp_log = LogWrapper(logger, f"process_workflow_running workflow_id={workflow_spec.workflow_id}") + tmp_log.debug("Start") + # Initialize + process_result = WorkflowProcessResult() + # Check status + if workflow_spec.status != WorkflowStatus.running: + process_result.message = f"Workflow status changed unexpectedly from {WorkflowStatus.running} to {workflow_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Get steps + step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id) + if not step_specs: + process_result.message = f"No step in required status; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Get data spec map of the workflow + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) + data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} + output_data_spec_map = {data_spec.name: data_spec for data_spec in data_specs if data_spec.type == WFDataType.output} + # Check if all output data are good + all_outputs_good = None + for output_data_name, output_data_spec in output_data_spec_map.items(): + if output_data_spec.status in WFDataStatus.good_output_statuses: + if all_outputs_good is None: + all_outputs_good = True + else: + all_outputs_good = False + break + if all_outputs_good is True: + # All outputs are good, mark the workflow as done + workflow_spec.status = WorkflowStatus.done + workflow_spec.end_time = naive_utcnow() + self.tbif.update_workflow(workflow_spec) + process_result.success = True + process_result.new_status = workflow_spec.status + tmp_log.info(f"Done, all output data are good; advanced to status={workflow_spec.status}") + return process_result + # Process each step + steps_status_stats = self.process_steps(step_specs, data_spec_map=data_spec_map) + # Update workflow status by steps + if (processed_steps_stats := steps_status_stats["processed"]) and ( + processed_steps_stats.get(WFStepStatus.failed) or processed_steps_stats.get(WFStepStatus.cancelled) + ): + # TODO: cancel all unfinished steps + # self.cancel_step(...) + # mark workflow as failed + tmp_log.warning(f"workflow failed due to some steps failed or cancelled") + workflow_spec.status = WorkflowStatus.failed + workflow_spec.start_time = naive_utcnow() + self.tbif.update_workflow(workflow_spec) + process_result.success = True + process_result.new_status = workflow_spec.status + tmp_log.info(f"Done, advanced to status={workflow_spec.status}") + else: + process_result.success = True + tmp_log.info(f"Done, status remains {workflow_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result From 858fa64132b7481c26d78f47bfa8fa70b17aca9c Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 24 Oct 2025 00:49:01 +0200 Subject: [PATCH 026/101] workflows4: more status transition, add ddm data handler --- .../data_handler_plugins/base_data_handler.py | 41 ++++- .../ddm_collection_data_handler.py | 59 ++++++ .../panda_task_step_handler.py | 8 +- pandaserver/workflow/workflow_base.py | 47 ++++- pandaserver/workflow/workflow_core.py | 169 ++++++++++++++++-- 5 files changed, 306 insertions(+), 18 deletions(-) create mode 100644 pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py diff --git a/pandaserver/workflow/data_handler_plugins/base_data_handler.py b/pandaserver/workflow/data_handler_plugins/base_data_handler.py index 4a439dac0..f756d499c 100644 --- a/pandaserver/workflow/data_handler_plugins/base_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/base_data_handler.py @@ -1,7 +1,46 @@ +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFDataTargetCheckResult, + WFDataType, + WFStepSpec, + WFStepStatus, + WFStepTargetCheckResult, + WFStepTargetSubmitResult, + WFStepType, + WorkflowSpec, + WorkflowStatus, +) + + class BaseDataHandler: """ Base class for data handlers in the workflow system. This class provides a common interface and some utility methods for data handlers. """ - def __init__(self, *args, **kwargs): ... + def __init__(self, task_buffer, *args, **kwargs): + """ + Initialize the step handler with necessary parameters. + + Args: + task_buffer: The task buffer interface to interact with the task database. + *args: Additional positional arguments. + **kwargs: Additional keyword arguments. + """ + self.tbif = task_buffer + + def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResult: + """ + Check the status of the data target. + This method should be implemented by subclasses to handle the specifics of data target status checking. + This method should NOT modify data_spec. Any update information should be stored in the WFStepTargetCheckResult returned instead. + + Args: + data_spec (WFDataSpec): The data specification to check. + **kwargs: Additional keyword arguments. + + Returns: + WFDataTargetCheckResult: The result of the target check. + """ + raise NotImplementedError("Subclasses must implement this method.") diff --git a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py new file mode 100644 index 000000000..651fef425 --- /dev/null +++ b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py @@ -0,0 +1,59 @@ +import json +import traceback +import uuid + +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger + +from pandaserver.workflow.data_handler_plugins.base_data_handler import BaseStepHandler +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFDataTargetCheckResult, + WFDataType, + WFStepSpec, + WFStepStatus, + WFStepType, + WorkflowSpec, + WorkflowStatus, +) + +# main logger +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + + +class DDMCollectionDataHandler(BaseStepHandler): + """ + Handler for DDM collection data in the workflow. + This class is responsible for managing the DDM collection data within a workflow. + """ + + def __init__(self, *args, **kwargs): + """ + Initialize the data handler with necessary parameters. + """ + # Initialize base class or any required modules here + super().__init__(*args, **kwargs) + + def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResult: + """ + Check the status of the DDM collection data target. + This method should be implemented to handle the specifics of DDM collection data status checking. + + Args: + data_spec (WFDataSpec): The data specification containing details about the data to be checked. + **kwargs: Additional keyword arguments that may be required for checking. + + Returns: + WFDataTargetCheckResult: An object containing the result of the check, including success status, current data status, and message. + """ + tmp_log = LogWrapper(logger, f"check_target workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") + # Initialize + check_result = WFDataTargetCheckResult() + # Check data type + if data_spec.type != WFDataType.ddm_collection: + tmp_log.warning(f"type={data_spec.type} not ddm_collection; skipped") + check_result.message = f"type not ddm_collection; skipped" + return check_result + # TODO: Implement the actual checking logic here + ... diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index bcd2e92dd..23b89d454 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -5,17 +5,15 @@ from pandacommon.pandalogger.LogWrapper import LogWrapper from pandacommon.pandalogger.PandaLogger import PandaLogger -from pandaserver.workflow.step_handler_plugins.base_step_handler import ( - BaseStepHandler, - WFStepTargetCheckResult, - WFStepTargetSubmitResult, -) +from pandaserver.workflow.step_handler_plugins.base_step_handler import BaseStepHandler from pandaserver.workflow.workflow_base import ( WFDataSpec, WFDataStatus, WFDataType, WFStepSpec, WFStepStatus, + WFStepTargetCheckResult, + WFStepTargetSubmitResult, WFStepType, WorkflowSpec, WorkflowStatus, diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 703ece6e4..c6741fa97 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -50,10 +50,13 @@ class WFStepStatus(object): running = "running" done = "done" failed = "failed" + closed = "closed" cancelled = "cancelled" + checked_statuses = (checked_true, checked_false) after_submitted_statuses = (running, done, failed, cancelled) after_running_statuses = (done, failed, cancelled) + final_statuses = (done, failed, closed, cancelled) class WFDataStatus(object): @@ -64,16 +67,19 @@ class WFDataStatus(object): registered = "registered" checking = "checking" checked_nonex = "checked_nonex" + checked_partex = "checked_partex" checked_exist = "checked_exist" generating_start = "generating_start" generating_ready = "generating_ready" + waiting_ready = "waiting_ready" done_generated = "done_generated" + done_waited = "done_waited" done_skipped = "done_skipped" cancelled = "cancelled" retired = "retired" - good_input_statuses = (generating_ready, done_generated, done_skipped) - good_output_statuses = (done_generated, done_skipped) + good_input_statuses = (generating_ready, waiting_ready, done_generated, done_waited, done_skipped) + good_output_statuses = (done_generated, done_waited, done_skipped) # ==== Types =================================================== @@ -343,6 +349,22 @@ class WFDataSpec(WorkflowBaseSpec): # === Return objects of core methods which process status ====== +@dataclass(slots=True) +class WFDataProcessResult: + """ + Result of processing data. + + Fields: + success (bool | None): Indicates if the processing was successful. + new_status (WFDataStatus | None): The new status of the data after processing, None if no change. + message (str): A message providing additional information about the processing result. + """ + + success: bool | None = None + new_status: WFDataStatus | None = None + message: str = "" + + @dataclass(slots=True) class WFStepProcessResult: """ @@ -412,4 +434,25 @@ class WFStepTargetCheckResult: message: str = "" +# ==== Return objects of data handler methods ================== + + +@dataclass(slots=True) +class WFDataTargetCheckResult: + """ + Result of checking the status of a data target. + + Fields: + success (bool | None): Indicates if the status check was successful. + status (WFDataStatus | None): The status of the data to move to. + native_metadata (dict | None): The native metadata from the target system. + message (str): A message providing additional information about the status check result. + """ + + success: bool | None = None + status: WFDataStatus | None = None + native_metadata: dict | None = None + message: str = "" + + # ============================================================== diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 61c431783..c6cbb3404 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -21,6 +21,7 @@ from pandaserver.dataservice.ddm import rucioAPI from pandaserver.srvcore.CoreUtils import clean_user_id from pandaserver.workflow.workflow_base import ( + WFDataProcessResult, WFDataSpec, WFDataStatus, WFDataType, @@ -56,9 +57,9 @@ "panda_task": ("panda_task_step_handler", "PandaTaskStepHandler"), # Add more step handler plugins here }, - # "data_handler": { - # "example_data": ("example_data_handler", "ExampleDataHandler"), - # }, + "data_handler": { + "ddm_collection": ("ddm_collection_data_handler", "DDMCollectionDataHandler"), + }, # Add more plugin types here } @@ -286,18 +287,91 @@ def process_data_registered(self, data_spec: WFDataSpec): """ tmp_log = LogWrapper(logger, f"process_data_registered data_id={data_spec.data_id}") tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() # Check status if data_spec.status != WFDataStatus.registered: - tmp_log.warning(f"Data status changed unexpectedly from {WFDataStatus.registered} to {data_spec.status}; skipped") - return + process_result.message = f"Data status changed unexpectedly from {WFDataStatus.registered} to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result # Process try: # For now, just update status to checking data_spec.status = WFDataStatus.checking self.tbif.update_workflow_data(data_spec) tmp_log.info(f"Done, status={data_spec.status}") - except Exception: - tmp_log.error(f"Got error ; {traceback.format_exc()}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{process_result.message}") + return process_result + + def process_data_checking(self, data_spec: WFDataSpec): + """ + Process data in checking status + """ + tmp_log = LogWrapper(logger, f"process_data_checking data_id={data_spec.data_id}") + tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() + # Check status + if data_spec.status != WFDataStatus.checking: + process_result.message = f"Data status changed unexpectedly from {WFDataStatus.checking} to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # Check data availability + # FIXME: For now, always advance to checked_nonex + data_spec.status = WFDataStatus.checked_nonex + data_spec.check_time = naive_utcnow() + self.tbif.update_workflow_data(data_spec) + tmp_log.info(f"Done, status={data_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{process_result.message}") + return process_result + + def process_data_checked(self, data_spec: WFDataSpec): + """ + Process data in checked status + """ + tmp_log = LogWrapper(logger, f"process_data_checked data_id={data_spec.data_id}") + tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() + # Check status + if data_spec.status not in (WFDataStatus.checked_nonex, WFDataStatus.checked_partex, WFDataStatus.checked_exist): + process_result.message = f"Data status changed unexpectedly from checked_* to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + original_status = data_spec.status + # Update data status based on check result + now_time = naive_utcnow() + match data_spec.status: + case WFDataStatus.checked_nonex: + # Data does not exist, advance to generating_start + data_spec.status = WFDataStatus.generating_start + data_spec.check_time = now_time + data_spec.start_time = now_time + self.tbif.update_workflow_data(data_spec) + case WFDataStatus.checked_partex: + # Data partially exist, advance to waiting_ready + data_spec.status = WFDataStatus.waiting_ready + data_spec.check_time = now_time + self.tbif.update_workflow_data(data_spec) + case WFDataStatus.checked_exist: + # Data already fully exist, advance to done_exist + data_spec.status = WFDataStatus.done_exist + data_spec.check_time = now_time + data_spec.end_time = now_time + self.tbif.update_workflow_data(data_spec) + tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{process_result.message}") + return process_result # ---- Step status transitions ----------------------------- @@ -323,13 +397,86 @@ def process_step_registered(self, step_spec: WFStepSpec) -> WFStepProcessResult: return process_result # Process try: - # For now, just update status to pending - step_spec.status = WFStepStatus.pending + step_spec.status = WFStepStatus.checking self.tbif.update_workflow_step(step_spec) tmp_log.info(f"Done, status={step_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" tmp_log.error(f"{process_result.message}") + return process_result + + def process_step_checking(self, step_spec: WFStepSpec) -> WFStepProcessResult: + """ + Process a step in checking status + To check the conditions about whether to process the step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step + """ + tmp_log = LogWrapper(logger, f"process_step_checking workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() + # Check status + if step_spec.status != WFStepStatus.checking: + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.checking} to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # FIXME: For now, always advance to checked_true + if True: + step_spec.status = WFStepStatus.checked_true + self.tbif.update_workflow_step(step_spec) + tmp_log.info(f"Done, status={step_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{process_result.message}") + return process_result + + def process_step_checked(self, step_spec: WFStepSpec) -> WFStepProcessResult: + """ + Process a step in checked status + To advance to pending or closed based on check result + + Args: + step_spec (WFStepSpec): The workflow step specification to process + + Returns: + WFStepProcessResult: The result of processing the step + """ + tmp_log = LogWrapper(logger, f"process_step_checked workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log.debug("Start") + # Initialize + process_result = WFStepProcessResult() + # Check status + if step_spec.status not in WFStepStatus.checked_statuses: + process_result.message = f"Step status changed unexpectedly from checked_* to {step_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + original_status = step_spec.status + try: + now_time = naive_utcnow() + match step_spec.status: + case WFStepStatus.checked_true: + # Conditions met, advance to pending + step_spec.status = WFStepStatus.pending + step_spec.check_time = now_time + self.tbif.update_workflow_step(step_spec) + case WFStepStatus.checked_false: + # Conditions not met, advanced to closed + step_spec.status = WFStepStatus.closed + step_spec.check_time = now_time + self.tbif.update_workflow_step(step_spec) + tmp_log.info(f"Done, from {original_status} to status={step_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{process_result.message}") + return process_result def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpec] | None = None) -> WFStepProcessResult: """ @@ -548,8 +695,10 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: self.tbif.update_workflow_step(step_spec) process_result.success = True tmp_log.info(f"Checked step, flavor={step_spec.flavor}, target_id={step_spec.target_id}, status={step_spec.status}") - except Exception: + except Exception as e: + process_result.message = f"Got error {str(e)}" tmp_log.error(f"Got error ; {traceback.format_exc()}") + return process_result def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, WFDataSpec] | None = None) -> Dict: """ From 09ff0fbd99c0bd7ed260e7ad4275cecb7be75205 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 27 Oct 2025 11:34:17 +0100 Subject: [PATCH 027/101] workflows4: add data process methods and ddm_collection data handler --- .../ddm_collection_data_handler.py | 37 +++- pandaserver/workflow/workflow_base.py | 34 ++- pandaserver/workflow/workflow_core.py | 199 +++++++++++++++++- 3 files changed, 260 insertions(+), 10 deletions(-) diff --git a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py index 651fef425..c1d5b8cdd 100644 --- a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py @@ -5,6 +5,7 @@ from pandacommon.pandalogger.LogWrapper import LogWrapper from pandacommon.pandalogger.PandaLogger import PandaLogger +from pandaserver.dataservice.ddm import rucioAPI from pandaserver.workflow.data_handler_plugins.base_data_handler import BaseStepHandler from pandaserver.workflow.workflow_base import ( WFDataSpec, @@ -22,6 +23,25 @@ logger = PandaLogger().getLogger(__name__.split(".")[-1]) +class DDMCollectionDIDType: + """ + Data Identifier Types for DDM Collections + """ + + DATASET = "DATASET" + CONTAINER = "CONTAINER" + + +class DDMCollectionState: + """ + States for DDM Collections + """ + + open = "open" + closed = "closed" + missing = "missing" + + class DDMCollectionDataHandler(BaseStepHandler): """ Handler for DDM collection data in the workflow. @@ -34,6 +54,7 @@ def __init__(self, *args, **kwargs): """ # Initialize base class or any required modules here super().__init__(*args, **kwargs) + self.ddmIF = rucioAPI def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResult: """ @@ -56,4 +77,18 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu check_result.message = f"type not ddm_collection; skipped" return check_result # TODO: Implement the actual checking logic here - ... + collection = data_spec.target_id + collection_meta = self.ddmIF.get_dataset_metadata(collection, ignore_missing=True) + if collection_meta is None: + check_result.success = False + check_result.message = f"Failed to get metadata for collection {collection}" + tmp_log.error(f"{check_result.message}") + return check_result + match collection_meta.get("state"): + case DDMCollectionState.missing: + check_result.status = WFDataStatus.generating_start + case DDMCollectionState.open: + check_result.status = WFDataStatus.generating_ready + case DDMCollectionState.closed: + check_result.status = WFDataStatus.done_generated + check_result.metadata = collection_meta diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index c6741fa97..0ed4528eb 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -78,8 +78,14 @@ class WFDataStatus(object): cancelled = "cancelled" retired = "retired" + checked_statuses = (checked_nonex, checked_partex, checked_exist) + generating_statues = (generating_start, generating_ready) + done_statuses = (done_generated, done_waited, done_skipped) good_input_statuses = (generating_ready, waiting_ready, done_generated, done_waited, done_skipped) good_output_statuses = (done_generated, done_waited, done_skipped) + after_generating_start_statuses = (generating_ready, done_generated, cancelled) + after_generating_ready_statuses = (done_generated, cancelled) + after_waiting_ready_statuses = (done_waited, cancelled) # ==== Types =================================================== @@ -334,6 +340,7 @@ class WFDataSpec(WorkflowBaseSpec): AttributeWithType("check_time", datetime), AttributeWithType("locked_by", str), AttributeWithType("lock_time", datetime), + AttributeWithType("metadata", str), AttributeWithType("parameters", str), ) # attributes @@ -345,6 +352,29 @@ class WFDataSpec(WorkflowBaseSpec): # mapping between sequence and attr _seqAttrMap = {"data_id": f"{panda_config.schemaJEDI}.WORKFLOW_DATA_ID_SEQ.nextval"} + @property + def metadata_map(self) -> dict: + """ + Get the dictionary parsed by metadata attribute in JSON + + Returns: + dict : dict of metadata if it is JSON or empty dict if null + """ + if self.metadata is None: + return {} + else: + return json.loads(self.metadata) + + @metadata_map.setter + def metadata_map(self, value_map: dict): + """ + Set the dictionary and store in metadata attribute in JSON + + Args: + value_map (dict): dict to set the metadata map + """ + self.metadata = json.dumps(value_map) + # === Return objects of core methods which process status ====== @@ -445,13 +475,13 @@ class WFDataTargetCheckResult: Fields: success (bool | None): Indicates if the status check was successful. status (WFDataStatus | None): The status of the data to move to. - native_metadata (dict | None): The native metadata from the target system. + metadata (dict | None): The native metadata from the target system. message (str): A message providing additional information about the status check result. """ success: bool | None = None status: WFDataStatus | None = None - native_metadata: dict | None = None + metadata: dict | None = None message: str = "" diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index c6cbb3404..dc4f647d8 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -277,13 +277,16 @@ def cancel_step(self, step_id: int) -> bool: ... # ---- Data status transitions ----------------------------- - def process_data_registered(self, data_spec: WFDataSpec): + def process_data_registered(self, data_spec: WFDataSpec) -> WFDataProcessResult: """ Process data in registered status To prepare for checking the data Args: data_spec (WFDataSpec): The workflow data specification to process + + Returns: + WFDataProcessResult: The result of processing the data """ tmp_log = LogWrapper(logger, f"process_data_registered data_id={data_spec.data_id}") tmp_log.debug("Start") @@ -305,9 +308,16 @@ def process_data_registered(self, data_spec: WFDataSpec): tmp_log.error(f"{process_result.message}") return process_result - def process_data_checking(self, data_spec: WFDataSpec): + def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: """ Process data in checking status + To check the conditions about whether the data is available + + Args: + data_spec (WFDataSpec): The workflow data specification to process + + Returns: + WFDataProcessResult: The result of processing the data """ tmp_log = LogWrapper(logger, f"process_data_checking data_id={data_spec.data_id}") tmp_log.debug("Start") @@ -331,9 +341,16 @@ def process_data_checking(self, data_spec: WFDataSpec): tmp_log.error(f"{process_result.message}") return process_result - def process_data_checked(self, data_spec: WFDataSpec): + def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: """ Process data in checked status + To advance to next status based on check result + + Args: + data_spec (WFDataSpec): The workflow data specification to process + + Returns: + WFDataProcessResult: The result of processing the data """ tmp_log = LogWrapper(logger, f"process_data_checked data_id={data_spec.data_id}") tmp_log.debug("Start") @@ -373,6 +390,172 @@ def process_data_checked(self, data_spec: WFDataSpec): tmp_log.error(f"{process_result.message}") return process_result + def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: + """ + Process data in generating status + To check the status of the data being generated + + Args: + data_spec (WFDataSpec): The workflow data specification to process + + Returns: + WFDataProcessResult: The result of processing the data + """ + tmp_log = LogWrapper(logger, f"process_data_generating data_id={data_spec.data_id}") + tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() + # Check status + if data_spec.status not in (WFDataStatus.generating_start, WFDataStatus.generating_ready): + process_result.message = f"Data status changed unexpectedly from generating_* to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + original_status = data_spec.status + # Get the data handler plugin + data_handler = self.get_plugin("data_handler", data_spec.flavor) + # Check the data status + check_result = data_handler.check_target(data_spec) + if not check_result.success or check_result.data_status is None: + process_result.message = f"Failed to check data; {check_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result + # Update data status + now_time = naive_utcnow() + match original_status: + case WFDataStatus.generating_start: + if check_result.data_status in WFDataStatus.after_generating_start_statuses: + # Data status advanced + data_spec.status = check_result.data_status + process_result.new_status = data_spec.status + elif check_result.data_status == WFDataStatus.generating_start: + # Still in generating_start, do nothing + pass + else: + tmp_log.warning(f"Invalid data_status {check_result.data_status} from target check result; skipped") + self.tbif.update_workflow_data(data_spec) + case WFDataStatus.generating_ready: + if check_result.data_status in WFDataStatus.after_generating_ready_statuses: + # Data status advanced to terminal + data_spec.status = check_result.data_status + process_result.new_status = data_spec.status + data_spec.end_time = now_time + elif check_result.data_status == WFDataStatus.generating_ready: + # Still in generating_ready, do nothing + pass + else: + tmp_log.warning(f"Invalid data_status {check_result.data_status} from target check result; skipped") + data_spec.check_time = now_time + self.tbif.update_workflow_data(data_spec) + tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{process_result.message}") + return process_result + + def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: + """ + Process data in waiting status + To check the status of the data being waited for, probably generating by other workflow steps or external sources + + Args: + data_spec (WFDataSpec): The workflow data specification to process + + Returns: + WFDataProcessResult: The result of processing the data + """ + tmp_log = LogWrapper(logger, f"process_data_waiting data_id={data_spec.data_id}") + tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() + # Check status + if data_spec.status not in (WFDataStatus.waiting_ready,): + process_result.message = f"Data status changed unexpectedly from waiting_* to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + original_status = data_spec.status + # Get the data handler plugin + data_handler = self.get_plugin("data_handler", data_spec.flavor) + # Check the data status + check_result = data_handler.check_target(data_spec) + if not check_result.success or check_result.data_status is None: + process_result.message = f"Failed to check data; {check_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result + # Update data status + now_time = naive_utcnow() + match original_status: + case WFDataStatus.waiting_ready: + if check_result.data_status in WFDataStatus.after_waiting_ready_statuses: + # Data status advanced to terminal + data_spec.status = check_result.data_status + process_result.new_status = data_spec.status + data_spec.end_time = now_time + elif check_result.data_status == WFDataStatus.waiting_ready: + # Still in waiting_ready, do nothing + pass + else: + tmp_log.warning(f"Invalid data_status {check_result.data_status} from target check result; skipped") + data_spec.check_time = now_time + self.tbif.update_workflow_data(data_spec) + tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{process_result.message}") + return process_result + + def process_data(self, data_specs: List[WFDataSpec]) -> Dict: + """ + Process a list of workflow data specifications + + Args: + data_specs (List[WFDataSpec]): List of workflow data specifications to process + + Returns: + Dict: Statistics of the processing results + """ + tmp_log = LogWrapper(logger, f"process_data workflow_id={data_specs[0].workflow_id}") + n_data = len(data_specs) + tmp_log.debug(f"Start, processing {n_data} data specs") + data_status_stats = {"n_data": n_data, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} + for data_spec in data_specs: + with self.workflow_data_lock(data_spec.data_id) as locked_data_spec: + if locked_data_spec is None: + tmp_log.warning(f"Failed to acquire lock for data_id={data_spec.data_id}; skipped") + continue + data_spec = locked_data_spec + # Process the data + tmp_res = None + match data_spec.status: + case WFDataStatus.registered: + tmp_res = self.process_data_registered(data_spec) + case WFDataStatus.checking: + tmp_res = self.process_data_checking(data_spec) + case WFDataStatus.checked_nonex | WFDataStatus.checked_partex | WFDataStatus.checked_exist: + tmp_res = self.process_data_checked(data_spec) + case WFDataStatus.generating_start | WFDataStatus.generating_ready: + tmp_res = self.process_data_generating(data_spec) + case WFDataStatus.waiting_ready: + tmp_res = self.process_data_waiting(data_spec) + case _: + tmp_log.debug(f"Data status {data_spec.status} is not handled in this context; skipped") + continue + if tmp_res and tmp_res.success: + # update stats + if tmp_res.new_status and tmp_res.new_status != data_spec.status: + data_status_stats["changed"].setdefault(data_spec.status, 0) + data_status_stats["changed"][data_spec.status] += 1 + else: + data_status_stats["unchanged"].setdefault(data_spec.status, 0) + data_status_stats["unchanged"][data_spec.status] += 1 + data_status_stats["processed"].setdefault(data_spec.status, 0) + data_status_stats["processed"][data_spec.status] += 1 + data_status_stats["n_processed"] += 1 + tmp_log.info(f"Done, processed data specs: {data_status_stats}") + # ---- Step status transitions ----------------------------- def process_step_registered(self, step_spec: WFStepSpec) -> WFStepProcessResult: @@ -542,7 +725,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W data_spec.target_id = None # to be filled later data_spec.status = WFDataStatus.registered data_spec.type = WFDataType.mid - data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable + data_spec.flavor = "ddm_collection" # FIXME: hardcoded flavor, should be configurable data_spec.creation_time = now_time self.tbif.insert_workflow_data(data_spec) tmp_log.debug(f"Registered mid data {output_data_name} of step_id={step_spec.step_id}") @@ -863,7 +1046,7 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProce data_spec.target_id = input_target data_spec.status = WFDataStatus.registered data_spec.type = WFDataType.input - data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable + data_spec.flavor = "ddm_collection" # FIXME: hardcoded flavor, should be configurable data_spec.creation_time = now_time data_specs.append(data_spec) for output_name, output_dict in workflow_definition["root_outputs"].items(): @@ -873,7 +1056,7 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProce data_spec.target_id = output_dict.get("value") data_spec.status = WFDataStatus.registered data_spec.type = WFDataType.output - data_spec.flavor = "ddm_ds" # FIXME: hardcoded flavor, should be configurable + data_spec.flavor = "ddm_collection" # FIXME: hardcoded flavor, should be configurable data_spec.creation_time = now_time data_specs.append(data_spec) # Register steps based on nodes in the definition @@ -951,6 +1134,8 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc return process_result # Process try: + # Get data + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) # Get steps in registered status required_step_statuses = [WFStepStatus.registered, WFStepStatus.pending, WFStepStatus.ready, WFStepStatus.submitted] over_advanced_step_statuses = [WFStepStatus.running, WFStepStatus.done, WFStepStatus.failed] @@ -974,7 +1159,7 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc # Get data spec map of the workflow data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} - # Process each step + # Process steps steps_status_stats = self.process_steps(step_specs, data_spec_map=data_spec_map) # Update workflow status to running if any of step is submitted if steps_status_stats["processed"].get(WFStepStatus.submitted): From e25637f3a7bdce99d6aa86fe6d733a212c5a947a Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 27 Oct 2025 13:41:43 +0100 Subject: [PATCH 028/101] workflows4: add native workflow watchdog --- .../jedidog/AtlasWorkflowProcessorWatchDog.py | 69 +++++++++++++++ pandaserver/workflow/workflow_base.py | 3 + pandaserver/workflow/workflow_core.py | 85 ++++++++++++++++++- 3 files changed, 155 insertions(+), 2 deletions(-) create mode 100644 pandajedi/jedidog/AtlasWorkflowProcessorWatchDog.py diff --git a/pandajedi/jedidog/AtlasWorkflowProcessorWatchDog.py b/pandajedi/jedidog/AtlasWorkflowProcessorWatchDog.py new file mode 100644 index 000000000..e8d001e7d --- /dev/null +++ b/pandajedi/jedidog/AtlasWorkflowProcessorWatchDog.py @@ -0,0 +1,69 @@ +import datetime +import os +import re +import socket +import sys +import traceback + +# logger +from pandacommon.pandalogger.PandaLogger import PandaLogger +from pandacommon.pandautils.PandaUtils import naive_utcnow + +from pandajedi.jedicore.MsgWrapper import MsgWrapper +from pandajedi.jedicore.ThreadUtils import ListWithLock, ThreadPool, WorkerThread +from pandaserver.workflow.workflow_core import WorkflowInterface + +from .WatchDogBase import WatchDogBase + +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + + +class AtlasWorkflowProcessorWatchDog(WatchDogBase): + """ + Workflow processor watchdog for ATLAS + """ + + # constructor + def __init__(self, taskBufferIF, ddmIF): + WatchDogBase.__init__(self, taskBufferIF, ddmIF) + self.vo = "atlas" + self.workflow_interface = WorkflowInterface(taskBufferIF) + + def doProcessWorkflows(self): + """ + Action to process active workflows + """ + tmpLog = MsgWrapper(logger, " #ATM #KV doProcessWorkflows") + tmpLog.debug("start") + try: + # watchdog lock + got_lock = self.get_process_lock("AtlasWorkflowProcessorWatchDog.doProcessWorkflows", timeLimit=1) + if not got_lock: + tmpLog.debug("locked by another watchdog process. Skipped") + return + tmpLog.debug("got watchdog lock") + # process active workflows + stats = self.workflow_interface.process_active_workflows() + tmpLog.info(f"processed workflows: {stats}") + # done + tmpLog.debug("done") + except Exception: + errtype, errvalue = sys.exc_info()[:2] + tmpLog.error(f"failed with {errtype} {errvalue} {traceback.format_exc()}") + + # main + def doAction(self): + try: + # get logger + origTmpLog = MsgWrapper(logger) + origTmpLog.debug("start") + # clean up + # check + # process workflows + self.doProcessWorkflows() + except Exception: + errtype, errvalue = sys.exc_info()[:2] + origTmpLog.error(f"failed with {errtype} {errvalue}") + # return + origTmpLog.debug("done") + return self.SC_SUCCEEDED diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 0ed4528eb..f4a4d4625 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -34,6 +34,9 @@ class WorkflowStatus(object): failed = "failed" cancelled = "cancelled" + active_statuses = (registered, parsed, checking, checked, starting, running) + final_statuses = (done, failed, cancelled) + class WFStepStatus(object): """ diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index dc4f647d8..9c575bb84 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -49,6 +49,11 @@ # named tuple for attribute with type AttributeWithType = namedtuple("AttributeWithType", ["attribute", "type"]) +# ==== Global Parameters ======================================= + +WORKFLOW_CHECK_INTERVAL_SEC = 300 +STEP_CHECK_INTERVAL_SEC = 300 +DATA_CHECK_INTERVAL_SEC = 300 # ==== Plugin Map ============================================== @@ -814,7 +819,7 @@ def process_step_submitted(self, step_spec: WFStepSpec) -> WFStepProcessResult: tmp_log.error(f"{process_result.message}") return process_result # Update step status - if check_result.step_status in WorkflowInterface.after_submitted_statuses: + if check_result.step_status in WFStepStatus.after_submitted_statuses: # Step status advanced step_spec.status = check_result.step_status process_result.new_status = step_spec.status @@ -864,7 +869,7 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: tmp_log.error(f"{process_result.message}") return process_result # Update step status - if check_result.step_status in WorkflowInterface.after_running_statuses: + if check_result.step_status in WFStepStatus.after_running_statuses: # Step status advanced step_spec.status = check_result.step_status process_result.new_status = step_spec.status @@ -1250,3 +1255,79 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce process_result.message = f"Got error {str(e)}" tmp_log.error(f"Got error ; {traceback.format_exc()}") return process_result + + def process_workflow(self, workflow_spec: WorkflowSpec) -> WorkflowProcessResult: + """ + Process a workflow based on its current status + + Args: + workflow_spec (WorkflowSpec): The workflow specification to process + + Returns: + WorkflowProcessResult: The result of processing the workflow + """ + tmp_log = LogWrapper(logger, f"process_workflow workflow_id={workflow_spec.workflow_id}") + tmp_log.debug(f"Start, current status={workflow_spec.status}") + # Initialize + process_result = WorkflowProcessResult() + # Process based on status + match workflow_spec.status: + case WorkflowStatus.registered: + process_result = self.process_workflow_registered(workflow_spec) + case WorkflowStatus.checked: + process_result = self.process_workflow_checked(workflow_spec) + case WorkflowStatus.starting: + process_result = self.process_workflow_starting(workflow_spec) + case WorkflowStatus.running: + process_result = self.process_workflow_running(workflow_spec) + case _: + process_result.message = f"Workflow status {workflow_spec.status} is not handled in this context; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + + # ---- Process all workflows ------------------------------------- + + def process_active_workflows(self) -> Dict: + """ + Process all active workflows in the system + + Returns: + Dict: Statistics of the processing results + """ + tmp_log = LogWrapper(logger, "process_active_workflows") + tmp_log.debug("Start") + # Initialize + workflows_status_stats = {"n_workflows": 0, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} + try: + # Get workflows + workflow_specs = self.tbif.get_workflows(status_filter_list=WorkflowStatus.active_statuses) + n_workflows = len(workflow_specs) + tmp_log.debug(f"Got {n_workflows} workflows to process") + if n_workflows == 0: + tmp_log.info("Done, no workflow to process") + return workflows_status_stats + # Process each workflow + for workflow_spec in workflow_specs: + with self.workflow_lock(workflow_spec.workflow_id) as locked_workflow_spec: + if locked_workflow_spec is None: + tmp_log.warning(f"Failed to acquire lock for workflow_id={workflow_spec.workflow_id}; skipped") + continue + workflow_spec = locked_workflow_spec + # Process the workflow + tmp_res = self.process_workflow(workflow_spec) + if tmp_res and tmp_res.success: + # update stats + if tmp_res.new_status and tmp_res.new_status != workflow_spec.status: + workflows_status_stats["changed"].setdefault(workflow_spec.status, 0) + workflows_status_stats["changed"][workflow_spec.status] += 1 + else: + workflows_status_stats["unchanged"].setdefault(workflow_spec.status, 0) + workflows_status_stats["unchanged"][workflow_spec.status] += 1 + workflows_status_stats["processed"].setdefault(workflow_spec.status, 0) + workflows_status_stats["processed"][workflow_spec.status] += 1 + workflows_status_stats["n_processed"] += 1 + workflows_status_stats["n_workflows"] = n_workflows + tmp_log.info(f"Done, processed workflows: {workflows_status_stats}") + except Exception as e: + tmp_log.error(f"Got error ; {traceback.format_exc()}") + return workflows_status_stats From fa60a58c6dc88790e3f50ff12b2c37d5bc9ffaa8 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 27 Oct 2025 16:17:51 +0100 Subject: [PATCH 029/101] workflows4: fix --- pandajedi/jedicore/JediTaskBuffer.py | 78 +++++++++++++++++++ .../ddm_collection_data_handler.py | 4 +- 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/pandajedi/jedicore/JediTaskBuffer.py b/pandajedi/jedicore/JediTaskBuffer.py index 23c7132b2..05a621354 100644 --- a/pandajedi/jedicore/JediTaskBuffer.py +++ b/pandajedi/jedicore/JediTaskBuffer.py @@ -968,6 +968,8 @@ def get_max_events_in_dataset(self, jedi_task_id, dataset_id): with self.proxyPool.get() as proxy: return proxy.get_max_events_in_dataset(jedi_task_id, dataset_id) + # ==== Data Carousel functions ============================= + # insert data carousel requests def insert_data_carousel_requests_JEDI(self, task_id, dc_req_specs): with self.proxyPool.get() as proxy: @@ -1017,3 +1019,79 @@ def retire_data_carousel_request_JEDI(self, request_id): def resubmit_data_carousel_request_JEDI(self, request_id, exclude_prev_dst=False): with self.proxyPool.get() as proxy: return proxy.resubmit_data_carousel_request_JEDI(request_id, exclude_prev_dst) + + # ==== Workflow fucntions ================================== + + def get_workflow(self, workflow_id): + with self.proxyPool.get() as proxy: + return proxy.get_workflow(workflow_id) + + def get_workflow_step(self, step_id): + with self.proxyPool.get() as proxy: + return proxy.get_workflow_step(step_id) + + def get_workflow_data(self, data_id): + with self.proxyPool.get() as proxy: + return proxy.get_workflow_data(data_id) + + def get_steps_of_workflow(self, workflow_id, status_filter_list=None, status_exclusion_list=None): + with self.proxyPool.get() as proxy: + return proxy.get_steps_of_workflow(workflow_id, status_filter_list, status_exclusion_list) + + def get_data_of_workflow(self, workflow_id, status_filter_list=None, status_exclusion_list=None, type_filter_list=None): + with self.proxyPool.get() as proxy: + return proxy.get_data_of_workflow(workflow_id, status_filter_list, status_exclusion_list, type_filter_list) + + def lock_workflow(self, workflow_id, locked_by, lock_expiration_sec=120): + with self.proxyPool.get() as proxy: + return proxy.lock_workflow(workflow_id, locked_by, lock_expiration_sec) + + def unlock_workflow(self, workflow_id, locked_by): + with self.proxyPool.get() as proxy: + return proxy.unlock_workflow(workflow_id, locked_by) + + def lock_workflow_step(self, step_id, locked_by, lock_expiration_sec=120): + with self.proxyPool.get() as proxy: + return proxy.lock_workflow_step(step_id, locked_by, lock_expiration_sec) + + def unlock_workflow_step(self, step_id, locked_by): + with self.proxyPool.get() as proxy: + return proxy.unlock_workflow_step(step_id, locked_by) + + def lock_workflow_data(self, data_id, locked_by, lock_expiration_sec=120): + with self.proxyPool.get() as proxy: + return proxy.lock_workflow_data(data_id, locked_by, lock_expiration_sec) + + def unlock_workflow_data(self, data_id, locked_by): + with self.proxyPool.get() as proxy: + return proxy.unlock_workflow_data(data_id, locked_by) + + def insert_workflow(self, workflow_spec): + with self.proxyPool.get() as proxy: + return proxy.insert_workflow(workflow_spec) + + def insert_workflow_step(self, wf_step_spec): + with self.proxyPool.get() as proxy: + return proxy.insert_workflow_step(wf_step_spec) + + def insert_workflow_data(self, wf_data_spec): + with self.proxyPool.get() as proxy: + return proxy.insert_workflow_data(wf_data_spec) + + def update_workflow(self, workflow_spec): + with self.proxyPool.get() as proxy: + return proxy.update_workflow(workflow_spec) + + def update_workflow_step(self, wf_step_spec): + with self.proxyPool.get() as proxy: + return proxy.update_workflow_step(wf_step_spec) + + def update_workflow_data(self, wf_data_spec): + with self.proxyPool.get() as proxy: + return proxy.update_workflow_data(wf_data_spec) + + def upsert_workflow_entities(self, workflow_id, actions_dict=None, workflow_spec=None, step_specs=None, data_specs=None): + with self.proxyPool.get() as proxy: + return proxy.upsert_workflow_entities(workflow_id, actions_dict, workflow_spec, step_specs, data_specs) + + # ========================================================== diff --git a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py index c1d5b8cdd..1e48faa53 100644 --- a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py @@ -6,7 +6,7 @@ from pandacommon.pandalogger.PandaLogger import PandaLogger from pandaserver.dataservice.ddm import rucioAPI -from pandaserver.workflow.data_handler_plugins.base_data_handler import BaseStepHandler +from pandaserver.workflow.data_handler_plugins.base_data_handler import BaseDataHandler from pandaserver.workflow.workflow_base import ( WFDataSpec, WFDataStatus, @@ -42,7 +42,7 @@ class DDMCollectionState: missing = "missing" -class DDMCollectionDataHandler(BaseStepHandler): +class DDMCollectionDataHandler(BaseDataHandler): """ Handler for DDM collection data in the workflow. This class is responsible for managing the DDM collection data within a workflow. From 9070f7b9bcab9cb46800872cf28a1f5692826130 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 27 Oct 2025 16:32:47 +0100 Subject: [PATCH 030/101] workflows4: fix --- pandajedi/jedicore/JediTaskBuffer.py | 176 +++++------------- pandaserver/taskbuffer/TaskBuffer.py | 4 + .../db_proxy_mods/workflow_module.py | 42 +++++ pandaserver/workflow/workflow_core.py | 2 +- 4 files changed, 96 insertions(+), 128 deletions(-) diff --git a/pandajedi/jedicore/JediTaskBuffer.py b/pandajedi/jedicore/JediTaskBuffer.py index 05a621354..0326f8e5e 100644 --- a/pandajedi/jedicore/JediTaskBuffer.py +++ b/pandajedi/jedicore/JediTaskBuffer.py @@ -968,130 +968,52 @@ def get_max_events_in_dataset(self, jedi_task_id, dataset_id): with self.proxyPool.get() as proxy: return proxy.get_max_events_in_dataset(jedi_task_id, dataset_id) - # ==== Data Carousel functions ============================= - - # insert data carousel requests - def insert_data_carousel_requests_JEDI(self, task_id, dc_req_specs): - with self.proxyPool.get() as proxy: - return proxy.insert_data_carousel_requests_JEDI(task_id, dc_req_specs) - - # update a data carousel request - def update_data_carousel_request_JEDI(self, dc_req_spec): - with self.proxyPool.get() as proxy: - return proxy.update_data_carousel_request_JEDI(dc_req_spec) - - # get data carousel queued requests and info of their related tasks - def get_data_carousel_queued_requests_JEDI(self): - with self.proxyPool.get() as proxy: - return proxy.get_data_carousel_queued_requests_JEDI() - - # get data carousel requests of tasks by task status - def get_data_carousel_requests_by_task_status_JEDI(self, status_filter_list=None, status_exclusion_list=None): - with self.proxyPool.get() as proxy: - return proxy.get_data_carousel_requests_by_task_status_JEDI(status_filter_list=status_filter_list, status_exclusion_list=status_exclusion_list) - - # get data carousel staging requests - def get_data_carousel_staging_requests_JEDI(self): - with self.proxyPool.get() as proxy: - return proxy.get_data_carousel_staging_requests_JEDI() - - # delete data carousel requests - def delete_data_carousel_requests_JEDI(self, request_id_list): - with self.proxyPool.get() as proxy: - return proxy.delete_data_carousel_requests_JEDI(request_id_list) - - # clean up data carousel requests - def clean_up_data_carousel_requests_JEDI(self, time_limit_days=30): - with self.proxyPool.get() as proxy: - return proxy.clean_up_data_carousel_requests_JEDI(time_limit_days) - - # cancel a data carousel request - def cancel_data_carousel_request_JEDI(self, request_id): - with self.proxyPool.get() as proxy: - return proxy.cancel_data_carousel_request_JEDI(request_id) - - # retire a data carousel request - def retire_data_carousel_request_JEDI(self, request_id): - with self.proxyPool.get() as proxy: - return proxy.retire_data_carousel_request_JEDI(request_id) - - # resubmit a data carousel request - def resubmit_data_carousel_request_JEDI(self, request_id, exclude_prev_dst=False): - with self.proxyPool.get() as proxy: - return proxy.resubmit_data_carousel_request_JEDI(request_id, exclude_prev_dst) - - # ==== Workflow fucntions ================================== - - def get_workflow(self, workflow_id): - with self.proxyPool.get() as proxy: - return proxy.get_workflow(workflow_id) - - def get_workflow_step(self, step_id): - with self.proxyPool.get() as proxy: - return proxy.get_workflow_step(step_id) - - def get_workflow_data(self, data_id): - with self.proxyPool.get() as proxy: - return proxy.get_workflow_data(data_id) - - def get_steps_of_workflow(self, workflow_id, status_filter_list=None, status_exclusion_list=None): - with self.proxyPool.get() as proxy: - return proxy.get_steps_of_workflow(workflow_id, status_filter_list, status_exclusion_list) - - def get_data_of_workflow(self, workflow_id, status_filter_list=None, status_exclusion_list=None, type_filter_list=None): - with self.proxyPool.get() as proxy: - return proxy.get_data_of_workflow(workflow_id, status_filter_list, status_exclusion_list, type_filter_list) - - def lock_workflow(self, workflow_id, locked_by, lock_expiration_sec=120): - with self.proxyPool.get() as proxy: - return proxy.lock_workflow(workflow_id, locked_by, lock_expiration_sec) - - def unlock_workflow(self, workflow_id, locked_by): - with self.proxyPool.get() as proxy: - return proxy.unlock_workflow(workflow_id, locked_by) - - def lock_workflow_step(self, step_id, locked_by, lock_expiration_sec=120): - with self.proxyPool.get() as proxy: - return proxy.lock_workflow_step(step_id, locked_by, lock_expiration_sec) - - def unlock_workflow_step(self, step_id, locked_by): - with self.proxyPool.get() as proxy: - return proxy.unlock_workflow_step(step_id, locked_by) - - def lock_workflow_data(self, data_id, locked_by, lock_expiration_sec=120): - with self.proxyPool.get() as proxy: - return proxy.lock_workflow_data(data_id, locked_by, lock_expiration_sec) - - def unlock_workflow_data(self, data_id, locked_by): - with self.proxyPool.get() as proxy: - return proxy.unlock_workflow_data(data_id, locked_by) - - def insert_workflow(self, workflow_spec): - with self.proxyPool.get() as proxy: - return proxy.insert_workflow(workflow_spec) - - def insert_workflow_step(self, wf_step_spec): - with self.proxyPool.get() as proxy: - return proxy.insert_workflow_step(wf_step_spec) - - def insert_workflow_data(self, wf_data_spec): - with self.proxyPool.get() as proxy: - return proxy.insert_workflow_data(wf_data_spec) - - def update_workflow(self, workflow_spec): - with self.proxyPool.get() as proxy: - return proxy.update_workflow(workflow_spec) - - def update_workflow_step(self, wf_step_spec): - with self.proxyPool.get() as proxy: - return proxy.update_workflow_step(wf_step_spec) - - def update_workflow_data(self, wf_data_spec): - with self.proxyPool.get() as proxy: - return proxy.update_workflow_data(wf_data_spec) - - def upsert_workflow_entities(self, workflow_id, actions_dict=None, workflow_spec=None, step_specs=None, data_specs=None): - with self.proxyPool.get() as proxy: - return proxy.upsert_workflow_entities(workflow_id, actions_dict, workflow_spec, step_specs, data_specs) - - # ========================================================== + # # insert data carousel requests + # def insert_data_carousel_requests_JEDI(self, task_id, dc_req_specs): + # with self.proxyPool.get() as proxy: + # return proxy.insert_data_carousel_requests_JEDI(task_id, dc_req_specs) + + # # update a data carousel request + # def update_data_carousel_request_JEDI(self, dc_req_spec): + # with self.proxyPool.get() as proxy: + # return proxy.update_data_carousel_request_JEDI(dc_req_spec) + + # # get data carousel queued requests and info of their related tasks + # def get_data_carousel_queued_requests_JEDI(self): + # with self.proxyPool.get() as proxy: + # return proxy.get_data_carousel_queued_requests_JEDI() + + # # get data carousel requests of tasks by task status + # def get_data_carousel_requests_by_task_status_JEDI(self, status_filter_list=None, status_exclusion_list=None): + # with self.proxyPool.get() as proxy: + # return proxy.get_data_carousel_requests_by_task_status_JEDI(status_filter_list=status_filter_list, status_exclusion_list=status_exclusion_list) + + # # get data carousel staging requests + # def get_data_carousel_staging_requests_JEDI(self): + # with self.proxyPool.get() as proxy: + # return proxy.get_data_carousel_staging_requests_JEDI() + + # # delete data carousel requests + # def delete_data_carousel_requests_JEDI(self, request_id_list): + # with self.proxyPool.get() as proxy: + # return proxy.delete_data_carousel_requests_JEDI(request_id_list) + + # # clean up data carousel requests + # def clean_up_data_carousel_requests_JEDI(self, time_limit_days=30): + # with self.proxyPool.get() as proxy: + # return proxy.clean_up_data_carousel_requests_JEDI(time_limit_days) + + # # cancel a data carousel request + # def cancel_data_carousel_request_JEDI(self, request_id): + # with self.proxyPool.get() as proxy: + # return proxy.cancel_data_carousel_request_JEDI(request_id) + + # # retire a data carousel request + # def retire_data_carousel_request_JEDI(self, request_id): + # with self.proxyPool.get() as proxy: + # return proxy.retire_data_carousel_request_JEDI(request_id) + + # # resubmit a data carousel request + # def resubmit_data_carousel_request_JEDI(self, request_id, exclude_prev_dst=False): + # with self.proxyPool.get() as proxy: + # return proxy.resubmit_data_carousel_request_JEDI(request_id, exclude_prev_dst) diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index 94cb48a6e..306e1d785 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -2756,6 +2756,10 @@ def get_data_of_workflow(self, workflow_id, status_filter_list=None, status_excl with self.proxyPool.get() as proxy: return proxy.get_data_of_workflow(workflow_id, status_filter_list, status_exclusion_list, type_filter_list) + def query_workflows(self, status_filter_list=None, status_exclusion_list=None, check_interval_sec=300): + with self.proxyPool.get() as proxy: + return proxy.query_workflows(status_filter_list, status_exclusion_list, check_interval_sec) + def lock_workflow(self, workflow_id, locked_by, lock_expiration_sec=120): with self.proxyPool.get() as proxy: return proxy.lock_workflow(workflow_id, locked_by, lock_expiration_sec) diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index 00e69865d..e4afecf6a 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -194,6 +194,48 @@ def get_data_of_workflow( tmp_log.warning("no data found; skipped") return [] + def query_workflows( + self, status_filter_list: list | None = None, status_exclusion_list: list | None = None, check_interval_sec: int = 300 + ) -> list[WorkflowSpec]: + """ + Retrieve list of workflows with optional status filtering + + Args: + status_filter_list (list | None): List of statuses to filter the workflows by (optional) + status_exclusion_list (list | None): List of statuses to exclude the workflows by (optional) + check_interval_sec (int): Time in seconds to wait between checks (default: 300) + + Returns: + list[WorkflowSpec]: List of workflow specifications + """ + comment = " /* DBProxy.query_workflows */" + tmp_log = self.create_tagged_logger(comment, "query_workflows") + tmp_log.debug(f"start, status_filter_list={status_filter_list} status_exclusion_list={status_exclusion_list} check_interval_sec={check_interval_sec}") + sql = f"SELECT {WorkflowSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflows " f"WHERE (check_time IS NULL OR check_time<:check_time) " + now_time = naive_utcnow() + var_map = {":check_time": now_time - timedelta(seconds=check_interval_sec)} + if status_filter_list: + status_var_names_str, status_var_map = get_sql_IN_bind_variables(status_filter_list, prefix=":status") + sql += f"AND status IN ({status_var_names_str}) " + var_map.update(status_var_map) + if status_exclusion_list: + antistatus_var_names_str, antistatus_var_map = get_sql_IN_bind_variables(status_exclusion_list, prefix=":antistatus") + sql += f"AND status NOT IN ({antistatus_var_names_str}) " + var_map.update(antistatus_var_map) + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + workflow_specs = [] + for res in res_list: + workflow_spec = WorkflowSpec() + workflow_spec.pack(res) + workflow_specs.append(workflow_spec) + tmp_log.debug(f"got {len(workflow_specs)} workflows") + return workflow_specs + else: + tmp_log.warning("no workflows found; skipped") + return [] + def lock_workflow(self, workflow_id: int, locked_by: str, lock_expiration_sec: int = 120) -> bool | None: """ Lock a workflow to prevent concurrent modifications diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 9c575bb84..68153be8e 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -1300,7 +1300,7 @@ def process_active_workflows(self) -> Dict: workflows_status_stats = {"n_workflows": 0, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} try: # Get workflows - workflow_specs = self.tbif.get_workflows(status_filter_list=WorkflowStatus.active_statuses) + workflow_specs = self.tbif.get_workflows(status_filter_list=WorkflowStatus.active_statuses, check_interval=WORKFLOW_CHECK_INTERVAL_SEC) n_workflows = len(workflow_specs) tmp_log.debug(f"Got {n_workflows} workflows to process") if n_workflows == 0: From 189f08a37ccf2128d9ae193bec0cf5f01d52f9bd Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 27 Oct 2025 17:13:13 +0100 Subject: [PATCH 031/101] workflows4: fix --- .../ddm_collection_data_handler.py | 15 ++++++----- .../panda_task_step_handler.py | 26 ++++++++++--------- pandaserver/workflow/workflow_base.py | 4 +-- pandaserver/workflow/workflow_core.py | 12 +++++---- 4 files changed, 31 insertions(+), 26 deletions(-) diff --git a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py index 1e48faa53..854c6ddd8 100644 --- a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py @@ -55,6 +55,7 @@ def __init__(self, *args, **kwargs): # Initialize base class or any required modules here super().__init__(*args, **kwargs) self.ddmIF = rucioAPI + self.plugin_flavor = "ddm_collection" def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResult: """ @@ -71,10 +72,10 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu tmp_log = LogWrapper(logger, f"check_target workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") # Initialize check_result = WFDataTargetCheckResult() - # Check data type - if data_spec.type != WFDataType.ddm_collection: - tmp_log.warning(f"type={data_spec.type} not ddm_collection; skipped") - check_result.message = f"type not ddm_collection; skipped" + # Check data flavor + if data_spec.flavor != self.plugin_flavor: + tmp_log.warning(f"flavor={data_spec.flavor} not {self.plugin_flavor}; skipped") + check_result.message = f"flavor not {self.plugin_flavor}; skipped" return check_result # TODO: Implement the actual checking logic here collection = data_spec.target_id @@ -86,9 +87,9 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu return check_result match collection_meta.get("state"): case DDMCollectionState.missing: - check_result.status = WFDataStatus.generating_start + check_result.data_status = WFDataStatus.generating_start case DDMCollectionState.open: - check_result.status = WFDataStatus.generating_ready + check_result.data_status = WFDataStatus.generating_ready case DDMCollectionState.closed: - check_result.status = WFDataStatus.done_generated + check_result.data_status = WFDataStatus.done_generated check_result.metadata = collection_meta diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 23b89d454..0bde16ddc 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -35,6 +35,8 @@ def __init__(self, *args, **kwargs): """ # Initialize base class or any required modules here super().__init__(*args, **kwargs) + # plugin flavor + self.plugin_flavor = "panda_task" def submit_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetSubmitResult: """ @@ -51,10 +53,10 @@ def submit_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetSubmitRe tmp_log = LogWrapper(logger, f"submit_target workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") # Initialize submit_result = WFStepTargetSubmitResult() - # Check step type - if step_spec.type != WFStepType.panda_task: - tmp_log.warning(f"type={step_spec.type} not panda_task; skipped") - submit_result.message = f"type not panda_task; skipped" + # Check step flavor + if step_spec.flavor != self.plugin_flavor: + tmp_log.warning(f"flavor={step_spec.flavor} not {self.plugin_flavor}; skipped") + submit_result.message = f"flavor not {self.plugin_flavor}; skipped" return submit_result ... # task_param_map = {} @@ -150,9 +152,9 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu check_result.message = f"not in status to check; skipped" tmp_log.warning(f"status={step_spec.status} not in status to check; skipped") return check_result - if step_spec.type != WFStepType.panda_task: - check_result.message = f"type not panda_task; skipped" - tmp_log.warning(f"type={step_spec.type} not panda_task; skipped") + if step_spec.flavor != self.plugin_flavor: + check_result.message = f"flavor not {self.plugin_flavor}; skipped" + tmp_log.warning(f"flavor={step_spec.flavor} not {self.plugin_flavor}; skipped") return check_result if step_spec.target_id is None: check_result.message = f"target_id is None; skipped" @@ -170,13 +172,13 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu check_result.success = True check_result.native_status = task_status if task_status in ["running", "transferring", "transferred", "merging"]: - check_result.status = WFStepStatus.running - elif task_status in ["defined", "assigned", "activated", "starting", "ready"]: - check_result.status = WFStepStatus.submitted + check_result.step_status = WFStepStatus.running + elif task_status in ["defined", "assigned", "activated", "starting", "ready", "pending"]: + check_result.step_status = WFStepStatus.submitted elif task_status in ["done", "finished"]: - check_result.status = WFStepStatus.finished + check_result.step_status = WFStepStatus.finished elif task_status in ["failed", "exhausted", "aborted", "toabort", "aborting", "broken", "tobroken"]: - check_result.status = WFStepStatus.failed + check_result.step_status = WFStepStatus.failed else: check_result.success = False check_result.message = f"unknown task_status {task_status}" diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index f4a4d4625..2ed175bfa 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -462,7 +462,7 @@ class WFStepTargetCheckResult: """ success: bool | None = None - status: WFStepStatus | None = None + step_status: WFStepStatus | None = None native_status: str | None = None message: str = "" @@ -483,7 +483,7 @@ class WFDataTargetCheckResult: """ success: bool | None = None - status: WFDataStatus | None = None + data_status: WFDataStatus | None = None metadata: dict | None = None message: str = "" diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 68153be8e..deffb5bf2 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -512,7 +512,7 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.error(f"{process_result.message}") return process_result - def process_data(self, data_specs: List[WFDataSpec]) -> Dict: + def process_data_specs(self, data_specs: List[WFDataSpec]) -> Dict: """ Process a list of workflow data specifications @@ -522,7 +522,7 @@ def process_data(self, data_specs: List[WFDataSpec]) -> Dict: Returns: Dict: Statistics of the processing results """ - tmp_log = LogWrapper(logger, f"process_data workflow_id={data_specs[0].workflow_id}") + tmp_log = LogWrapper(logger, f"process_data_specs workflow_id={data_specs[0].workflow_id}") n_data = len(data_specs) tmp_log.debug(f"Start, processing {n_data} data specs") data_status_stats = {"n_data": n_data, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} @@ -560,6 +560,7 @@ def process_data(self, data_specs: List[WFDataSpec]) -> Dict: data_status_stats["processed"][data_spec.status] += 1 data_status_stats["n_processed"] += 1 tmp_log.info(f"Done, processed data specs: {data_status_stats}") + return data_status_stats # ---- Step status transitions ----------------------------- @@ -899,7 +900,7 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W Returns: Dict: Statistics of the processing results """ - tmp_log = LogWrapper(logger, f"process_steps workflow_id={step_spec.workflow_id}") + tmp_log = LogWrapper(logger, f"process_steps workflow_id={step_specs[0].workflow_id}") n_steps = len(step_specs) tmp_log.debug(f"Start, processing {n_steps} steps") steps_status_stats = {"n_steps": n_steps, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} @@ -937,6 +938,7 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W steps_status_stats["processed"][step_spec.status] += 1 steps_status_stats["n_processed"] += 1 tmp_log.info(f"Done, processed steps: {steps_status_stats}") + return steps_status_stats # ---- Workflow status transitions ------------------------- @@ -1299,8 +1301,8 @@ def process_active_workflows(self) -> Dict: # Initialize workflows_status_stats = {"n_workflows": 0, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} try: - # Get workflows - workflow_specs = self.tbif.get_workflows(status_filter_list=WorkflowStatus.active_statuses, check_interval=WORKFLOW_CHECK_INTERVAL_SEC) + # Query active workflows to process + workflow_specs = self.tbif.query_workflows(status_filter_list=WorkflowStatus.active_statuses, check_interval_sec=WORKFLOW_CHECK_INTERVAL_SEC) n_workflows = len(workflow_specs) tmp_log.debug(f"Got {n_workflows} workflows to process") if n_workflows == 0: From 306511938091b0b2581d4f0c01af13220fd20389 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 28 Oct 2025 10:20:03 +0100 Subject: [PATCH 032/101] workflows4: fix data handler and data check status --- .../ddm_collection_data_handler.py | 9 +- pandaserver/workflow/workflow_base.py | 14 ++- pandaserver/workflow/workflow_core.py | 95 ++++++++++++------- 3 files changed, 79 insertions(+), 39 deletions(-) diff --git a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py index 854c6ddd8..7df20084c 100644 --- a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py @@ -11,6 +11,7 @@ WFDataSpec, WFDataStatus, WFDataTargetCheckResult, + WFDataTargetCheckStatus, WFDataType, WFStepSpec, WFStepStatus, @@ -77,7 +78,7 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu tmp_log.warning(f"flavor={data_spec.flavor} not {self.plugin_flavor}; skipped") check_result.message = f"flavor not {self.plugin_flavor}; skipped" return check_result - # TODO: Implement the actual checking logic here + # Check DDM collection status collection = data_spec.target_id collection_meta = self.ddmIF.get_dataset_metadata(collection, ignore_missing=True) if collection_meta is None: @@ -87,9 +88,9 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu return check_result match collection_meta.get("state"): case DDMCollectionState.missing: - check_result.data_status = WFDataStatus.generating_start + check_result.check_status = WFDataTargetCheckStatus.nonex case DDMCollectionState.open: - check_result.data_status = WFDataStatus.generating_ready + check_result.check_status = WFDataTargetCheckStatus.partex case DDMCollectionState.closed: - check_result.data_status = WFDataStatus.done_generated + check_result.check_status = WFDataTargetCheckStatus.exist check_result.metadata = collection_meta diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 2ed175bfa..f688bfb66 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -470,6 +470,16 @@ class WFStepTargetCheckResult: # ==== Return objects of data handler methods ================== +class WFDataTargetCheckStatus: + """ + Possible statuses returned by data target check + """ + + exist = "exist" # data fully exists + partex = "partex" # data partially exists + nonex = "nonex" # data does not exist + + @dataclass(slots=True) class WFDataTargetCheckResult: """ @@ -477,13 +487,13 @@ class WFDataTargetCheckResult: Fields: success (bool | None): Indicates if the status check was successful. - status (WFDataStatus | None): The status of the data to move to. + check_status (WFDataTargetCheckStatus | None): The status of the data target. metadata (dict | None): The native metadata from the target system. message (str): A message providing additional information about the status check result. """ success: bool | None = None - data_status: WFDataStatus | None = None + check_status: WFDataTargetCheckStatus | None = None metadata: dict | None = None message: str = "" diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index deffb5bf2..dfa8b6627 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -24,6 +24,7 @@ WFDataProcessResult, WFDataSpec, WFDataStatus, + WFDataTargetCheckStatus, WFDataType, WFStepProcessResult, WFStepSpec, @@ -336,9 +337,25 @@ def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: # Process try: # Check data availability - # FIXME: For now, always advance to checked_nonex - data_spec.status = WFDataStatus.checked_nonex - data_spec.check_time = naive_utcnow() + original_status = data_spec.status + # Get the data handler plugin + data_handler = self.get_plugin("data_handler", data_spec.flavor) + # Check the data status + check_result = data_handler.check_target(data_spec) + if not check_result.success or check_result.check_status is None: + process_result.message = f"Failed to check data; {check_result.message}" + tmp_log.error(f"{process_result.message}") + return process_result + # Update data status + now_time = naive_utcnow() + match check_result.check_status: + case WFDataTargetCheckStatus.nonex: + data_spec.status = WFDataStatus.checked_nonex + case WFDataTargetCheckStatus.partex: + data_spec.status = WFDataStatus.checked_partex + case WFDataTargetCheckStatus.exist: + data_spec.status = WFDataStatus.checked_exist + data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) tmp_log.info(f"Done, status={data_spec.status}") except Exception as e: @@ -422,35 +439,40 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: data_handler = self.get_plugin("data_handler", data_spec.flavor) # Check the data status check_result = data_handler.check_target(data_spec) - if not check_result.success or check_result.data_status is None: + if not check_result.success or check_result.check_status is None: process_result.message = f"Failed to check data; {check_result.message}" tmp_log.error(f"{process_result.message}") return process_result # Update data status now_time = naive_utcnow() - match original_status: - case WFDataStatus.generating_start: - if check_result.data_status in WFDataStatus.after_generating_start_statuses: - # Data status advanced - data_spec.status = check_result.data_status + if original_status == WFDataStatus.generating_start: + match check_result.check_status: + case WFDataTargetCheckStatus.partex | WFDataTargetCheckStatus.exist: + # Data exist, advance to generating_ready + data_spec.status = WFDataStatus.generating_ready process_result.new_status = data_spec.status - elif check_result.data_status == WFDataStatus.generating_start: - # Still in generating_start, do nothing + case WFDataTargetCheckStatus.nonex: + # Data not yet exist, stay in generating_start pass - else: - tmp_log.warning(f"Invalid data_status {check_result.data_status} from target check result; skipped") - self.tbif.update_workflow_data(data_spec) - case WFDataStatus.generating_ready: - if check_result.data_status in WFDataStatus.after_generating_ready_statuses: - # Data status advanced to terminal - data_spec.status = check_result.data_status + case _: + # Unexpected status, log and skip + tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") + elif original_status == WFDataStatus.generating_ready: + match check_result.check_status: + case WFDataTargetCheckStatus.exist: + # Data fully exist, advance to final status done_exist + data_spec.status = WFDataStatus.done_exist process_result.new_status = data_spec.status data_spec.end_time = now_time - elif check_result.data_status == WFDataStatus.generating_ready: - # Still in generating_ready, do nothing + case WFDataTargetCheckStatus.partex: + # Data still partially exist, stay in generating_ready pass - else: - tmp_log.warning(f"Invalid data_status {check_result.data_status} from target check result; skipped") + case WFDataTargetCheckStatus.nonex: + # Data not exist anymore, unexpected, log and skip + tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") + case _: + # Unexpected status, log and skip + tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") @@ -486,24 +508,27 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: data_handler = self.get_plugin("data_handler", data_spec.flavor) # Check the data status check_result = data_handler.check_target(data_spec) - if not check_result.success or check_result.data_status is None: + if not check_result.success or check_result.check_status is None: process_result.message = f"Failed to check data; {check_result.message}" tmp_log.error(f"{process_result.message}") return process_result # Update data status now_time = naive_utcnow() - match original_status: - case WFDataStatus.waiting_ready: - if check_result.data_status in WFDataStatus.after_waiting_ready_statuses: - # Data status advanced to terminal - data_spec.status = check_result.data_status + if original_status == WFDataStatus.waiting_ready: + match check_result.check_status: + case WFDataTargetCheckStatus.exist: + # Data fully exist, advance to final status done_waited + data_spec.status = WFDataStatus.done_waited process_result.new_status = data_spec.status data_spec.end_time = now_time - elif check_result.data_status == WFDataStatus.waiting_ready: - # Still in waiting_ready, do nothing + case WFDataTargetCheckStatus.partex: + # Data still partially exist, stay in waiting_ready pass - else: - tmp_log.warning(f"Invalid data_status {check_result.data_status} from target check result; skipped") + case WFDataTargetCheckStatus.nonex: + # Data not exist anymore, unexpected, log and skip + tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") + case _: + tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") @@ -1141,8 +1166,9 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc return process_result # Process try: - # Get data + # Process data specs first data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) + data_status_stats = self.process_data_specs(data_specs) # Get steps in registered status required_step_statuses = [WFStepStatus.registered, WFStepStatus.pending, WFStepStatus.ready, WFStepStatus.submitted] over_advanced_step_statuses = [WFStepStatus.running, WFStepStatus.done, WFStepStatus.failed] @@ -1206,6 +1232,9 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce return process_result # Process try: + # Process data specs first + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) + data_status_stats = self.process_data_specs(data_specs) # Get steps step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id) if not step_specs: From ec247eeab7c2a6ff1d2be9bea226291cfc7004bc Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 28 Oct 2025 10:47:42 +0100 Subject: [PATCH 033/101] workflows4: fix mid data --- .../data_handler_plugins/ddm_collection_data_handler.py | 2 ++ pandaserver/workflow/workflow_core.py | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py index 7df20084c..5d141ae03 100644 --- a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py @@ -94,3 +94,5 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu case DDMCollectionState.closed: check_result.check_status = WFDataTargetCheckStatus.exist check_result.metadata = collection_meta + check_result.success = True + return check_result diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index dfa8b6627..ad3c1465d 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -475,7 +475,10 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) - tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") + if data_spec.status == original_status: + tmp_log.info(f"Done, status stays {data_spec.status}") + else: + tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" tmp_log.error(f"{process_result.message}") @@ -748,12 +751,13 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W if not step_spec_definition.get("is_tail"): # is intermediate step, register their outputs as mid type output_data_list = step_spec_definition.get("output_data_list", []) + outputs_raw_dict = step_spec_definition.get("outputs", {}) now_time = naive_utcnow() for output_data_name in output_data_list: data_spec = WFDataSpec() data_spec.workflow_id = step_spec.workflow_id data_spec.name = output_data_name - data_spec.target_id = None # to be filled later + data_spec.target_id = outputs_raw_dict.get(output_data_name, {}).get("value") # caution: may be None data_spec.status = WFDataStatus.registered data_spec.type = WFDataType.mid data_spec.flavor = "ddm_collection" # FIXME: hardcoded flavor, should be configurable From 119b01f24e2db999beffa236bf8380fb41348b65 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 28 Oct 2025 12:36:20 +0100 Subject: [PATCH 034/101] workflows4: fix --- .../ddm_collection_data_handler.py | 1 + pandaserver/workflow/workflow_core.py | 20 +++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py index 5d141ae03..755ee0724 100644 --- a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py @@ -95,4 +95,5 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu check_result.check_status = WFDataTargetCheckStatus.exist check_result.metadata = collection_meta check_result.success = True + tmp_log.info(f"Got collection {collection} check_status={check_result.check_status}") return check_result diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index ad3c1465d..6cfa59f91 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -294,7 +294,7 @@ def process_data_registered(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_registered data_id={data_spec.data_id}") + tmp_log = LogWrapper(logger, f"process_data_registered workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -374,7 +374,7 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_checked data_id={data_spec.data_id}") + tmp_log = LogWrapper(logger, f"process_data_checked workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -423,7 +423,7 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_generating data_id={data_spec.data_id}") + tmp_log = LogWrapper(logger, f"process_data_generating workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -495,7 +495,7 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_waiting data_id={data_spec.data_id}") + tmp_log = LogWrapper(logger, f"process_data_waiting workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -603,7 +603,7 @@ def process_step_registered(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_registered step_id={step_spec.step_id}") + tmp_log = LogWrapper(logger, f"process_step_registered workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1174,7 +1174,15 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) data_status_stats = self.process_data_specs(data_specs) # Get steps in registered status - required_step_statuses = [WFStepStatus.registered, WFStepStatus.pending, WFStepStatus.ready, WFStepStatus.submitted] + required_step_statuses = [ + WFStepStatus.registered, + WFStepStatus.checking, + WFStepStatus.checked_true, + WFStepStatus.checked_false, + WFStepStatus.pending, + WFStepStatus.ready, + WFStepStatus.submitted, + ] over_advanced_step_statuses = [WFStepStatus.running, WFStepStatus.done, WFStepStatus.failed] step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=required_step_statuses) over_advanced_step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=over_advanced_step_statuses) From 0796dc90d09298794476861efc0d9b0b00b2f2c3 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 28 Oct 2025 14:30:19 +0100 Subject: [PATCH 035/101] workflows4: fix, add data step_source_id --- pandaserver/taskbuffer/TaskBuffer.py | 4 ++ .../db_proxy_mods/workflow_module.py | 33 +++++++++++++++ pandaserver/workflow/workflow_base.py | 3 ++ pandaserver/workflow/workflow_core.py | 42 +++++++++++-------- 4 files changed, 64 insertions(+), 18 deletions(-) diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index 306e1d785..59920b019 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -2748,6 +2748,10 @@ def get_workflow_data(self, data_id): with self.proxyPool.get() as proxy: return proxy.get_workflow_data(data_id) + def get_workflow_data_by_name(self, name, workflow_id=None): + with self.proxyPool.get() as proxy: + return proxy.get_workflow_data_by_name(name, workflow_id) + def get_steps_of_workflow(self, workflow_id, status_filter_list=None, status_exclusion_list=None): with self.proxyPool.get() as proxy: return proxy.get_steps_of_workflow(workflow_id, status_filter_list, status_exclusion_list) diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index e4afecf6a..b164e8b0a 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -113,6 +113,39 @@ def get_workflow_data(self, data_id: int) -> WFDataSpec | None: tmp_log.warning("no data found; skipped") return None + def get_workflow_data_by_name(self, name: str, workflow_id: int | None) -> WFDataSpec | None: + """ + Retrieve a workflow data specification by its name and workflow ID + + Args: + name (str): Name of the workflow data to retrieve + workflow_id (int | None): ID of the workflow to which the data belongs (optional) + + Returns: + WFDataSpec | None: The workflow data specification if found, otherwise None + """ + comment = " /* DBProxy.get_workflow_data_by_name */" + tmp_log = self.create_tagged_logger(comment, f"name={name}, workflow_id={workflow_id}") + sql = f"SELECT {WFDataSpec.columnNames()} " f"FROM {panda_config.schemaJEDI}.workflow_data " f"WHERE name=:name " + var_map = {":name": name} + if workflow_id is not None: + sql += "AND workflow_id=:workflow_id " + var_map[":workflow_id"] = workflow_id + self.cur.execute(sql + comment, var_map) + res_list = self.cur.fetchall() + if res_list is not None: + if len(res_list) > 1: + tmp_log.error("more than one data; unexpected") + return None + else: + for res in res_list: + data_spec = WFDataSpec() + data_spec.pack(res) + return data_spec + else: + tmp_log.warning("no data found; skipped") + return None + def get_steps_of_workflow(self, workflow_id: int, status_filter_list: list | None = None, status_exclusion_list: list | None = None) -> list[WFStepSpec]: """ Retrieve all workflow steps for a given workflow ID diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index f688bfb66..7572a15bc 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -57,7 +57,9 @@ class WFStepStatus(object): cancelled = "cancelled" checked_statuses = (checked_true, checked_false) + to_advance_step_statuses = (registered, checking, checked_true, checked_false, pending, ready, submitted) after_submitted_statuses = (running, done, failed, cancelled) + after_submitted_uninterrupted_statuses = (running, done, failed) after_running_statuses = (done, failed, cancelled) final_statuses = (done, failed, closed, cancelled) @@ -332,6 +334,7 @@ class WFDataSpec(WorkflowBaseSpec): AttributeWithType("data_id", int), AttributeWithType("name", str), AttributeWithType("workflow_id", int), + AttributeWithType("source_step_id", int), AttributeWithType("type", str), AttributeWithType("status", str), AttributeWithType("flavor", str), diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 6cfa59f91..fa9d213f9 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -52,9 +52,7 @@ # ==== Global Parameters ======================================= -WORKFLOW_CHECK_INTERVAL_SEC = 300 -STEP_CHECK_INTERVAL_SEC = 300 -DATA_CHECK_INTERVAL_SEC = 300 +WORKFLOW_CHECK_INTERVAL_SEC = 60 # ==== Plugin Map ============================================== @@ -748,14 +746,25 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W return process_result # All inputs are good, register outputs of the step and update step status to ready tmp_log.debug(f"All input data are good; proceeding") - if not step_spec_definition.get("is_tail"): - # is intermediate step, register their outputs as mid type - output_data_list = step_spec_definition.get("output_data_list", []) - outputs_raw_dict = step_spec_definition.get("outputs", {}) - now_time = naive_utcnow() + output_data_list = step_spec_definition.get("output_data_list", []) + outputs_raw_dict = step_spec_definition.get("outputs", {}) + now_time = naive_utcnow() + if step_spec_definition.get("is_tail"): + # Tail step, set root output source_step_id + for output_data_name in output_data_list: + data_spec = self.tbif.get_workflow_data_by_name(output_data_name, step_spec.workflow_id) + if data_spec is not None: + data_spec.source_step_id = step_spec.step_id + self.tbif.update_workflow_data(data_spec) + tmp_log.debug(f"Updated output data_id={data_spec.id} name={output_data_name} of source_step_id={step_spec.step_id}") + else: + tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") + else: + # Intermediate step, register their outputs as mid type for output_data_name in output_data_list: data_spec = WFDataSpec() data_spec.workflow_id = step_spec.workflow_id + data_spec.source_step_id = step_spec.step_id data_spec.name = output_data_name data_spec.target_id = outputs_raw_dict.get(output_data_name, {}).get("value") # caution: may be None data_spec.status = WFDataStatus.registered @@ -944,6 +953,10 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W match step_spec.status: case WFStepStatus.registered: tmp_res = self.process_step_registered(step_spec) + case WFStepStatus.checking: + tmp_res = self.process_step_checking(step_spec) + case WFStepStatus.checked_true | WFStepStatus.checked_false: + tmp_res = self.process_step_checked(step_spec) case WFStepStatus.pending: tmp_res = self.process_step_pending(step_spec, data_spec_map=data_spec_map) case WFStepStatus.ready: @@ -1088,6 +1101,7 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProce for output_name, output_dict in workflow_definition["root_outputs"].items(): data_spec = WFDataSpec() data_spec.workflow_id = workflow_spec.workflow_id + data_spec.source_step_id = None # root output data_spec.name = output_name data_spec.target_id = output_dict.get("value") data_spec.status = WFDataStatus.registered @@ -1174,16 +1188,8 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) data_status_stats = self.process_data_specs(data_specs) # Get steps in registered status - required_step_statuses = [ - WFStepStatus.registered, - WFStepStatus.checking, - WFStepStatus.checked_true, - WFStepStatus.checked_false, - WFStepStatus.pending, - WFStepStatus.ready, - WFStepStatus.submitted, - ] - over_advanced_step_statuses = [WFStepStatus.running, WFStepStatus.done, WFStepStatus.failed] + required_step_statuses = list(WFStepStatus.to_advance_step_statuses) + over_advanced_step_statuses = list(WFStepStatus.after_submitted_uninterrupted_statuses) step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=required_step_statuses) over_advanced_step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=over_advanced_step_statuses) if not step_specs: From 2f9423c03cbbcc51847b038d478250beef0865bc Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 30 Oct 2025 14:11:04 +0100 Subject: [PATCH 036/101] workflows4: pretty --- .../taskbuffer/db_proxy_mods/workflow_module.py | 3 +++ pandaserver/workflow/workflow_core.py | 16 ++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index b164e8b0a..409474f9b 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -170,6 +170,7 @@ def get_steps_of_workflow(self, workflow_id: int, status_filter_list: list | Non antistatus_var_names_str, antistatus_var_map = get_sql_IN_bind_variables(status_exclusion_list, prefix=":antistatus") sql += f"AND status NOT IN ({antistatus_var_names_str}) " var_map.update(antistatus_var_map) + sql += "ORDER BY step_id " self.cur.execute(sql + comment, var_map) res_list = self.cur.fetchall() if res_list is not None: @@ -214,6 +215,7 @@ def get_data_of_workflow( type_var_names_str, type_var_map = get_sql_IN_bind_variables(type_filter_list, prefix=":type") sql += f"AND type IN ({type_var_names_str}) " var_map.update(type_var_map) + sql += "ORDER BY data_id " self.cur.execute(sql + comment, var_map) res_list = self.cur.fetchall() if res_list is not None: @@ -255,6 +257,7 @@ def query_workflows( antistatus_var_names_str, antistatus_var_map = get_sql_IN_bind_variables(status_exclusion_list, prefix=":antistatus") sql += f"AND status NOT IN ({antistatus_var_names_str}) " var_map.update(antistatus_var_map) + sql += "ORDER BY check_time, creation_time " self.cur.execute(sql + comment, var_map) res_list = self.cur.fetchall() if res_list is not None: diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index fa9d213f9..6e68cbade 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -601,7 +601,7 @@ def process_step_registered(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_registered workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log = LogWrapper(logger, f"process_step_registered workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -631,7 +631,7 @@ def process_step_checking(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_checking workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log = LogWrapper(logger, f"process_step_checking workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -663,7 +663,7 @@ def process_step_checked(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_checked workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log = LogWrapper(logger, f"process_step_checked workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -705,7 +705,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_pending workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log = LogWrapper(logger, f"process_step_pending workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -756,7 +756,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W if data_spec is not None: data_spec.source_step_id = step_spec.step_id self.tbif.update_workflow_data(data_spec) - tmp_log.debug(f"Updated output data_id={data_spec.id} name={output_data_name} of source_step_id={step_spec.step_id}") + tmp_log.debug(f"Updated output data_id={data_spec.id} name={output_data_name} about source_step_id") else: tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") else: @@ -796,7 +796,7 @@ def process_step_ready(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_ready workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log = LogWrapper(logger, f"process_step_ready workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -838,7 +838,7 @@ def process_step_submitted(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_submitted workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log = LogWrapper(logger, f"process_step_submitted workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -888,7 +888,7 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_running workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + tmp_log = LogWrapper(logger, f"process_step_running workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() From 5de09329cfffee361e4448daf69d86e6473c03df Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 5 Nov 2025 15:42:42 +0100 Subject: [PATCH 037/101] workflows4: fix by test --- .../jedidog/AtlasWorkflowProcessorWatchDog.py | 2 +- pandaserver/api/v1/workflow_api.py | 4 ++-- .../panda_task_step_handler.py | 2 +- pandaserver/workflow/workflow_core.py | 23 ++++++++++++++++--- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/pandajedi/jedidog/AtlasWorkflowProcessorWatchDog.py b/pandajedi/jedidog/AtlasWorkflowProcessorWatchDog.py index e8d001e7d..5d25fe497 100644 --- a/pandajedi/jedidog/AtlasWorkflowProcessorWatchDog.py +++ b/pandajedi/jedidog/AtlasWorkflowProcessorWatchDog.py @@ -37,7 +37,7 @@ def doProcessWorkflows(self): tmpLog.debug("start") try: # watchdog lock - got_lock = self.get_process_lock("AtlasWorkflowProcessorWatchDog.doProcessWorkflows", timeLimit=1) + got_lock = self.get_process_lock("AtlasWFProcDog.doProcessWorkflows", timeLimit=1) if not got_lock: tmpLog.debug("locked by another watchdog process. Skipped") return diff --git a/pandaserver/api/v1/workflow_api.py b/pandaserver/api/v1/workflow_api.py index df5435d4b..32d38b7a8 100644 --- a/pandaserver/api/v1/workflow_api.py +++ b/pandaserver/api/v1/workflow_api.py @@ -43,7 +43,7 @@ def init_task_buffer(task_buffer: TaskBuffer) -> None: global_wfif = WorkflowInterface(global_task_buffer) -@request_validation(_logger, secure=True, production=True, request_method="POST") +@request_validation(_logger, secure=True, production=False, request_method="POST") def submit_workflow_raw_request(req: PandaRequest, params: dict | str) -> dict: """ Submit raw request of PanDA native workflow. @@ -94,7 +94,7 @@ def submit_workflow_raw_request(req: PandaRequest, params: dict | str) -> dict: return generate_response(success, message, data) -@request_validation(_logger, secure=True, production=True, request_method="POST") +@request_validation(_logger, secure=True, production=False, request_method="POST") def submit_workflow(req: PandaRequest, workflow_definition: dict) -> dict: """ Submit a PanDA native workflow. diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 0bde16ddc..03e08dfd7 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -176,7 +176,7 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu elif task_status in ["defined", "assigned", "activated", "starting", "ready", "pending"]: check_result.step_status = WFStepStatus.submitted elif task_status in ["done", "finished"]: - check_result.step_status = WFStepStatus.finished + check_result.step_status = WFStepStatus.done elif task_status in ["failed", "exhausted", "aborted", "toabort", "aborting", "broken", "tobroken"]: check_result.step_status = WFStepStatus.failed else: diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 6e68cbade..027ceebac 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -306,6 +306,8 @@ def process_data_registered(self, data_spec: WFDataSpec) -> WFDataProcessResult: # For now, just update status to checking data_spec.status = WFDataStatus.checking self.tbif.update_workflow_data(data_spec) + process_result.success = True + process_result.new_status = data_spec.status tmp_log.info(f"Done, status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" @@ -355,6 +357,8 @@ def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: data_spec.status = WFDataStatus.checked_exist data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) + process_result.new_status = data_spec.status + process_result.success = True tmp_log.info(f"Done, status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" @@ -404,6 +408,8 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: data_spec.check_time = now_time data_spec.end_time = now_time self.tbif.update_workflow_data(data_spec) + process_result.success = True + process_result.new_status = data_spec.status tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" @@ -473,6 +479,7 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) + process_result.success = True if data_spec.status == original_status: tmp_log.info(f"Done, status stays {data_spec.status}") else: @@ -532,6 +539,7 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) + process_result.success = True tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" @@ -558,6 +566,7 @@ def process_data_specs(self, data_specs: List[WFDataSpec]) -> Dict: tmp_log.warning(f"Failed to acquire lock for data_id={data_spec.data_id}; skipped") continue data_spec = locked_data_spec + orig_status = data_spec.status # Process the data tmp_res = None match data_spec.status: @@ -576,7 +585,7 @@ def process_data_specs(self, data_specs: List[WFDataSpec]) -> Dict: continue if tmp_res and tmp_res.success: # update stats - if tmp_res.new_status and tmp_res.new_status != data_spec.status: + if tmp_res.new_status and data_spec.status != orig_status: data_status_stats["changed"].setdefault(data_spec.status, 0) data_status_stats["changed"][data_spec.status] += 1 else: @@ -614,6 +623,8 @@ def process_step_registered(self, step_spec: WFStepSpec) -> WFStepProcessResult: try: step_spec.status = WFStepStatus.checking self.tbif.update_workflow_step(step_spec) + process_result.success = True + process_result.new_status = step_spec.status tmp_log.info(f"Done, status={step_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" @@ -646,6 +657,8 @@ def process_step_checking(self, step_spec: WFStepSpec) -> WFStepProcessResult: if True: step_spec.status = WFStepStatus.checked_true self.tbif.update_workflow_step(step_spec) + process_result.success = True + process_result.new_status = step_spec.status tmp_log.info(f"Done, status={step_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" @@ -687,6 +700,8 @@ def process_step_checked(self, step_spec: WFStepSpec) -> WFStepProcessResult: step_spec.status = WFStepStatus.closed step_spec.check_time = now_time self.tbif.update_workflow_step(step_spec) + process_result.success = True + process_result.new_status = step_spec.status tmp_log.info(f"Done, from {original_status} to status={step_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" @@ -948,6 +963,7 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W tmp_log.warning(f"Failed to acquire lock for step_id={step_spec.step_id}; skipped") continue step_spec = locked_step_spec + orig_status = step_spec.status # Process the step tmp_res = None match step_spec.status: @@ -970,7 +986,7 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W continue if tmp_res and tmp_res.success: # update stats - if tmp_res.new_status and tmp_res.new_status != step_spec.status: + if tmp_res.new_status and step_spec.status != orig_status: steps_status_stats["changed"].setdefault(step_spec.status, 0) steps_status_stats["changed"][step_spec.status] += 1 else: @@ -1362,11 +1378,12 @@ def process_active_workflows(self) -> Dict: tmp_log.warning(f"Failed to acquire lock for workflow_id={workflow_spec.workflow_id}; skipped") continue workflow_spec = locked_workflow_spec + orig_status = workflow_spec.status # Process the workflow tmp_res = self.process_workflow(workflow_spec) if tmp_res and tmp_res.success: # update stats - if tmp_res.new_status and tmp_res.new_status != workflow_spec.status: + if tmp_res.new_status and workflow_spec.status != orig_status: workflows_status_stats["changed"].setdefault(workflow_spec.status, 0) workflows_status_stats["changed"][workflow_spec.status] += 1 else: From afc6e93adb9bca0f4af4acbeba937d5aaded1f0d Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 5 Nov 2025 17:36:42 +0100 Subject: [PATCH 038/101] workflows4: data target suffix about output_types --- pandaserver/workflow/workflow_core.py | 5 ++++- pandaserver/workflow/workflow_parser.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 027ceebac..df707a035 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -763,6 +763,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W tmp_log.debug(f"All input data are good; proceeding") output_data_list = step_spec_definition.get("output_data_list", []) outputs_raw_dict = step_spec_definition.get("outputs", {}) + output_types = step_spec_definition.get("output_types", []) now_time = naive_utcnow() if step_spec_definition.get("is_tail"): # Tail step, set root output source_step_id @@ -771,7 +772,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W if data_spec is not None: data_spec.source_step_id = step_spec.step_id self.tbif.update_workflow_data(data_spec) - tmp_log.debug(f"Updated output data_id={data_spec.id} name={output_data_name} about source_step_id") + tmp_log.debug(f"Updated output data_id={data_spec.data_id} name={output_data_name} about source_step_id") else: tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") else: @@ -782,6 +783,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W data_spec.source_step_id = step_spec.step_id data_spec.name = output_data_name data_spec.target_id = outputs_raw_dict.get(output_data_name, {}).get("value") # caution: may be None + data_spec.set_parameter("output_types", output_types) data_spec.status = WFDataStatus.registered data_spec.type = WFDataType.mid data_spec.flavor = "ddm_collection" # FIXME: hardcoded flavor, should be configurable @@ -1120,6 +1122,7 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProce data_spec.source_step_id = None # root output data_spec.name = output_name data_spec.target_id = output_dict.get("value") + data_spec.set_parameter("output_types", output_dict.get("output_types")) data_spec.status = WFDataStatus.registered data_spec.type = WFDataType.output data_spec.flavor = "ddm_collection" # FIXME: hardcoded flavor, should be configurable diff --git a/pandaserver/workflow/workflow_parser.py b/pandaserver/workflow/workflow_parser.py index 8281ed5b9..1452f9c83 100644 --- a/pandaserver/workflow/workflow_parser.py +++ b/pandaserver/workflow/workflow_parser.py @@ -151,6 +151,8 @@ def parse_raw_request(sandbox_url, log_token, user_name, raw_request_dict) -> tu nodes_list.append(vars(node)) if node.is_tail: root_outputs_dict.update(node.outputs) + for out_val in root_outputs_dict.values(): + out_val["output_types"] = node.output_types # workflow definition workflow_definition_dict = { "workflow_name": workflow_name, From d59339b6f28c42b91f85d5f1407d9c88fc3754bc Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 7 Nov 2025 10:32:26 +0100 Subject: [PATCH 039/101] workflows4: fix data status --- pandaserver/workflow/workflow_core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index df707a035..b2eb3272e 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -403,8 +403,8 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) case WFDataStatus.checked_exist: - # Data already fully exist, advance to done_exist - data_spec.status = WFDataStatus.done_exist + # Data already fully exist, advance to done_skipped + data_spec.status = WFDataStatus.done_skipped data_spec.check_time = now_time data_spec.end_time = now_time self.tbif.update_workflow_data(data_spec) @@ -464,8 +464,8 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: elif original_status == WFDataStatus.generating_ready: match check_result.check_status: case WFDataTargetCheckStatus.exist: - # Data fully exist, advance to final status done_exist - data_spec.status = WFDataStatus.done_exist + # Data fully exist, advance to final status done_generated + data_spec.status = WFDataStatus.done_generated process_result.new_status = data_spec.status data_spec.end_time = now_time case WFDataTargetCheckStatus.partex: From c45228da874a2d48da36e133950cb06cb1ac215f Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 10 Nov 2025 10:48:49 +0100 Subject: [PATCH 040/101] workflows4: add panda task data handler, rename data status --- pandaserver/dataservice/ddm.py | 4 +- .../ddm_collection_data_handler.py | 9 +- .../panda_task_data_handler.py | 127 ++++++++++++++++++ pandaserver/workflow/workflow_base.py | 18 ++- pandaserver/workflow/workflow_core.py | 73 +++++++--- 5 files changed, 197 insertions(+), 34 deletions(-) create mode 100644 pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py diff --git a/pandaserver/dataservice/ddm.py b/pandaserver/dataservice/ddm.py index 7578bf228..b400dd70b 100755 --- a/pandaserver/dataservice/ddm.py +++ b/pandaserver/dataservice/ddm.py @@ -1093,7 +1093,7 @@ def get_dataset_metadata(self, dataset_name, ignore_missing=False): return None # get files in dataset - def get_files_in_dataset(self, dataset_name, ski_duplicate=True, ignore_unknown=False, long_format=False, lfn_only=False): + def get_files_in_dataset(self, dataset_name, skip_duplicate=True, ignore_unknown=False, long_format=False, lfn_only=False): method_name = "get_files_in_dataset" method_name += f" " tmp_log = LogWrapper(_logger, method_name) @@ -1131,7 +1131,7 @@ def get_files_in_dataset(self, dataset_name, ski_duplicate=True, ignore_unknown= guid = str(f"{x['guid'][0:8]}-{x['guid'][8:12]}-{x['guid'][12:16]}-{x['guid'][16:20]}-{x['guid'][20:32]}") attrs["guid"] = guid # skip duplicated files - if ski_duplicate: + if skip_duplicate: # extract base LFN and attempt number baseLFN = re.sub("(\.(\d+))$", "", lfn) attNr = re.sub(baseLFN + "\.*", "", lfn) diff --git a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py index 755ee0724..f640bab70 100644 --- a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py @@ -88,11 +88,14 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu return check_result match collection_meta.get("state"): case DDMCollectionState.missing: - check_result.check_status = WFDataTargetCheckStatus.nonex + check_result.check_status = WFDataTargetCheckStatus.nonexist case DDMCollectionState.open: - check_result.check_status = WFDataTargetCheckStatus.partex + if collection_meta.get("length", 0) == 0: + check_result.check_status = WFDataTargetCheckStatus.insuff + else: + check_result.check_status = WFDataTargetCheckStatus.partial case DDMCollectionState.closed: - check_result.check_status = WFDataTargetCheckStatus.exist + check_result.check_status = WFDataTargetCheckStatus.complete check_result.metadata = collection_meta check_result.success = True tmp_log.info(f"Got collection {collection} check_status={check_result.check_status}") diff --git a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py new file mode 100644 index 000000000..3a223398f --- /dev/null +++ b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py @@ -0,0 +1,127 @@ +import json +import traceback +import uuid + +from pandacommon.pandalogger.LogWrapper import LogWrapper +from pandacommon.pandalogger.PandaLogger import PandaLogger + +from pandaserver.dataservice.ddm import rucioAPI +from pandaserver.workflow.data_handler_plugins.base_data_handler import BaseDataHandler +from pandaserver.workflow.workflow_base import ( + WFDataSpec, + WFDataStatus, + WFDataTargetCheckResult, + WFDataTargetCheckStatus, + WFDataType, + WFStepSpec, + WFStepStatus, + WFStepType, + WorkflowSpec, + WorkflowStatus, +) + +# main logger +logger = PandaLogger().getLogger(__name__.split(".")[-1]) + + +class DDMCollectionDIDType: + """ + Data Identifier Types for DDM Collections + """ + + DATASET = "DATASET" + CONTAINER = "CONTAINER" + + +class DDMCollectionState: + """ + States for DDM Collections + """ + + open = "open" + closed = "closed" + missing = "missing" + + +class PandaTaskDataHandler(BaseDataHandler): + """ + Handler for PanDA task intermediate/output data in the workflow. + This class is responsible for managing the data generated by PanDA task within a workflow. + The output data from a PanDA task is usually a DDM container, which remains open even after the task completion. + Thus, the handler not only checks the status of the DDM collection to determine if there are files available, but also verifies the step status of source workflow steps to ensure that the data generation process has been completed successfully. + """ + + def __init__(self, *args, **kwargs): + """ + Initialize the data handler with necessary parameters. + """ + # Initialize base class or any required modules here + super().__init__(*args, **kwargs) + self.ddmIF = rucioAPI + self.plugin_flavor = "panda_task" + + def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResult: + """ + Check the status of the PanDA task data target. + This method should be implemented to handle the specifics of PanDA task data status checking. + + Args: + data_spec (WFDataSpec): The data specification containing details about the data to be checked. + **kwargs: Additional keyword arguments that may be required for checking. + + Returns: + WFDataTargetCheckResult: An object containing the result of the check, including success status, current data status, and message. + """ + tmp_log = LogWrapper(logger, f"check_target workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") + # Initialize + check_result = WFDataTargetCheckResult() + # Check data flavor + if data_spec.flavor != self.plugin_flavor: + tmp_log.warning(f"flavor={data_spec.flavor} not {self.plugin_flavor}; skipped") + check_result.message = f"flavor not {self.plugin_flavor}; skipped" + return check_result + # Check source step status + source_step_spec = self.tbif.get_workflow_step_spec(data_spec.source_step_id) + if source_step_spec is None: + check_result.success = False + check_result.message = f"Failed to get source step spec for step_id={data_spec.source_step_id}" + tmp_log.error(f"{check_result.message}") + return check_result + if source_step_spec.status == WFStepStatus.done: + # Source step done; consider data fully available + check_result.success = True + check_result.check_status = WFDataTargetCheckStatus.complete + tmp_log.info(f"Source step step_id={source_step_spec.id} done, data considered fully available; check_status={check_result.check_status}") + return check_result + elif source_step_spec.status in WFStepStatus.final_statuses: + # Source step in final status but not done; skip data availability + check_result.success = True + check_result.message = f"Source step step_id={source_step_spec.id} {source_step_spec.status}; skip data availability check" + tmp_log.warning(f"{check_result.message}") + return check_result + else: + # Source step not terminated; check number of files in DDM collection + collection = data_spec.target_id + tmp_stat, tmp_res = self.ddmIF.get_number_of_files(collection) + if tmp_stat is None: + # Collection does not exist + check_result.success = True + check_result.check_status = WFDataTargetCheckStatus.nonexist + tmp_log.info(f"Collection {collection} does not exist; check_status={check_result.check_status}") + return check_result + elif not tmp_stat: + # Error in getting number of files + check_result.success = False + check_result.message = f"Failed to get number of files for collection {collection}: {tmp_res}" + tmp_log.error(f"{check_result.message}") + return check_result + # Check number of files + n_files = tmp_res + if n_files == 0: + check_result.check_status = WFDataTargetCheckStatus.insuff + else: + # At least 1 file is sufficient for step input + check_result.check_status = WFDataTargetCheckStatus.partial + check_result.success = True + tmp_log.info(f"Got collection {collection} n_files={n_files}; check_status={check_result.check_status}") + return check_result diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 7572a15bc..2937584b5 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -71,11 +71,13 @@ class WFDataStatus(object): registered = "registered" checking = "checking" - checked_nonex = "checked_nonex" - checked_partex = "checked_partex" - checked_exist = "checked_exist" + checked_nonexist = "checked_nonexist" # data does not exist + checked_insuff = "checked_insuff" # data available but insufficient to be step input + checked_partial = "checked_partial" # data partially available and sufficient to be step input + checked_complete = "checked_complete" # data completely available generating_start = "generating_start" generating_ready = "generating_ready" + waiting_unready = "waiting_unready" waiting_ready = "waiting_ready" done_generated = "done_generated" done_waited = "done_waited" @@ -83,7 +85,8 @@ class WFDataStatus(object): cancelled = "cancelled" retired = "retired" - checked_statuses = (checked_nonex, checked_partex, checked_exist) + checked_statuses = (checked_nonexist, checked_partial, checked_complete) + waiting_statuses = (waiting_unready, waiting_ready) generating_statues = (generating_start, generating_ready) done_statuses = (done_generated, done_waited, done_skipped) good_input_statuses = (generating_ready, waiting_ready, done_generated, done_waited, done_skipped) @@ -478,9 +481,10 @@ class WFDataTargetCheckStatus: Possible statuses returned by data target check """ - exist = "exist" # data fully exists - partex = "partex" # data partially exists - nonex = "nonex" # data does not exist + complete = "complete" # data completely exists + partial = "partial" # data partially exists + insuff = "insuff" # data exists but insufficient to be step input + nonexist = "nonexist" # data does not exist @dataclass(slots=True) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index b2eb3272e..8e7bb54de 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -63,6 +63,8 @@ }, "data_handler": { "ddm_collection": ("ddm_collection_data_handler", "DDMCollectionDataHandler"), + "panda_task": ("panda_task_data_handler", "PandaTaskDataHandler"), + # Add more data handler plugins here }, # Add more plugin types here } @@ -349,12 +351,12 @@ def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: # Update data status now_time = naive_utcnow() match check_result.check_status: - case WFDataTargetCheckStatus.nonex: - data_spec.status = WFDataStatus.checked_nonex - case WFDataTargetCheckStatus.partex: - data_spec.status = WFDataStatus.checked_partex - case WFDataTargetCheckStatus.exist: - data_spec.status = WFDataStatus.checked_exist + case WFDataTargetCheckStatus.nonexist: + data_spec.status = WFDataStatus.checked_nonexist + case WFDataTargetCheckStatus.partial: + data_spec.status = WFDataStatus.checked_partial + case WFDataTargetCheckStatus.complete: + data_spec.status = WFDataStatus.checked_complete data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) process_result.new_status = data_spec.status @@ -381,7 +383,7 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: # Initialize process_result = WFDataProcessResult() # Check status - if data_spec.status not in (WFDataStatus.checked_nonex, WFDataStatus.checked_partex, WFDataStatus.checked_exist): + if data_spec.status not in (WFDataStatus.checked_nonexist, WFDataStatus.checked_partial, WFDataStatus.checked_complete): process_result.message = f"Data status changed unexpectedly from checked_* to {data_spec.status}; skipped" tmp_log.warning(f"{process_result.message}") return process_result @@ -391,18 +393,23 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: # Update data status based on check result now_time = naive_utcnow() match data_spec.status: - case WFDataStatus.checked_nonex: + case WFDataStatus.checked_nonexist: # Data does not exist, advance to generating_start data_spec.status = WFDataStatus.generating_start data_spec.check_time = now_time data_spec.start_time = now_time self.tbif.update_workflow_data(data_spec) - case WFDataStatus.checked_partex: + case WFDataStatus.checked_insuff: + # Data insufficient, advance to waiting_unready + data_spec.status = WFDataStatus.waiting_unready + data_spec.check_time = now_time + self.tbif.update_workflow_data(data_spec) + case WFDataStatus.checked_partial: # Data partially exist, advance to waiting_ready data_spec.status = WFDataStatus.waiting_ready data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) - case WFDataStatus.checked_exist: + case WFDataStatus.checked_complete: # Data already fully exist, advance to done_skipped data_spec.status = WFDataStatus.done_skipped data_spec.check_time = now_time @@ -451,11 +458,11 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: now_time = naive_utcnow() if original_status == WFDataStatus.generating_start: match check_result.check_status: - case WFDataTargetCheckStatus.partex | WFDataTargetCheckStatus.exist: + case WFDataTargetCheckStatus.partial | WFDataTargetCheckStatus.complete: # Data exist, advance to generating_ready data_spec.status = WFDataStatus.generating_ready process_result.new_status = data_spec.status - case WFDataTargetCheckStatus.nonex: + case WFDataTargetCheckStatus.nonexist: # Data not yet exist, stay in generating_start pass case _: @@ -463,15 +470,15 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") elif original_status == WFDataStatus.generating_ready: match check_result.check_status: - case WFDataTargetCheckStatus.exist: + case WFDataTargetCheckStatus.complete: # Data fully exist, advance to final status done_generated data_spec.status = WFDataStatus.done_generated process_result.new_status = data_spec.status data_spec.end_time = now_time - case WFDataTargetCheckStatus.partex: + case WFDataTargetCheckStatus.partial: # Data still partially exist, stay in generating_ready pass - case WFDataTargetCheckStatus.nonex: + case WFDataTargetCheckStatus.nonexist: # Data not exist anymore, unexpected, log and skip tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") case _: @@ -505,7 +512,7 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: # Initialize process_result = WFDataProcessResult() # Check status - if data_spec.status not in (WFDataStatus.waiting_ready,): + if data_spec.status not in WFDataStatus.waiting_statuses: process_result.message = f"Data status changed unexpectedly from waiting_* to {data_spec.status}; skipped" tmp_log.warning(f"{process_result.message}") return process_result @@ -524,15 +531,37 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: now_time = naive_utcnow() if original_status == WFDataStatus.waiting_ready: match check_result.check_status: - case WFDataTargetCheckStatus.exist: + case WFDataTargetCheckStatus.complete: # Data fully exist, advance to final status done_waited data_spec.status = WFDataStatus.done_waited process_result.new_status = data_spec.status data_spec.end_time = now_time - case WFDataTargetCheckStatus.partex: + case WFDataTargetCheckStatus.partial: # Data still partially exist, stay in waiting_ready pass - case WFDataTargetCheckStatus.nonex: + case WFDataTargetCheckStatus.insuff: + # Data not sufficient anymore, unexpected, log and skip + tmp_log.warning(f"Data are not sufficient anymore, unexpected; skipped") + case WFDataTargetCheckStatus.nonexist: + # Data not exist anymore, unexpected, log and skip + tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") + case _: + tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") + elif original_status == WFDataStatus.waiting_unready: + match check_result.check_status: + case WFDataTargetCheckStatus.partial: + # Data partially exist, advance to waiting_ready + data_spec.status = WFDataStatus.waiting_ready + process_result.new_status = data_spec.status + case WFDataTargetCheckStatus.complete: + # Data fully exist, advance to final status done_waited + data_spec.status = WFDataStatus.done_waited + process_result.new_status = data_spec.status + data_spec.end_time = now_time + case WFDataTargetCheckStatus.insuff: + # Data still insufficient, stay in waiting_unready + pass + case WFDataTargetCheckStatus.nonexist: # Data not exist anymore, unexpected, log and skip tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") case _: @@ -574,7 +603,7 @@ def process_data_specs(self, data_specs: List[WFDataSpec]) -> Dict: tmp_res = self.process_data_registered(data_spec) case WFDataStatus.checking: tmp_res = self.process_data_checking(data_spec) - case WFDataStatus.checked_nonex | WFDataStatus.checked_partex | WFDataStatus.checked_exist: + case WFDataStatus.checked_nonexist | WFDataStatus.checked_partial | WFDataStatus.checked_complete: tmp_res = self.process_data_checked(data_spec) case WFDataStatus.generating_start | WFDataStatus.generating_ready: tmp_res = self.process_data_generating(data_spec) @@ -786,7 +815,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W data_spec.set_parameter("output_types", output_types) data_spec.status = WFDataStatus.registered data_spec.type = WFDataType.mid - data_spec.flavor = "ddm_collection" # FIXME: hardcoded flavor, should be configurable + data_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable data_spec.creation_time = now_time self.tbif.insert_workflow_data(data_spec) tmp_log.debug(f"Registered mid data {output_data_name} of step_id={step_spec.step_id}") @@ -1125,7 +1154,7 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProce data_spec.set_parameter("output_types", output_dict.get("output_types")) data_spec.status = WFDataStatus.registered data_spec.type = WFDataType.output - data_spec.flavor = "ddm_collection" # FIXME: hardcoded flavor, should be configurable + data_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable data_spec.creation_time = now_time data_specs.append(data_spec) # Register steps based on nodes in the definition From 7fe46da66cc98ff90cfa738e564b051e56723a93 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 10 Nov 2025 23:29:58 +0100 Subject: [PATCH 041/101] workflows4: fixes --- .../panda_task_data_handler.py | 54 +++++++++++-------- pandaserver/workflow/workflow_core.py | 10 ++-- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py index 3a223398f..336987b13 100644 --- a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py @@ -81,43 +81,55 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu check_result.message = f"flavor not {self.plugin_flavor}; skipped" return check_result # Check source step status - source_step_spec = self.tbif.get_workflow_step_spec(data_spec.source_step_id) + if data_spec.source_step_id is None: + check_result.success = True + check_result.message = "No source step yet; skipped" + tmp_log.warning(f"{check_result.message}") + return check_result + source_step_spec = self.tbif.get_workflow_step(data_spec.source_step_id) if source_step_spec is None: check_result.success = False - check_result.message = f"Failed to get source step spec for step_id={data_spec.source_step_id}" + check_result.message = f"Failed to get source step step_id={data_spec.source_step_id}; skipped" tmp_log.error(f"{check_result.message}") return check_result if source_step_spec.status == WFStepStatus.done: # Source step done; consider data fully available check_result.success = True check_result.check_status = WFDataTargetCheckStatus.complete - tmp_log.info(f"Source step step_id={source_step_spec.id} done, data considered fully available; check_status={check_result.check_status}") + tmp_log.info(f"Source step step_id={source_step_spec.step_id} done, data considered fully available; check_status={check_result.check_status}") return check_result elif source_step_spec.status in WFStepStatus.final_statuses: # Source step in final status but not done; skip data availability check_result.success = True - check_result.message = f"Source step step_id={source_step_spec.id} {source_step_spec.status}; skip data availability check" + check_result.message = f"Source step step_id={source_step_spec.step_id} {source_step_spec.status}; skip data availability check" tmp_log.warning(f"{check_result.message}") return check_result else: - # Source step not terminated; check number of files in DDM collection - collection = data_spec.target_id - tmp_stat, tmp_res = self.ddmIF.get_number_of_files(collection) - if tmp_stat is None: - # Collection does not exist - check_result.success = True - check_result.check_status = WFDataTargetCheckStatus.nonexist - tmp_log.info(f"Collection {collection} does not exist; check_status={check_result.check_status}") - return check_result - elif not tmp_stat: - # Error in getting number of files - check_result.success = False - check_result.message = f"Failed to get number of files for collection {collection}: {tmp_res}" - tmp_log.error(f"{check_result.message}") - return check_result + # Source step not terminated; check number of files in DDM collections + total_n_files = 0 + none_exist = True + output_types = data_spec.get_parameter("output_types") + if output_types is None: + output_types = [] + for output_type in output_types: + collection = f"{data_spec.target_id}_{output_type}" + tmp_stat, tmp_res = self.ddmIF.get_number_of_files(collection) + if tmp_stat is None: + tmp_log.debug(f"Collection {collection} does not exist") + elif not tmp_stat: + # Error in getting number of files + check_result.success = False + check_result.message = f"Failed to get number of files for collection {collection}: {tmp_res}" + tmp_log.error(f"{check_result.message}") + return check_result + else: + none_exist = False + n_files = tmp_res + total_n_files += n_files # Check number of files - n_files = tmp_res - if n_files == 0: + if none_exist: + check_result.check_status = WFDataTargetCheckStatus.nonexist + elif total_n_files == 0: check_result.check_status = WFDataTargetCheckStatus.insuff else: # At least 1 file is sufficient for step input diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 8e7bb54de..1c45b7945 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -313,7 +313,7 @@ def process_data_registered(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.info(f"Done, status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" - tmp_log.error(f"{process_result.message}") + tmp_log.error(f"{traceback.format_exc()}") return process_result def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: @@ -364,7 +364,7 @@ def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.info(f"Done, status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" - tmp_log.error(f"{process_result.message}") + tmp_log.error(f"{traceback.format_exc()}") return process_result def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: @@ -420,7 +420,7 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" - tmp_log.error(f"{process_result.message}") + tmp_log.error(f"{traceback.format_exc()}") return process_result def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: @@ -493,7 +493,7 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" - tmp_log.error(f"{process_result.message}") + tmp_log.error(f"{traceback.format_exc()}") return process_result def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: @@ -572,7 +572,7 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" - tmp_log.error(f"{process_result.message}") + tmp_log.error(f"{traceback.format_exc()}") return process_result def process_data_specs(self, data_specs: List[WFDataSpec]) -> Dict: From 1f29d5ab75009424a27a92fece5a6dfc29ec9e96 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 11 Nov 2025 16:12:35 +0100 Subject: [PATCH 042/101] workflows4: fix --- .../data_handler_plugins/panda_task_data_handler.py | 3 ++- pandaserver/workflow/workflow_base.py | 1 + pandaserver/workflow/workflow_core.py | 6 +++--- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py index 336987b13..88bc380a0 100644 --- a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py @@ -126,6 +126,7 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu none_exist = False n_files = tmp_res total_n_files += n_files + tmp_log.debug(f"Got collection {collection} n_files={n_files}") # Check number of files if none_exist: check_result.check_status = WFDataTargetCheckStatus.nonexist @@ -135,5 +136,5 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu # At least 1 file is sufficient for step input check_result.check_status = WFDataTargetCheckStatus.partial check_result.success = True - tmp_log.info(f"Got collection {collection} n_files={n_files}; check_status={check_result.check_status}") + tmp_log.info(f"Got total_n_files={total_n_files}; check_status={check_result.check_status}") return check_result diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 2937584b5..db8f907f4 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -94,6 +94,7 @@ class WFDataStatus(object): after_generating_start_statuses = (generating_ready, done_generated, cancelled) after_generating_ready_statuses = (done_generated, cancelled) after_waiting_ready_statuses = (done_waited, cancelled) + terminated_statuses = (done_generated, done_waited, done_skipped, cancelled, retired) # ==== Types =================================================== diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 1c45b7945..a5df8d1d6 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -1233,7 +1233,7 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc # Process try: # Process data specs first - data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id, status_exclusion_list=list(WFDataStatus.terminated_statuses)) data_status_stats = self.process_data_specs(data_specs) # Get steps in registered status required_step_statuses = list(WFStepStatus.to_advance_step_statuses) @@ -1299,10 +1299,10 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce # Process try: # Process data specs first - data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id, status_exclusion_list=list(WFDataStatus.terminated_statuses)) data_status_stats = self.process_data_specs(data_specs) # Get steps - step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id) + step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_exclusion_list=list(WFStepStatus.final_statuses)) if not step_specs: process_result.message = f"No step in required status; skipped" tmp_log.warning(f"{process_result.message}") From 7b056f46c86081ad9cf81be11aa488689928ce78 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 11 Nov 2025 17:42:24 +0100 Subject: [PATCH 043/101] workflows4: fixes --- pandaserver/dataservice/ddm.py | 11 +++++++++-- .../step_handler_plugins/panda_task_step_handler.py | 2 +- pandaserver/workflow/workflow_core.py | 8 +++++--- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pandaserver/dataservice/ddm.py b/pandaserver/dataservice/ddm.py index b400dd70b..b3c4282b3 100755 --- a/pandaserver/dataservice/ddm.py +++ b/pandaserver/dataservice/ddm.py @@ -791,21 +791,28 @@ def get_number_of_files(self, dataset_name: str, preset_scope: str = None): Tuple[bool, Union[int, str]]: A tuple containing a boolean indicating the success of the operation and the number of files or an error message. If an exception occurs, the boolean is False and the string contains the error message. """ + # make logger + method_name = "get_number_of_files" + method_name = f"{method_name} dataset_name={dataset_name}" + tmp_log = LogWrapper(_logger, method_name) + tmp_log.debug("start") # extract scope from dataset scope, dataset_name = self.extract_scope(dataset_name) if preset_scope is not None: scope = preset_scope - client = RucioClient() - n_files = 0 try: + client = RucioClient() + n_files = 0 for _ in client.list_files(scope, dataset_name): n_files += 1 return True, n_files except DataIdentifierNotFound: + tmp_log.debug("dataset not found") return None, "dataset not found" except Exception: err_type, err_value = sys.exc_info()[:2] err_msg = f"{err_type.__name__} {err_value}" + tmp_log.error(f"got error ; {traceback.format_exc()}") return False, err_msg # list datasets with GUIDs diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 03e08dfd7..26ff4a788 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -171,7 +171,7 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu task_status = res[0] check_result.success = True check_result.native_status = task_status - if task_status in ["running", "transferring", "transferred", "merging"]: + if task_status in ["running", "scouting", "scouted", "throttled", "prepared", "finishing", "passed"]: check_result.step_status = WFStepStatus.running elif task_status in ["defined", "assigned", "activated", "starting", "ready", "pending"]: check_result.step_status = WFStepStatus.submitted diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index a5df8d1d6..67c6b4110 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -1234,7 +1234,8 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc try: # Process data specs first data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id, status_exclusion_list=list(WFDataStatus.terminated_statuses)) - data_status_stats = self.process_data_specs(data_specs) + if data_specs: + data_status_stats = self.process_data_specs(data_specs) # Get steps in registered status required_step_statuses = list(WFStepStatus.to_advance_step_statuses) over_advanced_step_statuses = list(WFStepStatus.after_submitted_uninterrupted_statuses) @@ -1300,9 +1301,10 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce try: # Process data specs first data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id, status_exclusion_list=list(WFDataStatus.terminated_statuses)) - data_status_stats = self.process_data_specs(data_specs) + if data_specs: + data_status_stats = self.process_data_specs(data_specs) # Get steps - step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_exclusion_list=list(WFStepStatus.final_statuses)) + step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id) if not step_specs: process_result.message = f"No step in required status; skipped" tmp_log.warning(f"{process_result.message}") From 9bd6f8eb90fde25e2ca2ba421b114026377a8992 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 12 Nov 2025 15:40:59 +0100 Subject: [PATCH 044/101] workflows4: fix about data status --- pandaserver/workflow/workflow_base.py | 5 +- pandaserver/workflow/workflow_core.py | 85 ++++++++++++++----- templates/sysconfig/panda_jedi.template | 4 +- .../panda_server.sysconfig.rpmnew.template | 2 +- .../panda_server_env.systemd.rpmnew.template | 2 +- 5 files changed, 73 insertions(+), 25 deletions(-) diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index db8f907f4..a4043d966 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -76,6 +76,7 @@ class WFDataStatus(object): checked_partial = "checked_partial" # data partially available and sufficient to be step input checked_complete = "checked_complete" # data completely available generating_start = "generating_start" + generating_unready = "generating_unready" generating_ready = "generating_ready" waiting_unready = "waiting_unready" waiting_ready = "waiting_ready" @@ -85,9 +86,9 @@ class WFDataStatus(object): cancelled = "cancelled" retired = "retired" - checked_statuses = (checked_nonexist, checked_partial, checked_complete) + checked_statuses = (checked_nonexist, checked_insuff, checked_partial, checked_complete) + generating_statuses = (generating_start, generating_unready, generating_ready) waiting_statuses = (waiting_unready, waiting_ready) - generating_statues = (generating_start, generating_ready) done_statuses = (done_generated, done_waited, done_skipped) good_input_statuses = (generating_ready, waiting_ready, done_generated, done_waited, done_skipped) good_output_statuses = (done_generated, done_waited, done_skipped) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 67c6b4110..cf1644ded 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -344,7 +344,13 @@ def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: data_handler = self.get_plugin("data_handler", data_spec.flavor) # Check the data status check_result = data_handler.check_target(data_spec) - if not check_result.success or check_result.check_status is None: + if check_result.success and check_result.check_status is None: + # No status change + process_result.message = f"Skipped; {check_result.message}" + tmp_log.debug(f"{process_result.message}") + process_result.success = True + return process_result + elif not check_result.success or check_result.check_status is None: process_result.message = f"Failed to check data; {check_result.message}" tmp_log.error(f"{process_result.message}") return process_result @@ -353,6 +359,8 @@ def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: match check_result.check_status: case WFDataTargetCheckStatus.nonexist: data_spec.status = WFDataStatus.checked_nonexist + case WFDataTargetCheckStatus.insuff: + data_spec.status = WFDataStatus.checked_insuff case WFDataTargetCheckStatus.partial: data_spec.status = WFDataStatus.checked_partial case WFDataTargetCheckStatus.complete: @@ -383,7 +391,7 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: # Initialize process_result = WFDataProcessResult() # Check status - if data_spec.status not in (WFDataStatus.checked_nonexist, WFDataStatus.checked_partial, WFDataStatus.checked_complete): + if data_spec.status not in WFDataStatus.checked_statuses: process_result.message = f"Data status changed unexpectedly from checked_* to {data_spec.status}; skipped" tmp_log.warning(f"{process_result.message}") return process_result @@ -439,7 +447,7 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: # Initialize process_result = WFDataProcessResult() # Check status - if data_spec.status not in (WFDataStatus.generating_start, WFDataStatus.generating_ready): + if data_spec.status not in WFDataStatus.generating_statuses: process_result.message = f"Data status changed unexpectedly from generating_* to {data_spec.status}; skipped" tmp_log.warning(f"{process_result.message}") return process_result @@ -450,7 +458,13 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: data_handler = self.get_plugin("data_handler", data_spec.flavor) # Check the data status check_result = data_handler.check_target(data_spec) - if not check_result.success or check_result.check_status is None: + if check_result.success and check_result.check_status is None: + # No status change + process_result.message = f"Skipped; {check_result.message}" + tmp_log.debug(f"{process_result.message}") + process_result.success = True + return process_result + elif not check_result.success or check_result.check_status is None: process_result.message = f"Failed to check data; {check_result.message}" tmp_log.error(f"{process_result.message}") return process_result @@ -462,12 +476,31 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: # Data exist, advance to generating_ready data_spec.status = WFDataStatus.generating_ready process_result.new_status = data_spec.status + case WFDataTargetCheckStatus.insuff: + # Data insufficient, move to generating_unready + data_spec.status = WFDataStatus.generating_unready + process_result.new_status = data_spec.status case WFDataTargetCheckStatus.nonexist: # Data not yet exist, stay in generating_start pass case _: # Unexpected status, log and skip tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") + elif original_status == WFDataStatus.generating_unready: + match check_result.check_status: + case WFDataTargetCheckStatus.partial | WFDataTargetCheckStatus.complete: + # Data now exist, advance to generating_ready + data_spec.status = WFDataStatus.generating_ready + process_result.new_status = data_spec.status + case WFDataTargetCheckStatus.insuff: + # Data still insufficient, stay in generating_unready + pass + case WFDataTargetCheckStatus.nonexist: + # Data not exist anymore, unexpected, log and skip + tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") + case _: + # Unexpected status, log and skip + tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") elif original_status == WFDataStatus.generating_ready: match check_result.check_status: case WFDataTargetCheckStatus.complete: @@ -478,6 +511,9 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: case WFDataTargetCheckStatus.partial: # Data still partially exist, stay in generating_ready pass + case WFDataTargetCheckStatus.insuff: + # Data not sufficient anymore, unexpected, log and skip + tmp_log.warning(f"Data are not sufficient anymore, unexpected; skipped") case WFDataTargetCheckStatus.nonexist: # Data not exist anymore, unexpected, log and skip tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") @@ -523,7 +559,13 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: data_handler = self.get_plugin("data_handler", data_spec.flavor) # Check the data status check_result = data_handler.check_target(data_spec) - if not check_result.success or check_result.check_status is None: + if check_result.success and check_result.check_status is None: + # No status change + process_result.message = f"Skipped; {check_result.message}" + tmp_log.debug(f"{process_result.message}") + process_result.success = True + return process_result + elif not check_result.success or check_result.check_status is None: process_result.message = f"Failed to check data; {check_result.message}" tmp_log.error(f"{process_result.message}") return process_result @@ -569,7 +611,10 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) process_result.success = True - tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") + if data_spec.status == original_status: + tmp_log.info(f"Done, status stays {data_spec.status}") + else: + tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" tmp_log.error(f"{traceback.format_exc()}") @@ -598,20 +643,20 @@ def process_data_specs(self, data_specs: List[WFDataSpec]) -> Dict: orig_status = data_spec.status # Process the data tmp_res = None - match data_spec.status: - case WFDataStatus.registered: - tmp_res = self.process_data_registered(data_spec) - case WFDataStatus.checking: - tmp_res = self.process_data_checking(data_spec) - case WFDataStatus.checked_nonexist | WFDataStatus.checked_partial | WFDataStatus.checked_complete: - tmp_res = self.process_data_checked(data_spec) - case WFDataStatus.generating_start | WFDataStatus.generating_ready: - tmp_res = self.process_data_generating(data_spec) - case WFDataStatus.waiting_ready: - tmp_res = self.process_data_waiting(data_spec) - case _: - tmp_log.debug(f"Data status {data_spec.status} is not handled in this context; skipped") - continue + status = data_spec.status + if status == WFDataStatus.registered: + tmp_res = self.process_data_registered(data_spec) + elif status == WFDataStatus.checking: + tmp_res = self.process_data_checking(data_spec) + elif status in WFDataStatus.checked_statuses: + tmp_res = self.process_data_checked(data_spec) + elif status in WFDataStatus.generating_statuses: + tmp_res = self.process_data_generating(data_spec) + elif status in WFDataStatus.waiting_statuses: + tmp_res = self.process_data_waiting(data_spec) + else: + tmp_log.debug(f"Data status {data_spec.status} is not handled in this context; skipped") + continue if tmp_res and tmp_res.success: # update stats if tmp_res.new_status and data_spec.status != orig_status: diff --git a/templates/sysconfig/panda_jedi.template b/templates/sysconfig/panda_jedi.template index 61f830267..2eb87f227 100644 --- a/templates/sysconfig/panda_jedi.template +++ b/templates/sysconfig/panda_jedi.template @@ -10,9 +10,11 @@ if [[ -n "${VIRTUAL_ENV}" ]]; then PATH=${VIRTUAL_ENV}/bin:${VIRTUAL_ENV}/usr/local/bin:${VIRTUAL_ENV}/usr/bin:${VIRTUAL_ENV}:${PATH} fi -# for DQ2 +# for Rucio export X509_CERT_DIR=/etc/grid-security/certificates export X509_USER_PROXY=/data/atlpan/x509up_u25606 +export RUCIO_ACCOUNT=panda +export RUCIO_APPID=pandasrv # panda home if [[ -n "${VIRTUAL_ENV}" ]]; then diff --git a/templates/sysconfig/panda_server.sysconfig.rpmnew.template b/templates/sysconfig/panda_server.sysconfig.rpmnew.template index 5d0aa25c7..e1e589fa4 100644 --- a/templates/sysconfig/panda_server.sysconfig.rpmnew.template +++ b/templates/sysconfig/panda_server.sysconfig.rpmnew.template @@ -18,7 +18,7 @@ if [[ -n "${VIRTUAL_ENV}" ]]; then fi unset LD_LIBRARY_PATH -# for DQ2 +# for Rucio export X509_CERT_DIR=/etc/grid-security/certificates if [[ -z "${PANDA_RUCIO_ACCOUNT}" ]]; then export RUCIO_ACCOUNT=panda diff --git a/templates/sysconfig/panda_server_env.systemd.rpmnew.template b/templates/sysconfig/panda_server_env.systemd.rpmnew.template index af31a47c2..61dbe6a87 100644 --- a/templates/sysconfig/panda_server_env.systemd.rpmnew.template +++ b/templates/sysconfig/panda_server_env.systemd.rpmnew.template @@ -7,7 +7,7 @@ OPTIONS="-f @@virtual_env@@/etc/panda/panda_server-httpd.conf" PATH=@@virtual_env@@/bin:/bin:/usr/local/bin:/usr/bin LD_LIBRARY_PATH= -# for DQ2 +# for Rucio X509_CERT_DIR=/etc/grid-security/certificates RUCIO_ACCOUNT=panda RUCIO_APPID=pandasrv From 393c9c3120785e31e60851197fa90adec00ff7a6 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 12 Nov 2025 16:24:10 +0100 Subject: [PATCH 045/101] workflows4: fix step start_time and end_time --- pandaserver/workflow/workflow_core.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index cf1644ded..2110521a0 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -960,6 +960,12 @@ def process_step_submitted(self, step_spec: WFStepSpec) -> WFStepProcessResult: tmp_log.warning(f"Invalid step_status {check_result.step_status} from target check result; skipped") now_time = naive_utcnow() step_spec.check_time = now_time + if step_spec.status in WFStepStatus.after_submitted_uninterrupted_statuses and step_spec.start_time is None: + # step has run, set start_time if not yet set + step_spec.start_time = now_time + if step_spec.status in WFStepStatus.final_statuses and step_spec.start_time is not None and step_spec.end_time is None: + # step has ended, set end_time if not yet set + step_spec.end_time = now_time self.tbif.update_workflow_step(step_spec) process_result.success = True tmp_log.info(f"Checked step, flavor={step_spec.flavor}, target_id={step_spec.target_id}, status={step_spec.status}") @@ -1010,6 +1016,12 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: tmp_log.warning(f"Invalid step_status {check_result.step_status} from target check result; skipped") now_time = naive_utcnow() step_spec.check_time = now_time + if step_spec.status in WFStepStatus.after_submitted_uninterrupted_statuses and step_spec.start_time is None: + # step has run, set start_time if not yet set + step_spec.start_time = now_time + if step_spec.status in WFStepStatus.final_statuses and step_spec.start_time is not None and step_spec.end_time is None: + # step has ended, set end_time if not yet set + step_spec.end_time = now_time self.tbif.update_workflow_step(step_spec) process_result.success = True tmp_log.info(f"Checked step, flavor={step_spec.flavor}, target_id={step_spec.target_id}, status={step_spec.status}") From 4ee9eaf0ed7ad6c91c9e4a41c922095d0d84884e Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 13 Nov 2025 09:05:53 +0100 Subject: [PATCH 046/101] workflows4: rename step status submitted to starting --- .../step_handler_plugins/base_step_handler.py | 2 +- .../panda_task_step_handler.py | 4 +- pandaserver/workflow/workflow_base.py | 8 ++-- pandaserver/workflow/workflow_core.py | 44 +++++++++---------- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/pandaserver/workflow/step_handler_plugins/base_step_handler.py b/pandaserver/workflow/step_handler_plugins/base_step_handler.py index 96f9f6cea..f42210302 100644 --- a/pandaserver/workflow/step_handler_plugins/base_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/base_step_handler.py @@ -38,7 +38,7 @@ def submit_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetSubmitRe This method should NOT modify step_spec. Any update information should be stored in the WFStepTargetSubmitResult returned instead. Args: - step_spec (WFStepSpec): Specifications of the workflow step to be submitted. + step_spec (WFStepSpec): Specifications of the workflow step whose target is to be submitted. Returns: WFStepTargetSubmitResult: An object containing the result of the submission, including success status, target ID, and message. diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 26ff4a788..628ef8c7f 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -143,7 +143,7 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu WFStepTargetCheckResult: An object containing the result of the status check, including success status, step status, native status, and message. """ tmp_log = LogWrapper(logger, f"check_target workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") - allowed_step_statuses = [WFStepStatus.submitted, WFStepStatus.running] + allowed_step_statuses = [WFStepStatus.starting, WFStepStatus.running] try: # Initialize check_result = WFStepTargetCheckResult() @@ -174,7 +174,7 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu if task_status in ["running", "scouting", "scouted", "throttled", "prepared", "finishing", "passed"]: check_result.step_status = WFStepStatus.running elif task_status in ["defined", "assigned", "activated", "starting", "ready", "pending"]: - check_result.step_status = WFStepStatus.submitted + check_result.step_status = WFStepStatus.starting elif task_status in ["done", "finished"]: check_result.step_status = WFStepStatus.done elif task_status in ["failed", "exhausted", "aborted", "toabort", "aborting", "broken", "tobroken"]: diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index a4043d966..913e1789e 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -49,7 +49,7 @@ class WFStepStatus(object): checked_false = "checked_false" pending = "pending" ready = "ready" - submitted = "submitted" + starting = "starting" running = "running" done = "done" failed = "failed" @@ -57,9 +57,9 @@ class WFStepStatus(object): cancelled = "cancelled" checked_statuses = (checked_true, checked_false) - to_advance_step_statuses = (registered, checking, checked_true, checked_false, pending, ready, submitted) - after_submitted_statuses = (running, done, failed, cancelled) - after_submitted_uninterrupted_statuses = (running, done, failed) + to_advance_step_statuses = (registered, checking, checked_true, checked_false, pending, ready, starting) + after_starting_statuses = (running, done, failed, cancelled) + after_starting_uninterrupted_statuses = (running, done, failed) after_running_statuses = (done, failed, cancelled) final_statuses = (done, failed, closed, cancelled) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 2110521a0..444522578 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -879,7 +879,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W def process_step_ready(self, step_spec: WFStepSpec) -> WFStepProcessResult: """ Process a step in ready status - To submit the step for execution + To start the step by submitting its target Args: step_spec (WFStepSpec): The workflow step specification to process @@ -900,28 +900,28 @@ def process_step_ready(self, step_spec: WFStepSpec) -> WFStepProcessResult: try: # Get the step handler plugin step_handler = self.get_plugin("step_handler", step_spec.flavor) - # Submit the step + # Submit the step target submit_result = step_handler.submit_target(step_spec) if not submit_result.success or submit_result.target_id is None: - process_result.message = f"Failed to submit step; {submit_result.message}" + process_result.message = f"Failed to submit step target; {submit_result.message}" tmp_log.error(f"{process_result.message}") return process_result - # Update step status to submitted + # Update step status to starting step_spec.target_id = submit_result.target_id - step_spec.status = WFStepStatus.submitted + step_spec.status = WFStepStatus.starting self.tbif.update_workflow_step(step_spec) process_result.success = True process_result.new_status = step_spec.status - tmp_log.info(f"Submitted step, flavor={step_spec.flavor}, target_id={step_spec.target_id}, status={step_spec.status}") + tmp_log.info(f"Done, submitted target flavor={step_spec.flavor} target_id={step_spec.target_id}, status={step_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" tmp_log.error(f"Got error ; {traceback.format_exc()}") return process_result - def process_step_submitted(self, step_spec: WFStepSpec) -> WFStepProcessResult: + def process_step_starting(self, step_spec: WFStepSpec) -> WFStepProcessResult: """ - Process a step in submitted status - To check the status of the submitted step + Process a step in starting status + To check the status of the starting step Args: step_spec (WFStepSpec): The workflow step specification to process @@ -929,13 +929,13 @@ def process_step_submitted(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_submitted workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") + tmp_log = LogWrapper(logger, f"process_step_starting workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() # Check status - if step_spec.status != WFStepStatus.submitted: - process_result.message = f"Step status changed unexpectedly from {WFStepStatus.submitted} to {step_spec.status}; skipped" + if step_spec.status != WFStepStatus.starting: + process_result.message = f"Step status changed unexpectedly from {WFStepStatus.starting} to {step_spec.status}; skipped" tmp_log.warning(f"{process_result.message}") return process_result # Process @@ -949,18 +949,18 @@ def process_step_submitted(self, step_spec: WFStepSpec) -> WFStepProcessResult: tmp_log.error(f"{process_result.message}") return process_result # Update step status - if check_result.step_status in WFStepStatus.after_submitted_statuses: + if check_result.step_status in WFStepStatus.after_starting_statuses: # Step status advanced step_spec.status = check_result.step_status process_result.new_status = step_spec.status - elif check_result.step_status == WFStepStatus.submitted: - # Still in submitted, do nothing + elif check_result.step_status == WFStepStatus.starting: + # Still in starting, do nothing pass else: tmp_log.warning(f"Invalid step_status {check_result.step_status} from target check result; skipped") now_time = naive_utcnow() step_spec.check_time = now_time - if step_spec.status in WFStepStatus.after_submitted_uninterrupted_statuses and step_spec.start_time is None: + if step_spec.status in WFStepStatus.after_starting_uninterrupted_statuses and step_spec.start_time is None: # step has run, set start_time if not yet set step_spec.start_time = now_time if step_spec.status in WFStepStatus.final_statuses and step_spec.start_time is not None and step_spec.end_time is None: @@ -1016,7 +1016,7 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: tmp_log.warning(f"Invalid step_status {check_result.step_status} from target check result; skipped") now_time = naive_utcnow() step_spec.check_time = now_time - if step_spec.status in WFStepStatus.after_submitted_uninterrupted_statuses and step_spec.start_time is None: + if step_spec.status in WFStepStatus.after_starting_uninterrupted_statuses and step_spec.start_time is None: # step has run, set start_time if not yet set step_spec.start_time = now_time if step_spec.status in WFStepStatus.final_statuses and step_spec.start_time is not None and step_spec.end_time is None: @@ -1065,8 +1065,8 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W tmp_res = self.process_step_pending(step_spec, data_spec_map=data_spec_map) case WFStepStatus.ready: tmp_res = self.process_step_ready(step_spec) - case WFStepStatus.submitted: - tmp_res = self.process_step_submitted(step_spec) + case WFStepStatus.starting: + tmp_res = self.process_step_starting(step_spec) case WFStepStatus.running: tmp_res = self.process_step_running(step_spec) case _: @@ -1295,7 +1295,7 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc data_status_stats = self.process_data_specs(data_specs) # Get steps in registered status required_step_statuses = list(WFStepStatus.to_advance_step_statuses) - over_advanced_step_statuses = list(WFStepStatus.after_submitted_uninterrupted_statuses) + over_advanced_step_statuses = list(WFStepStatus.after_starting_uninterrupted_statuses) step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=required_step_statuses) over_advanced_step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id, status_filter_list=over_advanced_step_statuses) if not step_specs: @@ -1318,8 +1318,8 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} # Process steps steps_status_stats = self.process_steps(step_specs, data_spec_map=data_spec_map) - # Update workflow status to running if any of step is submitted - if steps_status_stats["processed"].get(WFStepStatus.submitted): + # Update workflow status to running if any of step is starting + if steps_status_stats["processed"].get(WFStepStatus.starting): workflow_spec.status = WorkflowStatus.running workflow_spec.start_time = naive_utcnow() self.tbif.update_workflow(workflow_spec) From 421900fec7011ed61bf585b5cff474b510d61228 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Sat, 15 Nov 2025 14:48:02 +0100 Subject: [PATCH 047/101] workflows4: log --- pandaserver/workflow/workflow_core.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 444522578..53d634235 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -668,7 +668,9 @@ def process_data_specs(self, data_specs: List[WFDataSpec]) -> Dict: data_status_stats["processed"].setdefault(data_spec.status, 0) data_status_stats["processed"][data_spec.status] += 1 data_status_stats["n_processed"] += 1 - tmp_log.info(f"Done, processed data specs: {data_status_stats}") + tmp_log.info( + f"Done, processed {data_status_stats['n_processed']}/{n_data} data specs, unchanged: {data_status_stats['unchanged']}, changed: {data_status_stats['changed']}" + ) return data_status_stats # ---- Step status transitions ----------------------------- @@ -1083,7 +1085,9 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W steps_status_stats["processed"].setdefault(step_spec.status, 0) steps_status_stats["processed"][step_spec.status] += 1 steps_status_stats["n_processed"] += 1 - tmp_log.info(f"Done, processed steps: {steps_status_stats}") + tmp_log.info( + f"Done, processed {steps_status_stats['n_processed']}/{n_steps} steps, unchanged: {steps_status_stats['unchanged']}, changed: {steps_status_stats['changed']}" + ) return steps_status_stats # ---- Workflow status transitions ------------------------- @@ -1484,7 +1488,9 @@ def process_active_workflows(self) -> Dict: workflows_status_stats["processed"][workflow_spec.status] += 1 workflows_status_stats["n_processed"] += 1 workflows_status_stats["n_workflows"] = n_workflows - tmp_log.info(f"Done, processed workflows: {workflows_status_stats}") + tmp_log.info( + f"Done, processed {workflows_status_stats['n_processed']}/{n_workflows} workflows, unchanged: {workflows_status_stats['unchanged']}, changed: {workflows_status_stats['changed']}" + ) except Exception as e: tmp_log.error(f"Got error ; {traceback.format_exc()}") return workflows_status_stats From 088d504f9f586620fb46a7efd4df740f1a25338d Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 17 Nov 2025 14:16:12 +0100 Subject: [PATCH 048/101] workflows4: check step output during checking --- pandaserver/workflow/workflow_base.py | 1 + pandaserver/workflow/workflow_core.py | 92 ++++++++++++++++++++------- 2 files changed, 71 insertions(+), 22 deletions(-) diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 913e1789e..3596856e3 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -96,6 +96,7 @@ class WFDataStatus(object): after_generating_ready_statuses = (done_generated, cancelled) after_waiting_ready_statuses = (done_waited, cancelled) terminated_statuses = (done_generated, done_waited, done_skipped, cancelled, retired) + nonreusable_statuses = (cancelled, retired) # ==== Types =================================================== diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 53d634235..340030846 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -404,23 +404,19 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: case WFDataStatus.checked_nonexist: # Data does not exist, advance to generating_start data_spec.status = WFDataStatus.generating_start - data_spec.check_time = now_time data_spec.start_time = now_time self.tbif.update_workflow_data(data_spec) case WFDataStatus.checked_insuff: # Data insufficient, advance to waiting_unready data_spec.status = WFDataStatus.waiting_unready - data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) case WFDataStatus.checked_partial: # Data partially exist, advance to waiting_ready data_spec.status = WFDataStatus.waiting_ready - data_spec.check_time = now_time self.tbif.update_workflow_data(data_spec) case WFDataStatus.checked_complete: # Data already fully exist, advance to done_skipped data_spec.status = WFDataStatus.done_skipped - data_spec.check_time = now_time data_spec.end_time = now_time self.tbif.update_workflow_data(data_spec) process_result.success = True @@ -524,7 +520,7 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: self.tbif.update_workflow_data(data_spec) process_result.success = True if data_spec.status == original_status: - tmp_log.info(f"Done, status stays {data_spec.status}") + tmp_log.info(f"Done, status stays in {data_spec.status}") else: tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") except Exception as e: @@ -612,7 +608,7 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: self.tbif.update_workflow_data(data_spec) process_result.success = True if data_spec.status == original_status: - tmp_log.info(f"Done, status stays {data_spec.status}") + tmp_log.info(f"Done, status stays in {data_spec.status}") else: tmp_log.info(f"Done, from {original_status} to status={data_spec.status}") except Exception as e: @@ -729,9 +725,46 @@ def process_step_checking(self, step_spec: WFStepSpec) -> WFStepProcessResult: return process_result # Process try: - # FIXME: For now, always advance to checked_true - if True: - step_spec.status = WFStepStatus.checked_true + # Decide whether to run the step: True = must run, False = can skip, None = undecided yet and must check later + to_run_step = False + # FIXME: For now, always check outputs, not customizable + check_outputs = True + if check_outputs and to_run_step is False: + to_generate_output = False + output_data_names = step_spec.definition_json_map.get("output_data_list", []) + for output_data_name in output_data_names: + data_spec = self.tbif.get_workflow_data_by_name(step_spec.workflow_id, output_data_name) + if data_spec is None: + tmp_log.warning(f"Output {output_data_name} not found in workflow data; skipped") + to_run_step = None + break + if data_spec.status == WFDataStatus.generating_start: + tmp_log.debug(f"Output data {output_data_name} status {data_spec.status} needs generation") + to_generate_output = True + break + elif data_spec.status in (WFDataStatus.registered, WFDataStatus.checking) or data_spec.status in WFDataStatus.checked_statuses: + tmp_log.debug(f"Output data {output_data_name} status {data_spec.status} is not after checked; skipped") + to_run_step = None + break + else: + tmp_log.debug(f"Output data {output_data_name} status {data_spec.status} does not need generation") + continue + if to_run_step is not None and to_generate_output: + # Outputs are not all good; need to run the step + to_run_step = True + # Update step status + now_time = naive_utcnow() + if to_run_step is None: + step_spec.check_time = now_time + self.tbif.update_workflow_step(step_spec) + process_result.success = True + tmp_log.info(f"Done, status stays in {step_spec.status}") + else: + if to_run_step is True: + step_spec.status = WFStepStatus.checked_true + elif to_run_step is False: + step_spec.status = WFStepStatus.checked_false + step_spec.check_time = now_time self.tbif.update_workflow_step(step_spec) process_result.success = True process_result.new_status = step_spec.status @@ -1195,21 +1228,11 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProce data_specs = [] step_specs = [] now_time = naive_utcnow() - # Register root inputs and outputs - for input_name, input_target in workflow_definition["root_inputs"].items(): - data_spec = WFDataSpec() - data_spec.workflow_id = workflow_spec.workflow_id - data_spec.name = input_name - data_spec.target_id = input_target - data_spec.status = WFDataStatus.registered - data_spec.type = WFDataType.input - data_spec.flavor = "ddm_collection" # FIXME: hardcoded flavor, should be configurable - data_spec.creation_time = now_time - data_specs.append(data_spec) + # Register root outputs for output_name, output_dict in workflow_definition["root_outputs"].items(): data_spec = WFDataSpec() data_spec.workflow_id = workflow_spec.workflow_id - data_spec.source_step_id = None # root output + data_spec.source_step_id = None # to be set when the step producing it starts data_spec.name = output_name data_spec.target_id = output_dict.get("value") data_spec.set_parameter("output_types", output_dict.get("output_types")) @@ -1218,7 +1241,18 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProce data_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable data_spec.creation_time = now_time data_specs.append(data_spec) - # Register steps based on nodes in the definition + # Register root inputs + for input_name, input_target in workflow_definition["root_inputs"].items(): + data_spec = WFDataSpec() + data_spec.workflow_id = workflow_spec.workflow_id + data_spec.name = input_name + data_spec.target_id = input_target + data_spec.status = WFDataStatus.registered + data_spec.type = WFDataType.input + data_spec.flavor = "ddm_collection" # FIXME: hardcoded flavor, should be configurable + data_spec.creation_time = now_time + data_specs.append(data_spec) + # Register steps and their intermediate outputs based on nodes in the definition for node in workflow_definition["nodes"]: # FIXME: not yet consider scatter, condition, loop, etc. if not (node.get("condition") or node.get("scatter") or node.get("loop")): @@ -1253,6 +1287,20 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProce step_spec.definition_json_map = step_definition step_spec.creation_time = now_time step_specs.append(step_spec) + # intermediate outputs of the step + for output_data_name in output_data_set: + if output_data_name not in workflow_definition["root_outputs"]: + data_spec = WFDataSpec() + data_spec.workflow_id = workflow_spec.workflow_id + data_spec.source_step_id = None # to be set when step starts + data_spec.name = output_data_name + data_spec.target_id = None # to be set when step starts + data_spec.set_parameter("output_types", step_definition.get("output_types", [])) + data_spec.status = WFDataStatus.registered + data_spec.type = WFDataType.mid + data_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable + data_spec.creation_time = now_time + data_specs.append(data_spec) # Update status to starting workflow_spec.status = WorkflowStatus.starting # Upsert DB From 6dc443d436ea401a1dc7ea9a695a5f113914c16f Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 18 Nov 2025 11:04:19 +0100 Subject: [PATCH 049/101] workflows4: introduce data status binding --- .../panda_task_data_handler.py | 109 ++++++++------- pandaserver/workflow/workflow_base.py | 1 + pandaserver/workflow/workflow_core.py | 130 ++++++++++++------ 3 files changed, 146 insertions(+), 94 deletions(-) diff --git a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py index 88bc380a0..0edba5714 100644 --- a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py @@ -81,60 +81,59 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu check_result.message = f"flavor not {self.plugin_flavor}; skipped" return check_result # Check source step status - if data_spec.source_step_id is None: - check_result.success = True - check_result.message = "No source step yet; skipped" - tmp_log.warning(f"{check_result.message}") - return check_result - source_step_spec = self.tbif.get_workflow_step(data_spec.source_step_id) - if source_step_spec is None: - check_result.success = False - check_result.message = f"Failed to get source step step_id={data_spec.source_step_id}; skipped" - tmp_log.error(f"{check_result.message}") - return check_result - if source_step_spec.status == WFStepStatus.done: - # Source step done; consider data fully available - check_result.success = True - check_result.check_status = WFDataTargetCheckStatus.complete - tmp_log.info(f"Source step step_id={source_step_spec.step_id} done, data considered fully available; check_status={check_result.check_status}") - return check_result - elif source_step_spec.status in WFStepStatus.final_statuses: - # Source step in final status but not done; skip data availability - check_result.success = True - check_result.message = f"Source step step_id={source_step_spec.step_id} {source_step_spec.status}; skip data availability check" - tmp_log.warning(f"{check_result.message}") - return check_result + if data_spec.source_step_id is not None: + source_step_spec = self.tbif.get_workflow_step(data_spec.source_step_id) + if source_step_spec is None: + check_result.success = False + check_result.message = f"Failed to get source step step_id={data_spec.source_step_id}; skipped" + tmp_log.error(f"{check_result.message}") + return check_result + if source_step_spec.status == WFStepStatus.done: + # Source step done; consider data fully available + check_result.success = True + check_result.check_status = WFDataTargetCheckStatus.complete + tmp_log.info(f"Source step step_id={source_step_spec.step_id} done, data considered fully available; check_status={check_result.check_status}") + return check_result + elif source_step_spec.status in WFStepStatus.final_statuses: + # Source step in final status but not done; skip data availability + check_result.success = True + check_result.message = f"Source step step_id={source_step_spec.step_id} {source_step_spec.status}; skip data availability check" + tmp_log.warning(f"{check_result.message}") + return check_result + else: + tmp_log.info(f"Source step step_id={source_step_spec.step_id} status={source_step_spec.status}; checking data availability") else: - # Source step not terminated; check number of files in DDM collections - total_n_files = 0 - none_exist = True - output_types = data_spec.get_parameter("output_types") - if output_types is None: - output_types = [] - for output_type in output_types: - collection = f"{data_spec.target_id}_{output_type}" - tmp_stat, tmp_res = self.ddmIF.get_number_of_files(collection) - if tmp_stat is None: - tmp_log.debug(f"Collection {collection} does not exist") - elif not tmp_stat: - # Error in getting number of files - check_result.success = False - check_result.message = f"Failed to get number of files for collection {collection}: {tmp_res}" - tmp_log.error(f"{check_result.message}") - return check_result - else: - none_exist = False - n_files = tmp_res - total_n_files += n_files - tmp_log.debug(f"Got collection {collection} n_files={n_files}") - # Check number of files - if none_exist: - check_result.check_status = WFDataTargetCheckStatus.nonexist - elif total_n_files == 0: - check_result.check_status = WFDataTargetCheckStatus.insuff + tmp_log.info("No source step yet; checking data availability") + # Without source step or source step not terminated; check number of files in DDM collections + total_n_files = 0 + none_exist = True + output_types = data_spec.get_parameter("output_types") + if output_types is None: + output_types = [] + for output_type in output_types: + collection = f"{data_spec.target_id}_{output_type}" + tmp_stat, tmp_res = self.ddmIF.get_number_of_files(collection) + if tmp_stat is None: + tmp_log.debug(f"Collection {collection} does not exist") + elif not tmp_stat: + # Error in getting number of files + check_result.success = False + check_result.message = f"Failed to get number of files for collection {collection}: {tmp_res}" + tmp_log.error(f"{check_result.message}") + return check_result else: - # At least 1 file is sufficient for step input - check_result.check_status = WFDataTargetCheckStatus.partial - check_result.success = True - tmp_log.info(f"Got total_n_files={total_n_files}; check_status={check_result.check_status}") - return check_result + none_exist = False + n_files = tmp_res + total_n_files += n_files + tmp_log.debug(f"Got collection {collection} n_files={n_files}") + # Check number of files + if none_exist: + check_result.check_status = WFDataTargetCheckStatus.nonexist + elif total_n_files == 0: + check_result.check_status = WFDataTargetCheckStatus.insuff + else: + # At least 1 file is sufficient for step input + check_result.check_status = WFDataTargetCheckStatus.partial + check_result.success = True + tmp_log.info(f"Got total_n_files={total_n_files}; check_status={check_result.check_status}") + return check_result diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 3596856e3..bc3eb37fa 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -75,6 +75,7 @@ class WFDataStatus(object): checked_insuff = "checked_insuff" # data available but insufficient to be step input checked_partial = "checked_partial" # data partially available and sufficient to be step input checked_complete = "checked_complete" # data completely available + binding = "binding" # data being bound to a step to generate generating_start = "generating_start" generating_unready = "generating_unready" generating_ready = "generating_ready" diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 340030846..2f7761946 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -402,8 +402,8 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: now_time = naive_utcnow() match data_spec.status: case WFDataStatus.checked_nonexist: - # Data does not exist, advance to generating_start - data_spec.status = WFDataStatus.generating_start + # Data does not exist, advance to binding + data_spec.status = WFDataStatus.binding data_spec.start_time = now_time self.tbif.update_workflow_data(data_spec) case WFDataStatus.checked_insuff: @@ -427,6 +427,41 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.error(f"{traceback.format_exc()}") return process_result + def process_data_binding(self, data_spec: WFDataSpec) -> WFDataProcessResult: + """ + Process data in binding status + To bind the data to the step that will generate it + + Args: + data_spec (WFDataSpec): The workflow data specification to process + + Returns: + WFDataProcessResult: The result of processing the data + """ + tmp_log = LogWrapper(logger, f"process_data_binding workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") + tmp_log.debug("Start") + # Initialize + process_result = WFDataProcessResult() + # Check status + if data_spec.status != WFDataStatus.binding: + process_result.message = f"Data status changed unexpectedly from {WFDataStatus.binding} to {data_spec.status}; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Process + try: + # FIXME: find the step to bind to + ... + # For now, just update status to generating_start + # data_spec.status = WFDataStatus.generating_start + # self.tbif.update_workflow_data(data_spec) + # process_result.success = True + # process_result.new_status = data_spec.status + # tmp_log.info(f"Done, status={data_spec.status}") + except Exception as e: + process_result.message = f"Got error {str(e)}" + tmp_log.error(f"{traceback.format_exc()}") + return process_result + def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: """ Process data in generating status @@ -733,13 +768,13 @@ def process_step_checking(self, step_spec: WFStepSpec) -> WFStepProcessResult: to_generate_output = False output_data_names = step_spec.definition_json_map.get("output_data_list", []) for output_data_name in output_data_names: - data_spec = self.tbif.get_workflow_data_by_name(step_spec.workflow_id, output_data_name) + data_spec = self.tbif.get_workflow_data_by_name(output_data_name, step_spec.workflow_id) if data_spec is None: tmp_log.warning(f"Output {output_data_name} not found in workflow data; skipped") to_run_step = None break - if data_spec.status == WFDataStatus.generating_start: - tmp_log.debug(f"Output data {output_data_name} status {data_spec.status} needs generation") + if data_spec.status == WFDataStatus.binding: + tmp_log.debug(f"Output data {output_data_name} status {data_spec.status} requires step to generate") to_generate_output = True break elif data_spec.status in (WFDataStatus.registered, WFDataStatus.checking) or data_spec.status in WFDataStatus.checked_statuses: @@ -747,7 +782,7 @@ def process_step_checking(self, step_spec: WFStepSpec) -> WFStepProcessResult: to_run_step = None break else: - tmp_log.debug(f"Output data {output_data_name} status {data_spec.status} does not need generation") + tmp_log.debug(f"Output data {output_data_name} status {data_spec.status} does not require step to generate") continue if to_run_step is not None and to_generate_output: # Outputs are not all good; need to run the step @@ -871,36 +906,53 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W # All inputs are good, register outputs of the step and update step status to ready tmp_log.debug(f"All input data are good; proceeding") output_data_list = step_spec_definition.get("output_data_list", []) - outputs_raw_dict = step_spec_definition.get("outputs", {}) - output_types = step_spec_definition.get("output_types", []) - now_time = naive_utcnow() - if step_spec_definition.get("is_tail"): - # Tail step, set root output source_step_id - for output_data_name in output_data_list: - data_spec = self.tbif.get_workflow_data_by_name(output_data_name, step_spec.workflow_id) - if data_spec is not None: + # outputs_raw_dict = step_spec_definition.get("outputs", {}) + # output_types = step_spec_definition.get("output_types", []) + # now_time = naive_utcnow() + # New code: for all output data, set source_step_id to this step + for output_data_name in output_data_list: + data_spec = self.tbif.get_workflow_data_by_name(output_data_name, step_spec.workflow_id) + if data_spec is not None: + if data_spec.status == WFDataStatus.binding: data_spec.source_step_id = step_spec.step_id self.tbif.update_workflow_data(data_spec) - tmp_log.debug(f"Updated output data_id={data_spec.data_id} name={output_data_name} about source_step_id") + tmp_log.debug(f"Bound output data_id={data_spec.data_id} name={output_data_name} to the step") else: - tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") - else: - # Intermediate step, register their outputs as mid type - for output_data_name in output_data_list: - data_spec = WFDataSpec() - data_spec.workflow_id = step_spec.workflow_id - data_spec.source_step_id = step_spec.step_id - data_spec.name = output_data_name - data_spec.target_id = outputs_raw_dict.get(output_data_name, {}).get("value") # caution: may be None - data_spec.set_parameter("output_types", output_types) - data_spec.status = WFDataStatus.registered - data_spec.type = WFDataType.mid - data_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable - data_spec.creation_time = now_time - self.tbif.insert_workflow_data(data_spec) - tmp_log.debug(f"Registered mid data {output_data_name} of step_id={step_spec.step_id}") - # update data_spec_map - data_spec_map[output_data_name] = data_spec + tmp_log.debug(f"Output data_id={data_spec.data_id} name={output_data_name} status={data_spec.status} not in binding; skipped") + else: + tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") + # Old code for reference + # if step_spec_definition.get("is_tail"): + # # Tail step, set root output source_step_id + # for output_data_name in output_data_list: + # data_spec = self.tbif.get_workflow_data_by_name(output_data_name, step_spec.workflow_id) + # if data_spec is not None: + # data_spec.source_step_id = step_spec.step_id + # self.tbif.update_workflow_data(data_spec) + # tmp_log.debug(f"Updated output data_id={data_spec.data_id} name={output_data_name} about source_step_id") + # else: + # tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") + # else: + # # Intermediate step, update mid output data specs source_step_id + # for output_data_name in output_data_list: + # data_spec = self.tbif.get_workflow_data_by_name(output_data_name, step_spec.workflow_id) + # if data_spec is None: + # tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") + # continue + # elif data_spec.status == WFDataStatus.binding: + # # mid data in binding, bind it to the step + # data_spec.source_step_id = step_spec.step_id + # data_spec.name = output_data_name + # data_spec.target_id = outputs_raw_dict.get(output_data_name, {}).get("value") # caution: may be None + # data_spec.set_parameter("output_types", output_types) + # data_spec.status = WFDataStatus.registered + # data_spec.type = WFDataType.mid + # data_spec.flavor = "panda_task" # FIXME: hardcoded flavor, should be configurable + # data_spec.creation_time = now_time + # self.tbif.update_workflow_data(data_spec) + # tmp_log.debug(f"Updated mid data {output_data_name} about source step") + # # update data_spec_map + # data_spec_map[output_data_name] = data_spec step_spec.status = WFStepStatus.ready self.tbif.update_workflow_step(step_spec) process_result.success = True @@ -1270,7 +1322,7 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProce step_definition["user_dn"] = workflow_definition.get("user_dn") # resolve inputs and outputs input_data_set = set() - output_data_set = set() + output_data_dict = dict() for input_target in step_definition.get("inputs", {}).values(): if not input_target.get("source"): continue @@ -1280,21 +1332,21 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProce else: sources = [input_target["source"]] input_data_set.update(sources) - for output_name in step_definition.get("outputs", {}).keys(): - output_data_set.add(output_name) + for output_name, output_value in step_definition.get("outputs", {}).items(): + output_data_dict[output_name] = output_value.get("value") step_definition["input_data_list"] = list(input_data_set) - step_definition["output_data_list"] = list(output_data_set) + step_definition["output_data_list"] = list(output_data_dict.keys()) step_spec.definition_json_map = step_definition step_spec.creation_time = now_time step_specs.append(step_spec) # intermediate outputs of the step - for output_data_name in output_data_set: + for output_data_name in output_data_dict.keys(): if output_data_name not in workflow_definition["root_outputs"]: data_spec = WFDataSpec() data_spec.workflow_id = workflow_spec.workflow_id data_spec.source_step_id = None # to be set when step starts data_spec.name = output_data_name - data_spec.target_id = None # to be set when step starts + data_spec.target_id = output_data_dict[output_data_name] data_spec.set_parameter("output_types", step_definition.get("output_types", [])) data_spec.status = WFDataStatus.registered data_spec.type = WFDataType.mid From 5faeffcb1a4014f6a8141a9ed0ef0c15dda44030 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 18 Nov 2025 12:32:38 +0100 Subject: [PATCH 050/101] workflows4: fix binding --- pandaserver/workflow/workflow_core.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 2f7761946..a6a486676 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -427,13 +427,14 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.error(f"{traceback.format_exc()}") return process_result - def process_data_binding(self, data_spec: WFDataSpec) -> WFDataProcessResult: + def process_data_binding(self, data_spec: WFDataSpec, step_spec: WFStepSpec) -> WFDataProcessResult: """ Process data in binding status To bind the data to the step that will generate it Args: data_spec (WFDataSpec): The workflow data specification to process + step_spec (WFStepSpec): The workflow step specification to bind the data to Returns: WFDataProcessResult: The result of processing the data @@ -449,14 +450,13 @@ def process_data_binding(self, data_spec: WFDataSpec) -> WFDataProcessResult: return process_result # Process try: - # FIXME: find the step to bind to - ... - # For now, just update status to generating_start - # data_spec.status = WFDataStatus.generating_start - # self.tbif.update_workflow_data(data_spec) - # process_result.success = True - # process_result.new_status = data_spec.status - # tmp_log.info(f"Done, status={data_spec.status}") + original_status = data_spec.status + data_spec.source_step_id = step_spec.step_id + data_spec.status = WFDataStatus.generating_start + self.tbif.update_workflow_data(data_spec) + process_result.success = True + process_result.new_status = data_spec.status + tmp_log.info(f"Done, bound to step_id={step_spec.step_id}, from {original_status} to status={data_spec.status}") except Exception as e: process_result.message = f"Got error {str(e)}" tmp_log.error(f"{traceback.format_exc()}") @@ -681,6 +681,11 @@ def process_data_specs(self, data_specs: List[WFDataSpec]) -> Dict: tmp_res = self.process_data_checking(data_spec) elif status in WFDataStatus.checked_statuses: tmp_res = self.process_data_checked(data_spec) + elif status == WFDataStatus.binding: + # dummy result since binding data are handled in step processing + dummy_process_result = WFDataProcessResult() + dummy_process_result.success = True + tmp_res = dummy_process_result elif status in WFDataStatus.generating_statuses: tmp_res = self.process_data_generating(data_spec) elif status in WFDataStatus.waiting_statuses: @@ -914,8 +919,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W data_spec = self.tbif.get_workflow_data_by_name(output_data_name, step_spec.workflow_id) if data_spec is not None: if data_spec.status == WFDataStatus.binding: - data_spec.source_step_id = step_spec.step_id - self.tbif.update_workflow_data(data_spec) + self.process_data_binding(data_spec, step_spec) tmp_log.debug(f"Bound output data_id={data_spec.data_id} name={output_data_name} to the step") else: tmp_log.debug(f"Output data_id={data_spec.data_id} name={output_data_name} status={data_spec.status} not in binding; skipped") From 6281fbcd6820a75d4dfd388af3a13bf716cbf099 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 18 Nov 2025 15:29:07 +0100 Subject: [PATCH 051/101] workflows4: rename statuses --- .../ddm_collection_data_handler.py | 4 +- .../panda_task_data_handler.py | 4 +- pandaserver/workflow/workflow_base.py | 32 ++++---- pandaserver/workflow/workflow_core.py | 78 +++++++++---------- 4 files changed, 59 insertions(+), 59 deletions(-) diff --git a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py index f640bab70..b99698ff0 100644 --- a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py @@ -91,9 +91,9 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu check_result.check_status = WFDataTargetCheckStatus.nonexist case DDMCollectionState.open: if collection_meta.get("length", 0) == 0: - check_result.check_status = WFDataTargetCheckStatus.insuff + check_result.check_status = WFDataTargetCheckStatus.insuffi else: - check_result.check_status = WFDataTargetCheckStatus.partial + check_result.check_status = WFDataTargetCheckStatus.suffice case DDMCollectionState.closed: check_result.check_status = WFDataTargetCheckStatus.complete check_result.metadata = collection_meta diff --git a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py index 0edba5714..e91ddb2ca 100644 --- a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py @@ -130,10 +130,10 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu if none_exist: check_result.check_status = WFDataTargetCheckStatus.nonexist elif total_n_files == 0: - check_result.check_status = WFDataTargetCheckStatus.insuff + check_result.check_status = WFDataTargetCheckStatus.insuffi else: # At least 1 file is sufficient for step input - check_result.check_status = WFDataTargetCheckStatus.partial + check_result.check_status = WFDataTargetCheckStatus.suffice check_result.success = True tmp_log.info(f"Got total_n_files={total_n_files}; check_status={check_result.check_status}") return check_result diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index bc3eb37fa..97a425f66 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -72,30 +72,30 @@ class WFDataStatus(object): registered = "registered" checking = "checking" checked_nonexist = "checked_nonexist" # data does not exist - checked_insuff = "checked_insuff" # data available but insufficient to be step input - checked_partial = "checked_partial" # data partially available and sufficient to be step input + checked_insuffi = "checked_insuffi" # data available but insufficient to be step input + checked_suffice = "checked_suffice" # data partially available and sufficient to be step input checked_complete = "checked_complete" # data completely available binding = "binding" # data being bound to a step to generate - generating_start = "generating_start" - generating_unready = "generating_unready" - generating_ready = "generating_ready" - waiting_unready = "waiting_unready" - waiting_ready = "waiting_ready" + generating_bound = "generating_bound" + generating_insuffi = "generating_insuffi" + generating_suffice = "generating_suffice" + waiting_insuffi = "waiting_insuffi" + waiting_suffice = "waiting_suffice" done_generated = "done_generated" done_waited = "done_waited" done_skipped = "done_skipped" cancelled = "cancelled" retired = "retired" - checked_statuses = (checked_nonexist, checked_insuff, checked_partial, checked_complete) - generating_statuses = (generating_start, generating_unready, generating_ready) - waiting_statuses = (waiting_unready, waiting_ready) + checked_statuses = (checked_nonexist, checked_insuffi, checked_suffice, checked_complete) + generating_statuses = (generating_bound, generating_insuffi, generating_suffice) + waiting_statuses = (waiting_insuffi, waiting_suffice) done_statuses = (done_generated, done_waited, done_skipped) - good_input_statuses = (generating_ready, waiting_ready, done_generated, done_waited, done_skipped) + good_input_statuses = (generating_suffice, waiting_suffice, done_generated, done_waited, done_skipped) good_output_statuses = (done_generated, done_waited, done_skipped) - after_generating_start_statuses = (generating_ready, done_generated, cancelled) - after_generating_ready_statuses = (done_generated, cancelled) - after_waiting_ready_statuses = (done_waited, cancelled) + after_generating_bound_statuses = (generating_suffice, done_generated, cancelled) + after_generating_suffice_statuses = (done_generated, cancelled) + after_waiting_suffice_statuses = (done_waited, cancelled) terminated_statuses = (done_generated, done_waited, done_skipped, cancelled, retired) nonreusable_statuses = (cancelled, retired) @@ -486,8 +486,8 @@ class WFDataTargetCheckStatus: """ complete = "complete" # data completely exists - partial = "partial" # data partially exists - insuff = "insuff" # data exists but insufficient to be step input + suffice = "suffice" # data partially exists and suffices to be step input + insuffi = "insuffi" # data partially exists but is insufficient to be step input nonexist = "nonexist" # data does not exist diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index a6a486676..e2af64309 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -359,10 +359,10 @@ def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: match check_result.check_status: case WFDataTargetCheckStatus.nonexist: data_spec.status = WFDataStatus.checked_nonexist - case WFDataTargetCheckStatus.insuff: - data_spec.status = WFDataStatus.checked_insuff - case WFDataTargetCheckStatus.partial: - data_spec.status = WFDataStatus.checked_partial + case WFDataTargetCheckStatus.insuffi: + data_spec.status = WFDataStatus.checked_insuffi + case WFDataTargetCheckStatus.suffice: + data_spec.status = WFDataStatus.checked_suffice case WFDataTargetCheckStatus.complete: data_spec.status = WFDataStatus.checked_complete data_spec.check_time = now_time @@ -406,13 +406,13 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: data_spec.status = WFDataStatus.binding data_spec.start_time = now_time self.tbif.update_workflow_data(data_spec) - case WFDataStatus.checked_insuff: - # Data insufficient, advance to waiting_unready - data_spec.status = WFDataStatus.waiting_unready + case WFDataStatus.checked_insuffi: + # Data insufficient, advance to waiting_insuffi + data_spec.status = WFDataStatus.waiting_insuffi self.tbif.update_workflow_data(data_spec) - case WFDataStatus.checked_partial: - # Data partially exist, advance to waiting_ready - data_spec.status = WFDataStatus.waiting_ready + case WFDataStatus.checked_suffice: + # Data partially exist, advance to waiting_suffice + data_spec.status = WFDataStatus.waiting_suffice self.tbif.update_workflow_data(data_spec) case WFDataStatus.checked_complete: # Data already fully exist, advance to done_skipped @@ -452,7 +452,7 @@ def process_data_binding(self, data_spec: WFDataSpec, step_spec: WFStepSpec) -> try: original_status = data_spec.status data_spec.source_step_id = step_spec.step_id - data_spec.status = WFDataStatus.generating_start + data_spec.status = WFDataStatus.generating_bound self.tbif.update_workflow_data(data_spec) process_result.success = True process_result.new_status = data_spec.status @@ -501,30 +501,30 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: return process_result # Update data status now_time = naive_utcnow() - if original_status == WFDataStatus.generating_start: + if original_status == WFDataStatus.generating_bound: match check_result.check_status: - case WFDataTargetCheckStatus.partial | WFDataTargetCheckStatus.complete: - # Data exist, advance to generating_ready - data_spec.status = WFDataStatus.generating_ready + case WFDataTargetCheckStatus.suffice | WFDataTargetCheckStatus.complete: + # Data exist, advance to generating_suffice + data_spec.status = WFDataStatus.generating_suffice process_result.new_status = data_spec.status - case WFDataTargetCheckStatus.insuff: - # Data insufficient, move to generating_unready - data_spec.status = WFDataStatus.generating_unready + case WFDataTargetCheckStatus.insuffi: + # Data insufficient, move to generating_insuffi + data_spec.status = WFDataStatus.generating_insuffi process_result.new_status = data_spec.status case WFDataTargetCheckStatus.nonexist: - # Data not yet exist, stay in generating_start + # Data not yet exist, stay in generating_bound pass case _: # Unexpected status, log and skip tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") - elif original_status == WFDataStatus.generating_unready: + elif original_status == WFDataStatus.generating_insuffi: match check_result.check_status: - case WFDataTargetCheckStatus.partial | WFDataTargetCheckStatus.complete: - # Data now exist, advance to generating_ready - data_spec.status = WFDataStatus.generating_ready + case WFDataTargetCheckStatus.suffice | WFDataTargetCheckStatus.complete: + # Data now exist, advance to generating_suffice + data_spec.status = WFDataStatus.generating_suffice process_result.new_status = data_spec.status - case WFDataTargetCheckStatus.insuff: - # Data still insufficient, stay in generating_unready + case WFDataTargetCheckStatus.insuffi: + # Data still insufficient, stay in generating_insuffi pass case WFDataTargetCheckStatus.nonexist: # Data not exist anymore, unexpected, log and skip @@ -532,17 +532,17 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: case _: # Unexpected status, log and skip tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") - elif original_status == WFDataStatus.generating_ready: + elif original_status == WFDataStatus.generating_suffice: match check_result.check_status: case WFDataTargetCheckStatus.complete: # Data fully exist, advance to final status done_generated data_spec.status = WFDataStatus.done_generated process_result.new_status = data_spec.status data_spec.end_time = now_time - case WFDataTargetCheckStatus.partial: - # Data still partially exist, stay in generating_ready + case WFDataTargetCheckStatus.suffice: + # Data still partially exist, stay in generating_suffice pass - case WFDataTargetCheckStatus.insuff: + case WFDataTargetCheckStatus.insuffi: # Data not sufficient anymore, unexpected, log and skip tmp_log.warning(f"Data are not sufficient anymore, unexpected; skipped") case WFDataTargetCheckStatus.nonexist: @@ -602,17 +602,17 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: return process_result # Update data status now_time = naive_utcnow() - if original_status == WFDataStatus.waiting_ready: + if original_status == WFDataStatus.waiting_suffice: match check_result.check_status: case WFDataTargetCheckStatus.complete: # Data fully exist, advance to final status done_waited data_spec.status = WFDataStatus.done_waited process_result.new_status = data_spec.status data_spec.end_time = now_time - case WFDataTargetCheckStatus.partial: - # Data still partially exist, stay in waiting_ready + case WFDataTargetCheckStatus.suffice: + # Data still partially exist, stay in waiting_suffice pass - case WFDataTargetCheckStatus.insuff: + case WFDataTargetCheckStatus.insuffi: # Data not sufficient anymore, unexpected, log and skip tmp_log.warning(f"Data are not sufficient anymore, unexpected; skipped") case WFDataTargetCheckStatus.nonexist: @@ -620,19 +620,19 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.warning(f"Data do not exist anymore, unexpected; skipped") case _: tmp_log.warning(f"Invalid check_status {check_result.check_status} from target check result; skipped") - elif original_status == WFDataStatus.waiting_unready: + elif original_status == WFDataStatus.waiting_insuffi: match check_result.check_status: - case WFDataTargetCheckStatus.partial: - # Data partially exist, advance to waiting_ready - data_spec.status = WFDataStatus.waiting_ready + case WFDataTargetCheckStatus.suffice: + # Data partially exist, advance to waiting_suffice + data_spec.status = WFDataStatus.waiting_suffice process_result.new_status = data_spec.status case WFDataTargetCheckStatus.complete: # Data fully exist, advance to final status done_waited data_spec.status = WFDataStatus.done_waited process_result.new_status = data_spec.status data_spec.end_time = now_time - case WFDataTargetCheckStatus.insuff: - # Data still insufficient, stay in waiting_unready + case WFDataTargetCheckStatus.insuffi: + # Data still insufficient, stay in waiting_insuffi pass case WFDataTargetCheckStatus.nonexist: # Data not exist anymore, unexpected, log and skip From 568f4df5b5ca63291c0a8297a65428515be1e9a9 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 18 Nov 2025 15:49:49 +0100 Subject: [PATCH 052/101] workflows4: status pretty --- pandaserver/workflow/workflow_core.py | 40 +++++++++++++++------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index e2af64309..aa8e50332 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -1145,24 +1145,28 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W orig_status = step_spec.status # Process the step tmp_res = None - match step_spec.status: - case WFStepStatus.registered: - tmp_res = self.process_step_registered(step_spec) - case WFStepStatus.checking: - tmp_res = self.process_step_checking(step_spec) - case WFStepStatus.checked_true | WFStepStatus.checked_false: - tmp_res = self.process_step_checked(step_spec) - case WFStepStatus.pending: - tmp_res = self.process_step_pending(step_spec, data_spec_map=data_spec_map) - case WFStepStatus.ready: - tmp_res = self.process_step_ready(step_spec) - case WFStepStatus.starting: - tmp_res = self.process_step_starting(step_spec) - case WFStepStatus.running: - tmp_res = self.process_step_running(step_spec) - case _: - tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") - continue + if step_spec.status == WFStepStatus.registered: + tmp_res = self.process_step_registered(step_spec) + elif step_spec.status == WFStepStatus.checking: + tmp_res = self.process_step_checking(step_spec) + elif step_spec.status in WFStepStatus.checked_statuses: + tmp_res = self.process_step_checked(step_spec) + elif step_spec.status == WFStepStatus.pending: + tmp_res = self.process_step_pending(step_spec, data_spec_map=data_spec_map) + elif step_spec.status == WFStepStatus.ready: + tmp_res = self.process_step_ready(step_spec) + elif step_spec.status == WFStepStatus.starting: + tmp_res = self.process_step_starting(step_spec) + elif step_spec.status == WFStepStatus.running: + tmp_res = self.process_step_running(step_spec) + elif step_spec.status in WFStepStatus.final_statuses: + # dummy result since final steps need no processing + dummy_process_result = WFStepProcessResult() + dummy_process_result.success = True + tmp_res = dummy_process_result + else: + tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") + continue if tmp_res and tmp_res.success: # update stats if tmp_res.new_status and step_spec.status != orig_status: From cc485c937863860b36ab5372cd32f396480812e8 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 24 Nov 2025 15:20:19 +0100 Subject: [PATCH 053/101] workflows4: add workflowHoldup task param --- pandajedi/jediorder/ContentsFeeder.py | 3 +++ pandajedi/jedirefine/TaskRefinerBase.py | 2 ++ pandaserver/taskbuffer/JediTaskSpec.py | 13 ++++++++++++- pandaserver/taskbuffer/task_split_rules.py | 1 + .../step_handler_plugins/panda_task_step_handler.py | 2 ++ 5 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pandajedi/jediorder/ContentsFeeder.py b/pandajedi/jediorder/ContentsFeeder.py index 3cc08a194..f47c7e633 100644 --- a/pandajedi/jediorder/ContentsFeeder.py +++ b/pandajedi/jediorder/ContentsFeeder.py @@ -628,6 +628,9 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): if noWaitParent: # parent is running taskOnHold = True + elif taskSpec.is_workflow_holdup(): + # hold up by the workflow + taskOnHold = True else: # the task has no parent or parent is finished if master_is_open and taskSpec.runUntilClosed(): diff --git a/pandajedi/jedirefine/TaskRefinerBase.py b/pandajedi/jedirefine/TaskRefinerBase.py index 74fdd4fbf..461b33544 100644 --- a/pandajedi/jedirefine/TaskRefinerBase.py +++ b/pandajedi/jedirefine/TaskRefinerBase.py @@ -432,6 +432,8 @@ def extractCommon(self, jediTaskID, taskParamMap, workQueueMapper, splitRule): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken["allowIncompleteInDS"]) if "noAutoPause" in taskParamMap and taskParamMap["noAutoPause"]: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken["noAutoPause"]) + if "workflowHoldup" in taskParamMap and taskParamMap["workflowHoldup"]: + self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken["workflowHoldup"]) # work queue workQueue = None if "workQueueName" in taskParamMap: diff --git a/pandaserver/taskbuffer/JediTaskSpec.py b/pandaserver/taskbuffer/JediTaskSpec.py index a5f419253..db0dd6346 100644 --- a/pandaserver/taskbuffer/JediTaskSpec.py +++ b/pandaserver/taskbuffer/JediTaskSpec.py @@ -839,7 +839,7 @@ def removeSplitRule(self, ruleName): items = self.splitRule.split(",") newItems = [] for item in items: - # remove rile + # remove rule tmpRuleName = item.split("=")[0] if ruleName == tmpRuleName: continue @@ -1617,6 +1617,17 @@ def is_msg_driven(self): def allow_incomplete_input(self): return self.check_split_rule("allowIncompleteInDS") + # check if workflow holdup + def is_workflow_holdup(self): + return self.check_split_rule("workflowHoldup") + + # set workflow holdup + def set_workflow_holdup(self, value: bool): + if value: + self.setSplitRule("workflowHoldup", "1") + else: + self.removeSplitRule(self.splitRuleToken["workflowHoldup"]) + # get queued time def get_queued_time(self): """ diff --git a/pandaserver/taskbuffer/task_split_rules.py b/pandaserver/taskbuffer/task_split_rules.py index 448311e03..052ab6ed7 100644 --- a/pandaserver/taskbuffer/task_split_rules.py +++ b/pandaserver/taskbuffer/task_split_rules.py @@ -99,6 +99,7 @@ "useExhausted": "UX", "useZipToPin": "UZ", "writeInputToFile": "WF", + "workflowHoldup": "WH", "waitInput": "WI", "maxAttemptES": "XA", "decAttOnFailedES": "XF", diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 628ef8c7f..332628954 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -116,6 +116,8 @@ def submit_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetSubmitRe user_dn = step_definition.get("user_dn") task_param_map = step_definition.get("task_params", {}) # task_param_map["userName"] = user_name + # Always set workflowHoldup to True to hold up the workflow until released by workflow processor + task_param_map["workflowHoldup"] = True # Submit task tmp_ret_flag, temp_ret_val = self.tbif.insertTaskParamsPanda(task_param_map, user_dn, False, decode=False) if tmp_ret_flag: From a9fab0995143226f8a69c0c8555a668d6e7720a0 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 24 Nov 2025 17:41:02 +0100 Subject: [PATCH 054/101] workflows4: fix for workflowHoldup --- .../db_proxy_mods/task_complex_module.py | 5 +++ .../step_handler_plugins/base_step_handler.py | 11 +++++++ .../panda_task_step_handler.py | 33 +++++++++++++++++++ pandaserver/workflow/workflow_utils.py | 6 ++-- 4 files changed, 52 insertions(+), 3 deletions(-) diff --git a/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py b/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py index f1de5ebc3..a8e09270f 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py @@ -3897,6 +3897,11 @@ def prepareTasksToBeFinished_JEDI(self, vo, prodSourceLabel, nTasks=50, simTasks self.cur.execute(sqlMUT + comment, varMap) nRow = self.cur.rowcount tmpLog.debug(f"jediTaskID={jediTaskID} updated {nRow} mutable datasets") + elif taskSpec.is_workflow_holdup(): + # hold up workflow tasks + # go to defined to trigger CF + newTaskStatus = "defined" + tmpLog.debug(f"jediTaskID={jediTaskID} held up as workflow task") else: # update input datasets for varMap in varMapList: diff --git a/pandaserver/workflow/step_handler_plugins/base_step_handler.py b/pandaserver/workflow/step_handler_plugins/base_step_handler.py index f42210302..f7c9783d6 100644 --- a/pandaserver/workflow/step_handler_plugins/base_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/base_step_handler.py @@ -59,3 +59,14 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu WFStepTargetCheckResult: An object containing the result of the check, including success status, current step status, and message. """ raise NotImplementedError("Subclasses must implement this method.") + + def on_all_inputs_done(self, step_spec: WFStepSpec, **kwargs) -> None: + """ + Hook method called when all inputs for the step are done. + This method can be overridden by subclasses to perform actions when all inputs are ready. + + Args: + step_spec (WFStepSpec): Specifications of the workflow step whose inputs are done. + **kwargs: Additional keyword arguments. + """ + raise NotImplementedError("Subclasses must implement this method.") diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 332628954..c6d29af28 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -192,3 +192,36 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu check_result.message = f"exception {str(e)}" tmp_log.error(f"Failed to check status: {traceback.format_exc()}") return check_result + + def on_all_inputs_done(self, step_spec: WFStepSpec, **kwargs) -> None: + """ + Hook method called when all inputs for the step are done. + For PanDA task steps, unset workflowHoldup of the target task to allow it to proceed. + + Args: + step_spec (WFStepSpec): The workflow step specification containing details about the step. + **kwargs: Additional keyword arguments. + """ + tmp_log = LogWrapper(logger, f"on_all_inputs_done workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + try: + # Check step flavor + if step_spec.flavor != self.plugin_flavor: + tmp_log.warning(f"flavor={step_spec.flavor} not {self.plugin_flavor}; skipped") + return + if step_spec.target_id is None: + tmp_log.warning(f"target_id is None; skipped") + return + # Get task ID + task_id = int(step_spec.target_id) + # Get task spec + _, task_spec = self.tbif.getTaskWithID_JEDI(task_id) + if task_spec is None: + tmp_log.error(f"task_id={task_id} not found; skipped") + return + # Unset workflowHoldup + if task_spec.is_workflow_holdup(): + task_spec.set_workflow_holdup(False) + self.tbif.updateTask_JEDI(task_spec, {"jediTaskID": task_spec.jediTaskID}) + tmp_log.info(f"Unset workflowHoldup for task_id={task_id}") + except Exception as e: + tmp_log.error(f"Failed with: {traceback.format_exc()}") diff --git a/pandaserver/workflow/workflow_utils.py b/pandaserver/workflow/workflow_utils.py index dce483be5..765af1b5b 100644 --- a/pandaserver/workflow/workflow_utils.py +++ b/pandaserver/workflow/workflow_utils.py @@ -423,9 +423,9 @@ def make_task_params(self, task_template, id_map, workflow_node): tmp_item["value"] = f"-a {task_params['buildSpec']['archiveName']}" del task_params["buildSpec"] # parent - if self.parents and len(self.parents) == 1: - task_params["noWaitParent"] = True - task_params["parentTaskName"] = id_map[list(self.parents)[0]].task_params["taskName"] + # if self.parents and len(self.parents) == 1: + # task_params["noWaitParent"] = True + # task_params["parentTaskName"] = id_map[list(self.parents)[0]].task_params["taskName"] # notification if not self.is_workflow_output: task_params["noEmail"] = True From 4c4000eb1edd107c1a786cd51b0825160a515c7a Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 24 Nov 2025 18:51:22 +0100 Subject: [PATCH 055/101] workflows4: fix test mutable for workflowholdup --- pandajedi/jediorder/ContentsFeeder.py | 3 +++ pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py | 5 ----- .../step_handler_plugins/panda_task_step_handler.py | 5 +++-- pandaserver/workflow/workflow_core.py | 6 ++++++ 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pandajedi/jediorder/ContentsFeeder.py b/pandajedi/jediorder/ContentsFeeder.py index f47c7e633..2a7e4a1d5 100644 --- a/pandajedi/jediorder/ContentsFeeder.py +++ b/pandajedi/jediorder/ContentsFeeder.py @@ -240,6 +240,9 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): ): # dummy metadata when parent is running tmpMetadata = {"state": "mutable"} + # set mutable when workflow holdup is set + if taskSpec.is_workflow_holdup(): + tmpMetadata = {"state": "mutable"} except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error(f"{self.__class__.__name__} failed to get metadata to {errtype.__name__}:{errvalue}") diff --git a/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py b/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py index a8e09270f..f1de5ebc3 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py @@ -3897,11 +3897,6 @@ def prepareTasksToBeFinished_JEDI(self, vo, prodSourceLabel, nTasks=50, simTasks self.cur.execute(sqlMUT + comment, varMap) nRow = self.cur.rowcount tmpLog.debug(f"jediTaskID={jediTaskID} updated {nRow} mutable datasets") - elif taskSpec.is_workflow_holdup(): - # hold up workflow tasks - # go to defined to trigger CF - newTaskStatus = "defined" - tmpLog.debug(f"jediTaskID={jediTaskID} held up as workflow task") else: # update input datasets for varMap in varMapList: diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index c6d29af28..d75d605a0 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -116,8 +116,9 @@ def submit_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetSubmitRe user_dn = step_definition.get("user_dn") task_param_map = step_definition.get("task_params", {}) # task_param_map["userName"] = user_name - # Always set workflowHoldup to True to hold up the workflow until released by workflow processor - task_param_map["workflowHoldup"] = True + if not step_spec.get_parameter("all_inputs_complete"): + # Some inputs are not complete, set workflowHoldup to True to hold up the workflow until released by workflow processor + task_param_map["workflowHoldup"] = True # Submit task tmp_ret_flag, temp_ret_val = self.tbif.insertTaskParamsPanda(task_param_map, user_dn, False, decode=False) if tmp_ret_flag: diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index aa8e50332..f465c5a6c 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -893,6 +893,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W return process_result # Check if all input data are good, aka ready as input all_inputs_good = True + all_inputs_complete = True for input_data_name in input_data_list: data_spec = data_spec_map.get(input_data_name) if data_spec is None: @@ -903,6 +904,8 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W tmp_log.debug(f"Input data {input_data_name} status {data_spec.status} is not ready for input") all_inputs_good = False break + elif data_spec.status not in WFDataStatus.done_statuses: + all_inputs_complete = False # If not all inputs are good, just return and wait for next round if not all_inputs_good: tmp_log.debug(f"Some input data are not good; skipped") @@ -925,6 +928,9 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W tmp_log.debug(f"Output data_id={data_spec.data_id} name={output_data_name} status={data_spec.status} not in binding; skipped") else: tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") + if all_inputs_complete: + # All inputs are complete, mark in step_spec + step_spec.set_parameter("all_inputs_complete", True) # Old code for reference # if step_spec_definition.get("is_tail"): # # Tail step, set root output source_step_id From e2b227d24f99cd0436dac35f86c4ebcb9c9bb1ff Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 25 Nov 2025 10:15:51 +0100 Subject: [PATCH 056/101] workflows4: fix --- pandajedi/jediorder/ContentsFeeder.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandajedi/jediorder/ContentsFeeder.py b/pandajedi/jediorder/ContentsFeeder.py index 2a7e4a1d5..67a1c3624 100644 --- a/pandajedi/jediorder/ContentsFeeder.py +++ b/pandajedi/jediorder/ContentsFeeder.py @@ -621,6 +621,11 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): setFrozenTime = False skip_secondaries = True tmpLog.debug("end loop") + # task holdup by workflow + if not taskOnHold and not taskBroken and allUpdated and checkedMaster and taskSpec.is_workflow_holdup(): + # hold up by the workflow + taskOnHold = True + tmpLog.debug("task to hold up by workflow") # no master input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = "no master input files. input dataset is empty" @@ -631,9 +636,6 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): if noWaitParent: # parent is running taskOnHold = True - elif taskSpec.is_workflow_holdup(): - # hold up by the workflow - taskOnHold = True else: # the task has no parent or parent is finished if master_is_open and taskSpec.runUntilClosed(): From 7420a027848c4fb43092a928c0544bb6e082027e Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 25 Nov 2025 12:26:59 +0100 Subject: [PATCH 057/101] workflows4: fix --- .../panda_task_step_handler.py | 2 + pandaserver/workflow/workflow_core.py | 101 +++++++++++++----- 2 files changed, 79 insertions(+), 24 deletions(-) diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index d75d605a0..230911845 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -224,5 +224,7 @@ def on_all_inputs_done(self, step_spec: WFStepSpec, **kwargs) -> None: task_spec.set_workflow_holdup(False) self.tbif.updateTask_JEDI(task_spec, {"jediTaskID": task_spec.jediTaskID}) tmp_log.info(f"Unset workflowHoldup for task_id={task_id}") + # Done + tmp_log.debug(f"Done") except Exception as e: tmp_log.error(f"Failed with: {traceback.format_exc()}") diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index f465c5a6c..25de97558 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -711,6 +711,40 @@ def process_data_specs(self, data_specs: List[WFDataSpec]) -> Dict: # ---- Step status transitions ----------------------------- + def _check_all_inputs_of_step(self, tmp_log: LogWrapper, input_data_list: List[str], data_spec_map: Dict[str, WFDataSpec]) -> Dict[str, bool]: + """ + Check whether all input data of a step are sufficient or complete + + Args: + tmp_log (LogWrapper): Logger for logging messages + input_data_list (List[str]): List of input data names for the step + data_spec_map (Dict[str, WFDataSpec]): Mapping of data names to their specifications + + Returns: + Dict[str, bool]: Dictionary indicating whether all inputs sufficient and complete + """ + # Check if all input data sufficient or complete + ret_dict = {"all_inputs_sufficient": True, "all_inputs_complete": True} + for input_data_name in input_data_list: + data_spec = data_spec_map.get(input_data_name) + if data_spec is None: + tmp_log.warning(f"Input data {input_data_name} not found in workflow data") + ret_dict["all_inputs_sufficient"] = False + ret_dict["all_inputs_complete"] = False + break + elif data_spec.status not in WFDataStatus.good_input_statuses: + tmp_log.debug(f"Input data {input_data_name} status {data_spec.status} is not sufficient as input") + ret_dict["all_inputs_sufficient"] = False + ret_dict["all_inputs_complete"] = False + break + elif data_spec.status not in WFDataStatus.done_statuses: + ret_dict["all_inputs_complete"] = False + if ret_dict["all_inputs_complete"]: + tmp_log.debug("All input data are complete") + elif ret_dict["all_inputs_sufficient"]: + tmp_log.debug("All input data are sufficient as input") + return ret_dict + def process_step_registered(self, step_spec: WFStepSpec) -> WFStepProcessResult: """ Process a step in registered status @@ -880,10 +914,6 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W return process_result # Process try: - # Get data spec map of the workflow - if data_spec_map is None: - data_specs = self.tbif.get_workflow_data(workflow_id=step_spec.workflow_id) - data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} # Input data list of the step step_spec_definition = step_spec.definition_json_map input_data_list = step_spec_definition.get("input_data_list") @@ -891,28 +921,19 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W process_result.message = f"Step definition does not have input_data_list; skipped" tmp_log.warning(f"{process_result.message}") return process_result - # Check if all input data are good, aka ready as input - all_inputs_good = True - all_inputs_complete = True - for input_data_name in input_data_list: - data_spec = data_spec_map.get(input_data_name) - if data_spec is None: - tmp_log.warning(f"Input data {input_data_name} not found in workflow data") - all_inputs_good = False - break - elif data_spec.status not in WFDataStatus.good_input_statuses: - tmp_log.debug(f"Input data {input_data_name} status {data_spec.status} is not ready for input") - all_inputs_good = False - break - elif data_spec.status not in WFDataStatus.done_statuses: - all_inputs_complete = False - # If not all inputs are good, just return and wait for next round - if not all_inputs_good: - tmp_log.debug(f"Some input data are not good; skipped") + # Get data spec map of the workflow + if data_spec_map is None: + data_specs = self.tbif.get_data_of_workflow(workflow_id=step_spec.workflow_id) + data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} + # Check if all input data are good + all_inputs_stats = self._check_all_inputs_of_step(tmp_log, input_data_list, data_spec_map) + # If not all inputs are sufficient as input, just return and wait for next round + if not all_inputs_stats["all_inputs_sufficient"]: + tmp_log.debug(f"Some input data are not sufficient as input; skipped") process_result.success = True return process_result # All inputs are good, register outputs of the step and update step status to ready - tmp_log.debug(f"All input data are good; proceeding") + tmp_log.debug(f"All input data are sufficient as input; proceeding") output_data_list = step_spec_definition.get("output_data_list", []) # outputs_raw_dict = step_spec_definition.get("outputs", {}) # output_types = step_spec_definition.get("output_types", []) @@ -928,7 +949,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W tmp_log.debug(f"Output data_id={data_spec.data_id} name={output_data_name} status={data_spec.status} not in binding; skipped") else: tmp_log.warning(f"Output data {output_data_name} not found in workflow data; skipped") - if all_inputs_complete: + if all_inputs_stats["all_inputs_complete"]: # All inputs are complete, mark in step_spec step_spec.set_parameter("all_inputs_complete", True) # Old code for reference @@ -1037,6 +1058,18 @@ def process_step_starting(self, step_spec: WFStepSpec) -> WFStepProcessResult: return process_result # Process try: + # Input data list of the step + step_spec_definition = step_spec.definition_json_map + input_data_list = step_spec_definition.get("input_data_list") + if input_data_list is None: + process_result.message = f"Step definition does not have input_data_list; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Get data spec map of the workflow + data_specs = self.tbif.get_data_of_workflow(workflow_id=step_spec.workflow_id) + data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} + # Check if all input data are good + all_inputs_stats = self._check_all_inputs_of_step(tmp_log, input_data_list, data_spec_map) # Get the step handler plugin step_handler = self.get_plugin("step_handler", step_spec.flavor) # Check the step status @@ -1045,6 +1078,10 @@ def process_step_starting(self, step_spec: WFStepSpec) -> WFStepProcessResult: process_result.message = f"Failed to check step; {check_result.message}" tmp_log.error(f"{process_result.message}") return process_result + # If all inputs are complete, mark in step_spec and call the hook of step_handler + if all_inputs_stats["all_inputs_complete"]: + step_spec.set_parameter("all_inputs_complete", True) + step_handler.on_all_inputs_done(step_spec) # Update step status if check_result.step_status in WFStepStatus.after_starting_statuses: # Step status advanced @@ -1093,8 +1130,24 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: return process_result # Process try: + # Input data list of the step + step_spec_definition = step_spec.definition_json_map + input_data_list = step_spec_definition.get("input_data_list") + if input_data_list is None: + process_result.message = f"Step definition does not have input_data_list; skipped" + tmp_log.warning(f"{process_result.message}") + return process_result + # Get data spec map of the workflow + data_specs = self.tbif.get_data_of_workflow(workflow_id=step_spec.workflow_id) + data_spec_map = {data_spec.name: data_spec for data_spec in data_specs} + # Check if all input data are good + all_inputs_stats = self._check_all_inputs_of_step(tmp_log, input_data_list, data_spec_map) # Get the step handler plugin step_handler = self.get_plugin("step_handler", step_spec.flavor) + # If all inputs are complete, mark in step_spec and call the hook of step_handler + if all_inputs_stats["all_inputs_complete"]: + step_spec.set_parameter("all_inputs_complete", True) + step_handler.on_all_inputs_done(step_spec) # Check the step status check_result = step_handler.check_target(step_spec) if not check_result.success or check_result.step_status is None: From 84cb281992dc1936a5f8a8f41302e83092f55bd1 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 25 Nov 2025 13:26:59 +0100 Subject: [PATCH 058/101] tmp fix --- pandajedi/jediorder/ContentsFeeder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandajedi/jediorder/ContentsFeeder.py b/pandajedi/jediorder/ContentsFeeder.py index 67a1c3624..bd7c8df86 100644 --- a/pandajedi/jediorder/ContentsFeeder.py +++ b/pandajedi/jediorder/ContentsFeeder.py @@ -622,7 +622,7 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): skip_secondaries = True tmpLog.debug("end loop") # task holdup by workflow - if not taskOnHold and not taskBroken and allUpdated and checkedMaster and taskSpec.is_workflow_holdup(): + if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster and taskSpec.is_workflow_holdup(): # hold up by the workflow taskOnHold = True tmpLog.debug("task to hold up by workflow") From 94413deed3660a394ce7b1c2fc25b3e9320fc7fb Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 25 Nov 2025 14:29:22 +0100 Subject: [PATCH 059/101] workflows4: fix for holdup by nReady --- pandajedi/jediorder/ContentsFeeder.py | 12 +++++++++--- .../db_proxy_mods/task_complex_module.py | 15 ++++++++++----- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/pandajedi/jediorder/ContentsFeeder.py b/pandajedi/jediorder/ContentsFeeder.py index bd7c8df86..4e1e65a3f 100644 --- a/pandajedi/jediorder/ContentsFeeder.py +++ b/pandajedi/jediorder/ContentsFeeder.py @@ -185,6 +185,7 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): parentOutDatasets.add(tmpParentOutDataset.containerName + "/") # loop over all datasets nFilesMaster = 0 + nFilesMasterReady = 0 checkedMaster = False setFrozenTime = True master_offset = None @@ -532,7 +533,7 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): orderBy = None # feed files to the contents table tmpLog.debug("update contents") - retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI( + res_dict = self.taskBufferIF.insertFilesForDataset_JEDI( datasetSpec, tmpRet, tmpMetadata["state"], @@ -568,6 +569,10 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): maxFileRecords, skip_short_output, ) + retDB = res_dict["retDB"] + missingFileList = res_dict["missingFileList"] + nFilesUnique = res_dict["numUniqueLfn"] + diagMap = res_dict["diagMap"] if retDB is False: taskSpec.setErrDiag(f"failed to insert files for {datasetSpec.datasetName}. {diagMap['errMsg']}") allUpdated = False @@ -598,6 +603,7 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique + nFilesMasterReady += res_dict.get("nReady", 0) master_offset = datasetSpec.getOffset() # running task if diagMap["isRunningTask"]: @@ -621,8 +627,8 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): setFrozenTime = False skip_secondaries = True tmpLog.debug("end loop") - # task holdup by workflow - if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster and taskSpec.is_workflow_holdup(): + # task holdup by workflow if no master inputs are ready + if not taskOnHold and not taskBroken and allUpdated and nFilesMasterReady == 0 and checkedMaster and taskSpec.is_workflow_holdup(): # hold up by the workflow taskOnHold = True tmpLog.debug("task to hold up by workflow") diff --git a/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py b/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py index f1de5ebc3..1bf9a191c 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/task_complex_module.py @@ -222,8 +222,10 @@ def insertFilesForDataset_JEDI( tmpLog.debug(f"skipShortInput={skipShortInput} skipShortOutput={skip_short_output} inputPreStaging={inputPreStaging} order_by={order_by}") # return value for failure diagMap = {"errMsg": "", "nChunksForScout": nChunksForScout, "nActivatedPending": 0, "isRunningTask": False} - failedRet = False, 0, None, diagMap - harmlessRet = None, 0, None, diagMap + # failedRet = False, 0, None, diagMap + # harmlessRet = None, 0, None, diagMap + failedRet = {"ret_val": False, "missingFileList": 0, "numUniqueLfn": None, "diagMap": diagMap} + harmlessRet = {"ret_val": None, "missingFileList": 0, "numUniqueLfn": None, "diagMap": diagMap} regStart = naive_utcnow() # mutable fake_mutable_for_skip_short_output = False @@ -632,7 +634,8 @@ def insertFilesForDataset_JEDI( nEventsLost = 0 nEventsExist = 0 stagingLB = set() - retVal = None, missingFileList, None, diagMap + # retVal = None, missingFileList, None, diagMap + retVal = {"ret_val": None, "missingFileList": missingFileList, "numUniqueLfn": None, "diagMap": diagMap, "nReady": nReady} # begin transaction self.conn.begin() # check task @@ -741,7 +744,8 @@ def insertFilesForDataset_JEDI( self.cur.execute(sqlCo + comment, varMap) resCo = self.cur.fetchone() numUniqueLfn = resCo[0] - retVal = True, missingFileList, numUniqueLfn, diagMap + # retVal = True, missingFileList, numUniqueLfn, diagMap + retVal = {"ret_val": True, "missingFileList": missingFileList, "numUniqueLfn": numUniqueLfn, "diagMap": diagMap, "nReady": nReady} else: oldDsStatus, nFilesUnprocessed, dsStateInDB, nFilesToUseDS, nFilesUsedInDS = resDs tmpLog.debug(f"ds.state={dsStateInDB} in DB") @@ -1258,7 +1262,8 @@ def insertFilesForDataset_JEDI( if nFilesUnprocessed not in [0, None]: diagMap["nActivatedPending"] += nFilesUnprocessed # set return value - retVal = True, missingFileList, numUniqueLfn, diagMap + # retVal = True, missingFileList, numUniqueLfn, diagMap + retVal = {"ret_val": True, "missingFileList": missingFileList, "numUniqueLfn": numUniqueLfn, "diagMap": diagMap, "nReady": nReady} # fix secondary files in staging if inputPreStaging and datasetSpec.isSeqNumber(): get_task_utils_module(self).fix_associated_files_in_staging(datasetSpec.jediTaskID, secondary_id=datasetSpec.datasetID) From 6dbee698309c4159ff890c0c0e39ce92750c7dd0 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 25 Nov 2025 16:11:38 +0100 Subject: [PATCH 060/101] workflows4: fix --- pandajedi/jediorder/ContentsFeeder.py | 2 +- pandaserver/workflow/workflow_core.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandajedi/jediorder/ContentsFeeder.py b/pandajedi/jediorder/ContentsFeeder.py index 4e1e65a3f..38999df07 100644 --- a/pandajedi/jediorder/ContentsFeeder.py +++ b/pandajedi/jediorder/ContentsFeeder.py @@ -569,7 +569,7 @@ def feed_contents_to_tasks(self, task_ds_list, real_run=True): maxFileRecords, skip_short_output, ) - retDB = res_dict["retDB"] + retDB = res_dict["ret_val"] missingFileList = res_dict["missingFileList"] nFilesUnique = res_dict["numUniqueLfn"] diagMap = res_dict["diagMap"] diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 25de97558..8ae833992 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -933,7 +933,6 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W process_result.success = True return process_result # All inputs are good, register outputs of the step and update step status to ready - tmp_log.debug(f"All input data are sufficient as input; proceeding") output_data_list = step_spec_definition.get("output_data_list", []) # outputs_raw_dict = step_spec_definition.get("outputs", {}) # output_types = step_spec_definition.get("output_types", []) From 672caebd66b76ae7140a986c1369846ead4c1657 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 25 Nov 2025 19:28:36 +0100 Subject: [PATCH 061/101] workflows4: improve hook to release task --- .../step_handler_plugins/panda_task_step_handler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 230911845..bd5ed8b60 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -219,11 +219,14 @@ def on_all_inputs_done(self, step_spec: WFStepSpec, **kwargs) -> None: if task_spec is None: tmp_log.error(f"task_id={task_id} not found; skipped") return - # Unset workflowHoldup + # Unset workflowHoldup and release the task if task_spec.is_workflow_holdup(): task_spec.set_workflow_holdup(False) self.tbif.updateTask_JEDI(task_spec, {"jediTaskID": task_spec.jediTaskID}) - tmp_log.info(f"Unset workflowHoldup for task_id={task_id}") + tmp_log.info(f"task_id={task_id} unset workflowHoldup") + if task_spec.status == "pending": + self.tbif.release_task_on_hold(task_id) + tmp_log.info(f"task_id={task_id} released from pending") # Done tmp_log.debug(f"Done") except Exception as e: From 391d17803ec617fac9dbae98a404bb87b93683c8 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 25 Nov 2025 19:30:32 +0100 Subject: [PATCH 062/101] workflows4: fix --- .../step_handler_plugins/panda_task_step_handler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index bd5ed8b60..e2340679c 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -225,8 +225,11 @@ def on_all_inputs_done(self, step_spec: WFStepSpec, **kwargs) -> None: self.tbif.updateTask_JEDI(task_spec, {"jediTaskID": task_spec.jediTaskID}) tmp_log.info(f"task_id={task_id} unset workflowHoldup") if task_spec.status == "pending": - self.tbif.release_task_on_hold(task_id) - tmp_log.info(f"task_id={task_id} released from pending") + tmp_ret = self.tbif.release_task_on_hold(task_id) + if not tmp_ret: + tmp_log.error(f"task_id={task_id} failed to release from pending") + else: + tmp_log.info(f"task_id={task_id} released from pending") # Done tmp_log.debug(f"Done") except Exception as e: From b29aa284c2739b4f6d4cd12e95719b72ee25577e Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 27 Nov 2025 09:21:20 +0100 Subject: [PATCH 063/101] workflows4: rename watchdog to be workflow manager --- ...Dog.py => AtlasWorkflowManagerWatchDog.py} | 6 +- pandaserver/api/v1/workflow_api.py | 66 ------------------- 2 files changed, 3 insertions(+), 69 deletions(-) rename pandajedi/jedidog/{AtlasWorkflowProcessorWatchDog.py => AtlasWorkflowManagerWatchDog.py} (91%) diff --git a/pandajedi/jedidog/AtlasWorkflowProcessorWatchDog.py b/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py similarity index 91% rename from pandajedi/jedidog/AtlasWorkflowProcessorWatchDog.py rename to pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py index 5d25fe497..2343f7811 100644 --- a/pandajedi/jedidog/AtlasWorkflowProcessorWatchDog.py +++ b/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py @@ -18,9 +18,9 @@ logger = PandaLogger().getLogger(__name__.split(".")[-1]) -class AtlasWorkflowProcessorWatchDog(WatchDogBase): +class AtlasWorkflowManagerWatchDog(WatchDogBase): """ - Workflow processor watchdog for ATLAS + Workflow manager watchdog for ATLAS """ # constructor @@ -37,7 +37,7 @@ def doProcessWorkflows(self): tmpLog.debug("start") try: # watchdog lock - got_lock = self.get_process_lock("AtlasWFProcDog.doProcessWorkflows", timeLimit=1) + got_lock = self.get_process_lock("AtlasWFManagerDog.doProcessWorkflows", timeLimit=1) if not got_lock: tmpLog.debug("locked by another watchdog process. Skipped") return diff --git a/pandaserver/api/v1/workflow_api.py b/pandaserver/api/v1/workflow_api.py index 32d38b7a8..065e359f1 100644 --- a/pandaserver/api/v1/workflow_api.py +++ b/pandaserver/api/v1/workflow_api.py @@ -134,69 +134,3 @@ def submit_workflow(req: PandaRequest, workflow_definition: dict) -> dict: tmp_logger.debug(f"Done. Took {time_delta.seconds}.{time_delta.microseconds // 1000:03d} sec") return generate_response(success, message, data) - - -# def put_workflow_request(panda_request: PandaRequest, data: str, check: bool = False, sync: bool = False) -> str: -# """ -# Upload workflow request to the server. -# Args: -# panda_request (PandaRequest): PanDA request object. -# data (string): workflow request data. -# check (bool): check flag. -# sync (bool): synchronous processing. -# Returns: -# string: String in json format with (boolean, message) -# """ - -# if not Protocol.isSecure(panda_request): -# return json.dumps((False, ERROR_NOT_SECURE)) - -# user_name = panda_request.subprocess_env["SSL_CLIENT_S_DN"] -# creation_time = naive_utcnow().strftime("%Y-%m-%d %H:%M:%S") - -# tmp_log = LogWrapper(_logger, "put_workflow_request") - -# tmp_log.debug(f"start user={user_name} check={check}") - -# if check in ("True", True): -# check = True -# elif sync in ("True", True): -# sync = True - -# try: -# # generate the filename -# file_name = f"{panda_config.cache_dir}/workflow.{str(uuid.uuid4())}" -# tmp_log.debug(f"file={file_name}") - -# # write -# with open(file_name, "w") as file_object: -# data_dict = { -# "userName": user_name, -# "creationTime": creation_time, -# "data": json.loads(data), -# } -# json.dump(data_dict, file_object) - -# if sync or check: -# from pandaserver.taskbuffer.workflow_processor import WorkflowProcessor - -# processor = WorkflowProcessor(log_stream=_logger) -# if check: -# ret = processor.process(file_name, True, True, True, True) -# else: -# ret = processor.process(file_name, True, False, True, False) -# if os.path.exists(file_name): -# try: -# os.remove(file_name) -# except Exception: -# pass -# tmp_log.debug("done") -# return json.dumps((True, ret)) - -# except Exception as exc: -# error_message = f"cannot put request due to {str(exc)} " -# tmp_log.error(error_message + traceback.format_exc()) -# return json.dumps((False, error_message)) - -# tmp_log.debug("done") -# return json.dumps((True, "request was accepted and will be processed in a few minutes")) From dc90947d9b9af05afa39e4b60fe3ac6338107fa6 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 27 Nov 2025 13:06:35 +0100 Subject: [PATCH 064/101] workflows4: add messaging for fast forward transient status --- .../workflow_manager_msg_processor.py | 80 ++++++++++++++++ pandaserver/workflow/workflow_base.py | 3 + pandaserver/workflow/workflow_core.py | 96 +++++++++++++++++++ 3 files changed, 179 insertions(+) create mode 100644 pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py diff --git a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py new file mode 100644 index 000000000..8ea252866 --- /dev/null +++ b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py @@ -0,0 +1,80 @@ +import json + +from pandacommon.pandalogger import logger_utils + +from pandajedi.jediddm.DDMInterface import DDMInterface +from pandajedi.jedimsgprocessor.base_msg_processor import BaseMsgProcPlugin +from pandaserver.workflow.workflow_core import WorkflowInterface + +base_logger = logger_utils.setup_logger(__name__.split(".")[-1]) + + +# Workflow manager message processor plugin +class WorkflowManagerMsgProcPlugin(BaseMsgProcPlugin): + """ + Message-driven workflow manager + """ + + def initialize(self): + """ + Initialize the plugin + """ + BaseMsgProcPlugin.initialize(self) + ddmIF = DDMInterface() + ddmIF.setupInterface() + the_pid = self.get_pid() + self.workflow_interface = WorkflowInterface(self.tbIF) + + def process(self, msg_obj): + """ + Process the message + Typical message data looks like: + {"msg_type":"workflow", "workflow_id": 123, "timestamp": 987654321} + {"msg_type":"wfstep", "step_id": 456, "timestamp": 987654321} + {"msg_type":"wfdata", "data_id": 789, "timestamp": 987654321} + + Args: + msg_obj: message object + """ + tmp_log = logger_utils.make_logger(base_logger, token=self.get_pid(), method_name="process") + # start + tmp_log.info("start") + tmp_log.debug(f"sub_id={msg_obj.sub_id} ; msg_id={msg_obj.msg_id}") + # parse json + try: + msg_dict = json.loads(msg_obj.data) + except Exception as e: + err_str = f"failed to parse message json {msg_obj.data} , skipped. {e.__class__.__name__} : {e}" + tmp_log.error(err_str) + raise + # sanity check + try: + msg_type = msg_dict["msg_type"] + except Exception as e: + err_str = f"failed to parse message object dict {msg_dict} , skipped. {e.__class__.__name__} : {e}" + tmp_log.error(err_str) + raise + if msg_type not in ("workflow", "wfstep", "wfdata"): + err_str = f"got unknown msg_type {msg_type} , skipped " + tmp_log.error(err_str) + raise + # run + try: + if msg_type == "workflow": + workflow_id = msg_dict["workflow_id"] + stats = self.workflow_interface.process_workflow(workflow_id) + tmp_log.info(f"processed workflow_id={workflow_id} : {stats}") + elif msg_type == "wfstep": + step_id = msg_dict["step_id"] + stats = self.workflow_interface.process_workflow_step(step_id) + tmp_log.info(f"processed step_id={step_id} : {stats}") + elif msg_type == "wfdata": + data_id = msg_dict["data_id"] + stats = self.workflow_interface.process_workflow_data(data_id) + tmp_log.info(f"processed data_id={data_id} : {stats}") + except Exception as e: + err_str = f"failed to run, skipped. {e.__class__.__name__} : {e}" + tmp_log.error(err_str) + raise + # done + tmp_log.info("done") diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index 97a425f66..edb884262 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -36,6 +36,7 @@ class WorkflowStatus(object): active_statuses = (registered, parsed, checking, checked, starting, running) final_statuses = (done, failed, cancelled) + transient_statuses = (parsed, checking, checked, starting) class WFStepStatus(object): @@ -62,6 +63,7 @@ class WFStepStatus(object): after_starting_uninterrupted_statuses = (running, done, failed) after_running_statuses = (done, failed, cancelled) final_statuses = (done, failed, closed, cancelled) + transient_statuses = (checked_true, checked_false, ready) class WFDataStatus(object): @@ -98,6 +100,7 @@ class WFDataStatus(object): after_waiting_suffice_statuses = (done_waited, cancelled) terminated_statuses = (done_generated, done_waited, done_skipped, cancelled, retired) nonreusable_statuses = (cancelled, retired) + transient_statuses = (checked_nonexist, checked_insuffi, checked_suffice, checked_complete) # ==== Types =================================================== diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 8ae833992..e2c9edff3 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -1,3 +1,4 @@ +import atexit import copy import functools import importlib @@ -53,6 +54,7 @@ # ==== Global Parameters ======================================= WORKFLOW_CHECK_INTERVAL_SEC = 60 +MESSAGE_QUEUE_NAME = "jedi_workflow_manager" # ==== Plugin Map ============================================== @@ -122,6 +124,8 @@ def __init__(self, task_buffer, *args, **kwargs): self.ddm_if = rucioAPI self.full_pid = f"{socket.getfqdn().split('.')[0]}-{os.getpgrp()}-{os.getpid()}" self.plugin_map = {} + self.mb_proxy = None + self.set_mb_proxy() def get_plugin(self, plugin_type: str, flavor: str): """ @@ -145,6 +149,89 @@ def get_plugin(self, plugin_type: str, flavor: str): plugin = self.plugin_map[plugin_type][flavor] return plugin + def set_mb_proxy(self): + """ + Set the message broker proxy for workflow manager messaging + """ + try: + jedi_config = importlib.import_module("pandajedi.jediconfig.jedi_config") + if hasattr(jedi_config, "mq") and hasattr(jedi_config.mq, "configFile") and jedi_config.mq.configFile: + MsgProcAgent = importlib.import_module(f"pandajedi.jediorder.JediMsgProcessor.MsgProcAgent") + else: + logger.warning("Message queue config not found in jedi_config; skipped workflow manager messaging") + return None + out_q_list = [MESSAGE_QUEUE_NAME] + mq_agent = MsgProcAgent(config_file=jedi_config.mq.configFile) + mb_proxy_dict = mq_agent.start_passive_mode(in_q_list=None, out_q_list=out_q_list) + # stop with atexit + atexit.register(mq_agent.stop_passive_mode) + # set mb_proxy + self.mb_proxy = mb_proxy_dict["out"][MESSAGE_QUEUE_NAME] + logger.info(f"Set mb_proxy about queue {MESSAGE_QUEUE_NAME} for workflow manager messaging") + except Exception: + logger.warning(f"Failed to set mb_proxy about queue {MESSAGE_QUEUE_NAME}; skipped workflow manager messaging: {traceback.format_exc()}") + return None + + def _send_message(self, tmp_log, msg_type: str, data_dict: Dict[str, Any] = None): + """ + Send a message to the workflow manager message queue + + Args: + tmp_log (logging.Logger): Logger for logging messages + msg_type (str): Type of the message (e.g., "workflow", "wfstep", "wfdata") + data_dict (Dict[str, Any], optional): Additional data to include in the message + """ + if self.mb_proxy is None: + return None + try: + now_time = naive_utcnow() + now_ts = int(now_time.timestamp()) + # get mbproxy + msg_dict = {} + if data_dict: + msg_dict.update(data_dict) + msg_dict.update( + { + "msg_type": msg_type, + "timestamp": now_ts, + } + ) + msg = json.dumps(msg_dict) + self.mb_proxy.send(msg) + tmp_log.debug(f"Sent message to workflow manager queue {MESSAGE_QUEUE_NAME}: {msg}") + except Exception: + tmp_log.error(f"Failed to send message to workflow manager queue {MESSAGE_QUEUE_NAME}: {traceback.format_exc()}") + + def send_workflow_message(self, workflow_id: int): + """ + Send a message about the workflow to the workflow manager message queue + + Args: + workflow_id (int): ID of the workflow + """ + tmp_log = LogWrapper(logger, f"send_workflow_message workflow_id={workflow_id}") + self._send_message(tmp_log, "workflow", {"workflow_id": workflow_id}) + + def send_step_message(self, step_id: int): + """ + Send a message about the workflow step to the workflow manager message queue + + Args: + step_id (int): ID of the workflow step + """ + tmp_log = LogWrapper(logger, f"send_step_message step_id={step_id}") + self._send_message(tmp_log, "wfstep", {"step_id": step_id}) + + def send_data_message(self, data_id: int): + """ + Send a message about the workflow data to the workflow manager message queue + + Args: + data_id (int): ID of the workflow data + """ + tmp_log = LogWrapper(logger, f"send_data_message data_id={data_id}") + self._send_message(tmp_log, "wfdata", {"data_id": data_id}) + # --- Context managers for locking ------------------------- @contextmanager @@ -704,6 +791,9 @@ def process_data_specs(self, data_specs: List[WFDataSpec]) -> Dict: data_status_stats["processed"].setdefault(data_spec.status, 0) data_status_stats["processed"][data_spec.status] += 1 data_status_stats["n_processed"] += 1 + # For changes into transient status, send message to trigger processing immediately + if data_spec.status != orig_status and data_spec.status in WFDataStatus.transient_statuses: + self.send_data_message(data_spec.data_id) tmp_log.info( f"Done, processed {data_status_stats['n_processed']}/{n_data} data specs, unchanged: {data_status_stats['unchanged']}, changed: {data_status_stats['changed']}" ) @@ -1236,6 +1326,9 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W steps_status_stats["processed"].setdefault(step_spec.status, 0) steps_status_stats["processed"][step_spec.status] += 1 steps_status_stats["n_processed"] += 1 + # For changes into transient status, send message to trigger processing immediately + if step_spec.status != orig_status and step_spec.status in WFStepStatus.transient_statuses: + self.send_step_message(step_spec.step_id) tmp_log.info( f"Done, processed {steps_status_stats['n_processed']}/{n_steps} steps, unchanged: {steps_status_stats['unchanged']}, changed: {steps_status_stats['changed']}" ) @@ -1653,6 +1746,9 @@ def process_active_workflows(self) -> Dict: workflows_status_stats["processed"].setdefault(workflow_spec.status, 0) workflows_status_stats["processed"][workflow_spec.status] += 1 workflows_status_stats["n_processed"] += 1 + # For changes into transient status, send message to trigger processing immediately + if workflow_spec.status != orig_status and workflow_spec.status in WorkflowStatus.transient_statuses: + self.send_workflow_message(workflow_spec.workflow_id) workflows_status_stats["n_workflows"] = n_workflows tmp_log.info( f"Done, processed {workflows_status_stats['n_processed']}/{n_workflows} workflows, unchanged: {workflows_status_stats['unchanged']}, changed: {workflows_status_stats['changed']}" From ec662de861f9c24b0d86ed01921bd6bc6ea6ff25 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 27 Nov 2025 16:44:18 +0100 Subject: [PATCH 065/101] workflows4: fixes --- .../workflow_manager_msg_processor.py | 18 ++++++----- pandaserver/workflow/workflow_core.py | 30 +++++++++++-------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py index 8ea252866..ad59ae655 100644 --- a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py +++ b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py @@ -20,8 +20,6 @@ def initialize(self): Initialize the plugin """ BaseMsgProcPlugin.initialize(self) - ddmIF = DDMInterface() - ddmIF.setupInterface() the_pid = self.get_pid() self.workflow_interface = WorkflowInterface(self.tbIF) @@ -60,18 +58,22 @@ def process(self, msg_obj): raise # run try: + tmp_log.info(f"got message {msg_dict}") if msg_type == "workflow": workflow_id = msg_dict["workflow_id"] - stats = self.workflow_interface.process_workflow(workflow_id) - tmp_log.info(f"processed workflow_id={workflow_id} : {stats}") + workflow_spec = self.tbIF.get_workflow(workflow_id) + stats = self.workflow_interface.process_workflow(workflow_spec, by="msgproc") + tmp_log.info(f"processed workflow_id={workflow_id}") elif msg_type == "wfstep": step_id = msg_dict["step_id"] - stats = self.workflow_interface.process_workflow_step(step_id) - tmp_log.info(f"processed step_id={step_id} : {stats}") + step_spec = self.tbIF.get_workflow_step(step_id) + stats = self.workflow_interface.process_steps([step_spec], by="msgproc") + tmp_log.info(f"processed step_id={step_id}") elif msg_type == "wfdata": data_id = msg_dict["data_id"] - stats = self.workflow_interface.process_workflow_data(data_id) - tmp_log.info(f"processed data_id={data_id} : {stats}") + data_spec = self.tbIF.get_workflow_data(data_id) + stats = self.workflow_interface.process_datas([data_spec], by="msgproc") + tmp_log.info(f"processed data_id={data_id}") except Exception as e: err_str = f"failed to run, skipped. {e.__class__.__name__} : {e}" tmp_log.error(err_str) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index e2c9edff3..aa0dfb5f4 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -154,9 +154,9 @@ def set_mb_proxy(self): Set the message broker proxy for workflow manager messaging """ try: - jedi_config = importlib.import_module("pandajedi.jediconfig.jedi_config") + jedi_config = importlib.import_module("pandajedi.jediconfig").jedi_config if hasattr(jedi_config, "mq") and hasattr(jedi_config.mq, "configFile") and jedi_config.mq.configFile: - MsgProcAgent = importlib.import_module(f"pandajedi.jediorder.JediMsgProcessor.MsgProcAgent") + MsgProcAgent = importlib.import_module(f"pandajedi.jediorder.JediMsgProcessor").MsgProcAgent else: logger.warning("Message queue config not found in jedi_config; skipped workflow manager messaging") return None @@ -167,7 +167,7 @@ def set_mb_proxy(self): atexit.register(mq_agent.stop_passive_mode) # set mb_proxy self.mb_proxy = mb_proxy_dict["out"][MESSAGE_QUEUE_NAME] - logger.info(f"Set mb_proxy about queue {MESSAGE_QUEUE_NAME} for workflow manager messaging") + # logger.debug(f"Set mb_proxy about queue {MESSAGE_QUEUE_NAME} for workflow manager messaging") except Exception: logger.warning(f"Failed to set mb_proxy about queue {MESSAGE_QUEUE_NAME}; skipped workflow manager messaging: {traceback.format_exc()}") return None @@ -198,7 +198,7 @@ def _send_message(self, tmp_log, msg_type: str, data_dict: Dict[str, Any] = None ) msg = json.dumps(msg_dict) self.mb_proxy.send(msg) - tmp_log.debug(f"Sent message to workflow manager queue {MESSAGE_QUEUE_NAME}: {msg}") + tmp_log.debug(f"Sent message") except Exception: tmp_log.error(f"Failed to send message to workflow manager queue {MESSAGE_QUEUE_NAME}: {traceback.format_exc()}") @@ -738,17 +738,18 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.error(f"{traceback.format_exc()}") return process_result - def process_data_specs(self, data_specs: List[WFDataSpec]) -> Dict: + def process_datas(self, data_specs: List[WFDataSpec], by: str = "watchdog") -> Dict: """ Process a list of workflow data specifications Args: data_specs (List[WFDataSpec]): List of workflow data specifications to process + by (str): Identifier of the entity processing the data specifications Returns: Dict: Statistics of the processing results """ - tmp_log = LogWrapper(logger, f"process_data_specs workflow_id={data_specs[0].workflow_id}") + tmp_log = LogWrapper(logger, f"process_datas workflow_id={data_specs[0].workflow_id} by={by}") n_data = len(data_specs) tmp_log.debug(f"Start, processing {n_data} data specs") data_status_stats = {"n_data": n_data, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} @@ -1269,18 +1270,19 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: tmp_log.error(f"Got error ; {traceback.format_exc()}") return process_result - def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, WFDataSpec] | None = None) -> Dict: + def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "watchdog") -> Dict: """ Process a list of workflow steps Args: step_specs (List[WFStepSpec]): List of workflow step specifications to process data_spec_map (Dict[str, WFDataSpec] | None): Optional map of data name to WFDataSpec for the workflow + by (str): The entity processing the steps, e.g., "watchdog" or "user" Returns: Dict: Statistics of the processing results """ - tmp_log = LogWrapper(logger, f"process_steps workflow_id={step_specs[0].workflow_id}") + tmp_log = LogWrapper(logger, f"process_steps workflow_id={step_specs[0].workflow_id} by={by}") n_steps = len(step_specs) tmp_log.debug(f"Start, processing {n_steps} steps") steps_status_stats = {"n_steps": n_steps, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} @@ -1555,7 +1557,7 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc # Process data specs first data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id, status_exclusion_list=list(WFDataStatus.terminated_statuses)) if data_specs: - data_status_stats = self.process_data_specs(data_specs) + data_status_stats = self.process_datas(data_specs) # Get steps in registered status required_step_statuses = list(WFStepStatus.to_advance_step_statuses) over_advanced_step_statuses = list(WFStepStatus.after_starting_uninterrupted_statuses) @@ -1622,7 +1624,7 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce # Process data specs first data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id, status_exclusion_list=list(WFDataStatus.terminated_statuses)) if data_specs: - data_status_stats = self.process_data_specs(data_specs) + data_status_stats = self.process_datas(data_specs) # Get steps step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id) if not step_specs: @@ -1670,22 +1672,26 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce else: process_result.success = True tmp_log.info(f"Done, status remains {workflow_spec.status}") + if processed_steps_stats.get(WFStepStatus.done) == len(step_specs): + # all steps are done, trigger re-check to update workflow status + self.send_workflow_message(workflow_spec.workflow_id) except Exception as e: process_result.message = f"Got error {str(e)}" tmp_log.error(f"Got error ; {traceback.format_exc()}") return process_result - def process_workflow(self, workflow_spec: WorkflowSpec) -> WorkflowProcessResult: + def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "watchdog") -> WorkflowProcessResult: """ Process a workflow based on its current status Args: workflow_spec (WorkflowSpec): The workflow specification to process + by (str): The entity processing the workflow Returns: WorkflowProcessResult: The result of processing the workflow """ - tmp_log = LogWrapper(logger, f"process_workflow workflow_id={workflow_spec.workflow_id}") + tmp_log = LogWrapper(logger, f"process_workflow workflow_id={workflow_spec.workflow_id} by={by}") tmp_log.debug(f"Start, current status={workflow_spec.status}") # Initialize process_result = WorkflowProcessResult() From 51837c8f678e0cb08dce943d2b20bc90f8bab1a2 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 27 Nov 2025 17:12:16 +0100 Subject: [PATCH 066/101] workflows4: break down processing methods --- .../workflow_manager_msg_processor.py | 4 +- pandaserver/workflow/workflow_core.py | 205 ++++++++++-------- 2 files changed, 121 insertions(+), 88 deletions(-) diff --git a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py index ad59ae655..92b7583fb 100644 --- a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py +++ b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py @@ -67,12 +67,12 @@ def process(self, msg_obj): elif msg_type == "wfstep": step_id = msg_dict["step_id"] step_spec = self.tbIF.get_workflow_step(step_id) - stats = self.workflow_interface.process_steps([step_spec], by="msgproc") + stats = self.workflow_interface.process_step(step_spec, by="msgproc") tmp_log.info(f"processed step_id={step_id}") elif msg_type == "wfdata": data_id = msg_dict["data_id"] data_spec = self.tbIF.get_workflow_data(data_id) - stats = self.workflow_interface.process_datas([data_spec], by="msgproc") + stats = self.workflow_interface.process_data(data_spec, by="msgproc") tmp_log.info(f"processed data_id={data_id}") except Exception as e: err_str = f"failed to run, skipped. {e.__class__.__name__} : {e}" diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index aa0dfb5f4..9bfc91b14 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -738,6 +738,49 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.error(f"{traceback.format_exc()}") return process_result + def process_data(self, data_spec: WFDataSpec, by: str = "watchdog") -> WFDataProcessResult | None: + """ + Process a single workflow data specification + + Args: + data_spec (WFDataSpec): The workflow data specification to process + by (str): Identifier of the entity processing the data specification + + Returns: + WFDataProcessResult | None: The result of processing the data specification, or None if skipped + """ + tmp_log = LogWrapper(logger, f"process_data workflow_id={data_spec.workflow_id} data_id={data_spec.data_id} by={by}") + tmp_log.debug("Start") + tmp_res = None + with self.workflow_data_lock(data_spec.data_id) as locked_data_spec: + if locked_data_spec is None: + tmp_log.warning(f"Failed to acquire lock for data_id={data_spec.data_id}; skipped") + return None + data_spec = locked_data_spec + orig_status = data_spec.status + # Process the data + if data_spec.status == WFDataStatus.registered: + tmp_res = self.process_data_registered(data_spec) + elif data_spec.status == WFDataStatus.checking: + tmp_res = self.process_data_checking(data_spec) + elif data_spec.status in WFDataStatus.checked_statuses: + tmp_res = self.process_data_checked(data_spec) + elif data_spec.status == WFDataStatus.binding: + # dummy result since binding data are handled in step processing + dummy_process_result = WFDataProcessResult() + dummy_process_result.success = True + tmp_res = dummy_process_result + elif data_spec.status in WFDataStatus.generating_statuses: + tmp_res = self.process_data_generating(data_spec) + elif data_spec.status in WFDataStatus.waiting_statuses: + tmp_res = self.process_data_waiting(data_spec) + else: + tmp_log.debug(f"Data status {data_spec.status} is not handled in this context; skipped") + # For changes into transient status, send message to trigger processing immediately + if data_spec.status != orig_status and data_spec.status in WFDataStatus.transient_statuses: + self.send_data_message(data_spec.data_id) + return tmp_res + def process_datas(self, data_specs: List[WFDataSpec], by: str = "watchdog") -> Dict: """ Process a list of workflow data specifications @@ -754,47 +797,19 @@ def process_datas(self, data_specs: List[WFDataSpec], by: str = "watchdog") -> D tmp_log.debug(f"Start, processing {n_data} data specs") data_status_stats = {"n_data": n_data, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} for data_spec in data_specs: - with self.workflow_data_lock(data_spec.data_id) as locked_data_spec: - if locked_data_spec is None: - tmp_log.warning(f"Failed to acquire lock for data_id={data_spec.data_id}; skipped") - continue - data_spec = locked_data_spec - orig_status = data_spec.status - # Process the data - tmp_res = None - status = data_spec.status - if status == WFDataStatus.registered: - tmp_res = self.process_data_registered(data_spec) - elif status == WFDataStatus.checking: - tmp_res = self.process_data_checking(data_spec) - elif status in WFDataStatus.checked_statuses: - tmp_res = self.process_data_checked(data_spec) - elif status == WFDataStatus.binding: - # dummy result since binding data are handled in step processing - dummy_process_result = WFDataProcessResult() - dummy_process_result.success = True - tmp_res = dummy_process_result - elif status in WFDataStatus.generating_statuses: - tmp_res = self.process_data_generating(data_spec) - elif status in WFDataStatus.waiting_statuses: - tmp_res = self.process_data_waiting(data_spec) + orig_status = data_spec.status + tmp_res = self.process_data(data_spec, by=by) + if tmp_res and tmp_res.success: + # update stats + if tmp_res.new_status and data_spec.status != orig_status: + data_status_stats["changed"].setdefault(data_spec.status, 0) + data_status_stats["changed"][data_spec.status] += 1 else: - tmp_log.debug(f"Data status {data_spec.status} is not handled in this context; skipped") - continue - if tmp_res and tmp_res.success: - # update stats - if tmp_res.new_status and data_spec.status != orig_status: - data_status_stats["changed"].setdefault(data_spec.status, 0) - data_status_stats["changed"][data_spec.status] += 1 - else: - data_status_stats["unchanged"].setdefault(data_spec.status, 0) - data_status_stats["unchanged"][data_spec.status] += 1 - data_status_stats["processed"].setdefault(data_spec.status, 0) - data_status_stats["processed"][data_spec.status] += 1 - data_status_stats["n_processed"] += 1 - # For changes into transient status, send message to trigger processing immediately - if data_spec.status != orig_status and data_spec.status in WFDataStatus.transient_statuses: - self.send_data_message(data_spec.data_id) + data_status_stats["unchanged"].setdefault(data_spec.status, 0) + data_status_stats["unchanged"][data_spec.status] += 1 + data_status_stats["processed"].setdefault(data_spec.status, 0) + data_status_stats["processed"][data_spec.status] += 1 + data_status_stats["n_processed"] += 1 tmp_log.info( f"Done, processed {data_status_stats['n_processed']}/{n_data} data specs, unchanged: {data_status_stats['unchanged']}, changed: {data_status_stats['changed']}" ) @@ -1270,6 +1285,54 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: tmp_log.error(f"Got error ; {traceback.format_exc()}") return process_result + def process_step(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "watchdog") -> WFStepProcessResult | None: + """ + Process a single workflow step + + Args: + step_spec (WFStepSpec): The workflow step specification to process + data_spec_map (Dict[str, WFDataSpec] | None): Optional map of data name to WFDataSpec for the workflow + by (str): The entity processing the step, e.g., "watchdog" or "user" + + Returns: + WFStepProcessResult | None: The result of processing the step, or None if the step was skipped + """ + tmp_log = LogWrapper(logger, f"process_step workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} by={by}") + tmp_log.debug("Start") + tmp_res = None + with self.workflow_step_lock(step_spec.step_id) as locked_step_spec: + if locked_step_spec is None: + tmp_log.warning(f"Failed to acquire lock for step_id={step_spec.step_id}; skipped") + return None + step_spec = locked_step_spec + orig_status = step_spec.status + # Process the step + if step_spec.status == WFStepStatus.registered: + tmp_res = self.process_step_registered(step_spec) + elif step_spec.status == WFStepStatus.checking: + tmp_res = self.process_step_checking(step_spec) + elif step_spec.status in WFStepStatus.checked_statuses: + tmp_res = self.process_step_checked(step_spec) + elif step_spec.status == WFStepStatus.pending: + tmp_res = self.process_step_pending(step_spec, data_spec_map=data_spec_map) + elif step_spec.status == WFStepStatus.ready: + tmp_res = self.process_step_ready(step_spec) + elif step_spec.status == WFStepStatus.starting: + tmp_res = self.process_step_starting(step_spec) + elif step_spec.status == WFStepStatus.running: + tmp_res = self.process_step_running(step_spec) + elif step_spec.status in WFStepStatus.final_statuses: + # dummy result since final steps need no processing + dummy_process_result = WFStepProcessResult() + dummy_process_result.success = True + tmp_res = dummy_process_result + else: + tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") + # For changes into transient status, send message to trigger processing immediately + if step_spec.status != orig_status and step_spec.status in WFStepStatus.transient_statuses: + self.send_step_message(step_spec.step_id) + return tmp_res + def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "watchdog") -> Dict: """ Process a list of workflow steps @@ -1287,50 +1350,19 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W tmp_log.debug(f"Start, processing {n_steps} steps") steps_status_stats = {"n_steps": n_steps, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} for step_spec in step_specs: - with self.workflow_step_lock(step_spec.step_id) as locked_step_spec: - if locked_step_spec is None: - tmp_log.warning(f"Failed to acquire lock for step_id={step_spec.step_id}; skipped") - continue - step_spec = locked_step_spec - orig_status = step_spec.status - # Process the step - tmp_res = None - if step_spec.status == WFStepStatus.registered: - tmp_res = self.process_step_registered(step_spec) - elif step_spec.status == WFStepStatus.checking: - tmp_res = self.process_step_checking(step_spec) - elif step_spec.status in WFStepStatus.checked_statuses: - tmp_res = self.process_step_checked(step_spec) - elif step_spec.status == WFStepStatus.pending: - tmp_res = self.process_step_pending(step_spec, data_spec_map=data_spec_map) - elif step_spec.status == WFStepStatus.ready: - tmp_res = self.process_step_ready(step_spec) - elif step_spec.status == WFStepStatus.starting: - tmp_res = self.process_step_starting(step_spec) - elif step_spec.status == WFStepStatus.running: - tmp_res = self.process_step_running(step_spec) - elif step_spec.status in WFStepStatus.final_statuses: - # dummy result since final steps need no processing - dummy_process_result = WFStepProcessResult() - dummy_process_result.success = True - tmp_res = dummy_process_result + orig_status = step_spec.status + tmp_res = self.process_step(step_spec, data_spec_map=data_spec_map, by=by) + if tmp_res and tmp_res.success: + # update stats + if tmp_res.new_status and step_spec.status != orig_status: + steps_status_stats["changed"].setdefault(step_spec.status, 0) + steps_status_stats["changed"][step_spec.status] += 1 else: - tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") - continue - if tmp_res and tmp_res.success: - # update stats - if tmp_res.new_status and step_spec.status != orig_status: - steps_status_stats["changed"].setdefault(step_spec.status, 0) - steps_status_stats["changed"][step_spec.status] += 1 - else: - steps_status_stats["unchanged"].setdefault(step_spec.status, 0) - steps_status_stats["unchanged"][step_spec.status] += 1 - steps_status_stats["processed"].setdefault(step_spec.status, 0) - steps_status_stats["processed"][step_spec.status] += 1 - steps_status_stats["n_processed"] += 1 - # For changes into transient status, send message to trigger processing immediately - if step_spec.status != orig_status and step_spec.status in WFStepStatus.transient_statuses: - self.send_step_message(step_spec.step_id) + steps_status_stats["unchanged"].setdefault(step_spec.status, 0) + steps_status_stats["unchanged"][step_spec.status] += 1 + steps_status_stats["processed"].setdefault(step_spec.status, 0) + steps_status_stats["processed"][step_spec.status] += 1 + steps_status_stats["n_processed"] += 1 tmp_log.info( f"Done, processed {steps_status_stats['n_processed']}/{n_steps} steps, unchanged: {steps_status_stats['unchanged']}, changed: {steps_status_stats['changed']}" ) @@ -1695,6 +1727,7 @@ def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "watchdog") -> tmp_log.debug(f"Start, current status={workflow_spec.status}") # Initialize process_result = WorkflowProcessResult() + orig_status = workflow_spec.status # Process based on status match workflow_spec.status: case WorkflowStatus.registered: @@ -1708,6 +1741,9 @@ def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "watchdog") -> case _: process_result.message = f"Workflow status {workflow_spec.status} is not handled in this context; skipped" tmp_log.warning(f"{process_result.message}") + # For changes into transient status, send message to trigger processing immediately + if workflow_spec.status != orig_status and workflow_spec.status in WorkflowStatus.transient_statuses: + self.send_workflow_message(workflow_spec.workflow_id) return process_result # ---- Process all workflows ------------------------------------- @@ -1752,9 +1788,6 @@ def process_active_workflows(self) -> Dict: workflows_status_stats["processed"].setdefault(workflow_spec.status, 0) workflows_status_stats["processed"][workflow_spec.status] += 1 workflows_status_stats["n_processed"] += 1 - # For changes into transient status, send message to trigger processing immediately - if workflow_spec.status != orig_status and workflow_spec.status in WorkflowStatus.transient_statuses: - self.send_workflow_message(workflow_spec.workflow_id) workflows_status_stats["n_workflows"] = n_workflows tmp_log.info( f"Done, processed {workflows_status_stats['n_processed']}/{n_workflows} workflows, unchanged: {workflows_status_stats['unchanged']}, changed: {workflows_status_stats['changed']}" From b198f3e7e6895ed6c4468f176319cb77d808d46d Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 27 Nov 2025 17:21:51 +0100 Subject: [PATCH 067/101] workflows4: fix step and data lock for messaging --- pandaserver/workflow/workflow_core.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 9bfc91b14..7da7d410b 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -776,9 +776,9 @@ def process_data(self, data_spec: WFDataSpec, by: str = "watchdog") -> WFDataPro tmp_res = self.process_data_waiting(data_spec) else: tmp_log.debug(f"Data status {data_spec.status} is not handled in this context; skipped") - # For changes into transient status, send message to trigger processing immediately - if data_spec.status != orig_status and data_spec.status in WFDataStatus.transient_statuses: - self.send_data_message(data_spec.data_id) + # For changes into transient status, send message to trigger processing immediately + if data_spec.status != orig_status and data_spec.status in WFDataStatus.transient_statuses: + self.send_data_message(data_spec.data_id) return tmp_res def process_datas(self, data_specs: List[WFDataSpec], by: str = "watchdog") -> Dict: @@ -1328,9 +1328,9 @@ def process_step(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpe tmp_res = dummy_process_result else: tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") - # For changes into transient status, send message to trigger processing immediately - if step_spec.status != orig_status and step_spec.status in WFStepStatus.transient_statuses: - self.send_step_message(step_spec.step_id) + # For changes into transient status, send message to trigger processing immediately + if step_spec.status != orig_status and step_spec.status in WFStepStatus.transient_statuses: + self.send_step_message(step_spec.step_id) return tmp_res def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "watchdog") -> Dict: From dfcd9b0e663f7cdf102854af602e4c044ff84200 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 27 Nov 2025 17:34:25 +0100 Subject: [PATCH 068/101] workflows4: pretty log --- pandaserver/workflow/workflow_core.py | 54 ++++++++++++++------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 7da7d410b..fb0bec24b 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -209,7 +209,7 @@ def send_workflow_message(self, workflow_id: int): Args: workflow_id (int): ID of the workflow """ - tmp_log = LogWrapper(logger, f"send_workflow_message workflow_id={workflow_id}") + tmp_log = LogWrapper(logger, f"send_workflow_message ") self._send_message(tmp_log, "workflow", {"workflow_id": workflow_id}) def send_step_message(self, step_id: int): @@ -219,7 +219,7 @@ def send_step_message(self, step_id: int): Args: step_id (int): ID of the workflow step """ - tmp_log = LogWrapper(logger, f"send_step_message step_id={step_id}") + tmp_log = LogWrapper(logger, f"send_step_message ") self._send_message(tmp_log, "wfstep", {"step_id": step_id}) def send_data_message(self, data_id: int): @@ -229,7 +229,7 @@ def send_data_message(self, data_id: int): Args: data_id (int): ID of the workflow data """ - tmp_log = LogWrapper(logger, f"send_data_message data_id={data_id}") + tmp_log = LogWrapper(logger, f"send_data_message ") self._send_message(tmp_log, "wfdata", {"data_id": data_id}) # --- Context managers for locking ------------------------- @@ -359,7 +359,7 @@ def register_workflow( if ret_workflow_id is None: tmp_log.error(f"Failed to register workflow") return None - tmp_log.info(f"Registered workflow workflow_id={ret_workflow_id}") + tmp_log.info(f"Registered workflow ") return ret_workflow_id def cancel_workflow(self, workflow_id: int) -> bool: ... @@ -381,7 +381,7 @@ def process_data_registered(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_registered workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") + tmp_log = LogWrapper(logger, f"process_data_registered ") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -414,7 +414,7 @@ def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_checking data_id={data_spec.data_id}") + tmp_log = LogWrapper(logger, f"process_data_checking ") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -473,7 +473,7 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_checked workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") + tmp_log = LogWrapper(logger, f"process_data_checked ") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -526,7 +526,7 @@ def process_data_binding(self, data_spec: WFDataSpec, step_spec: WFStepSpec) -> Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_binding workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") + tmp_log = LogWrapper(logger, f"process_data_binding ") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -560,7 +560,7 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_generating workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") + tmp_log = LogWrapper(logger, f"process_data_generating ") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -661,7 +661,7 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_waiting workflow_id={data_spec.workflow_id} data_id={data_spec.data_id}") + tmp_log = LogWrapper(logger, f"process_data_waiting ") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -749,7 +749,7 @@ def process_data(self, data_spec: WFDataSpec, by: str = "watchdog") -> WFDataPro Returns: WFDataProcessResult | None: The result of processing the data specification, or None if skipped """ - tmp_log = LogWrapper(logger, f"process_data workflow_id={data_spec.workflow_id} data_id={data_spec.data_id} by={by}") + tmp_log = LogWrapper(logger, f"process_data ") tmp_log.debug("Start") tmp_res = None with self.workflow_data_lock(data_spec.data_id) as locked_data_spec: @@ -792,7 +792,7 @@ def process_datas(self, data_specs: List[WFDataSpec], by: str = "watchdog") -> D Returns: Dict: Statistics of the processing results """ - tmp_log = LogWrapper(logger, f"process_datas workflow_id={data_specs[0].workflow_id} by={by}") + tmp_log = LogWrapper(logger, f"process_datas ") n_data = len(data_specs) tmp_log.debug(f"Start, processing {n_data} data specs") data_status_stats = {"n_data": n_data, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} @@ -862,7 +862,9 @@ def process_step_registered(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_registered workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") + tmp_log = LogWrapper( + logger, f"process_step_registered " + ) tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -894,7 +896,7 @@ def process_step_checking(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_checking workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") + tmp_log = LogWrapper(logger, f"process_step_checking ") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -965,7 +967,7 @@ def process_step_checked(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_checked workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") + tmp_log = LogWrapper(logger, f"process_step_checked ") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1009,7 +1011,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_pending workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") + tmp_log = LogWrapper(logger, f"process_step_pending ") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1110,7 +1112,7 @@ def process_step_ready(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_ready workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") + tmp_log = LogWrapper(logger, f"process_step_ready ") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1152,7 +1154,7 @@ def process_step_starting(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_starting workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") + tmp_log = LogWrapper(logger, f"process_step_starting ") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1224,7 +1226,7 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_running workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} member_id={step_spec.member_id}") + tmp_log = LogWrapper(logger, f"process_step_running ") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1297,7 +1299,7 @@ def process_step(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpe Returns: WFStepProcessResult | None: The result of processing the step, or None if the step was skipped """ - tmp_log = LogWrapper(logger, f"process_step workflow_id={step_spec.workflow_id} step_id={step_spec.step_id} by={by}") + tmp_log = LogWrapper(logger, f"process_step ") tmp_log.debug("Start") tmp_res = None with self.workflow_step_lock(step_spec.step_id) as locked_step_spec: @@ -1345,7 +1347,7 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W Returns: Dict: Statistics of the processing results """ - tmp_log = LogWrapper(logger, f"process_steps workflow_id={step_specs[0].workflow_id} by={by}") + tmp_log = LogWrapper(logger, f"process_steps ") n_steps = len(step_specs) tmp_log.debug(f"Start, processing {n_steps} steps") steps_status_stats = {"n_steps": n_steps, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} @@ -1381,7 +1383,7 @@ def process_workflow_registered(self, workflow_spec: WorkflowSpec) -> WorkflowPr Returns: WorkflowProcessResult: The result of processing the workflow """ - tmp_log = LogWrapper(logger, f"process_workflow_registered workflow_id={workflow_spec.workflow_id}") + tmp_log = LogWrapper(logger, f"process_workflow_registered ") tmp_log.debug("Start") # Initialize process_result = WorkflowProcessResult() @@ -1449,7 +1451,7 @@ def process_workflow_checked(self, workflow_spec: WorkflowSpec) -> WorkflowProce Returns: WorkflowProcessResult: The result of processing the workflow """ - tmp_log = LogWrapper(logger, f"process_workflow_checked workflow_id={workflow_spec.workflow_id}") + tmp_log = LogWrapper(logger, f"process_workflow_checked ") tmp_log.debug("Start") # Initialize process_result = WorkflowProcessResult() @@ -1575,7 +1577,7 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc Returns: WorkflowProcessResult: The result of processing the workflow """ - tmp_log = LogWrapper(logger, f"process_workflow_starting workflow_id={workflow_spec.workflow_id}") + tmp_log = LogWrapper(logger, f"process_workflow_starting ") tmp_log.debug("Start") # Initialize process_result = WorkflowProcessResult() @@ -1642,7 +1644,7 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce Returns: WorkflowProcessResult: The result of processing the workflow """ - tmp_log = LogWrapper(logger, f"process_workflow_running workflow_id={workflow_spec.workflow_id}") + tmp_log = LogWrapper(logger, f"process_workflow_running ") tmp_log.debug("Start") # Initialize process_result = WorkflowProcessResult() @@ -1723,7 +1725,7 @@ def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "watchdog") -> Returns: WorkflowProcessResult: The result of processing the workflow """ - tmp_log = LogWrapper(logger, f"process_workflow workflow_id={workflow_spec.workflow_id} by={by}") + tmp_log = LogWrapper(logger, f"process_workflow ") tmp_log.debug(f"Start, current status={workflow_spec.status}") # Initialize process_result = WorkflowProcessResult() From 256eeb1cff6fb72a7d101236f253a23b2ae485de Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 27 Nov 2025 17:37:01 +0100 Subject: [PATCH 069/101] pretty --- .../workflow_manager_msg_processor.py | 6 +++--- pandaserver/workflow/workflow_core.py | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py index 92b7583fb..90f051385 100644 --- a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py +++ b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py @@ -62,17 +62,17 @@ def process(self, msg_obj): if msg_type == "workflow": workflow_id = msg_dict["workflow_id"] workflow_spec = self.tbIF.get_workflow(workflow_id) - stats = self.workflow_interface.process_workflow(workflow_spec, by="msgproc") + stats = self.workflow_interface.process_workflow(workflow_spec, by="msg") tmp_log.info(f"processed workflow_id={workflow_id}") elif msg_type == "wfstep": step_id = msg_dict["step_id"] step_spec = self.tbIF.get_workflow_step(step_id) - stats = self.workflow_interface.process_step(step_spec, by="msgproc") + stats = self.workflow_interface.process_step(step_spec, by="msg") tmp_log.info(f"processed step_id={step_id}") elif msg_type == "wfdata": data_id = msg_dict["data_id"] data_spec = self.tbIF.get_workflow_data(data_id) - stats = self.workflow_interface.process_data(data_spec, by="msgproc") + stats = self.workflow_interface.process_data(data_spec, by="msg") tmp_log.info(f"processed data_id={data_id}") except Exception as e: err_str = f"failed to run, skipped. {e.__class__.__name__} : {e}" diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index fb0bec24b..e9ea070b8 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -738,7 +738,7 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.error(f"{traceback.format_exc()}") return process_result - def process_data(self, data_spec: WFDataSpec, by: str = "watchdog") -> WFDataProcessResult | None: + def process_data(self, data_spec: WFDataSpec, by: str = "dog") -> WFDataProcessResult | None: """ Process a single workflow data specification @@ -781,7 +781,7 @@ def process_data(self, data_spec: WFDataSpec, by: str = "watchdog") -> WFDataPro self.send_data_message(data_spec.data_id) return tmp_res - def process_datas(self, data_specs: List[WFDataSpec], by: str = "watchdog") -> Dict: + def process_datas(self, data_specs: List[WFDataSpec], by: str = "dog") -> Dict: """ Process a list of workflow data specifications @@ -1287,14 +1287,14 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: tmp_log.error(f"Got error ; {traceback.format_exc()}") return process_result - def process_step(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "watchdog") -> WFStepProcessResult | None: + def process_step(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "dog") -> WFStepProcessResult | None: """ Process a single workflow step Args: step_spec (WFStepSpec): The workflow step specification to process data_spec_map (Dict[str, WFDataSpec] | None): Optional map of data name to WFDataSpec for the workflow - by (str): The entity processing the step, e.g., "watchdog" or "user" + by (str): The entity processing the step, e.g., "dog" or "user" Returns: WFStepProcessResult | None: The result of processing the step, or None if the step was skipped @@ -1335,14 +1335,14 @@ def process_step(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpe self.send_step_message(step_spec.step_id) return tmp_res - def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "watchdog") -> Dict: + def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "dog") -> Dict: """ Process a list of workflow steps Args: step_specs (List[WFStepSpec]): List of workflow step specifications to process data_spec_map (Dict[str, WFDataSpec] | None): Optional map of data name to WFDataSpec for the workflow - by (str): The entity processing the steps, e.g., "watchdog" or "user" + by (str): The entity processing the steps, e.g., "dog" or "user" Returns: Dict: Statistics of the processing results @@ -1714,7 +1714,7 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce tmp_log.error(f"Got error ; {traceback.format_exc()}") return process_result - def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "watchdog") -> WorkflowProcessResult: + def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "dog") -> WorkflowProcessResult: """ Process a workflow based on its current status From 0165649d916d3b92931ea2bb772365a455b19e01 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 28 Nov 2025 06:05:44 +0100 Subject: [PATCH 070/101] workflows4: log --- pandaserver/workflow/workflow_core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index e9ea070b8..366fd3644 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -770,6 +770,7 @@ def process_data(self, data_spec: WFDataSpec, by: str = "dog") -> WFDataProcessR dummy_process_result = WFDataProcessResult() dummy_process_result.success = True tmp_res = dummy_process_result + tmp_log.debug(f"Data status {data_spec.status} ; wait for step processing") elif data_spec.status in WFDataStatus.generating_statuses: tmp_res = self.process_data_generating(data_spec) elif data_spec.status in WFDataStatus.waiting_statuses: @@ -1328,6 +1329,7 @@ def process_step(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpe dummy_process_result = WFStepProcessResult() dummy_process_result.success = True tmp_res = dummy_process_result + tmp_log.debug(f"Step in final status {step_spec.status} ; skipped") else: tmp_log.debug(f"Step status {step_spec.status} is not handled in this context; skipped") # For changes into transient status, send message to trigger processing immediately From b2034d50d9ee4ae361df28ad486f205623c85b1e Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 19 Jan 2026 16:09:40 +0100 Subject: [PATCH 071/101] workflows4: improve step status map for pending task --- pandaserver/taskbuffer/TaskBuffer.py | 8 +++++ .../db_proxy_mods/misc_standalone_module.py | 32 +++++++++++++++++++ .../panda_task_step_handler.py | 11 +++++-- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index 1e27c4bfd..6b593917f 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -2006,6 +2006,14 @@ def getTaskStatus(self, jediTaskID): res = proxy.getTaskStatus(jediTaskID) return res + # get task status and oldstatus + def getTaskStatusOldstatus(self, jediTaskID): + # get DB proxy + with self.proxyPool.get() as proxy: + # exec + res = proxy.getTaskStatusOldstatus(jediTaskID) + return res + # reactivate task def reactivateTask(self, jediTaskID, keep_attempt_nr=False, trigger_job_generation=False): # get DB proxy diff --git a/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py b/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py index c64e08c4d..e670f13dd 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py @@ -536,6 +536,38 @@ def getTaskStatus(self, jediTaskID): # error self.dump_error_message(tmp_log) return [] + + # get task status and oldstatus + def getTaskStatusOldstatus(self, jediTaskID): + comment = " /* DBProxy.getTaskStatusOldstatus */" + tmp_log = self.create_tagged_logger(comment, f"jediTaskID={jediTaskID}") + tmp_log.debug("start") + try: + # sql to update input file status + varMap = {} + varMap[":jediTaskID"] = jediTaskID + sql = f"SELECT status,oldStatus FROM {panda_config.schemaJEDI}.JEDI_Tasks " + sql += "WHERE jediTaskID=:jediTaskID " + # start transaction + self.conn.begin() + self.cur.arraysize = 1000 + self.cur.execute(sql + comment, varMap) + res = self.cur.fetchone() + # commit + if not self._commit(): + raise RuntimeError("Commit error") + if res: + tmp_log.debug(f"task {jediTaskID} has status={res[0]} oldstatus={res[1]}") + else: + res = [] + tmp_log.debug(f"task {jediTaskID} not found") + return res + except Exception: + # roll back + self._rollback() + # error + self.dump_error_message(tmp_log) + return [] # reactivate task def reactivateTask(self, jediTaskID, keep_attempt_nr=False, trigger_job_generation=False): diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index e2340679c..fdbc4d51b 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -165,19 +165,26 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu return check_result # Get task ID and status task_id = int(step_spec.target_id) - res = self.tbif.getTaskStatus(task_id) + res = self.tbif.getTaskStatusOldstatus(task_id) if not res: check_result.message = f"task_id={task_id} not found" tmp_log.error(f"{check_result.message}") return check_result # Interpret status task_status = res[0] + task_oldstatus = res[1] check_result.success = True check_result.native_status = task_status if task_status in ["running", "scouting", "scouted", "throttled", "prepared", "finishing", "passed"]: check_result.step_status = WFStepStatus.running - elif task_status in ["defined", "assigned", "activated", "starting", "ready", "pending"]: + elif task_status in ["defined", "assigned", "activated", "starting", "ready"]: check_result.step_status = WFStepStatus.starting + elif task_status in ["pending"]: + # Check oldstatus for pending to distinguish between starting and running + if task_oldstatus in ["running", "scouting", "scouted", "throttled", "prepared", "finishing", "passed"]: + check_result.step_status = WFStepStatus.running + else: + check_result.step_status = WFStepStatus.starting elif task_status in ["done", "finished"]: check_result.step_status = WFStepStatus.done elif task_status in ["failed", "exhausted", "aborted", "toabort", "aborting", "broken", "tobroken"]: From 8a1be8879101a3c03aa780f85a13fccd51dc9815 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 19 Jan 2026 16:29:38 +0100 Subject: [PATCH 072/101] test --- .../workflow/step_handler_plugins/panda_task_step_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index fdbc4d51b..140c35452 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -180,7 +180,7 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu elif task_status in ["defined", "assigned", "activated", "starting", "ready"]: check_result.step_status = WFStepStatus.starting elif task_status in ["pending"]: - # Check oldstatus for pending to distinguish between starting and running + # Check oldstatus for repetitive status (e.g. pending) to distinguish between starting and running if task_oldstatus in ["running", "scouting", "scouted", "throttled", "prepared", "finishing", "passed"]: check_result.step_status = WFStepStatus.running else: From 2b0d14e390a4d1b8893ed961d82c858c83c89c5f Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 19 Jan 2026 16:37:50 +0100 Subject: [PATCH 073/101] test --- .../workflow/step_handler_plugins/panda_task_step_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 140c35452..e2ea8ae3a 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -180,7 +180,7 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu elif task_status in ["defined", "assigned", "activated", "starting", "ready"]: check_result.step_status = WFStepStatus.starting elif task_status in ["pending"]: - # Check oldstatus for repetitive status (e.g. pending) to distinguish between starting and running + # Check oldstatus for repetitive statuses (e.g. pending) to distinguish between starting and running if task_oldstatus in ["running", "scouting", "scouted", "throttled", "prepared", "finishing", "passed"]: check_result.step_status = WFStepStatus.running else: From 17248f61793be444621ffd5c60b07d5418f067d0 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 19 Jan 2026 16:49:26 +0100 Subject: [PATCH 074/101] test --- .../workflow/step_handler_plugins/panda_task_step_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index e2ea8ae3a..140c35452 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -180,7 +180,7 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu elif task_status in ["defined", "assigned", "activated", "starting", "ready"]: check_result.step_status = WFStepStatus.starting elif task_status in ["pending"]: - # Check oldstatus for repetitive statuses (e.g. pending) to distinguish between starting and running + # Check oldstatus for repetitive status (e.g. pending) to distinguish between starting and running if task_oldstatus in ["running", "scouting", "scouted", "throttled", "prepared", "finishing", "passed"]: check_result.step_status = WFStepStatus.running else: From d8aab0863b103609beb12b7f09d450ad21af7420 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 27 Jan 2026 17:51:00 +0100 Subject: [PATCH 075/101] workflows4: improve step status mapping with task superstatus --- pandaserver/taskbuffer/TaskBuffer.py | 8 ++++---- .../db_proxy_mods/misc_standalone_module.py | 12 ++++++------ .../step_handler_plugins/panda_task_step_handler.py | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index 6b593917f..2a0fed3db 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -2006,14 +2006,14 @@ def getTaskStatus(self, jediTaskID): res = proxy.getTaskStatus(jediTaskID) return res - # get task status and oldstatus - def getTaskStatusOldstatus(self, jediTaskID): + # get task status and superstatus + def getTaskStatusSuperstatus(self, jediTaskID): # get DB proxy with self.proxyPool.get() as proxy: # exec - res = proxy.getTaskStatusOldstatus(jediTaskID) + res = proxy.getTaskStatusSuperstatus(jediTaskID) return res - + # reactivate task def reactivateTask(self, jediTaskID, keep_attempt_nr=False, trigger_job_generation=False): # get DB proxy diff --git a/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py b/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py index e670f13dd..8821cf075 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/misc_standalone_module.py @@ -536,17 +536,17 @@ def getTaskStatus(self, jediTaskID): # error self.dump_error_message(tmp_log) return [] - - # get task status and oldstatus - def getTaskStatusOldstatus(self, jediTaskID): - comment = " /* DBProxy.getTaskStatusOldstatus */" + + # get task status and superstatus + def getTaskStatusSuperstatus(self, jediTaskID): + comment = " /* DBProxy.getTaskStatusSuperstatus */" tmp_log = self.create_tagged_logger(comment, f"jediTaskID={jediTaskID}") tmp_log.debug("start") try: # sql to update input file status varMap = {} varMap[":jediTaskID"] = jediTaskID - sql = f"SELECT status,oldStatus FROM {panda_config.schemaJEDI}.JEDI_Tasks " + sql = f"SELECT status,superStatus FROM {panda_config.schemaJEDI}.JEDI_Tasks " sql += "WHERE jediTaskID=:jediTaskID " # start transaction self.conn.begin() @@ -557,7 +557,7 @@ def getTaskStatusOldstatus(self, jediTaskID): if not self._commit(): raise RuntimeError("Commit error") if res: - tmp_log.debug(f"task {jediTaskID} has status={res[0]} oldstatus={res[1]}") + tmp_log.debug(f"task {jediTaskID} has status={res[0]} superstatus={res[1]}") else: res = [] tmp_log.debug(f"task {jediTaskID} not found") diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 140c35452..5e257fcd0 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -165,14 +165,14 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu return check_result # Get task ID and status task_id = int(step_spec.target_id) - res = self.tbif.getTaskStatusOldstatus(task_id) + res = self.tbif.getTaskStatusSuperstatus(task_id) if not res: check_result.message = f"task_id={task_id} not found" tmp_log.error(f"{check_result.message}") return check_result # Interpret status task_status = res[0] - task_oldstatus = res[1] + task_superstatus = res[1] check_result.success = True check_result.native_status = task_status if task_status in ["running", "scouting", "scouted", "throttled", "prepared", "finishing", "passed"]: @@ -180,8 +180,8 @@ def check_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCheckResu elif task_status in ["defined", "assigned", "activated", "starting", "ready"]: check_result.step_status = WFStepStatus.starting elif task_status in ["pending"]: - # Check oldstatus for repetitive status (e.g. pending) to distinguish between starting and running - if task_oldstatus in ["running", "scouting", "scouted", "throttled", "prepared", "finishing", "passed"]: + # Check superstatus for repetitive status (e.g. pending) to distinguish between starting and running + if task_superstatus in ["running"]: check_result.step_status = WFStepStatus.running else: check_result.step_status = WFStepStatus.starting From 5f2ecbb85f02fd616b3e99036f51fb2ccfa9b4ce Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 4 Mar 2026 12:24:24 +0100 Subject: [PATCH 076/101] workflows4: fixes by copilot --- .../workflow_manager_msg_processor.py | 11 +++++++++-- pandaserver/workflow/workflow_core.py | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py index 90f051385..3dde094c2 100644 --- a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py +++ b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py @@ -2,7 +2,6 @@ from pandacommon.pandalogger import logger_utils -from pandajedi.jediddm.DDMInterface import DDMInterface from pandajedi.jedimsgprocessor.base_msg_processor import BaseMsgProcPlugin from pandaserver.workflow.workflow_core import WorkflowInterface @@ -20,7 +19,6 @@ def initialize(self): Initialize the plugin """ BaseMsgProcPlugin.initialize(self) - the_pid = self.get_pid() self.workflow_interface = WorkflowInterface(self.tbIF) def process(self, msg_obj): @@ -62,16 +60,25 @@ def process(self, msg_obj): if msg_type == "workflow": workflow_id = msg_dict["workflow_id"] workflow_spec = self.tbIF.get_workflow(workflow_id) + if workflow_spec is None: + tmp_log.warning(f"workflow_id={workflow_id} not found; skipped") + return stats = self.workflow_interface.process_workflow(workflow_spec, by="msg") tmp_log.info(f"processed workflow_id={workflow_id}") elif msg_type == "wfstep": step_id = msg_dict["step_id"] step_spec = self.tbIF.get_workflow_step(step_id) + if step_spec is None: + tmp_log.warning(f"step_id={step_id} not found; skipped") + return stats = self.workflow_interface.process_step(step_spec, by="msg") tmp_log.info(f"processed step_id={step_id}") elif msg_type == "wfdata": data_id = msg_dict["data_id"] data_spec = self.tbIF.get_workflow_data(data_id) + if data_spec is None: + tmp_log.warning(f"data_id={data_id} not found; skipped") + return stats = self.workflow_interface.process_data(data_spec, by="msg") tmp_log.info(f"processed data_id={data_id}") except Exception as e: diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 366fd3644..f71b009c1 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -1700,7 +1700,7 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce # mark workflow as failed tmp_log.warning(f"workflow failed due to some steps failed or cancelled") workflow_spec.status = WorkflowStatus.failed - workflow_spec.start_time = naive_utcnow() + workflow_spec.end_time = naive_utcnow() self.tbif.update_workflow(workflow_spec) process_result.success = True process_result.new_status = workflow_spec.status From b6b59b1fd26d6931df811a1d31c05d5f08f5637f Mon Sep 17 00:00:00 2001 From: FaHui Lin <19180940+mightqxc@users.noreply.github.com> Date: Wed, 4 Mar 2026 13:45:04 +0100 Subject: [PATCH 077/101] Update pandaserver/workflow/workflow_utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pandaserver/workflow/workflow_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandaserver/workflow/workflow_utils.py b/pandaserver/workflow/workflow_utils.py index 765af1b5b..b9b3e19b8 100644 --- a/pandaserver/workflow/workflow_utils.py +++ b/pandaserver/workflow/workflow_utils.py @@ -672,7 +672,8 @@ def resolve_nodes(node_list, root_inputs, data, serial_id, parent_ids, out_ds_na if isinstance(tmp_data["source"], list): tmp_sources = tmp_data["source"] if "parent_id" in tmp_data: - tmp_parent_ids = tmp_data["parent_id"] + # Make a copy to avoid mutating the original list stored in node.inputs + tmp_parent_ids = list(tmp_data["parent_id"]) tmp_parent_ids += [None] * (len(tmp_sources) - len(tmp_parent_ids)) else: tmp_parent_ids = [None] * len(tmp_sources) From bb507731c27dfb3cf997cbb23e100779b5f21318 Mon Sep 17 00:00:00 2001 From: FaHui Lin <19180940+mightqxc@users.noreply.github.com> Date: Wed, 4 Mar 2026 13:50:59 +0100 Subject: [PATCH 078/101] Update pandaserver/workflow/workflow_parser.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pandaserver/workflow/workflow_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaserver/workflow/workflow_parser.py b/pandaserver/workflow/workflow_parser.py index 1452f9c83..1dfed4713 100644 --- a/pandaserver/workflow/workflow_parser.py +++ b/pandaserver/workflow/workflow_parser.py @@ -77,7 +77,7 @@ def parse_raw_request(sandbox_url, log_token, user_name, raw_request_dict) -> tu os.chdir(tmp_dirname) # download sandbox tmp_log.info(f"downloading sandbox from {sandbox_url}") - with requests.get(sandbox_url, allow_redirects=True, verify=False, stream=True) as r: + with requests.get(sandbox_url, allow_redirects=True, stream=True) as r: if r.status_code == 400: tmp_log.error("not found") is_fatal = True From 1bae97ae0013163c57fb505967ffb9329209c3d2 Mon Sep 17 00:00:00 2001 From: FaHui Lin <19180940+mightqxc@users.noreply.github.com> Date: Wed, 4 Mar 2026 13:52:29 +0100 Subject: [PATCH 079/101] Update pandaserver/workflow/workflow_parser.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pandaserver/workflow/workflow_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaserver/workflow/workflow_parser.py b/pandaserver/workflow/workflow_parser.py index 1dfed4713..2d0d6c618 100644 --- a/pandaserver/workflow/workflow_parser.py +++ b/pandaserver/workflow/workflow_parser.py @@ -131,7 +131,7 @@ def parse_raw_request(sandbox_url, log_token, user_name, raw_request_dict) -> tu is_fatal = True is_ok = False else: - dump_str = "{} is not supported to describe the workflow" + dump_str = f"{wf_lang} is not supported to describe the workflow" tmp_log.error(dump_str) is_fatal = True is_ok = False From 31bd56b44f4b0613506e7092b602fd3a114cefb4 Mon Sep 17 00:00:00 2001 From: FaHui Lin <19180940+mightqxc@users.noreply.github.com> Date: Wed, 4 Mar 2026 14:03:42 +0100 Subject: [PATCH 080/101] typo in pandaserver/taskbuffer/db_proxy_mods/workflow_module.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pandaserver/taskbuffer/db_proxy_mods/workflow_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py index 409474f9b..31f06238a 100644 --- a/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py +++ b/pandaserver/taskbuffer/db_proxy_mods/workflow_module.py @@ -737,11 +737,11 @@ def upsert_workflow_entities( "data": action_of_data, } # log - tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_spec.workflow_id}") + tmp_log = self.create_tagged_logger(comment, f"workflow_id={workflow_id}") tmp_log.debug(f"start, actions={actions_dict}") # skip if no action specified if not any(actions_dict.values()): - self.log.warning("no action specified; skipped") + tmp_log.warning("no action specified; skipped") return None try: n_steps_upserted = 0 From 209c3939643b4b0d3a9f943c22a19f61cd57bcc8 Mon Sep 17 00:00:00 2001 From: FaHui Lin <19180940+mightqxc@users.noreply.github.com> Date: Wed, 4 Mar 2026 14:09:03 +0100 Subject: [PATCH 081/101] typo pandaserver/workflow/workflow_utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pandaserver/workflow/workflow_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaserver/workflow/workflow_utils.py b/pandaserver/workflow/workflow_utils.py index b9b3e19b8..3ba9e8ba1 100644 --- a/pandaserver/workflow/workflow_utils.py +++ b/pandaserver/workflow/workflow_utils.py @@ -650,7 +650,7 @@ def convert_params_in_condition_to_parent_ids(condition_item, input_data, id_map setattr(condition_item, item, id_map[tmp_data["parent_id"]]) break if not isOK: - raise ReferenceError(f"unresolved paramter {param} in the condition string") + raise ReferenceError(f"unresolved parameter {param} in the condition string") elif isinstance(param, ConditionItem): convert_params_in_condition_to_parent_ids(param, input_data, id_map) From 8e8a66a3be309bbce9eda807a405a78cc4863cd6 Mon Sep 17 00:00:00 2001 From: FaHui Lin <19180940+mightqxc@users.noreply.github.com> Date: Wed, 4 Mar 2026 14:12:18 +0100 Subject: [PATCH 082/101] Update pandaserver/workflow/workflow_parser.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pandaserver/workflow/workflow_parser.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/pandaserver/workflow/workflow_parser.py b/pandaserver/workflow/workflow_parser.py index 2d0d6c618..145738049 100644 --- a/pandaserver/workflow/workflow_parser.py +++ b/pandaserver/workflow/workflow_parser.py @@ -85,17 +85,35 @@ def parse_raw_request(sandbox_url, log_token, user_name, raw_request_dict) -> tu elif r.status_code != 200: tmp_log.error(f"bad HTTP response {r.status_code}") is_ok = False + # validate sandbox filename + sandbox_name = raw_request_dict.get("sandbox") + if is_ok: + if not isinstance(sandbox_name, str): + tmp_log.error("sandbox filename is missing or not a string") + is_fatal = True + is_ok = False + else: + # sandbox filename must not contain any path separators + seps = [os.path.sep] + if os.path.altsep: + seps.append(os.path.altsep) + if any(sep in sandbox_name for sep in seps): + tmp_log.error("sandbox filename must not contain path separators") + is_fatal = True + is_ok = False + else: + sandbox_name = os.path.basename(sandbox_name) # extract sandbox if is_ok: - with open(raw_request_dict["sandbox"], "wb") as fs: + with open(sandbox_name, "wb") as fs: for chunk in r.raw.stream(1024, decode_content=False): if chunk: fs.write(chunk) fs.close() - tmp_stat, tmp_out = commands_get_status_output(f"tar xvfz {raw_request_dict['sandbox']}") + tmp_stat, tmp_out = commands_get_status_output(f"tar xvfz {sandbox_name}") if tmp_stat != 0: tmp_log.error(tmp_out) - dump_str = f"failed to extract {raw_request_dict['sandbox']}" + dump_str = f"failed to extract {sandbox_name}" tmp_log.error(dump_str) is_fatal = True is_ok = False From 6cc92f639a02dcda3fefa024865b8148dbfbce99 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 4 Mar 2026 14:21:41 +0100 Subject: [PATCH 083/101] workflows4: fix parser to guarantee chdir to original dir --- pandaserver/workflow/workflow_parser.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandaserver/workflow/workflow_parser.py b/pandaserver/workflow/workflow_parser.py index 145738049..631dfd867 100644 --- a/pandaserver/workflow/workflow_parser.py +++ b/pandaserver/workflow/workflow_parser.py @@ -70,9 +70,9 @@ def parse_raw_request(sandbox_url, log_token, user_name, raw_request_dict) -> tu is_fatal = False # request_id = None workflow_definition_dict = dict() + cur_dir = os.getcwd() try: # go to temp dir - cur_dir = os.getcwd() with tempfile.TemporaryDirectory() as tmp_dirname: os.chdir(tmp_dirname) # download sandbox @@ -179,11 +179,15 @@ def parse_raw_request(sandbox_url, log_token, user_name, raw_request_dict) -> tu "root_outputs": root_outputs_dict, "nodes": nodes_list, } - os.chdir(cur_dir) except Exception as e: is_ok = False is_fatal = True tmp_log.error(f"failed to run with {str(e)} {traceback.format_exc()}") + finally: + try: + os.chdir(cur_dir) + except Exception as e: + tmp_log.error(f"failed to restore working directory to {cur_dir}: {traceback.format_exc()}") # with tempfile.NamedTemporaryFile(delete=False, mode="w") as tmp_json: # json.dump([is_ok, is_fatal, request_id, tmp_log.dumpToString()], tmp_json) From f97ac894bb86ea51751a7ca205de02066782180d Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 4 Mar 2026 14:33:49 +0100 Subject: [PATCH 084/101] workflows4: use tarfile instead of tar command --- pandaserver/workflow/workflow_parser.py | 39 +++++++++++++++++++++---- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/pandaserver/workflow/workflow_parser.py b/pandaserver/workflow/workflow_parser.py index 631dfd867..c828d0136 100644 --- a/pandaserver/workflow/workflow_parser.py +++ b/pandaserver/workflow/workflow_parser.py @@ -4,6 +4,7 @@ import re import shlex import sys +import tarfile import tempfile import traceback @@ -16,7 +17,7 @@ from pandacommon.pandalogger.PandaLogger import PandaLogger from ruamel.yaml import YAML -from pandaserver.srvcore.CoreUtils import clean_user_id, commands_get_status_output +# from pandaserver.srvcore.CoreUtils import clean_user_id from pandaserver.workflow import pcwl_utils, workflow_utils from pandaserver.workflow.snakeparser import Parser @@ -71,6 +72,34 @@ def parse_raw_request(sandbox_url, log_token, user_name, raw_request_dict) -> tu # request_id = None workflow_definition_dict = dict() cur_dir = os.getcwd() + + def _is_within_directory(base_dir: str, target_path: str) -> bool: + abs_base_dir = os.path.abspath(base_dir) + abs_target_path = os.path.abspath(target_path) + return os.path.commonpath([abs_base_dir, abs_target_path]) == abs_base_dir + + def _safe_extract_tar_gz(tar_path: str, extract_dir: str): + with tarfile.open(tar_path, mode="r:gz") as tar: + members = tar.getmembers() + for member in members: + member_name = member.name + normalized_name = os.path.normpath(member_name) + # security checks for tar member name + if os.path.isabs(member_name): + raise ValueError(f"absolute path in tar member is not allowed: {member_name}") + if normalized_name in ("", ".", "..") or normalized_name.startswith(".." + os.path.sep): + raise ValueError(f"path traversal in tar member is not allowed: {member_name}") + if member.issym() or member.islnk(): + raise ValueError(f"links in tar archive are not allowed: {member_name}") + if member.ischr() or member.isblk() or member.isfifo(): + raise ValueError(f"special file in tar archive is not allowed: {member_name}") + # check that the extraction target is within the extract_dir + extraction_target = os.path.join(extract_dir, normalized_name) + if not _is_within_directory(extract_dir, extraction_target): + raise ValueError(f"tar member extracts outside target directory: {member_name}") + # all checks passed, safe to extract + tar.extractall(path=extract_dir, members=members) + try: # go to temp dir with tempfile.TemporaryDirectory() as tmp_dirname: @@ -110,10 +139,10 @@ def parse_raw_request(sandbox_url, log_token, user_name, raw_request_dict) -> tu if chunk: fs.write(chunk) fs.close() - tmp_stat, tmp_out = commands_get_status_output(f"tar xvfz {sandbox_name}") - if tmp_stat != 0: - tmp_log.error(tmp_out) - dump_str = f"failed to extract {sandbox_name}" + try: + _safe_extract_tar_gz(sandbox_name, tmp_dirname) + except Exception as e: + dump_str = f"failed to extract {sandbox_name}: {traceback.format_exc()}" tmp_log.error(dump_str) is_fatal = True is_ok = False From b9eb18174f584ef41fc3139d22087a1e37a1227a Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 4 Mar 2026 14:44:06 +0100 Subject: [PATCH 085/101] workflows4: fix utils according to copilot --- pandaserver/workflow/workflow_utils.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/pandaserver/workflow/workflow_utils.py b/pandaserver/workflow/workflow_utils.py index 3ba9e8ba1..7f0d8c04e 100644 --- a/pandaserver/workflow/workflow_utils.py +++ b/pandaserver/workflow/workflow_utils.py @@ -643,10 +643,22 @@ def convert_params_in_condition_to_parent_ids(condition_item, input_data, id_map isOK = True if isinstance(tmp_data["parent_id"], list): if idx is not None: - setattr(condition_item, item, id_map[tmp_data["parent_id"][idx]]) + if idx < 0 or idx >= len(tmp_data["parent_id"]): + raise IndexError(f"index {idx} is out of bounds for parameter {param} with {len(tmp_data['parent_id'])} parents") + parent_id = tmp_data["parent_id"][idx] + if parent_id not in id_map: + raise ReferenceError(f"unresolved parent_id {parent_id} for parameter {param}[{idx}]") + setattr(condition_item, item, id_map[parent_id]) else: - setattr(condition_item, item, id_map[tmp_data["parent_id"]]) + resolved_parent_ids = set() + for parent_id in tmp_data["parent_id"]: + if parent_id not in id_map: + raise ReferenceError(f"unresolved parent_id {parent_id} for parameter {param}") + resolved_parent_ids |= id_map[parent_id] + setattr(condition_item, item, list(resolved_parent_ids)) else: + if tmp_data["parent_id"] not in id_map: + raise ReferenceError(f"unresolved parent_id {tmp_data['parent_id']} for parameter {param}") setattr(condition_item, item, id_map[tmp_data["parent_id"]]) break if not isOK: @@ -778,10 +790,9 @@ def resolve_nodes(node_list, root_inputs, data, serial_id, parent_ids, out_ds_na tail_nodes = [] for node in all_nodes: if node.is_tail: - if node.is_tail: - tail_nodes.append(node) - else: - tail_nodes += resolved_map[node.id] + tail_nodes.append(node) + else: + tail_nodes += resolved_map[node.id] return serial_id, tail_nodes, all_nodes From d2433daa81d0ff2cac4577ca7ad33f1c85cb2b30 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 4 Mar 2026 14:50:55 +0100 Subject: [PATCH 086/101] workflows4: improve test script --- ...unctions.py => workflow_core_smoketest.py} | 72 +++++++++++-------- 1 file changed, 42 insertions(+), 30 deletions(-) rename pandaserver/workflow/{test_workflow_core_functions.py => workflow_core_smoketest.py} (88%) diff --git a/pandaserver/workflow/test_workflow_core_functions.py b/pandaserver/workflow/workflow_core_smoketest.py similarity index 88% rename from pandaserver/workflow/test_workflow_core_functions.py rename to pandaserver/workflow/workflow_core_smoketest.py index 496f711be..a8d95746b 100644 --- a/pandaserver/workflow/test_workflow_core_functions.py +++ b/pandaserver/workflow/workflow_core_smoketest.py @@ -1,3 +1,4 @@ +import argparse import json import sys @@ -17,7 +18,12 @@ username = "testuser" workflow_name = "test_workflow_bg_comb_00" -WFID = sys.argv[1] # workflow ID to be used in this test + +def parse_args(): + parser = argparse.ArgumentParser(description="Workflow core smoke test helper") + parser.add_argument("workflow_id", nargs="?", default=None, help="Workflow ID to use in commented smoke test calls") + return parser.parse_args() + # workflow definition json # wfd_json = json.dumps( @@ -330,40 +336,46 @@ # ) -# interface for workflow operations -requester_id = GenericThread().get_full_id(__name__, sys.modules[__name__].__file__) -taskBuffer.init( - panda_config.dbhost, - panda_config.dbpasswd, - nDBConnection=panda_config.nDBConnection, - useTimeout=True, - requester=requester_id, -) +def main(): + args = parse_args() + WFID = args.workflow_id -wfif = WorkflowInterface(taskBuffer) + # interface for workflow operations + requester_id = GenericThread().get_full_id(__name__, sys.modules[__name__].__file__) + taskBuffer.init( + panda_config.dbhost, + panda_config.dbpasswd, + nDBConnection=panda_config.nDBConnection, + useTimeout=True, + requester=requester_id, + ) + wfif = WorkflowInterface(taskBuffer) -# Test cases for workflow core + # Test cases for workflow core -# Register the workflow -# print("Registering workflow...") -# wf_spec = wfif.register_workflow( -# prodsourcelabel=prodsourcelabel, -# username=username, -# workflow_name=workflow_name, -# workflow_definition_json=wfd_json, -# ) + # Register the workflow + # print("Registering workflow...") + # wf_spec = wfif.register_workflow( + # prodsourcelabel=prodsourcelabel, + # username=username, + # workflow_name=workflow_name, + # workflow_definition_json=wfd_json, + # ) + + # Process the registered workflow + # wf_spec = taskBuffer.get_workflow(workflow_id=WFID) + # print("Processing registered workflow...") + # wfif.process_workflow_registered(wf_spec) + # wf_spec = taskBuffer.get_workflow(workflow_id=WFID) + # print("Processing checked workflow...") + # wfif.process_workflow_checked(wf_spec) -# Process the registered workflow -# wf_spec = taskBuffer.get_workflow(workflow_id=WFID) -# print("Processing registered workflow...") -# wfif.process_workflow_registered(wf_spec) + # wf_spec = taskBuffer.get_workflow(workflow_id=WFID) + # print("Processing starting workflow...") + # wfif.process_workflow_starting(wf_spec) -# wf_spec = taskBuffer.get_workflow(workflow_id=WFID) -# print("Processing checked workflow...") -# wfif.process_workflow_checked(wf_spec) -# wf_spec = taskBuffer.get_workflow(workflow_id=WFID) -# print("Processing starting workflow...") -# wfif.process_workflow_starting(wf_spec) +if __name__ == "__main__": + main() From 9dcb224d5378cbe91ae3b07d9e1025cfcd1760a4 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 4 Mar 2026 15:03:35 +0100 Subject: [PATCH 087/101] workflows4: fix stats of status change of entities --- .../workflow_manager_msg_processor.py | 6 ++--- pandaserver/workflow/workflow_core.py | 27 +++++++++++-------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py index 3dde094c2..1129e652e 100644 --- a/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py +++ b/pandajedi/jedimsgprocessor/workflow_manager_msg_processor.py @@ -63,7 +63,7 @@ def process(self, msg_obj): if workflow_spec is None: tmp_log.warning(f"workflow_id={workflow_id} not found; skipped") return - stats = self.workflow_interface.process_workflow(workflow_spec, by="msg") + stats, workflow_spec = self.workflow_interface.process_workflow(workflow_spec, by="msg") tmp_log.info(f"processed workflow_id={workflow_id}") elif msg_type == "wfstep": step_id = msg_dict["step_id"] @@ -71,7 +71,7 @@ def process(self, msg_obj): if step_spec is None: tmp_log.warning(f"step_id={step_id} not found; skipped") return - stats = self.workflow_interface.process_step(step_spec, by="msg") + stats, step_spec = self.workflow_interface.process_step(step_spec, by="msg") tmp_log.info(f"processed step_id={step_id}") elif msg_type == "wfdata": data_id = msg_dict["data_id"] @@ -79,7 +79,7 @@ def process(self, msg_obj): if data_spec is None: tmp_log.warning(f"data_id={data_id} not found; skipped") return - stats = self.workflow_interface.process_data(data_spec, by="msg") + stats, data_spec = self.workflow_interface.process_data(data_spec, by="msg") tmp_log.info(f"processed data_id={data_id}") except Exception as e: err_str = f"failed to run, skipped. {e.__class__.__name__} : {e}" diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index f71b009c1..8279fcc7f 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -738,7 +738,7 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: tmp_log.error(f"{traceback.format_exc()}") return process_result - def process_data(self, data_spec: WFDataSpec, by: str = "dog") -> WFDataProcessResult | None: + def process_data(self, data_spec: WFDataSpec, by: str = "dog") -> tuple[WFDataProcessResult | None, WFDataSpec]: """ Process a single workflow data specification @@ -748,6 +748,7 @@ def process_data(self, data_spec: WFDataSpec, by: str = "dog") -> WFDataProcessR Returns: WFDataProcessResult | None: The result of processing the data specification, or None if skipped + WFDataSpec: The updated workflow data specification """ tmp_log = LogWrapper(logger, f"process_data ") tmp_log.debug("Start") @@ -755,7 +756,7 @@ def process_data(self, data_spec: WFDataSpec, by: str = "dog") -> WFDataProcessR with self.workflow_data_lock(data_spec.data_id) as locked_data_spec: if locked_data_spec is None: tmp_log.warning(f"Failed to acquire lock for data_id={data_spec.data_id}; skipped") - return None + return None, data_spec data_spec = locked_data_spec orig_status = data_spec.status # Process the data @@ -780,7 +781,7 @@ def process_data(self, data_spec: WFDataSpec, by: str = "dog") -> WFDataProcessR # For changes into transient status, send message to trigger processing immediately if data_spec.status != orig_status and data_spec.status in WFDataStatus.transient_statuses: self.send_data_message(data_spec.data_id) - return tmp_res + return tmp_res, data_spec def process_datas(self, data_specs: List[WFDataSpec], by: str = "dog") -> Dict: """ @@ -799,7 +800,7 @@ def process_datas(self, data_specs: List[WFDataSpec], by: str = "dog") -> Dict: data_status_stats = {"n_data": n_data, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} for data_spec in data_specs: orig_status = data_spec.status - tmp_res = self.process_data(data_spec, by=by) + tmp_res, data_spec = self.process_data(data_spec, by=by) if tmp_res and tmp_res.success: # update stats if tmp_res.new_status and data_spec.status != orig_status: @@ -1288,7 +1289,9 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: tmp_log.error(f"Got error ; {traceback.format_exc()}") return process_result - def process_step(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "dog") -> WFStepProcessResult | None: + def process_step( + self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "dog" + ) -> tuple[WFStepProcessResult | None, WFStepSpec]: """ Process a single workflow step @@ -1299,6 +1302,7 @@ def process_step(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpe Returns: WFStepProcessResult | None: The result of processing the step, or None if the step was skipped + WFStepSpec: The updated workflow step specification """ tmp_log = LogWrapper(logger, f"process_step ") tmp_log.debug("Start") @@ -1306,7 +1310,7 @@ def process_step(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpe with self.workflow_step_lock(step_spec.step_id) as locked_step_spec: if locked_step_spec is None: tmp_log.warning(f"Failed to acquire lock for step_id={step_spec.step_id}; skipped") - return None + return None, step_spec step_spec = locked_step_spec orig_status = step_spec.status # Process the step @@ -1335,7 +1339,7 @@ def process_step(self, step_spec: WFStepSpec, data_spec_map: Dict[str, WFDataSpe # For changes into transient status, send message to trigger processing immediately if step_spec.status != orig_status and step_spec.status in WFStepStatus.transient_statuses: self.send_step_message(step_spec.step_id) - return tmp_res + return tmp_res, step_spec def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, WFDataSpec] | None = None, by: str = "dog") -> Dict: """ @@ -1355,7 +1359,7 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W steps_status_stats = {"n_steps": n_steps, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} for step_spec in step_specs: orig_status = step_spec.status - tmp_res = self.process_step(step_spec, data_spec_map=data_spec_map, by=by) + tmp_res, step_spec = self.process_step(step_spec, data_spec_map=data_spec_map, by=by) if tmp_res and tmp_res.success: # update stats if tmp_res.new_status and step_spec.status != orig_status: @@ -1716,7 +1720,7 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce tmp_log.error(f"Got error ; {traceback.format_exc()}") return process_result - def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "dog") -> WorkflowProcessResult: + def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "dog") -> tuple[WorkflowProcessResult, WorkflowSpec]: """ Process a workflow based on its current status @@ -1726,6 +1730,7 @@ def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "dog") -> Work Returns: WorkflowProcessResult: The result of processing the workflow + WorkflowSpec: The updated workflow specification """ tmp_log = LogWrapper(logger, f"process_workflow ") tmp_log.debug(f"Start, current status={workflow_spec.status}") @@ -1748,7 +1753,7 @@ def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "dog") -> Work # For changes into transient status, send message to trigger processing immediately if workflow_spec.status != orig_status and workflow_spec.status in WorkflowStatus.transient_statuses: self.send_workflow_message(workflow_spec.workflow_id) - return process_result + return process_result, workflow_spec # ---- Process all workflows ------------------------------------- @@ -1780,7 +1785,7 @@ def process_active_workflows(self) -> Dict: workflow_spec = locked_workflow_spec orig_status = workflow_spec.status # Process the workflow - tmp_res = self.process_workflow(workflow_spec) + tmp_res, workflow_spec = self.process_workflow(workflow_spec) if tmp_res and tmp_res.success: # update stats if tmp_res.new_status and workflow_spec.status != orig_status: From 2d2d2bf3c1dfbbb3d4141c02c270e65cb4c1977a Mon Sep 17 00:00:00 2001 From: FaHui Lin <19180940+mightqxc@users.noreply.github.com> Date: Wed, 4 Mar 2026 15:25:38 +0100 Subject: [PATCH 088/101] Update pandaserver/workflow/workflow_core_smoketest.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pandaserver/workflow/workflow_core_smoketest.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandaserver/workflow/workflow_core_smoketest.py b/pandaserver/workflow/workflow_core_smoketest.py index a8d95746b..00420619b 100644 --- a/pandaserver/workflow/workflow_core_smoketest.py +++ b/pandaserver/workflow/workflow_core_smoketest.py @@ -1,17 +1,10 @@ import argparse -import json import sys from pandacommon.pandautils.thread_utils import GenericThread from pandaserver.config import panda_config from pandaserver.taskbuffer.TaskBuffer import taskBuffer -from pandaserver.workflow.workflow_core import ( - WFDataSpec, - WFStepSpec, - WorkflowInterface, - WorkflowSpec, -) # parameters for the workflow prodsourcelabel = "user" From 812cfafc3cf7dc0fa998e8ea3e92f38345fce98b Mon Sep 17 00:00:00 2001 From: FaHui Lin <19180940+mightqxc@users.noreply.github.com> Date: Wed, 4 Mar 2026 15:26:24 +0100 Subject: [PATCH 089/101] Update pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../step_handler_plugins/panda_task_step_handler.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 5e257fcd0..641ab6afb 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -1,22 +1,15 @@ -import json import traceback -import uuid from pandacommon.pandalogger.LogWrapper import LogWrapper from pandacommon.pandalogger.PandaLogger import PandaLogger from pandaserver.workflow.step_handler_plugins.base_step_handler import BaseStepHandler from pandaserver.workflow.workflow_base import ( - WFDataSpec, - WFDataStatus, - WFDataType, WFStepSpec, WFStepStatus, WFStepTargetCheckResult, WFStepTargetSubmitResult, WFStepType, - WorkflowSpec, - WorkflowStatus, ) # main logger From c498facebcbf4a0b2c925de4d8acbf184bcccd09 Mon Sep 17 00:00:00 2001 From: FaHui Lin <19180940+mightqxc@users.noreply.github.com> Date: Wed, 4 Mar 2026 15:27:05 +0100 Subject: [PATCH 090/101] Update pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py b/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py index 2343f7811..680914c81 100644 --- a/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py +++ b/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py @@ -1,16 +1,10 @@ -import datetime -import os -import re -import socket import sys import traceback # logger from pandacommon.pandalogger.PandaLogger import PandaLogger -from pandacommon.pandautils.PandaUtils import naive_utcnow from pandajedi.jedicore.MsgWrapper import MsgWrapper -from pandajedi.jedicore.ThreadUtils import ListWithLock, ThreadPool, WorkerThread from pandaserver.workflow.workflow_core import WorkflowInterface from .WatchDogBase import WatchDogBase From e8fd04f2f921c50203b139a8e5f5e0a0bdf986d6 Mon Sep 17 00:00:00 2001 From: FaHui Lin <19180940+mightqxc@users.noreply.github.com> Date: Wed, 4 Mar 2026 15:29:55 +0100 Subject: [PATCH 091/101] Update pandaserver/workflow/step_handler_plugins/base_step_handler.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pandaserver/workflow/step_handler_plugins/base_step_handler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandaserver/workflow/step_handler_plugins/base_step_handler.py b/pandaserver/workflow/step_handler_plugins/base_step_handler.py index f7c9783d6..6d75cd7f0 100644 --- a/pandaserver/workflow/step_handler_plugins/base_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/base_step_handler.py @@ -1,5 +1,3 @@ -import dataclasses - from pandaserver.workflow.workflow_base import ( WFDataSpec, WFDataStatus, From 7b3f5682fdcaa9d56e713a19df93d3281faab311 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 4 Mar 2026 15:42:16 +0100 Subject: [PATCH 092/101] workflows4: update workflow checktime --- pandaserver/workflow/workflow_core.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 8279fcc7f..a2114c12b 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -1624,14 +1624,18 @@ def process_workflow_starting(self, workflow_spec: WorkflowSpec) -> WorkflowProc # Process steps steps_status_stats = self.process_steps(step_specs, data_spec_map=data_spec_map) # Update workflow status to running if any of step is starting + now_time = naive_utcnow() if steps_status_stats["processed"].get(WFStepStatus.starting): workflow_spec.status = WorkflowStatus.running - workflow_spec.start_time = naive_utcnow() + workflow_spec.start_time = now_time + workflow_spec.check_time = now_time self.tbif.update_workflow(workflow_spec) process_result.success = True process_result.new_status = workflow_spec.status tmp_log.info(f"Done, advanced to status={workflow_spec.status}") else: + workflow_spec.check_time = now_time + self.tbif.update_workflow(workflow_spec) process_result.success = True tmp_log.info(f"Done, status remains {workflow_spec.status}") except Exception as e: @@ -1696,6 +1700,7 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce # Process each step steps_status_stats = self.process_steps(step_specs, data_spec_map=data_spec_map) # Update workflow status by steps + now_time = naive_utcnow() if (processed_steps_stats := steps_status_stats["processed"]) and ( processed_steps_stats.get(WFStepStatus.failed) or processed_steps_stats.get(WFStepStatus.cancelled) ): @@ -1704,12 +1709,15 @@ def process_workflow_running(self, workflow_spec: WorkflowSpec) -> WorkflowProce # mark workflow as failed tmp_log.warning(f"workflow failed due to some steps failed or cancelled") workflow_spec.status = WorkflowStatus.failed - workflow_spec.end_time = naive_utcnow() + workflow_spec.end_time = now_time + workflow_spec.check_time = now_time self.tbif.update_workflow(workflow_spec) process_result.success = True process_result.new_status = workflow_spec.status tmp_log.info(f"Done, advanced to status={workflow_spec.status}") else: + workflow_spec.check_time = now_time + self.tbif.update_workflow(workflow_spec) process_result.success = True tmp_log.info(f"Done, status remains {workflow_spec.status}") if processed_steps_stats.get(WFStepStatus.done) == len(step_specs): From 178455ec46101d8664f3973afb2b7d24e706d660 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 4 Mar 2026 15:57:44 +0100 Subject: [PATCH 093/101] workflows4: fix resolve_nodes --- pandaserver/workflow/workflow_utils.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandaserver/workflow/workflow_utils.py b/pandaserver/workflow/workflow_utils.py index 7f0d8c04e..5ed1e60ab 100644 --- a/pandaserver/workflow/workflow_utils.py +++ b/pandaserver/workflow/workflow_utils.py @@ -675,6 +675,8 @@ def resolve_nodes(node_list, root_inputs, data, serial_id, parent_ids, out_ds_na root_inputs[k] = data[kk] tmp_to_real_id_map = {} resolved_map = {} + # map of object identity to original temporary node ID used in resolved_map keys + node_key_map = {} all_nodes = [] for node in node_list: # resolve input @@ -746,10 +748,12 @@ def resolve_nodes(node_list, root_inputs, data, serial_id, parent_ids, out_ds_na sc_nodes = [node] # loop over scattered nodes for sc_node in sc_nodes: + original_node_id = sc_node.id all_nodes.append(sc_node) + node_key_map[id(sc_node)] = original_node_id # set real node ID - resolved_map.setdefault(sc_node.id, []) - tmp_to_real_id_map.setdefault(sc_node.id, set()) + resolved_map.setdefault(original_node_id, []) + tmp_to_real_id_map.setdefault(original_node_id, set()) # resolve parents real_parens = set() for i in sc_node.parents: @@ -758,8 +762,8 @@ def resolve_nodes(node_list, root_inputs, data, serial_id, parent_ids, out_ds_na if sc_node.is_head: sc_node.parents |= parent_ids if sc_node.is_leaf: - resolved_map[sc_node.id].append(sc_node) - tmp_to_real_id_map[sc_node.id].add(serial_id) + resolved_map[original_node_id].append(sc_node) + tmp_to_real_id_map[original_node_id].add(serial_id) sc_node.id = serial_id serial_id += 1 else: @@ -772,8 +776,8 @@ def resolve_nodes(node_list, root_inputs, data, serial_id, parent_ids, out_ds_na out_ds_name, log_stream, ) - resolved_map[sc_node.id] += sub_tail_nodes - tmp_to_real_id_map[sc_node.id] |= set([n.id for n in sub_tail_nodes]) + resolved_map[original_node_id] += sub_tail_nodes + tmp_to_real_id_map[original_node_id] |= set([n.id for n in sub_tail_nodes]) sc_node.id = serial_id serial_id += 1 # convert parameters to parent IDs in conditions @@ -789,10 +793,11 @@ def resolve_nodes(node_list, root_inputs, data, serial_id, parent_ids, out_ds_na # return tails tail_nodes = [] for node in all_nodes: + original_node_id = node_key_map.get(id(node), node.id) if node.is_tail: tail_nodes.append(node) else: - tail_nodes += resolved_map[node.id] + tail_nodes += resolved_map[original_node_id] return serial_id, tail_nodes, all_nodes From 18a5fd77586397241b81f020393caa777071205d Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 4 Mar 2026 23:04:18 +0100 Subject: [PATCH 094/101] workflows4: default lock time --- pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py b/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py index 680914c81..38e96fefb 100644 --- a/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py +++ b/pandajedi/jedidog/AtlasWorkflowManagerWatchDog.py @@ -31,7 +31,7 @@ def doProcessWorkflows(self): tmpLog.debug("start") try: # watchdog lock - got_lock = self.get_process_lock("AtlasWFManagerDog.doProcessWorkflows", timeLimit=1) + got_lock = self.get_process_lock("AtlasWFManagerDog.doProcessWorkflows", timeLimit=5) if not got_lock: tmpLog.debug("locked by another watchdog process. Skipped") return From a5a7e9767b2311650f70c34deb7f5b4555145b37 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 4 Mar 2026 23:04:32 +0100 Subject: [PATCH 095/101] v0.7.2 --- PandaPkgInfo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PandaPkgInfo.py b/PandaPkgInfo.py index 6d8b9c455..e7b057782 100644 --- a/PandaPkgInfo.py +++ b/PandaPkgInfo.py @@ -1 +1 @@ -release_version = "0.7.1" +release_version = "0.7.2" From cf5eba81683e9c26a26f0f0f981531096efd54a2 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 5 Mar 2026 09:11:16 +0100 Subject: [PATCH 096/101] v0.8.0; update DB schema version --- PandaPkgInfo.py | 2 +- pandajedi/jedicore/JediDBSchemaInfo.py | 2 +- pandaserver/taskbuffer/PandaDBSchemaInfo.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PandaPkgInfo.py b/PandaPkgInfo.py index e7b057782..c93370d47 100644 --- a/PandaPkgInfo.py +++ b/PandaPkgInfo.py @@ -1 +1 @@ -release_version = "0.7.2" +release_version = "0.8.0" diff --git a/pandajedi/jedicore/JediDBSchemaInfo.py b/pandajedi/jedicore/JediDBSchemaInfo.py index ee5961c57..fb25f745a 100644 --- a/pandajedi/jedicore/JediDBSchemaInfo.py +++ b/pandajedi/jedicore/JediDBSchemaInfo.py @@ -12,6 +12,6 @@ class JediDBSchemaInfo: schema_version = None def method(self): - schema_version = "0.0.30" + schema_version = "0.1.1" _logger.debug(f"PanDA schema version required for JEDI is : {schema_version}") return schema_version diff --git a/pandaserver/taskbuffer/PandaDBSchemaInfo.py b/pandaserver/taskbuffer/PandaDBSchemaInfo.py index 5a5f077be..4787ac599 100644 --- a/pandaserver/taskbuffer/PandaDBSchemaInfo.py +++ b/pandaserver/taskbuffer/PandaDBSchemaInfo.py @@ -13,6 +13,6 @@ class PandaDBSchemaInfo: schema_version = None def method(self): - schema_version = "0.0.28" + schema_version = "0.1.1" _logger.debug(f"PanDA schema version required for Server is : {schema_version}") return schema_version From 00086db1684ef7b69fb0f4f02eff26b569292d28 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 11 Mar 2026 08:45:12 +0100 Subject: [PATCH 097/101] update dependency --- pandaserver/workflow/workflow_core.py | 5 ++++- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index a2114c12b..19c796c66 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -166,7 +166,10 @@ def set_mb_proxy(self): # stop with atexit atexit.register(mq_agent.stop_passive_mode) # set mb_proxy - self.mb_proxy = mb_proxy_dict["out"][MESSAGE_QUEUE_NAME] + self.mb_proxy = mb_proxy_dict["out"].get(MESSAGE_QUEUE_NAME) + if self.mb_proxy is None: + logger.warning(f"Message queue {MESSAGE_QUEUE_NAME} not found in mb_proxy_dict; skipped workflow manager messaging") + return None # logger.debug(f"Set mb_proxy about queue {MESSAGE_QUEUE_NAME} for workflow manager messaging") except Exception: logger.warning(f"Failed to set mb_proxy about queue {MESSAGE_QUEUE_NAME}; skipped workflow manager messaging: {traceback.format_exc()}") diff --git a/pyproject.toml b/pyproject.toml index d7a72e7b0..4a43d1530 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ authors = [ { name = "PanDA Team", email = "panda-support@cern.ch" }, ] dependencies = [ - 'panda-common>=0.1.6', + 'panda-common>=0.1.7', 'panda-client-light>=1.5.55', 'pyOpenSSL', 'python-daemon', From ff8c93aa9ab0101c2d8248b2f0e27b7e8413f4b0 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 11 Mar 2026 10:58:33 +0100 Subject: [PATCH 098/101] workflows4: add cancel_workflow --- .../step_handler_plugins/base_step_handler.py | 15 ++ .../panda_task_step_handler.py | 42 ++++++ pandaserver/workflow/workflow_base.py | 20 ++- pandaserver/workflow/workflow_core.py | 128 +++++++++++++++++- .../workflow/workflow_core_smoketest.py | 26 +++- 5 files changed, 221 insertions(+), 10 deletions(-) diff --git a/pandaserver/workflow/step_handler_plugins/base_step_handler.py b/pandaserver/workflow/step_handler_plugins/base_step_handler.py index 6d75cd7f0..c352fd538 100644 --- a/pandaserver/workflow/step_handler_plugins/base_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/base_step_handler.py @@ -4,6 +4,7 @@ WFDataType, WFStepSpec, WFStepStatus, + WFStepTargetCancelResult, WFStepTargetCheckResult, WFStepTargetSubmitResult, WFStepType, @@ -68,3 +69,17 @@ def on_all_inputs_done(self, step_spec: WFStepSpec, **kwargs) -> None: **kwargs: Additional keyword arguments. """ raise NotImplementedError("Subclasses must implement this method.") + + def cancel_target(self, step_spec: WFStepSpec, **kwargs) -> WFStepTargetCancelResult: + """ + Cancel the submitted target. + This method can be overridden by subclasses to handle target cancellation. + + Args: + step_spec (WFStepSpec): Specifications of the workflow step whose target is to be cancelled. + **kwargs: Additional keyword arguments. + + Returns: + WFStepTargetCancelResult: An object containing the result of the cancellation, including success status and message. + """ + raise NotImplementedError("Subclasses must implement this method.") diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index 641ab6afb..d08f12660 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -7,6 +7,7 @@ from pandaserver.workflow.workflow_base import ( WFStepSpec, WFStepStatus, + WFStepTargetCancelResult, WFStepTargetCheckResult, WFStepTargetSubmitResult, WFStepType, @@ -234,3 +235,44 @@ def on_all_inputs_done(self, step_spec: WFStepSpec, **kwargs) -> None: tmp_log.debug(f"Done") except Exception as e: tmp_log.error(f"Failed with: {traceback.format_exc()}") + + def cancel_target(self, step_spec, **kwargs) -> WFStepTargetCancelResult: + """ + Cancel the target task for the given step. + This method should be implemented to handle the specifics of task cancellation. + + Args: + step_spec (WFStepSpec): The workflow step specification containing details about the step to be processed. + **kwargs: Additional keyword arguments that may be required for cancellation. + + Returns: + WFStepTargetCancelResult: An object containing the result of the cancellation, including success status and message. + """ + tmp_log = LogWrapper(logger, f"cancel_target workflow_id={step_spec.workflow_id} step_id={step_spec.step_id}") + cancel_result = WFStepTargetCancelResult() + try: + # Check step flavor + if step_spec.flavor != self.plugin_flavor: + cancel_result.message = f"flavor not {self.plugin_flavor}; skipped" + tmp_log.warning(f"flavor={step_spec.flavor} not {self.plugin_flavor}; skipped") + return cancel_result + if step_spec.target_id is None: + cancel_result.message = f"target_id is None; skipped" + tmp_log.warning(f"target_id is None; skipped") + return cancel_result + # Get task ID + task_id = int(step_spec.target_id) + # Cancel task + ret_val, ret_str = self.taskBufferIF.sendCommandTaskPanda(task_id, "PanDA Task Step Handler cancel_target", True, "kill", properErrorCode=True) + # check if ok + if ret_val == 0: + cancel_result.success = True + tmp_log.info(f"target_id={step_spec.target_id} cancelled") + else: + cancel_result.success = False + cancel_result.message = f"failed to cancel the task: error_code={ret_val} {ret_str}" + tmp_log.warning(f"{cancel_result.message}") + except Exception as e: + cancel_result.message = f"exception {str(e)}" + tmp_log.error(f"Failed to cancel task: {traceback.format_exc()}") + return cancel_result diff --git a/pandaserver/workflow/workflow_base.py b/pandaserver/workflow/workflow_base.py index edb884262..ad04911fa 100644 --- a/pandaserver/workflow/workflow_base.py +++ b/pandaserver/workflow/workflow_base.py @@ -4,13 +4,13 @@ from datetime import datetime, timedelta from typing import Any, Dict, List -from pandacommon.pandalogger.PandaLogger import PandaLogger +# from pandacommon.pandalogger.PandaLogger import PandaLogger from pandacommon.pandautils.base import SpecBase from pandaserver.config import panda_config # main logger -logger = PandaLogger().getLogger(__name__.split(".")[-1]) +# logger = PandaLogger().getLogger(__name__.split(".")[-1]) # named tuple for attribute with type AttributeWithType = namedtuple("AttributeWithType", ["attribute", "type"]) @@ -480,6 +480,22 @@ class WFStepTargetCheckResult: message: str = "" +@dataclass(slots=True) +class WFStepTargetCancelResult: + """ + Result of cancelling a target of a step. + + Fields: + success (bool | None): Indicates if the cancellation was successful. + target_id (str | None): The ID of the cancelled target (e.g., task ID). + message (str): A message providing additional information about the cancellation result. + """ + + success: bool | None = None + target_id: str | None = None + message: str = "" + + # ==== Return objects of data handler methods ================== diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 19c796c66..b477c747f 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -365,11 +365,135 @@ def register_workflow( tmp_log.info(f"Registered workflow ") return ret_workflow_id - def cancel_workflow(self, workflow_id: int) -> bool: ... + def cancel_workflow(self, workflow_id: int, force: bool = False) -> bool: + """ + Cancel the workflow + + Args: + workflow_id (int): ID of the workflow to cancel + force (bool): Whether to force into cancelled status + + Returns: + bool: True if the workflow was successfully cancelled, otherwise False + """ + tmp_log = LogWrapper(logger, f"cancel_workflow ") + tmp_log.debug("Start") + try: + with self.workflow_lock(workflow_id) as workflow_spec: + if workflow_spec is None: + tmp_log.warning(f"Failed to acquire lock for workflow_id={workflow_id}; skipped") + return False + if workflow_spec.status in WorkflowStatus.final_statuses: + tmp_log.debug(f"Workflow already in final status {workflow_spec.status}; skipped") + return True + # Cancel all steps and data of the workflow + all_cancelled = True + for step_id in workflow_spec.step_ids: + if not self.cancel_step(step_id, force): + all_cancelled = False + for data_id in workflow_spec.data_ids: + if not self.cancel_data(data_id, force): + all_cancelled = False + # Update workflow status to cancelled if all steps and data are cancelled + if not all_cancelled and not force: + tmp_log.warning(f"Not all steps and data could be cancelled; skipped updating workflow status") + return False + else: + workflow_spec.status = WorkflowStatus.cancelled + workflow_spec.end_time = naive_utcnow() + self.tbif.update_workflow(workflow_spec) + if force and not all_cancelled: + tmp_log.warning(f"Force cancelled workflow without cancelling all steps and data") + else: + tmp_log.info(f"Cancelled workflow, updated status to {workflow_spec.status}") + return True + except Exception as e: + tmp_log.error(f"Got error {str(e)}") + return False # --- Step operation --------------------------------------- - def cancel_step(self, step_id: int) -> bool: ... + def cancel_step(self, step_id: int, force: bool = False) -> bool: + """ + Cancel the workflow step + + Args: + step_id (int): ID of the workflow step to cancel + force (bool): Whether to force into cancelled status; if False, the step will only be cancelled if the target cancellation is successful, while if True, the step will be marked as cancelled regardless of the target cancellation result + + Returns: + bool: True if the step was successfully cancelled, otherwise False + """ + tmp_log = LogWrapper(logger, f"cancel_step ") + tmp_log.debug("Start") + try: + with self.workflow_step_lock(step_id) as step_spec: + if step_spec is None: + tmp_log.warning(f"Failed to acquire lock for step_id={step_id}; skipped") + return False + if step_spec.status in WFStepStatus.final_statuses: + tmp_log.debug(f"Step already in final status {step_spec.status}; skipped") + return True + # Call plugin to cancel the target of the step + target_is_cancelled = False + step_handler = self.get_plugin("step_handler", step_spec.flavor) + if step_handler is None: + tmp_log.warning(f"Step handler plugin not found for flavor {step_spec.flavor}; skipped target cancellation") + else: + cancel_result = step_handler.cancel_target(step_spec) + if not cancel_result.success: + tmp_log.warning(f"Failed to cancel target with plugin {step_spec.flavor}; got message: {cancel_result.message}") + else: + tmp_log.debug(f"Cancelled target with flavor {step_spec.flavor}") + target_is_cancelled = True + # Update step status to cancelled + if not target_is_cancelled and not force: + tmp_log.warning(f"Target not cancelled; skipped updating step status") + return False + else: + step_spec.status = WFStepStatus.cancelled + step_spec.end_time = naive_utcnow() + self.tbif.update_workflow_step(step_spec) + if force and not target_is_cancelled: + tmp_log.warning(f"Force cancelled step without cancelling target") + else: + tmp_log.info(f"Cancelled step, updated status to {step_spec.status}") + return True + except Exception as e: + tmp_log.error(f"Got error {str(e)}") + return False + + # --- Data operation --------------------------------------- + + def cancel_data(self, data_id: int, force: bool = False) -> bool: + """ + Cancel the workflow data + + Args: + data_id (int): ID of the workflow data to cancel + force (bool): Whether to force into cancelled status; currently has no effect since data cancellation is not implemented in plugins, but reserved for future use + + Returns: + bool: True if the data was successfully cancelled, otherwise False + """ + tmp_log = LogWrapper(logger, f"cancel_data ") + tmp_log.debug("Start") + try: + with self.workflow_data_lock(data_id) as data_spec: + if data_spec is None: + tmp_log.warning(f"Failed to acquire lock for data_id={data_id}; skipped") + return False + if data_spec.status in WFDataStatus.terminated_statuses: + tmp_log.debug(f"Data already terminated with status {data_spec.status}; skipped") + return True + data_spec.status = WFDataStatus.cancelled + data_spec.end_time = naive_utcnow() + self.tbif.update_workflow_data(data_spec) + tmp_log.info(f"Cancelled data, updated status to {data_spec.status}") + return True + except Exception as e: + tmp_log.error(f"Got error {str(e)}") + return False # ---- Data status transitions ----------------------------- diff --git a/pandaserver/workflow/workflow_core_smoketest.py b/pandaserver/workflow/workflow_core_smoketest.py index 00420619b..34ead7f8c 100644 --- a/pandaserver/workflow/workflow_core_smoketest.py +++ b/pandaserver/workflow/workflow_core_smoketest.py @@ -6,18 +6,20 @@ from pandaserver.config import panda_config from pandaserver.taskbuffer.TaskBuffer import taskBuffer -# parameters for the workflow -prodsourcelabel = "user" -username = "testuser" -workflow_name = "test_workflow_bg_comb_00" - def parse_args(): parser = argparse.ArgumentParser(description="Workflow core smoke test helper") - parser.add_argument("workflow_id", nargs="?", default=None, help="Workflow ID to use in commented smoke test calls") + parser.add_argument("action", choices=["cancel_workflow"], help="Action to perform in the smoke test") + parser.add_argument("--force", action="store_true", help="Force into cancelled status") + parser.add_argument("workflow_id", help="Workflow ID to use in commented smoke test calls") return parser.parse_args() +# parameters for the workflow +# prodsourcelabel = "user" +# username = "testuser" +# workflow_name = "test_workflow_bg_comb_00" + # workflow definition json # wfd_json = json.dumps( # json.loads( @@ -332,6 +334,10 @@ def parse_args(): def main(): args = parse_args() WFID = args.workflow_id + action = args.action + force = args.force + + from pandaserver.workflow.workflow_core import WorkflowInterface # interface for workflow operations requester_id = GenericThread().get_full_id(__name__, sys.modules[__name__].__file__) @@ -369,6 +375,14 @@ def main(): # print("Processing starting workflow...") # wfif.process_workflow_starting(wf_spec) + if args.action == "cancel_workflow": + print(f"Cancelling workflow_id={WFID} ...") + res = wfif.cancel_workflow(workflow_id=WFID, force=args.force) + if res: + print(f"Cancelled workflow_id={WFID} successfully.") + else: + print(f"Failed to cancel workflow_id={WFID}.") + if __name__ == "__main__": main() From d5b6f1ddd6ad3bcc723a3581ef02eecac31be529 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 11 Mar 2026 11:24:16 +0100 Subject: [PATCH 099/101] workflows4: fixes --- .../panda_task_step_handler.py | 8 ++++-- pandaserver/workflow/workflow_core.py | 28 ++++++++++++++----- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py index d08f12660..022fd08fb 100644 --- a/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py +++ b/pandaserver/workflow/step_handler_plugins/panda_task_step_handler.py @@ -257,13 +257,15 @@ def cancel_target(self, step_spec, **kwargs) -> WFStepTargetCancelResult: tmp_log.warning(f"flavor={step_spec.flavor} not {self.plugin_flavor}; skipped") return cancel_result if step_spec.target_id is None: - cancel_result.message = f"target_id is None; skipped" - tmp_log.warning(f"target_id is None; skipped") + # If target_id is None, consider it as already cancelled since there is no task to cancel + cancel_result.success = True + cancel_result.message = f"target_id is None so considered already cancelled; skipped" + tmp_log.debug(f"{cancel_result.message}") return cancel_result # Get task ID task_id = int(step_spec.target_id) # Cancel task - ret_val, ret_str = self.taskBufferIF.sendCommandTaskPanda(task_id, "PanDA Task Step Handler cancel_target", True, "kill", properErrorCode=True) + ret_val, ret_str = self.tbif.sendCommandTaskPanda(task_id, "PanDA Task Step Handler cancel_target", True, "kill", properErrorCode=True) # check if ok if ret_val == 0: cancel_result.success = True diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index b477c747f..6165bb885 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -154,7 +154,11 @@ def set_mb_proxy(self): Set the message broker proxy for workflow manager messaging """ try: - jedi_config = importlib.import_module("pandajedi.jediconfig").jedi_config + jedi_config = None + try: + jedi_config = importlib.import_module("pandajedi.jediconfig.jedi_config") + except Exception: + jedi_config = importlib.import_module("pandajedi.jediconfig").jedi_config if hasattr(jedi_config, "mq") and hasattr(jedi_config.mq, "configFile") and jedi_config.mq.configFile: MsgProcAgent = importlib.import_module(f"pandajedi.jediorder.JediMsgProcessor").MsgProcAgent else: @@ -388,12 +392,22 @@ def cancel_workflow(self, workflow_id: int, force: bool = False) -> bool: return True # Cancel all steps and data of the workflow all_cancelled = True - for step_id in workflow_spec.step_ids: - if not self.cancel_step(step_id, force): - all_cancelled = False - for data_id in workflow_spec.data_ids: - if not self.cancel_data(data_id, force): - all_cancelled = False + step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id) + if step_specs is None: + tmp_log.warning(f"Failed to get steps of workflow_id={workflow_id}; skipped cancelling steps") + all_cancelled = False + else: + for step_spec in step_specs: + if not self.cancel_step(step_spec.step_id, force): + all_cancelled = False + data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) + if data_specs is None: + tmp_log.warning(f"Failed to get data of workflow_id={workflow_id}; skipped cancelling data") + all_cancelled = False + else: + for data_spec in data_specs: + if not self.cancel_data(data_spec.data_id, force): + all_cancelled = False # Update workflow status to cancelled if all steps and data are cancelled if not all_cancelled and not force: tmp_log.warning(f"Not all steps and data could be cancelled; skipped updating workflow status") From c0f77c9b1484200cc5ffa803154967900db7d622 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 11 Mar 2026 12:19:46 +0100 Subject: [PATCH 100/101] log pretty --- pandaserver/workflow/workflow_core.py | 56 +++++++++++++++------------ 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/pandaserver/workflow/workflow_core.py b/pandaserver/workflow/workflow_core.py index 6165bb885..37a1da767 100644 --- a/pandaserver/workflow/workflow_core.py +++ b/pandaserver/workflow/workflow_core.py @@ -385,7 +385,7 @@ def cancel_workflow(self, workflow_id: int, force: bool = False) -> bool: try: with self.workflow_lock(workflow_id) as workflow_spec: if workflow_spec is None: - tmp_log.warning(f"Failed to acquire lock for workflow_id={workflow_id}; skipped") + tmp_log.warning(f"Failed to acquire lock; skipped") return False if workflow_spec.status in WorkflowStatus.final_statuses: tmp_log.debug(f"Workflow already in final status {workflow_spec.status}; skipped") @@ -394,7 +394,7 @@ def cancel_workflow(self, workflow_id: int, force: bool = False) -> bool: all_cancelled = True step_specs = self.tbif.get_steps_of_workflow(workflow_id=workflow_spec.workflow_id) if step_specs is None: - tmp_log.warning(f"Failed to get steps of workflow_id={workflow_id}; skipped cancelling steps") + tmp_log.warning(f"Failed to get steps of the workflow; skipped cancelling steps") all_cancelled = False else: for step_spec in step_specs: @@ -402,7 +402,7 @@ def cancel_workflow(self, workflow_id: int, force: bool = False) -> bool: all_cancelled = False data_specs = self.tbif.get_data_of_workflow(workflow_id=workflow_spec.workflow_id) if data_specs is None: - tmp_log.warning(f"Failed to get data of workflow_id={workflow_id}; skipped cancelling data") + tmp_log.warning(f"Failed to get data of the workflow; skipped cancelling data") all_cancelled = False else: for data_spec in data_specs: @@ -438,13 +438,16 @@ def cancel_step(self, step_id: int, force: bool = False) -> bool: Returns: bool: True if the step was successfully cancelled, otherwise False """ - tmp_log = LogWrapper(logger, f"cancel_step ") + log_prefix = f"cancel_step " + tmp_log = LogWrapper(logger, log_prefix) tmp_log.debug("Start") try: with self.workflow_step_lock(step_id) as step_spec: if step_spec is None: - tmp_log.warning(f"Failed to acquire lock for step_id={step_id}; skipped") + tmp_log.warning(f"Failed to acquire lock; skipped") return False + log_prefix += f" workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}" + tmp_log = LogWrapper(logger, log_prefix) if step_spec.status in WFStepStatus.final_statuses: tmp_log.debug(f"Step already in final status {step_spec.status}; skipped") return True @@ -490,13 +493,16 @@ def cancel_data(self, data_id: int, force: bool = False) -> bool: Returns: bool: True if the data was successfully cancelled, otherwise False """ - tmp_log = LogWrapper(logger, f"cancel_data ") + log_prefix = f"cancel_data " + tmp_log = LogWrapper(logger, log_prefix) tmp_log.debug("Start") try: with self.workflow_data_lock(data_id) as data_spec: if data_spec is None: - tmp_log.warning(f"Failed to acquire lock for data_id={data_id}; skipped") + tmp_log.warning(f"Failed to acquire lock; skipped") return False + log_prefix += f" workflow_id={data_spec.workflow_id}" + tmp_log = LogWrapper(logger, log_prefix) if data_spec.status in WFDataStatus.terminated_statuses: tmp_log.debug(f"Data already terminated with status {data_spec.status}; skipped") return True @@ -522,7 +528,7 @@ def process_data_registered(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_registered ") + tmp_log = LogWrapper(logger, f"process_data_registered workflow_id={data_spec.workflow_id}") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -555,7 +561,7 @@ def process_data_checking(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_checking ") + tmp_log = LogWrapper(logger, f"process_data_checking workflow_id={data_spec.workflow_id}") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -614,7 +620,7 @@ def process_data_checked(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_checked ") + tmp_log = LogWrapper(logger, f"process_data_checked workflow_id={data_spec.workflow_id}") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -667,7 +673,7 @@ def process_data_binding(self, data_spec: WFDataSpec, step_spec: WFStepSpec) -> Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_binding ") + tmp_log = LogWrapper(logger, f"process_data_binding workflow_id={data_spec.workflow_id}") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -701,7 +707,7 @@ def process_data_generating(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_generating ") + tmp_log = LogWrapper(logger, f"process_data_generating workflow_id={data_spec.workflow_id}") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -802,7 +808,7 @@ def process_data_waiting(self, data_spec: WFDataSpec) -> WFDataProcessResult: Returns: WFDataProcessResult: The result of processing the data """ - tmp_log = LogWrapper(logger, f"process_data_waiting ") + tmp_log = LogWrapper(logger, f"process_data_waiting workflow_id={data_spec.workflow_id}") tmp_log.debug("Start") # Initialize process_result = WFDataProcessResult() @@ -891,7 +897,7 @@ def process_data(self, data_spec: WFDataSpec, by: str = "dog") -> tuple[WFDataPr WFDataProcessResult | None: The result of processing the data specification, or None if skipped WFDataSpec: The updated workflow data specification """ - tmp_log = LogWrapper(logger, f"process_data ") + tmp_log = LogWrapper(logger, f"process_data workflow_id={data_spec.workflow_id} by={by}") tmp_log.debug("Start") tmp_res = None with self.workflow_data_lock(data_spec.data_id) as locked_data_spec: @@ -935,7 +941,7 @@ def process_datas(self, data_specs: List[WFDataSpec], by: str = "dog") -> Dict: Returns: Dict: Statistics of the processing results """ - tmp_log = LogWrapper(logger, f"process_datas ") + tmp_log = LogWrapper(logger, f"process_datas by={by}") n_data = len(data_specs) tmp_log.debug(f"Start, processing {n_data} data specs") data_status_stats = {"n_data": n_data, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} @@ -1006,7 +1012,7 @@ def process_step_registered(self, step_spec: WFStepSpec) -> WFStepProcessResult: WFStepProcessResult: The result of processing the step """ tmp_log = LogWrapper( - logger, f"process_step_registered " + logger, f"process_step_registered workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}" ) tmp_log.debug("Start") # Initialize @@ -1039,7 +1045,7 @@ def process_step_checking(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_checking ") + tmp_log = LogWrapper(logger, f"process_step_checking workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1110,7 +1116,7 @@ def process_step_checked(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_checked ") + tmp_log = LogWrapper(logger, f"process_step_checked workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1154,7 +1160,7 @@ def process_step_pending(self, step_spec: WFStepSpec, data_spec_map: Dict[str, W Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_pending ") + tmp_log = LogWrapper(logger, f"process_step_pending workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1255,7 +1261,7 @@ def process_step_ready(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_ready ") + tmp_log = LogWrapper(logger, f"process_step_ready workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1297,7 +1303,7 @@ def process_step_starting(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_starting ") + tmp_log = LogWrapper(logger, f"process_step_starting workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1369,7 +1375,7 @@ def process_step_running(self, step_spec: WFStepSpec) -> WFStepProcessResult: Returns: WFStepProcessResult: The result of processing the step """ - tmp_log = LogWrapper(logger, f"process_step_running ") + tmp_log = LogWrapper(logger, f"process_step_running workflow_id={step_spec.workflow_id} member_id={step_spec.member_id}") tmp_log.debug("Start") # Initialize process_result = WFStepProcessResult() @@ -1445,7 +1451,7 @@ def process_step( WFStepProcessResult | None: The result of processing the step, or None if the step was skipped WFStepSpec: The updated workflow step specification """ - tmp_log = LogWrapper(logger, f"process_step ") + tmp_log = LogWrapper(logger, f"process_step workflow_id={step_spec.workflow_id} member_id={step_spec.member_id} by={by}") tmp_log.debug("Start") tmp_res = None with self.workflow_step_lock(step_spec.step_id) as locked_step_spec: @@ -1494,7 +1500,7 @@ def process_steps(self, step_specs: List[WFStepSpec], data_spec_map: Dict[str, W Returns: Dict: Statistics of the processing results """ - tmp_log = LogWrapper(logger, f"process_steps ") + tmp_log = LogWrapper(logger, f"process_steps by={by}") n_steps = len(step_specs) tmp_log.debug(f"Start, processing {n_steps} steps") steps_status_stats = {"n_steps": n_steps, "changed": {}, "unchanged": {}, "processed": {}, "n_processed": 0} @@ -1881,7 +1887,7 @@ def process_workflow(self, workflow_spec: WorkflowSpec, by: str = "dog") -> tupl WorkflowProcessResult: The result of processing the workflow WorkflowSpec: The updated workflow specification """ - tmp_log = LogWrapper(logger, f"process_workflow ") + tmp_log = LogWrapper(logger, f"process_workflow by={by}") tmp_log.debug(f"Start, current status={workflow_spec.status}") # Initialize process_result = WorkflowProcessResult() From 388667c6f5badaf2d9294ae8603d72c4646215f4 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 12 Mar 2026 11:01:01 +0100 Subject: [PATCH 101/101] workflows4: reuse DDM interface --- .../workflow/data_handler_plugins/base_data_handler.py | 4 +++- .../data_handler_plugins/ddm_collection_data_handler.py | 4 +--- .../workflow/data_handler_plugins/panda_task_data_handler.py | 4 +--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandaserver/workflow/data_handler_plugins/base_data_handler.py b/pandaserver/workflow/data_handler_plugins/base_data_handler.py index f756d499c..bd2f3a19e 100644 --- a/pandaserver/workflow/data_handler_plugins/base_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/base_data_handler.py @@ -19,16 +19,18 @@ class BaseDataHandler: This class provides a common interface and some utility methods for data handlers. """ - def __init__(self, task_buffer, *args, **kwargs): + def __init__(self, task_buffer, ddm_if, *args, **kwargs): """ Initialize the step handler with necessary parameters. Args: task_buffer: The task buffer interface to interact with the task database. + ddm_if: The DDM interface to interact with the DDM system. *args: Additional positional arguments. **kwargs: Additional keyword arguments. """ self.tbif = task_buffer + self.ddm_if = ddm_if def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResult: """ diff --git a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py index b99698ff0..70500cb46 100644 --- a/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/ddm_collection_data_handler.py @@ -5,7 +5,6 @@ from pandacommon.pandalogger.LogWrapper import LogWrapper from pandacommon.pandalogger.PandaLogger import PandaLogger -from pandaserver.dataservice.ddm import rucioAPI from pandaserver.workflow.data_handler_plugins.base_data_handler import BaseDataHandler from pandaserver.workflow.workflow_base import ( WFDataSpec, @@ -55,7 +54,6 @@ def __init__(self, *args, **kwargs): """ # Initialize base class or any required modules here super().__init__(*args, **kwargs) - self.ddmIF = rucioAPI self.plugin_flavor = "ddm_collection" def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResult: @@ -80,7 +78,7 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu return check_result # Check DDM collection status collection = data_spec.target_id - collection_meta = self.ddmIF.get_dataset_metadata(collection, ignore_missing=True) + collection_meta = self.ddm_if.get_dataset_metadata(collection, ignore_missing=True) if collection_meta is None: check_result.success = False check_result.message = f"Failed to get metadata for collection {collection}" diff --git a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py index e91ddb2ca..a80c96857 100644 --- a/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py +++ b/pandaserver/workflow/data_handler_plugins/panda_task_data_handler.py @@ -5,7 +5,6 @@ from pandacommon.pandalogger.LogWrapper import LogWrapper from pandacommon.pandalogger.PandaLogger import PandaLogger -from pandaserver.dataservice.ddm import rucioAPI from pandaserver.workflow.data_handler_plugins.base_data_handler import BaseDataHandler from pandaserver.workflow.workflow_base import ( WFDataSpec, @@ -57,7 +56,6 @@ def __init__(self, *args, **kwargs): """ # Initialize base class or any required modules here super().__init__(*args, **kwargs) - self.ddmIF = rucioAPI self.plugin_flavor = "panda_task" def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResult: @@ -112,7 +110,7 @@ def check_target(self, data_spec: WFDataSpec, **kwargs) -> WFDataTargetCheckResu output_types = [] for output_type in output_types: collection = f"{data_spec.target_id}_{output_type}" - tmp_stat, tmp_res = self.ddmIF.get_number_of_files(collection) + tmp_stat, tmp_res = self.ddm_if.get_number_of_files(collection) if tmp_stat is None: tmp_log.debug(f"Collection {collection} does not exist") elif not tmp_stat: