From 2bb6edb47a90c2a9c6d2b1b2dc5723a8c3b87fd1 Mon Sep 17 00:00:00 2001 From: Towsifa Akhter Date: Tue, 13 Jan 2026 03:44:21 +0100 Subject: [PATCH 1/6] distutils has been removed from python3.12 library --- bin_opt/optimize_binning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin_opt/optimize_binning.py b/bin_opt/optimize_binning.py index a5b2b23..2b54feb 100644 --- a/bin_opt/optimize_binning.py +++ b/bin_opt/optimize_binning.py @@ -12,7 +12,7 @@ import threading import time from sortedcontainers import SortedSet -import distutils.util +import setuptools._distutils.util import yaml input_binning_opt_config = os.path.join(os.environ["ANALYSIS_PATH"], "StatInference", "bin_opt", "bin_optimization.yaml") @@ -595,7 +595,7 @@ def dispatcher(): def_value = getattr(bkg_yields, name) def_value_type = type(def_value) if def_value_type == bool: - new_value = bool(distutils.util.strtobool(value)) + new_value = bool(setuptools._distutils.util.strtobool(value)) else: new_value = def_value_type(value) print("Setting {} = {}".format(name, new_value)) From e9a9655e7eb222887c31e6911a5a63ff3a5409d0 Mon Sep 17 00:00:00 2001 From: Towsifa Akhter Date: Wed, 14 Jan 2026 20:42:03 +0100 Subject: [PATCH 2/6] removing the dependency on setuptools or distutils package --- bin_opt/optimize_binning.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/bin_opt/optimize_binning.py b/bin_opt/optimize_binning.py index 2b54feb..c14ddbc 100644 --- a/bin_opt/optimize_binning.py +++ b/bin_opt/optimize_binning.py @@ -12,7 +12,6 @@ import threading import time from sortedcontainers import SortedSet -import setuptools._distutils.util import yaml input_binning_opt_config = os.path.join(os.environ["ANALYSIS_PATH"], "StatInference", "bin_opt", "bin_optimization.yaml") @@ -48,6 +47,20 @@ def HistToNumpy(hist): def arrayToStr(a): return '[ ' + ', '.join([ str(x) for x in a ]) + ' ]' +def strtobool(val): + """Convert a string representation of truth to true (1) or false (0). + True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values + are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if + 'val' is anything else. + """ + val = val.lower() + if val in ('y', 'yes', 't', 'true', 'on', '1'): + return 1 + elif val in ('n', 'no', 'f', 'false', 'off', '0'): + return 0 + else: + raise ValueError(f"invalid truth value '{val!r}'") + class Yields: from sortedcontainers import SortedSet def __init__(self, ref_bkgs, n_bins): @@ -595,7 +608,7 @@ def dispatcher(): def_value = getattr(bkg_yields, name) def_value_type = type(def_value) if def_value_type == bool: - new_value = bool(setuptools._distutils.util.strtobool(value)) + new_value = bool(strtobool(value)) else: new_value = def_value_type(value) print("Setting {} = {}".format(name, new_value)) From 5babe36959a44a155bd5911dd5174bf9758c93ea Mon Sep 17 00:00:00 2001 From: Towsifa Akhter Date: Wed, 11 Feb 2026 13:40:48 +0100 Subject: [PATCH 3/6] adding multi-category binning optimizing configuration --- bin_opt/multi_optimize_binning.py | 475 ++++++++++++++++++++++++++++++ 1 file changed, 475 insertions(+) create mode 100644 bin_opt/multi_optimize_binning.py diff --git a/bin_opt/multi_optimize_binning.py b/bin_opt/multi_optimize_binning.py new file mode 100644 index 0000000..4661d31 --- /dev/null +++ b/bin_opt/multi_optimize_binning.py @@ -0,0 +1,475 @@ +import bayes_opt +import json +import math +import numpy as np +import os +import queue +import random +import re +import ROOT +import shutil +import threading +import time +from sortedcontainers import SortedSet +import yaml +from optimize_binning import HistToNumpy, arrayToStr, strtobool, Yields, ExtractYields + +input_binning_opt_config = os.path.join(os.environ["ANALYSIS_PATH"], "StatInference", "bin_opt", "bin_optimization.yaml") +with open(input_binning_opt_config, "r") as f: + input_binning_opt_config_dict = yaml.safe_load(f) + +if input_binning_opt_config_dict["input"]["num_original_bins"] == 1000: + min_step = 0.001 + max_value_int = input_binning_opt_config_dict["input"]["num_original_bins"] +elif input_binning_opt_config_dict["input"]["num_original_bins"] == 5000: + min_step = 0.0002 + max_value_int = input_binning_opt_config_dict["input"]["num_original_bins"] + +class MultiBinning: + def __init__(self, edges): + self.edges = edges + self.exp_limit = None + self.input_datacard = None + + + @staticmethod + def fromEntry(entry): + binning = MultiBinning(np.array(entry['bin_edges'])) + binning.exp_limit = float(entry['exp_limit']) + binning.input_datacard = entry['input_datacard'] + return binning + + def isEquivalent(self, other): + if len(self.edges) != len(other.edges): + return False + if self.input_datacard != other.input_datacard: + return False + return np.count_nonzero(self.edges == other.edges) == len(self.edges) + + def toPoint(self, n_thrs, n_cards): + rel_thrs = self.getRelativeThresholds(n_thrs, n_cards) + point = {} + for idx in range(n_cards): + for n in range(n_thrs): + point[f'rel_thr_card{idx}_{n}'] = rel_thrs[idx][n] + return point + + def isBetter(self, other): + if other is None or other.exp_limit is None: + return True + if self.exp_limit != other.exp_limit: + return self.exp_limit < other.exp_limit + return len(self.edges) < len(other.edges) + + + def getRelativeThresholds(self, n_thrs, n_cards): + n_inner_edges = len(self.edges) - 2 + if n_thrs < n_inner_edges: + raise RuntimeError(f"Invalid number of output thresholds: {n_thrs} should not be < {n_inner_edges}") + rel_thrs = {} + for i in range(n_cards): + rel_thrs[i] = (np.random.rand(n_thrs) + min_step) / (1 - min_step) + rel_thrs[i][0:n_inner_edges] = np.flip(self.edges[2:] - self.edges[1:-1]) + if n_inner_edges < n_thrs: + rel_thrs[i][n_inner_edges] = max(self.edges[1], rel_thrs[i][n_inner_edges]) + return rel_thrs + + @staticmethod + def fromRelativeThresholds(rel_thrs, bkg_yields): + edges = [ max_value_int ] + prev_yield = -1 + for rel_thr in rel_thrs: + edge_up = edges[-1] + edge_down = max(edge_up - int(round(rel_thr * step_int_scale)), 0) + all_ok = False + while edge_down >= 0: + all_ok, new_yield = bkg_yields.test(edge_down, edge_up, prev_yield) + if all_ok: break + edge_down -= 1 + if all_ok: + edges.append(edge_down) + prev_yield = new_yield + if edge_down == 0: break + if len(edges) == 1: + edges.append(0) + edges.reverse() + edges = np.array(edges, dtype=float) + edges = edges / step_int_scale + if edges[-1] != 1: + edges[-1] = 1 + if edges[0] != 0: + edges[0] = 0 + return Binning(edges) + +class MultiBayesianOptimization: + def __init__(self, max_n_bins, working_area, workers_dir, acq, kappa, xi, + input_datacards_list, poi, bkg_yields_dict, input_queue_size, random_seed=12345): + self.max_n_bins = max_n_bins + self.binnings = [] + self.best_binning = None + self.working_area = working_area + self.workers_dir = workers_dir + self.log_output = os.path.join(working_area, 'results_multibinning.json') + self.poi = poi + self.best_binning_split = False + self.input_queue = queue.Queue(input_queue_size) + self.open_requests = [] + self.binning_lock = threading.Lock() + self.optimizer_lock = threading.Lock() + self.print_lock = threading.Lock() + self.bkg_yields_dict = bkg_yields_dict + + self.input_datacards = {} + bounds = {} + for idx,card in enumerate(input_datacards_list): + self.input_datacards[idx] = card + for n in range(max_n_bins): + upper_bound = 1. if n > 0 else 1.5 + bounds[f'rel_thr_card{idx}_{n}'] = (min_step, upper_bound) + + self.bounds_transformer = None + self.optimizer = bayes_opt.BayesianOptimization(f=None, pbounds=bounds, random_state=random_seed, verbose=1) + self.utilities = [ + bayes_opt.acquisition.UpperConfidenceBound(kappa=kappa), + bayes_opt.acquisition.ExpectedImprovement(xi=xi), + bayes_opt.acquisition.ProportionalIntegralDerivative() + ] + + if not os.path.isdir(working_area): + os.makedirs(working_area) + if not os.path.isdir(workers_dir): + os.makedirs(workers_dir) + if os.path.isfile(self.log_output): + with open(self.log_output, 'r') as f: + prev_binnins = json.loads('[' + ', '.join(f.readlines()) + ']') + for entry in prev_binnings: + binning = MultiBinning.fromEntry(entry) + if self.findEquivalent(binning) is None: + point = binning.toPoint(self.max_n_bins, len(self.input_datacards)) + self.register(point, len(binning.edges), binning.exp_limit) + if binning.isBetter(self.best_binning): + self.best_binning = binning + self.binnings.append(binning) + + if self.best_binning is not None: + self.print(f'The best binning from previous iterations: {arrayToStr(self.best_binning.edges)}, exp_limit = {self.best_binning.exp_limit}, datacard = {self.best_binning.input_datacard}') + + self.bounds_transformer = bayes_opt.SequentialDomainReductionTransformer() + self.bounds_transformer.initialize(self.optimizer.space) + + suggestions_file = os.path.join(working_area, 'to_try.json') + self.suggestions = [] + if os.path.isfile(suggestions_file): + with open(suggestions_file, 'r') as f: + suggestions = json.load(f) + for edges in suggestions: + self.addSuggestion(edges) + + def findEquivalent(self, binning, lock=True): + equivalent_binning = None + if lock: + self.binning_lock.acquire() + for prev_binning in self.binnings: + if prev_binning.isEquivalent(binning): + equivalent_binning = prev_binning + break + if lock: + self.binning_lock.release() + return equivalent_binning + + def register(self, point, n_edges, exp_limit): + loss = -(exp_limit*1e6 + n_edges) + self.optimizer_lock.acquire() + self.optimizer.register(params=point, target=loss) + self.updateBounds(n_edges - 2) + self.optimizer_lock.release() + + def print(self, msg): + self.print_lock.acquire() + print(msg) + self.print_lock.release() + + def updateBounds(self, n_points): + if self.bound_transformer: + new_bounds = self.bounds_transformer.transform(self.optimizer.space) + prev_fixed = True + for cat_i in range(len(self.input_datacards)): + for n in range(self.max_n_bins): + key = f'rel_thr_card{cat_i}_{n}' + if prev_fixed: + prev_fixed = new_bounds[key][1] - new_bounds[key][0] < 0.1 and n < n_points + else: + new_bounds[key] = np.array([min_step, 1.]) + for key in new_bounds: + if new_bounds[key][0] >= new_bounds[key][1]: + new_bounds[key] = np.array([new_bounds[key][1], new_bounds[key][0]]) + if new_bounds[key][0] < min_step: + new_bounds[key][0] = min_step + + upper_bound = 1. if key != f'rel_thr_card{cat_i}_0' else 1.5 + if new_bounds[key][1] > upper_bound: + new_bounds[key][1] = upper_bound + if np.any(np.isnan(new_bounds[key])): + new_bounds[key] = np.array([min_step, upper_bound]) + self.optimizer.set_bounds(new_bounds) + + def suggest(self, utility_index): + self.optimizer_lock.acquire() + if self.suggestions is None or len(self.suggestions) == 0: + self.optimizer._acquisition_function = self.utilities[utility_index] + point = self.optimizer.suggest() + else: + point = self.suggestions.pop(0) + self.suggestions.remove(point) + self.optimize_lock.release() + return point + + def addSuggestion(self, edges): + suggested_binning = Binning(np.array(edges)) + rel_thrs = suggested_binning.getRelativeThresholds(self.max_n_bins, len(self.input_datacards)) + binning = Binning.fromRelativeThresholds(rel_thrs, self.bkg_yields) + if self.findEquivalent(binning, False) is None: + point = binning.toPoint(self.max_n_bins, len(self.input_datacards)) + self.suggestions.append(point) + +###check below for compatibility with multi category optimization + + def addNewBinning(self, binning): + self.binning_lock.acquire() + if binning.isBetter(self.best_binning): + self.best_binning = binning + self.best_binning_split = False + self.print('New best binning: {}, exp_limit = {}' \ + .format(arrayToStr(binning.edges), binning.exp_limit)) + self.binnings.append(binning) + self.binning_lock.release() + + def tryBestBinningSplit(self): + self.binning_lock.acquire() + if not self.best_binning_split: + self.print('Splitting best binning...') + n_best = len(self.best_binning.edges) + self.optimizer_lock.acquire() + if n_best - 2 < self.max_n_bins: + for k in range(n_best - 1): + edges = np.zeros(n_best + 1) + edges[0:k+1] = self.best_binning.edges[0:k+1] + edges[k+2:] = self.best_binning.edges[k+1:] + for delta_edge in [ 0.1, 0.5, 0.9 ]: + edges[k+1] = self.best_binning.edges[k] \ + + (self.best_binning.edges[k+1] - self.best_binning.edges[k]) * delta_edge + self.addSuggestion(edges) + if n_best > 2: + for k in range(1, n_best): + edges = np.zeros(n_best - 1) + edges[0:k] = self.best_binning.edges[0:k] + edges[k:] = self.best_binning.edges[k+1:] + self.addSuggestion(edges) + self.optimizer_lock.release() + self.best_binning_split = True + self.binning_lock.release() + + def findOpenRequest(self, binning): + equivalent_binning = None + self.binning_lock.acquire() + for prev_binning in self.open_requests: + if prev_binning.isEquivalent(binning): + equivalent_binning = prev_binning + break + self.binning_lock.release() + return equivalent_binning + + def addOpenRequest(self, binning): + self.binning_lock.acquire() + self.open_requests.append(binning) + self.binning_lock.release() + + def clearOpenRequest(self, binning): + self.binning_lock.acquire() + to_remove = [] + for prev_binning in self.open_requests: + if prev_binning.isEquivalent(binning): + to_remove.append(prev_binning) + for b in to_remove: + self.open_requests.remove(b) + self.binning_lock.release() + + def waitOpenRequestsToFinish(self): + has_open_requests = True + while has_open_requests: + self.binning_lock.acquire() + has_open_requests = len(self.open_requests) > 0 + self.binning_lock.release() + time.sleep(1) + + def PointGenerator(self, n_eq_steps): + n = 0 + utility_index = 0 + open_request_sleep = 1 + while n < n_eq_steps: + point = self.suggest(utility_index) + + rel_thrs = np.zeros(len(point)) + for k in range(self.max_n_bins): + rel_thrs[k] = point['rel_thr_{}'.format(k)] + self.print('rel_thrs: {}'.format(arrayToStr(rel_thrs))) + + binning = Binning.fromRelativeThresholds(rel_thrs, self.bkg_yields) + equivalent_binning = self.findEquivalent(binning) + if equivalent_binning is None: + n = 0 + open_request = self.findOpenRequest(binning) + if open_request is None: + self.print('Next binning to probe: {}'.format(arrayToStr(binning.edges))) + self.input_queue.put(binning) + self.addOpenRequest(binning) + utility_index = 0 + open_request_sleep = 1 + else: + self.print('Open request for binning found: {}'.format(arrayToStr(open_request.edges))) + time.sleep(open_request_sleep) + open_request_sleep = open_request_sleep * 2 + utility_index = (utility_index + 1) % len(self.utilities) + else: + self.print('Equivalent binning found: {}'.format(arrayToStr(equivalent_binning.edges))) + self.register(point, len(equivalent_binning.edges), equivalent_binning.exp_limit) + n += 1 + if n >= 5: + self.tryBestBinningSplit() + if n == n_eq_steps - 1: + self.print('Waiting for open requests to finish...') + self.waitOpenRequestsToFinish() + + self.input_queue.put(None) + + def JobDispatcher(self): + while True: + time.sleep(1) + for worker_dir in os.listdir(self.workers_dir): + worker_dir = os.path.join(self.workers_dir, worker_dir) + if not os.path.isdir(worker_dir): continue + task_file = os.path.join(worker_dir, 'task.txt') + task_file_tmp = os.path.join(worker_dir, '.task.txt') + if os.path.isfile(task_file): + result_file = os.path.join(worker_dir, 'result.txt') + if os.path.isfile(result_file): + with open(result_file, 'r') as f: + result = json.load(f) + if result['input_datacard'] == self.input_datacard: + binning = Binning.fromEntry(result) + self.clearOpenRequest(binning) + if self.findEquivalent(binning) is None: + point = binning.toPoint(self.max_n_bins) + self.addNewBinning(binning) + with open(self.log_output, 'a') as f: + f.write(json.dumps(result) + '\n') + self.register(point, len(binning.edges), binning.exp_limit) + + os.remove(task_file) + os.remove(result_file) + else: + try: + binning = self.input_queue.get(True, 1) + if binning is None: return + task = { + 'input_datacard': self.input_datacard, + 'bin_edges': [ x for x in binning.edges ], + 'poi': self.poi, + 'other_datacards': self.other_datacards, + } + with open(task_file_tmp, 'w') as f: + json.dump(task, f) + shutil.move(task_file_tmp, task_file) + except queue.Empty: + pass + + def maximize(self, n_eq_steps): + + def generator(): + self.PointGenerator(n_eq_steps) + + def dispatcher(): + self.JobDispatcher() + + generator_thread = threading.Thread(target=generator) + generator_thread.start() + + dispatcher_thread = threading.Thread(target=dispatcher) + dispatcher_thread.start() + + generator_thread.join() + dispatcher_thread.join() + + +###end of check for multi_bo + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='Find optimal binning for multiple catgories that minimizes the expected limits.') + parser.add_argument('--input', required=True, type=str, help="comma separated input datacards, e.g. datacard_cat1,datacard_cat2,..") + parser.add_argument('--shape-file', required=True, type=str, help="input root file with shapes, comma separated") + parser.add_argument('--output', required=True, type=str, help="output directory") + parser.add_argument('--workers-dir', required=True, type=str, help="output directory for workers results") + parser.add_argument('--max_n_bins', required=True, type=int, help="maximum number of bins per category") + parser.add_argument('--poi', required=False, type=str, default='r', help="parameter of interest") + parser.add_argument('--params', required=False, type=str, default=None, + help="algorithm parameters in format param1=value1,param2=value2,...") + parser.add_argument('--verbosity', required=False, type=int, default=1, help="verbosity") + + args = parser.parse_args() + + param_dict = {} + if args.params is not None: + params = args.params.split(',') + for param in params: + p = param.split('=') + if len(p) != 2: + raise RuntimeError(f'invalid parameter definition {param}.') + param_dict[p[0]] = p[1] + + ref_bkgs = {} + for main_bkg in input_binning_opt_config_dict["background"]["main"]: + ref_bkgs[f'{main_bkg}'] = re.compile(f'^{main_bkg}$') + + nonbkgg_regex = re.compile('(data_obs|^ggHH.*|^qqHH.*|^DY_[0-2]b.*)') + ignore_unc_variations = re.compile('(CMS_bbtt_201[6-8]_DYSFunc[0-9]+|CMS_bbtt_.*_QCDshape)(Up|Down)') + + input_datacards = args.input.split(',') + for idx,card in enumerate(input_datacards): + input_datacards[idx] = os.path.abspath(card) + + bkg_yields = {} + input_shape_files = args.shape_file.split(',') + for idx,shape in enumerate(input_shape_files): + input_shape_files[idx] = os.path.abspath(shape) + print("Extracting yields for background processes", ref_bkgs, "and shape", shape) + bkg_yields[idx] = ExtractYields(input_shape_files[idx], ref_bkgs, nonbkg_regex, ignore_unc_variations) + bkg_yields[idx].printSummary() + + for name,value in param_dict.items(): + if not hasattr(bkg_yields[idx], name): + raise RuntimeError(f'Unknown parameter {name}') + def_value = getattr(bkg_yields[idx], name) + def_value_type = type(def_value) + if def_value_type == bool: + new_value = bool(strtobool(value)) + else: + new_value = def_value_type(value) #this might throw an error because def_value_type function is not defined. + + print(f"Setting {name} = {new_value}") + setattr(bkg_yields[idx], name, new_value) + + multi_bo = MultiBayesianOptimization(max_n_bins=args.max_n_bins, + working_area=args.output, + workers_dir=args.workers_dir, + acq='ucb', kappa=2.57, xi=0.1, + input_datacards=input_datacards, + poi=args.poi, + bkg_yields=bkg_yields, + input_queue_size=2, random_seed=None) + + multi_bo.maximize(20) + + print("Minimization finished.") + print("Best binning per category:\n") From 0c79966d107f7ef56946e55dbc70452ff999f249 Mon Sep 17 00:00:00 2001 From: Towsifa Akhter Date: Wed, 11 Feb 2026 16:12:44 +0100 Subject: [PATCH 4/6] multiple datacard option in point generator function --- bin_opt/multi_optimize_binning.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/bin_opt/multi_optimize_binning.py b/bin_opt/multi_optimize_binning.py index 4661d31..330c983 100644 --- a/bin_opt/multi_optimize_binning.py +++ b/bin_opt/multi_optimize_binning.py @@ -310,9 +310,10 @@ def PointGenerator(self, n_eq_steps): point = self.suggest(utility_index) rel_thrs = np.zeros(len(point)) - for k in range(self.max_n_bins): - rel_thrs[k] = point['rel_thr_{}'.format(k)] - self.print('rel_thrs: {}'.format(arrayToStr(rel_thrs))) + for idx in range(len(self.input_datacards)): + for k in range(self.max_n_bins): + rel_thrs[k] = point[f'rel_thr_card{idx}_{k}'] + self.print(f'rel_thrs: {arrayToStr(rel_thrs)}') binning = Binning.fromRelativeThresholds(rel_thrs, self.bkg_yields) equivalent_binning = self.findEquivalent(binning) @@ -320,18 +321,18 @@ def PointGenerator(self, n_eq_steps): n = 0 open_request = self.findOpenRequest(binning) if open_request is None: - self.print('Next binning to probe: {}'.format(arrayToStr(binning.edges))) + self.print(f'Next binning to probe: {arrayToStr(binning.edges)}') self.input_queue.put(binning) self.addOpenRequest(binning) utility_index = 0 open_request_sleep = 1 else: - self.print('Open request for binning found: {}'.format(arrayToStr(open_request.edges))) + self.print(f'Open request for binning found: {arrayToStr(open_request.edges)}') time.sleep(open_request_sleep) open_request_sleep = open_request_sleep * 2 utility_index = (utility_index + 1) % len(self.utilities) else: - self.print('Equivalent binning found: {}'.format(arrayToStr(equivalent_binning.edges))) + self.print(f'Equivalent binning found: {arrayToStr(equivalent_binning.edges)}') self.register(point, len(equivalent_binning.edges), equivalent_binning.exp_limit) n += 1 if n >= 5: @@ -355,7 +356,7 @@ def JobDispatcher(self): if os.path.isfile(result_file): with open(result_file, 'r') as f: result = json.load(f) - if result['input_datacard'] == self.input_datacard: + if result['input_datacard'] in self.input_datacards.values(): binning = Binning.fromEntry(result) self.clearOpenRequest(binning) if self.findEquivalent(binning) is None: @@ -371,12 +372,13 @@ def JobDispatcher(self): try: binning = self.input_queue.get(True, 1) if binning is None: return - task = { - 'input_datacard': self.input_datacard, - 'bin_edges': [ x for x in binning.edges ], - 'poi': self.poi, - 'other_datacards': self.other_datacards, - } + for card in self.input_datacards: + task = { + 'input_datacard': card, + 'bin_edges': [ x for x in binning.edges ], + 'poi': self.poi, + 'other_datacards': self.other_datacards, + } with open(task_file_tmp, 'w') as f: json.dump(task, f) shutil.move(task_file_tmp, task_file) From ad6e97648a6e3df88e8ec66edaab3ce77a6dc269 Mon Sep 17 00:00:00 2001 From: Towsifa Akhter Date: Wed, 11 Feb 2026 16:28:47 +0100 Subject: [PATCH 5/6] adding options for multi-category to the server script --- bin_opt/bin_optimization.yaml | 1 + bin_opt/optimize_channel.py | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/bin_opt/bin_optimization.yaml b/bin_opt/bin_optimization.yaml index 616e09f..d55433a 100644 --- a/bin_opt/bin_optimization.yaml +++ b/bin_opt/bin_optimization.yaml @@ -13,6 +13,7 @@ input: binning_suggestions: [] #[bin_opt/to_try.json] number_of_workers: 5 worker_condor_max_runtime_hours: 72 + multi_category_optimization: True output: directory: output_test/binning_run3_v4 diff --git a/bin_opt/optimize_channel.py b/bin_opt/optimize_channel.py index 8ae2544..61f60bc 100644 --- a/bin_opt/optimize_channel.py +++ b/bin_opt/optimize_channel.py @@ -98,11 +98,15 @@ def optimize_channel(channel, output, era, categories, max_n_bins, params, binni for cat_index in range(first_cat_index, len(categories)): category, poi = categories[cat_index] print(f"Optimizing {channel} {category} for era {era}") + datacards = [] + shape_files = [] for file in os.listdir(input_dir): if (channel in file or channel.lower() in file) and (category in file) and (era in file) and file.endswith('.txt'): input_card = f'{input_dir}/{file}' + datacards.append(f'{input_dir}/{file}') if (channel in file or channel.lower() in file) and (category in file) and (era in file) and file.endswith('.root'): input_shape = f'{input_dir}/{file}' + shape_files.append(f'{input_dir}/{file}') else: continue @@ -116,7 +120,10 @@ def optimize_channel(channel, output, era, categories, max_n_bins, params, binni json.dump(suggested_binnings[category], f) cat_log = os.path.join(cat_dir, 'results.json') - opt_cmd = f"python3 bin_opt/optimize_binning.py --input {input_card} --shape-file {input_shape} --output {cat_dir} --workers-dir {workers_dir} --max_n_bins {max_n_bins} --poi {poi}" + if input_binning_opt_config_dict["input"]["multi_category_optimization"]: + opt_cmd = f"python3 bin_opt/multi_optimize_binning.py --input {datacards} --shape-file {shape_files} --output {cat_dir} --workers-dir {workers_dir} --max_n_bins {max_n_bins} --poi {poi}" + else: + opt_cmd = f"python3 bin_opt/optimize_binning.py --input {input_card} --shape-file {input_shape} --output {cat_dir} --workers-dir {workers_dir} --max_n_bins {max_n_bins} --poi {poi}" if params is not None: opt_cmd += f" --params {params} " for cat_idx in range(cat_index): From 44767b9ef3555e928087261c696c1f68cb83113f Mon Sep 17 00:00:00 2001 From: Towsifa Akhter Date: Wed, 25 Feb 2026 16:38:44 +0100 Subject: [PATCH 6/6] multi category optimization --- bin_opt/multi_optimize_binning.py | 287 ++++++++++++++++-------------- bin_opt/optimize_channel.py | 153 +++++++++++++++- 2 files changed, 309 insertions(+), 131 deletions(-) diff --git a/bin_opt/multi_optimize_binning.py b/bin_opt/multi_optimize_binning.py index 330c983..8857e4e 100644 --- a/bin_opt/multi_optimize_binning.py +++ b/bin_opt/multi_optimize_binning.py @@ -26,84 +26,73 @@ max_value_int = input_binning_opt_config_dict["input"]["num_original_bins"] class MultiBinning: - def __init__(self, edges): - self.edges = edges + def __init__(self, edges_by_category, input_datacards): + self.edges_by_category = edges_by_category self.exp_limit = None - self.input_datacard = None + self.input_datacards = input_datacards - - @staticmethod - def fromEntry(entry): - binning = MultiBinning(np.array(entry['bin_edges'])) - binning.exp_limit = float(entry['exp_limit']) - binning.input_datacard = entry['input_datacard'] - return binning + @property + def n_categories(self): + return len(self.input_datacards) + + def n_edges_total(self): + return sum(len(self.edges_by_category[i]) for i in range(self.n_categories)) def isEquivalent(self, other): - if len(self.edges) != len(other.edges): + if self.input_datacards != other.input_datacards: return False - if self.input_datacard != other.input_datacard: + if self.n_categories != other.n_categories: return False - return np.count_nonzero(self.edges == other.edges) == len(self.edges) - - def toPoint(self, n_thrs, n_cards): - rel_thrs = self.getRelativeThresholds(n_thrs, n_cards) - point = {} - for idx in range(n_cards): - for n in range(n_thrs): - point[f'rel_thr_card{idx}_{n}'] = rel_thrs[idx][n] - return point + for n in range(self.n_categories): + a = self.edges_by_category.get(n, None) + b = other.edges_by_category.get(n, None) + if a is None or b is None: + return False + if len(a) != len(b): + return False + if np.count_nonzero(a == b) != len(a): + return False + return True def isBetter(self, other): if other is None or other.exp_limit is None: return True if self.exp_limit != other.exp_limit: return self.exp_limit < other.exp_limit - return len(self.edges) < len(other.edges) - + return self.n_edges_total() < other.n_edges_total() - def getRelativeThresholds(self, n_thrs, n_cards): - n_inner_edges = len(self.edges) - 2 - if n_thrs < n_inner_edges: - raise RuntimeError(f"Invalid number of output thresholds: {n_thrs} should not be < {n_inner_edges}") - rel_thrs = {} - for i in range(n_cards): - rel_thrs[i] = (np.random.rand(n_thrs) + min_step) / (1 - min_step) - rel_thrs[i][0:n_inner_edges] = np.flip(self.edges[2:] - self.edges[1:-1]) - if n_inner_edges < n_thrs: - rel_thrs[i][n_inner_edges] = max(self.edges[1], rel_thrs[i][n_inner_edges]) - return rel_thrs + def toTask(self, poi, other_datacards=None): + if other_datacards is None: + other_datacards = [] + categories = [] + for idx, card in enumerate(self.input_datacards): + categories.append({ + 'cat_id': idx, + 'datacard': card, + 'bin_edges': [float(x) for x in self.edges_by_category[idx]] + }) + task = { + 'task_type': 'multi_category_limit', + 'poi': poi, + 'categories': categories, + 'other_datacards': [os.path.abspath(p) for p in other_datacards] + } + return task @staticmethod - def fromRelativeThresholds(rel_thrs, bkg_yields): - edges = [ max_value_int ] - prev_yield = -1 - for rel_thr in rel_thrs: - edge_up = edges[-1] - edge_down = max(edge_up - int(round(rel_thr * step_int_scale)), 0) - all_ok = False - while edge_down >= 0: - all_ok, new_yield = bkg_yields.test(edge_down, edge_up, prev_yield) - if all_ok: break - edge_down -= 1 - if all_ok: - edges.append(edge_down) - prev_yield = new_yield - if edge_down == 0: break - if len(edges) == 1: - edges.append(0) - edges.reverse() - edges = np.array(edges, dtype=float) - edges = edges / step_int_scale - if edges[-1] != 1: - edges[-1] = 1 - if edges[0] != 0: - edges[0] = 0 - return Binning(edges) + def fromResult(result): + cats = result['categories'] + sorted_cats = sorted(cats, key=lambda x: int(x['cat_id'])) + datacards = [ cat['datacard'] for cat in sorted_cats ] + edges_by_category = { int(cat['cat_id']): np.array(cat['bin_edges'], dtype=float) for cat in sorted_cats } + binning = MultiBinning(edges_by_category, datacards) + binning.exp_limit = float(result['exp_limit']) + return binning class MultiBayesianOptimization: def __init__(self, max_n_bins, working_area, workers_dir, acq, kappa, xi, - input_datacards_list, poi, bkg_yields_dict, input_queue_size, random_seed=12345): + input_datacards_list, poi, bkg_yields_dict, input_queue_size, + random_seed=12345, other_datacards=None): self.max_n_bins = max_n_bins self.binnings = [] self.best_binning = None @@ -118,41 +107,43 @@ def __init__(self, max_n_bins, working_area, workers_dir, acq, kappa, xi, self.optimizer_lock = threading.Lock() self.print_lock = threading.Lock() self.bkg_yields_dict = bkg_yields_dict + self.other_datacards = other_datacards - self.input_datacards = {} + self.input_datacards = input_datacards_list bounds = {} - for idx,card in enumerate(input_datacards_list): - self.input_datacards[idx] = card - for n in range(max_n_bins): + for card in range(len(self.input_datacards)): + for n in range(self.max_n_bins): upper_bound = 1. if n > 0 else 1.5 - bounds[f'rel_thr_card{idx}_{n}'] = (min_step, upper_bound) + bounds[f'rel_thr_cat{card}_{n}'] = (min_step, upper_bound) self.bounds_transformer = None self.optimizer = bayes_opt.BayesianOptimization(f=None, pbounds=bounds, random_state=random_seed, verbose=1) self.utilities = [ bayes_opt.acquisition.UpperConfidenceBound(kappa=kappa), bayes_opt.acquisition.ExpectedImprovement(xi=xi), - bayes_opt.acquisition.ProportionalIntegralDerivative() + bayes_opt.acquisition.ProportionalIntegralDerivative(xi=xi) ] - if not os.path.isdir(working_area): - os.makedirs(working_area) - if not os.path.isdir(workers_dir): - os.makedirs(workers_dir) + os.makedirs(working_area, exist_ok=True) + os.makedirs(workers_dir, exist_ok=True) + if os.path.isfile(self.log_output): with open(self.log_output, 'r') as f: prev_binnins = json.loads('[' + ', '.join(f.readlines()) + ']') for entry in prev_binnings: - binning = MultiBinning.fromEntry(entry) + binning = MultiBinning.fromResult(entry) if self.findEquivalent(binning) is None: - point = binning.toPoint(self.max_n_bins, len(self.input_datacards)) - self.register(point, len(binning.edges), binning.exp_limit) + # point = binning.toPoint(self.max_n_bins, len(self.input_datacards)) + # self.register(point, len(binning.edges), binning.exp_limit) if binning.isBetter(self.best_binning): self.best_binning = binning self.binnings.append(binning) + + point = self.bundleToPoint(binning) + self.register(point, binning.n_edges_total(), binning.exp_limit) if self.best_binning is not None: - self.print(f'The best binning from previous iterations: {arrayToStr(self.best_binning.edges)}, exp_limit = {self.best_binning.exp_limit}, datacard = {self.best_binning.input_datacard}') + self.print(f'The best limit from previous iterations: exp_limit = {self.best_binning.exp_limit}') self.bounds_transformer = bayes_opt.SequentialDomainReductionTransformer() self.bounds_transformer.initialize(self.optimizer.space) @@ -165,6 +156,28 @@ def __init__(self, max_n_bins, working_area, workers_dir, acq, kappa, xi, for edges in suggestions: self.addSuggestion(edges) + def pointToBundle(self, point): + + edges_by_cat = {} + for c in range(len(self.datacards)): + rel_thrs = np.zeros(self.max_n_bins, dtype=float) + for k in range(self.max_n_bins): + rel_thrs[k] = float(point[f"rel_thr_cat{c}_{k}"]) + + binning = Binning.fromRelativeThresholds(rel_thrs, self.bkg_yields_dict[c]) + edges_by_cat[c] = binning.edges + return MultiBinning(edges_by_category=edges_by_cat, input_datacards=self.input_datacards) + + def bundleToPoint(self, bundle): + + point = {} + for c in range(bundle.n_categories): + b = Binning(np.array(bundle.edges_by_category[c], dtype=float)) + rel_thrs = b.getRelativeThresholds(self.max_n_bins) + for k in range(self.max_n_bins): + point[f"rel_thr_cat{c}_{k}"] = float(rel_thrs[k]) + return point + def findEquivalent(self, binning, lock=True): equivalent_binning = None if lock: @@ -177,11 +190,11 @@ def findEquivalent(self, binning, lock=True): self.binning_lock.release() return equivalent_binning - def register(self, point, n_edges, exp_limit): - loss = -(exp_limit*1e6 + n_edges) + def register(self, point, n_edges_total, exp_limit): + loss = -(exp_limit*1e6 + n_edges_total) self.optimizer_lock.acquire() self.optimizer.register(params=point, target=loss) - self.updateBounds(n_edges - 2) + self.updateBounds(n_edges_total - 2) self.optimizer_lock.release() def print(self, msg): @@ -190,27 +203,31 @@ def print(self, msg): self.print_lock.release() def updateBounds(self, n_points): - if self.bound_transformer: + if self.bounds_transformer: new_bounds = self.bounds_transformer.transform(self.optimizer.space) prev_fixed = True - for cat_i in range(len(self.input_datacards)): - for n in range(self.max_n_bins): - key = f'rel_thr_card{cat_i}_{n}' - if prev_fixed: - prev_fixed = new_bounds[key][1] - new_bounds[key][0] < 0.1 and n < n_points - else: - new_bounds[key] = np.array([min_step, 1.]) - for key in new_bounds: - if new_bounds[key][0] >= new_bounds[key][1]: - new_bounds[key] = np.array([new_bounds[key][1], new_bounds[key][0]]) - if new_bounds[key][0] < min_step: - new_bounds[key][0] = min_step - - upper_bound = 1. if key != f'rel_thr_card{cat_i}_0' else 1.5 - if new_bounds[key][1] > upper_bound: - new_bounds[key][1] = upper_bound - if np.any(np.isnan(new_bounds[key])): - new_bounds[key] = np.array([min_step, upper_bound]) + # for cat_i in range(len(self.input_datacards)): + # for n in range(self.max_n_bins): + # key = f'rel_thr_cat{cat_i}_{n}' + # if prev_fixed: + # prev_fixed = new_bounds[key][1] - new_bounds[key][0] < 0.1 and n < n_points + # else: + # new_bounds[key] = np.array([min_step, 1.]) + for key, bounds in new_bounds.items(): + lo, hi = float(bounds[0]), float(bounds[1]) + if math.isnan(lo) or math.isnan(hi): + # restore default + upper = 1.5 if key.endswith("_0") else 1.0 + new_bounds[key] = np.array([min_step, upper]) + continue + if lo >= hi: + lo, hi = hi, lo + lo = max(lo, min_step) + upper = 1.5 if key.endswith("_0") else 1.0 + hi = min(hi, upper) + if lo >= hi: + lo, hi = min_step, upper + new_bounds[key] = np.array([lo, hi]) self.optimizer.set_bounds(new_bounds) def suggest(self, utility_index): @@ -232,15 +249,12 @@ def addSuggestion(self, edges): point = binning.toPoint(self.max_n_bins, len(self.input_datacards)) self.suggestions.append(point) -###check below for compatibility with multi category optimization - def addNewBinning(self, binning): self.binning_lock.acquire() if binning.isBetter(self.best_binning): self.best_binning = binning self.best_binning_split = False - self.print('New best binning: {}, exp_limit = {}' \ - .format(arrayToStr(binning.edges), binning.exp_limit)) + self.print(f'New best exp_limit = {binning.exp_limit}, n_edges_total = {binning.n_edges_total()}') self.binnings.append(binning) self.binning_lock.release() @@ -308,35 +322,36 @@ def PointGenerator(self, n_eq_steps): open_request_sleep = 1 while n < n_eq_steps: point = self.suggest(utility_index) + bundle = self.pointToBundle(point) - rel_thrs = np.zeros(len(point)) - for idx in range(len(self.input_datacards)): - for k in range(self.max_n_bins): - rel_thrs[k] = point[f'rel_thr_card{idx}_{k}'] - self.print(f'rel_thrs: {arrayToStr(rel_thrs)}') + msg = [] + for c in range(bundle.n_categories): + msg.append(f"cat{c}_edges={arrayToStr(bundle.edges_by_category[c])}") + self.print("Proposed bundle: " + " | ".join(msg)) - binning = Binning.fromRelativeThresholds(rel_thrs, self.bkg_yields) - equivalent_binning = self.findEquivalent(binning) + + # binning = Binning.fromRelativeThresholds(rel_thrs, self.bkg_yields) + equivalent_binning = self.findEquivalent(bundle) if equivalent_binning is None: n = 0 - open_request = self.findOpenRequest(binning) + open_request = self.findOpenRequest(bundle) if open_request is None: - self.print(f'Next binning to probe: {arrayToStr(binning.edges)}') - self.input_queue.put(binning) - self.addOpenRequest(binning) + self.print(f'Next binning to probe: {arrayToStr(bundle.edges_by_category)}') + self.input_queue.put((bundle, point)) + self.addOpenRequest(bundle) utility_index = 0 open_request_sleep = 1 else: - self.print(f'Open request for binning found: {arrayToStr(open_request.edges)}') + self.print(f'Open request for binning found. Waiting for it to finish.') time.sleep(open_request_sleep) open_request_sleep = open_request_sleep * 2 utility_index = (utility_index + 1) % len(self.utilities) else: - self.print(f'Equivalent binning found: {arrayToStr(equivalent_binning.edges)}') - self.register(point, len(equivalent_binning.edges), equivalent_binning.exp_limit) + self.print(f'Equivalent binning found with exp_limit = {equivalent_binning.exp_limit}') + self.register(point, equivalent_binning.n_edges_total(), equivalent_binning.exp_limit) n += 1 - if n >= 5: - self.tryBestBinningSplit() + # if n >= 5: + # self.tryBestBinningSplit() if n == n_eq_steps - 1: self.print('Waiting for open requests to finish...') self.waitOpenRequestsToFinish() @@ -357,14 +372,15 @@ def JobDispatcher(self): with open(result_file, 'r') as f: result = json.load(f) if result['input_datacard'] in self.input_datacards.values(): - binning = Binning.fromEntry(result) + binning = MultiBinning.fromResult(result) self.clearOpenRequest(binning) if self.findEquivalent(binning) is None: - point = binning.toPoint(self.max_n_bins) + # point = binning.toPoint(self.max_n_bins) self.addNewBinning(binning) with open(self.log_output, 'a') as f: f.write(json.dumps(result) + '\n') - self.register(point, len(binning.edges), binning.exp_limit) + point = self.BundleToPoint(binning) + self.register(point, binning.n_edges_total(), binning.exp_limit) os.remove(task_file) os.remove(result_file) @@ -372,13 +388,16 @@ def JobDispatcher(self): try: binning = self.input_queue.get(True, 1) if binning is None: return - for card in self.input_datacards: - task = { - 'input_datacard': card, - 'bin_edges': [ x for x in binning.edges ], - 'poi': self.poi, - 'other_datacards': self.other_datacards, - } + + bundle, _point = binning + # for card in self.input_datacards: + # task = { + # 'input_datacard': card, + # 'bin_edges': [ x for x in binning.edges ], + # 'poi': self.poi, + # 'other_datacards': self.other_datacards, + # } + task = bundle.toTask(poi=self.poi, other_datacards=self.other_datacards) with open(task_file_tmp, 'w') as f: json.dump(task, f) shutil.move(task_file_tmp, task_file) @@ -408,15 +427,17 @@ def dispatcher(): if __name__ == '__main__': import argparse - parser = argparse.ArgumentParser(description='Find optimal binning for multiple catgories that minimizes the expected limits.') + parser = argparse.ArgumentParser(description='Find optimal binning for multiple catgories that minimizes the combined expected limits.') parser.add_argument('--input', required=True, type=str, help="comma separated input datacards, e.g. datacard_cat1,datacard_cat2,..") - parser.add_argument('--shape-file', required=True, type=str, help="input root file with shapes, comma separated") + parser.add_argument('--shape-file', required=True, type=str, help="input root file with shapes, comma separated. keep the same order as input datacards.") parser.add_argument('--output', required=True, type=str, help="output directory") parser.add_argument('--workers-dir', required=True, type=str, help="output directory for workers results") parser.add_argument('--max_n_bins', required=True, type=int, help="maximum number of bins per category") parser.add_argument('--poi', required=False, type=str, default='r', help="parameter of interest") parser.add_argument('--params', required=False, type=str, default=None, help="algorithm parameters in format param1=value1,param2=value2,...") + parser.add_argument('other_datacards',type=str, nargs='*', + help="list of other datacards to be combined together with the current target") parser.add_argument('--verbosity', required=False, type=int, default=1, help="verbosity") args = parser.parse_args() @@ -469,9 +490,15 @@ def dispatcher(): input_datacards=input_datacards, poi=args.poi, bkg_yields=bkg_yields, - input_queue_size=2, random_seed=None) + input_queue_size=2, + random_seed=None, + other_datacards=[os.path.abspath(p) for p in args.other_datacards] + ) multi_bo.maximize(20) print("Minimization finished.") - print("Best binning per category:\n") + if multi_bo.best_binning is not None: + print(f"Best combined exp_limit: {multi_bo.best_binning.exp_limit}") + for cat in range(multi_bo.best_binning.n_categories): + print(f"\tcat{cat} edges: {arrayToStr(multi_bo.best_binning.edges_by_category[cat])}") diff --git a/bin_opt/optimize_channel.py b/bin_opt/optimize_channel.py index 61f60bc..6477441 100644 --- a/bin_opt/optimize_channel.py +++ b/bin_opt/optimize_channel.py @@ -7,6 +7,7 @@ import subprocess import sys import yaml +import math file_dir = os.path.dirname(os.path.abspath(__file__)) pkg_dir = os.path.dirname(file_dir) @@ -33,6 +34,29 @@ def sh_call(cmd, error_message, verbose=0): if returncode != 0: raise RuntimeError(error_message) +def _total_bins_in_multicategory_entry(entry): + categories = entry.get("categories", []) + return sum(len(cat.get("bin_edges", [])) - 1 for cat in categories) + +def getBestMultiCategoryResult(log_file): + best = None + with open(log_file, 'r') as f: + for line in f: + line = line.strip() + if not line: + continue + entry = json.loads(line) + if best is None: + best = entry + continue + if entry['exp_limit'] != best['exp_limit']: + if entry['exp_limit'] < best['exp_limit']: + best = entry + else: + if _total_bins_in_multicategory_entry(entry) < _total_bins_in_multicategory_entry(best): + best = entry + return best + def compare_binnings(b1, b2): if b2 is None: return True if b1['exp_limit'] != b2['exp_limit']: return b1['exp_limit'] < b2['exp_limit'] @@ -50,6 +74,26 @@ def getBestBinning(log_file): best_binning = binning return best_binning +def inputs_for_category(input_dir, channel, category, era): + cards = [] + shapes = [] + for file in os.listdir(input_dir): + if not ((channel in file or channel.lower() in file) and (category in file) and (era in file)): + continue + if file.endswith(".txt"): + cards.append(os.path.join(input_dir, file)) + elif file.endswith(".root"): + shapes.append(os.path.join(input_dir, file)) + if len(cards) != 1: + raise RuntimeError( + f"Expected exactly 1 datacard for channel={channel}, category={category}, era={era}, found {len(cards)}: {cards}" + ) + if len(shapes) != 1: + raise RuntimeError( + f"Expected exactly 1 shape file for channel={channel}, category={category}, era={era}, found {len(shapes)}: {shapes}" + ) + return os.path.abspath(cards[0]), os.path.abspath(shapes[0]) + def optimize_channel(channel, output, era, categories, max_n_bins, params, binning_suggestions, verbose): output_dir = os.path.join(output, channel+'_'+era) workers_dir = os.path.join(output_dir, 'workers') @@ -95,6 +139,110 @@ def optimize_channel(channel, output, era, categories, max_n_bins, params, binni while first_cat_index < len(categories) and categories[first_cat_index][0] in best_binnings[channel+'_'+f"{era}"]: first_cat_index += 1 + if input_binning_opt_config_dict["input"]["multi_category_optimization"]: + remaining = categories[first_cat_index:] + if len(remaining) == 0: + print(f"Nothing to do: all categories already optimized for {channel}_{era}") + return + + pois = sorted(set([poi for (_cat, poi) in remaining])) + if len(pois) != 1: + raise RuntimeError(f"Simultaneous multi-category optimization requires a single shared POI across the bundle. Found POIs: {pois}") + poi = pois[0] + + bundle_categories = [cat for (cat, _poi) in remaining] + datacards = [] + shape_files = [] + cat_inputs = {} + for category in bundle_categories: + card, shape = inputs_for_category(input_dir, channel, category, era) + datacards.append(card) + shape_files.append(shape) + cat_inputs[category] = (card, shape) + + bundle_name = "multicat_" + "_".join(bundle_categories) + bundle_dir = os.path.join(output_dir, bundle_name) + os.makedirs(bundle_dir, exist_ok=True) + + input_arg = ",".join(datacards) + shape_arg = ",".join(shape_files) + opt_cmd = ( + f"python3 bin_opt/multi_optimize_binning.py " + f"--input {input_arg} " + f"--shape-file {shape_arg} " + f"--output {bundle_dir} " + f"--workers-dir {workers_dir} " + f"--max_n_bins {max_n_bins} " + f"--poi {poi}" + ) + if params is not None: + opt_cmd += f" --params {params} " + + ps_call(opt_cmd, shell=True, env=None, verbose=verbose) + + multicat_log = os.path.join(bundle_dir, "results_multibinning.json") + if not os.path.isfile(multicat_log): + raise RuntimeError(f"Multi-category log file not found: {multicat_log}") + + best_entry = getBestMultiCategoryResult(multicat_log) + if best_entry is None: + raise RuntimeError(f"Unable to find best multi-category result in {multicat_log}") + + combined_exp_limit = best_entry["exp_limit"] + + cats_sorted = sorted(best_entry["categories"], key=lambda x: int(x["cat_id"])) + if len(cats_sorted) != len(bundle_categories): + raise RuntimeError( + "Mismatch between optimized categories and result categories: " + f"{len(bundle_categories)} vs {len(cats_sorted)}" + ) + + for idx, (category, _poi) in enumerate(remaining): + cat_result = cats_sorted[idx] + bin_edges = cat_result["bin_edges"] + + input_card, _input_shape = cat_inputs[category] + + cat_best = { + "bin_edges": bin_edges, + "exp_limit": combined_exp_limit, # combined objective + "poi": poi, + "mode": "simultaneous_multicat", + "multicat_categories": bundle_categories, + } + best_binnings[channel + "_" + era][category] = cat_best + + bin_edges_str = ", ".join([str(edge) for edge in bin_edges]) + rebin_cmd = ( + f'python3 bin_opt/rebinAndRunLimits.py ' + f'--input {input_card} ' + f'--output {best_dir} ' + f'--bin-edges "{bin_edges_str}" ' + f'--rebin-only ' + ) + ps_call(rebin_cmd, shell=True, env=None, verbose=verbose) + + best_binnings_file = os.path.join(output_dir, "best.json") + with open(best_binnings_file, "w") as f: + f.write('{{\n\t"{}": {{\n'.format(channel + "_" + era)) + for i, (category, _poi) in enumerate(categories): + if category not in best_binnings[channel + "_" + era]: + continue + f.write('\t\t "{}": '.format(category)) + json.dump(best_binnings[channel + "_" + era][category], f) + # trailing commas only between written entries + f.write(",\n") + f.write("\t}\n}\n") + + final_binning_file = output_dir + ".json" + shutil.copy(best_binnings_file, final_binning_file) + print( + "Binning for {} has been successfully optimised (simultaneous multi-category). " + "The results can be found in {}".format(channel + "_" + era, final_binning_file) + ) + return + + for cat_index in range(first_cat_index, len(categories)): category, poi = categories[cat_index] print(f"Optimizing {channel} {category} for era {era}") @@ -121,7 +269,10 @@ def optimize_channel(channel, output, era, categories, max_n_bins, params, binni cat_log = os.path.join(cat_dir, 'results.json') if input_binning_opt_config_dict["input"]["multi_category_optimization"]: - opt_cmd = f"python3 bin_opt/multi_optimize_binning.py --input {datacards} --shape-file {shape_files} --output {cat_dir} --workers-dir {workers_dir} --max_n_bins {max_n_bins} --poi {poi}" + input_arg = ",".join([os.path.abspath(p) for p in datacards]) + shape_arg = ",".join([os.path.abspath(p) for p in shape_files]) + opt_cmd = f"python3 bin_opt/multi_optimize_binning.py --input {input_arg} --shape-file {shape_arg} --output {cat_dir} --workers-dir {workers_dir} --max_n_bins {max_n_bins} --poi {poi}" + else: opt_cmd = f"python3 bin_opt/optimize_binning.py --input {input_card} --shape-file {input_shape} --output {cat_dir} --workers-dir {workers_dir} --max_n_bins {max_n_bins} --poi {poi}" if params is not None: