from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import numpy as np
from ..utils.interfaces.madminer_hdf5 import madminer_event_loader, load_madminer_settings
from ..utils.interfaces.madminer_hdf5 import save_preformatted_events_to_madminer_file
from ..utils.interfaces.madminer_hdf5 import save_sample_summary_to_madminer_file
from ..utils.various import shuffle
logger = logging.getLogger(__name__)
def _calculate_n_events(sampling_ids, n_benchmarks):
if sampling_ids is None:
return None, None
unique, counts = np.unique(sampling_ids, return_counts=True)
results = dict(zip(unique, counts))
n_events_backgrounds = results.get(-1, 0)
n_events_signal_per_benchmark = np.array([results.get(i, 0) for i in range(n_benchmarks)], dtype=np.int)
return n_events_signal_per_benchmark, n_events_backgrounds
[docs]def combine_and_shuffle(
input_filenames, output_filename, k_factors=None, overwrite_existing_file=True, recalculate_header=True
):
"""
Combines multiple MadMiner files into one, and shuffles the order of the events.
Note that this function assumes that all samples are generated with the same setup, including identical benchmarks
(and thus morphing setup). If it is used with samples with different settings, there will be wrong results!
There are no explicit cross checks in place yet!
Parameters
----------
input_filenames : list of str
List of paths to the input MadMiner files.
output_filename : str
Path to the combined MadMiner file.
k_factors : float or list of float, optional
Multiplies the weights in input_filenames with a universal factor (if k_factors is a float) or with independent
factors (if it is a list of float). Default value: None.
overwrite_existing_file : bool, optional
If True and if the output file exists, it is overwritten. Default value: True.
recalculate_header : bool, optional
Recalculates the total number of events. Default value: True.
Returns
-------
None
"""
# TODO: merge different nuisance setups
logger.debug("Combining and shuffling samples")
if len(input_filenames) > 1:
logger.warning(
"Careful: this tool assumes that all samples are generated with the same setup, including"
" identical benchmarks (and thus morphing setup), and identical nuisance parameters. If it is used with "
"samples with different"
" settings, there will be wrong results! There are no explicit cross checks in place yet."
)
if len(input_filenames) <= 0:
raise ValueError("Need to provide at least one input filename")
# k factors
if k_factors is None:
k_factors = [1.0 for _ in input_filenames]
elif isinstance(k_factors, float):
k_factors = [k_factors for _ in input_filenames]
if len(input_filenames) != len(k_factors):
raise RuntimeError(
"Inconsistent length of input filenames and k factors: %s vs %s", len(input_filenames), len(k_factors)
)
# Copy first file to output_filename
logger.debug("Copying setup from %s to %s", input_filenames[0], output_filename)
# TODO: More memory efficient strategy
# Load events
all_observations = None
all_weights = None
all_sampling_ids = None
all_n_events_background = 0
all_n_events_signal_per_benchmark = 0
for i, (filename, k_factor) in enumerate(zip(input_filenames, k_factors)):
logger.debug(
"Loading samples from file %s / %s at %s, multiplying weights with k factor %s",
i + 1,
len(input_filenames),
filename,
k_factor,
)
(
_,
benchmarks,
_,
_,
_,
_,
_,
_,
_,
_,
n_signal_events_generated_per_benchmark,
n_background_events,
) = load_madminer_settings(filename)
n_benchmarks = len(benchmarks)
if n_signal_events_generated_per_benchmark is not None and n_background_events is not None:
all_n_events_signal_per_benchmark += n_signal_events_generated_per_benchmark
all_n_events_background += n_background_events
for observations, weights, sampling_ids in madminer_event_loader(filename, return_sampling_ids=True):
logger.debug("Sampling benchmarks: %s", sampling_ids)
if all_observations is None:
all_observations = observations
all_weights = k_factor * weights
else:
all_observations = np.vstack((all_observations, observations))
all_weights = np.vstack((all_weights, k_factor * weights))
if all_sampling_ids is None:
all_sampling_ids = sampling_ids
elif sampling_ids is not None:
all_sampling_ids = np.hstack((all_sampling_ids, sampling_ids))
logger.debug("Combined sampling benchmarks: %s", all_sampling_ids)
# Shuffle
all_observations, all_weights, all_sampling_ids = shuffle(all_observations, all_weights, all_sampling_ids)
# Recalculate header info: number of events
if recalculate_header:
all_n_events_signal_per_benchmark, all_n_events_background = _calculate_n_events(all_sampling_ids, n_benchmarks)
logger.debug(
"Recalculated event numbers per benchmark: %s, background: %s",
all_n_events_signal_per_benchmark,
all_n_events_background,
)
# Save result
save_preformatted_events_to_madminer_file(
filename=output_filename,
observations=all_observations,
weights=all_weights,
sampling_benchmarks=all_sampling_ids,
copy_setup_from=input_filenames[0],
overwrite_existing_samples=overwrite_existing_file,
)
if all_n_events_background + np.sum(all_n_events_signal_per_benchmark) > 0:
save_sample_summary_to_madminer_file(
filename=output_filename,
n_events_background=all_n_events_background,
n_events_per_sampling_benchmark=all_n_events_signal_per_benchmark,
)