Source code for madminer.sampling.combine

import logging
import numpy as np
import shutil

from contextlib import suppress
from typing import List
from typing import Union

from ..utils.interfaces.hdf5 import load_events
from ..utils.interfaces.hdf5 import load_madminer_settings
from ..utils.interfaces.hdf5 import _save_samples
from ..utils.interfaces.hdf5 import _save_samples_summary
from ..utils.various import shuffle

logger = logging.getLogger(__name__)


def _calculate_n_events(sampling_ids, n_benchmarks):
    if sampling_ids is None:
        return None, None

    unique, counts = np.unique(sampling_ids, return_counts=True)
    results = dict(zip(unique, counts))

    n_events_backgrounds = results.get(-1, 0)
    n_events_signal_per_benchmark = np.array([results.get(i, 0) for i in range(n_benchmarks)], dtype=int)
    return n_events_signal_per_benchmark, n_events_backgrounds


[docs]def combine_and_shuffle(
    input_filenames: List[str],
    output_filename: str,
    k_factors: Union[List[float], float] = None,
    recalculate_header: bool = True,
):
    """
    Combines multiple MadMiner files into one, and shuffles the order of the events.

    Note that this function assumes that all samples are generated with the same setup, including identical benchmarks
    (and thus morphing setup). If it is used with samples with different settings, there will be wrong results!
    There are no explicit cross checks in place yet!

    Parameters
    ----------
    input_filenames : list of str
        List of paths to the input MadMiner files.

    output_filename : str
        Path to the combined MadMiner file.

    k_factors : float or list of float, optional
        Multiplies the weights in input_filenames with a universal factor (if k_factors is a float)
        or with independent factors (if it is a list of float). Default value: None.

    recalculate_header : bool, optional
        Recalculates the total number of events. Default value: True.

    Returns
    -------
        None
    """

    # TODO: merge different nuisance setups

    logger.debug("Combining and shuffling samples")

    if len(input_filenames) > 1:
        logger.warning(
            "Careful: this tool assumes that all samples are generated with the same setup, including"
            " identical benchmarks (and thus morphing setup), and identical nuisance parameters. If it is used with "
            "samples with different"
            " settings, there will be wrong results! There are no explicit cross checks in place yet."
        )

    if len(input_filenames) <= 0:
        raise ValueError("Need to provide at least one input filename")

    # k factors
    if k_factors is None:
        k_factors = [1.0 for _ in input_filenames]
    elif isinstance(k_factors, float):
        k_factors = [k_factors for _ in input_filenames]

    if len(input_filenames) != len(k_factors):
        raise RuntimeError(
            f"Inconsistent length of input filenames and k factors: "
            f"{len(input_filenames)} vs {len(k_factors)}"
        )

    # Copy first file to output_filename
    logger.debug("Copying setup from %s to %s", input_filenames[0], output_filename)

    # TODO: More memory efficient strategy

    # Load events
    all_observations = None
    all_weights = None
    all_sampling_ids = None

    all_n_events_background = 0
    all_n_events_signal_per_benchmark = 0

    for i, (filename, k_factor) in enumerate(zip(input_filenames, k_factors), start=1):
        logger.debug(
            "Loading samples from file %s / %s at %s, multiplying weights with k factor %s",
            i,
            len(input_filenames),
            filename,
            k_factor,
        )

        (
            _,
            benchmarks,
            _,
            _,
            _,
            _,
            _,
            _,
            _,
            _,
            n_signal_events_generated_per_benchmark,
            n_background_events,
            _,
            _,
        ) = load_madminer_settings(filename, include_nuisance_benchmarks=False)

        n_benchmarks = len(benchmarks)

        if n_signal_events_generated_per_benchmark is not None and n_background_events is not None:
            all_n_events_signal_per_benchmark += n_signal_events_generated_per_benchmark
            all_n_events_background += n_background_events

        for observations, weights, sampling_ids in load_events(filename, include_sampling_ids=True):
            logger.debug("Sampling benchmarks: %s", sampling_ids)
            if all_observations is None:
                all_observations = observations
                all_weights = k_factor * weights
            else:
                all_observations = np.vstack((all_observations, observations))
                all_weights = np.vstack((all_weights, k_factor * weights))

            if all_sampling_ids is None:
                all_sampling_ids = sampling_ids
            elif sampling_ids is not None:
                all_sampling_ids = np.hstack((all_sampling_ids, sampling_ids))

        logger.debug("Combined sampling benchmarks: %s", all_sampling_ids)

    # Shuffle
    all_observations, all_weights, all_sampling_ids = shuffle(all_observations, all_weights, all_sampling_ids)

    # Recalculate header info: number of events
    if recalculate_header:
        all_n_events_signal_per_benchmark, all_n_events_background = _calculate_n_events(all_sampling_ids, n_benchmarks)

        logger.debug(
            "Recalculated event numbers per benchmark: %s, background: %s",
            all_n_events_signal_per_benchmark,
            all_n_events_background,
        )

    file_copy_path = input_filenames[0]
    if file_copy_path is not None:
        with suppress(OSError):
            shutil.copyfile(file_copy_path, output_filename)

    # Save result
    _save_samples(
        file_name=output_filename,
        file_override=True,
        sample_observations=all_observations,
        sample_weights=all_weights,
        sampling_ids=all_sampling_ids,
    )

    if all_n_events_background + np.sum(all_n_events_signal_per_benchmark) > 0:
        _save_samples_summary(
            file_name=output_filename,
            file_override=True,
            num_signal_events=all_n_events_signal_per_benchmark,
            num_background_events=all_n_events_background,
        )