Source code for madminer.sampling.combine

from __future__ import absolute_import, division, print_function, unicode_literals

import logging
import numpy as np

from ..utils.interfaces.madminer_hdf5 import madminer_event_loader, load_madminer_settings
from ..utils.interfaces.madminer_hdf5 import save_preformatted_events_to_madminer_file
from ..utils.interfaces.madminer_hdf5 import save_sample_summary_to_madminer_file
from ..utils.various import shuffle

logger = logging.getLogger(__name__)


def _calculate_n_events(sampling_ids, n_benchmarks):
    if sampling_ids is None:
        return None, None
    unique, counts = np.unique(sampling_ids, return_counts=True)
    results = dict(zip(unique, counts))
    n_events_backgrounds = results.get(-1, 0)
    n_events_signal_per_benchmark = np.array([results.get(i, 0) for i in range(n_benchmarks)], dtype=np.int)
    return n_events_signal_per_benchmark, n_events_backgrounds


[docs]def combine_and_shuffle(
    input_filenames, output_filename, k_factors=None, overwrite_existing_file=True, recalculate_header=True
):
    """
    Combines multiple MadMiner files into one, and shuffles the order of the events.

    Note that this function assumes that all samples are generated with the same setup, including identical benchmarks
    (and thus morphing setup). If it is used with samples with different settings, there will be wrong results!
    There are no explicit cross checks in place yet!

    Parameters
    ----------
    input_filenames : list of str
        List of paths to the input MadMiner files.

    output_filename : str
        Path to the combined MadMiner file.

    k_factors : float or list of float, optional
        Multiplies the weights in input_filenames with a universal factor (if k_factors is a float) or with independent
        factors (if it is a list of float). Default value: None.

    overwrite_existing_file : bool, optional
        If True and if the output file exists, it is overwritten. Default value: True.

    recalculate_header : bool, optional
        Recalculates the total number of events. Default value: True.

    Returns
    -------
        None

    """

    # TODO: merge different nuisance setups

    logger.debug("Combining and shuffling samples")

    if len(input_filenames) > 1:
        logger.warning(
            "Careful: this tool assumes that all samples are generated with the same setup, including"
            " identical benchmarks (and thus morphing setup), and identical nuisance parameters. If it is used with "
            "samples with different"
            " settings, there will be wrong results! There are no explicit cross checks in place yet."
        )

    if len(input_filenames) <= 0:
        raise ValueError("Need to provide at least one input filename")

    # k factors
    if k_factors is None:
        k_factors = [1.0 for _ in input_filenames]
    elif isinstance(k_factors, float):
        k_factors = [k_factors for _ in input_filenames]

    if len(input_filenames) != len(k_factors):
        raise RuntimeError(
            "Inconsistent length of input filenames and k factors: %s vs %s", len(input_filenames), len(k_factors)
        )

    # Copy first file to output_filename
    logger.debug("Copying setup from %s to %s", input_filenames[0], output_filename)

    # TODO: More memory efficient strategy

    # Load events
    all_observations = None
    all_weights = None
    all_sampling_ids = None

    all_n_events_background = 0
    all_n_events_signal_per_benchmark = 0

    for i, (filename, k_factor) in enumerate(zip(input_filenames, k_factors)):
        logger.debug(
            "Loading samples from file %s / %s at %s, multiplying weights with k factor %s",
            i + 1,
            len(input_filenames),
            filename,
            k_factor,
        )

        (
            _,
            benchmarks,
            _,
            _,
            _,
            _,
            _,
            _,
            _,
            _,
            n_signal_events_generated_per_benchmark,
            n_background_events,
        ) = load_madminer_settings(filename)
        n_benchmarks = len(benchmarks)

        if n_signal_events_generated_per_benchmark is not None and n_background_events is not None:
            all_n_events_signal_per_benchmark += n_signal_events_generated_per_benchmark
            all_n_events_background += n_background_events

        for observations, weights, sampling_ids in madminer_event_loader(filename, return_sampling_ids=True):
            logger.debug("Sampling benchmarks: %s", sampling_ids)
            if all_observations is None:
                all_observations = observations
                all_weights = k_factor * weights
            else:
                all_observations = np.vstack((all_observations, observations))
                all_weights = np.vstack((all_weights, k_factor * weights))

            if all_sampling_ids is None:
                all_sampling_ids = sampling_ids
            elif sampling_ids is not None:
                all_sampling_ids = np.hstack((all_sampling_ids, sampling_ids))

        logger.debug("Combined sampling benchmarks: %s", all_sampling_ids)

    # Shuffle
    all_observations, all_weights, all_sampling_ids = shuffle(all_observations, all_weights, all_sampling_ids)

    # Recalculate header info: number of events
    if recalculate_header:
        all_n_events_signal_per_benchmark, all_n_events_background = _calculate_n_events(all_sampling_ids, n_benchmarks)

        logger.debug(
            "Recalculated event numbers per benchmark: %s, background: %s",
            all_n_events_signal_per_benchmark,
            all_n_events_background,
        )

    # Save result
    save_preformatted_events_to_madminer_file(
        filename=output_filename,
        observations=all_observations,
        weights=all_weights,
        sampling_benchmarks=all_sampling_ids,
        copy_setup_from=input_filenames[0],
        overwrite_existing_samples=overwrite_existing_file,
    )
    if all_n_events_background + np.sum(all_n_events_signal_per_benchmark) > 0:
        save_sample_summary_to_madminer_file(
            filename=output_filename,
            n_events_background=all_n_events_background,
            n_events_per_sampling_benchmark=all_n_events_signal_per_benchmark,
        )