Source code for madminer.limits

from __future__ import absolute_import, division, print_function, unicode_literals

import logging
import numpy as np
import os
from scipy.stats import chi2, poisson

from madminer.analysis import DataAnalyzer
from madminer.utils.various import mdot
from madminer.ml import ParameterizedRatioEstimator, Ensemble, ScoreEstimator, load_estimator
from madminer.utils.histo import Histo
from madminer.sampling import SampleAugmenter
from madminer import sampling

logger = logging.getLogger(__name__)


[docs]class AsymptoticLimits(DataAnalyzer):
    """
    Functions to calculate observed and expected constraints, using asymptotic properties of the likelihood ratio as
    test statistics.

    Parameters
    ----------
    filename : str
        Path to MadMiner file (for instance the output of `madminer.delphes.DelphesProcessor.save()`).

    include_nuisance_parameters : bool, optional
        If True, nuisance parameters are taken into account. Default value: False.
    """

    def __init__(self, filename=None, include_nuisance_parameters=False):
        super(AsymptoticLimits, self).__init__(filename, False, include_nuisance_parameters)

[docs]    def observed_limits(
        self,
        x_observed,
        theta_ranges,
        mode="ml",
        model_file=None,
        hist_vars=None,
        hist_bins=20,
        include_xsec=True,
        resolutions=25,
        luminosity=300000.0,
        n_toys_per_theta=10000,
        returns="pval",
        dof=None,
        n_observed=None,
        histo_theta_batchsize=100,
    ):
        if n_observed is None:
            n_observed = len(x_observed)
        theta_grid, return_values, i_ml = self._analyse(
            n_observed,
            x_observed,
            theta_ranges,
            resolutions,
            mode,
            model_file,
            hist_vars,
            hist_bins,
            include_xsec,
            None,
            luminosity,
            n_toys_per_theta,
            returns=returns,
            dof=dof,
            histo_theta_batchsize=histo_theta_batchsize,
        )
        return theta_grid, return_values, i_ml

[docs]    def expected_limits(
        self,
        theta_true,
        theta_ranges,
        mode="ml",
        model_file=None,
        hist_vars=None,
        hist_bins=20,
        include_xsec=True,
        resolutions=25,
        luminosity=300000.0,
        n_toys_per_theta=10000,
        returns="pval",
        dof=None,
        histo_theta_batchsize=100,
    ):
        logger.info("Generating Asimov data")
        x_asimov, x_weights = self._asimov_data(theta_true)
        n_observed = luminosity * self._calculate_xsecs([theta_true])[0]
        logger.info("Expected events: %s", n_observed)
        theta_grid, return_values, i_ml = self._analyse(
            n_observed,
            x_asimov,
            theta_ranges,
            resolutions,
            mode,
            model_file,
            hist_vars,
            hist_bins,
            include_xsec,
            x_weights,
            luminosity,
            n_toys_per_theta,
            returns=returns,
            dof=dof,
            histo_theta_batchsize=histo_theta_batchsize,
        )
        return theta_grid, return_values, i_ml

[docs]    def asymptotic_p_value(self, log_likelihood_ratio, dof=None):
        if dof is None:
            dof = self.n_parameters
        q = -2.0 * log_likelihood_ratio
        p_value = chi2.sf(x=q, df=dof)
        return p_value

    def _analyse(
        self,
        n_events,
        x,
        theta_ranges,
        theta_resolutions,
        mode="ml",
        model_file=None,
        hist_vars=None,
        hist_bins=20,
        include_xsec=True,
        obs_weights=None,
        luminosity=300000.0,
        n_toys_per_theta=10000,
        returns="pval",
        dof=None,
        histo_theta_batchsize=100,
    ):
        logger.debug("Calculating p-values for %s expected events", n_events)

        assert returns in ["pval", "llr", "llr_raw"], "returns has to be either 'pval','llr' or 'llr_raw'!"

        # Observation weights
        if obs_weights is None:
            obs_weights = np.ones(len(x))
        obs_weights /= np.sum(obs_weights)
        obs_weights = obs_weights.astype(np.float64)

        # Theta grid
        theta_grid = self._make_theta_grid(theta_ranges, theta_resolutions)

        # Kinematic part
        if mode == "rate":
            log_r_kin = 0.0
        elif mode == "ml":
            assert model_file is not None
            logger.info("Loading kinematic likelihood ratio estimator")
            model = load_estimator(model_file)

            logger.info("Calculating kinematic log likelihood ratio with estimator")
            log_r_kin = self._calculate_log_likelihood_ratio_kinematics(x, theta_grid, model)
            log_r_kin = log_r_kin.astype(np.float64)
            log_r_kin = self._clean_nans(log_r_kin)
            logger.debug("Raw mean -2 log r: %s", np.mean(-2.0 * log_r_kin, axis=1))
            log_r_kin = n_events * np.sum(log_r_kin * obs_weights[np.newaxis, :], axis=1)
            logger.debug("Rescaled -2 log r: %s", -2.0 * log_r_kin)

        elif mode == "histo":
            if hist_vars is not None:
                logger.info("Setting up standard summary statistics")
                summary_function = self._make_summary_statistic_function("observables", observables=hist_vars)
            elif model_file is not None:
                logger.info("Loading score estimator and setting it up as summary statistics")
                model = load_estimator(model_file)
                summary_function = self._make_summary_statistic_function("sally", model=model)
            else:
                raise RuntimeError("For 'histo' mode, either provide histo_vars or model_file!")
            summary_stats = summary_function(x)
            del x

            logger.info("Creating histogram with %s bins for the summary statistics", hist_bins)
            histo = self._make_histo(
                summary_function,
                hist_bins,
                theta_grid,
                theta_resolutions,
                n_toys_per_theta,
                histo_theta_batchsize=histo_theta_batchsize,
            )

            logger.info("Calculating kinematic log likelihood with histograms")
            log_r_kin = self._calculate_log_likelihood_histo(summary_stats, theta_grid, histo)
            log_r_kin = log_r_kin.astype(np.float64)
            log_r_kin = self._clean_nans(log_r_kin)
            log_r_kin = n_events * np.sum(log_r_kin * obs_weights[np.newaxis, :], axis=1)

        else:
            raise ValueError("Unknown mode {}, has to be 'ml' or 'histo' or 'xsec'".format(mode))

        # xsec part
        if include_xsec:
            logger.info("Calculating rate log likelihood")
            log_p_xsec = self._calculate_log_likelihood_xsec(n_events, theta_grid, luminosity)
            logger.debug("Rate -2 log p: %s", -2.0 * log_p_xsec)
        else:
            log_p_xsec = 0.0

        # Combine and get p-values
        logger.info("Calculating p-values")
        log_r = log_r_kin + log_p_xsec
        if returns == "llr_raw":
            return theta_grid, log_r, 0

        logger.debug("Combined -2 log r: %s", -2.0 * log_r)
        log_r, i_ml = self._subtract_ml(log_r)
        logger.debug("Min-subtracted -2 log r: %s", -2.0 * log_r)
        p_values = self.asymptotic_p_value(log_r, dof=dof)

        if returns == "llr":
            return theta_grid, log_r, i_ml
        return theta_grid, p_values, i_ml

    def _make_summary_statistic_function(self, mode, model=None, observables=None):
        if mode == "observables":
            assert observables is not None
            x_indices = self._find_x_indices(observables)

            logger.debug("Preparing observables %s as summary statistic function", x_indices)

            def summary_function(x):
                return x[:, x_indices]

        elif mode == "sally":
            assert isinstance(model, ScoreEstimator)

            logger.debug("Preparing score estimator as summary statistic function")

            def summary_function(x):
                score = model.evaluate_score(x)
                score = score[:, : self.n_parameters]
                return score

        else:
            raise RuntimeError("Unknown mode {}, has to be 'observables' or 'sally'".format(mode))

        return summary_function

    def _calculate_xsecs(self, thetas, test_split=0.2):
        # Test split
        start_event, end_event, correction_factor = self._train_test_split(False, test_split)

        # Total xsecs for benchmarks
        xsecs_benchmarks = 0.0
        for observations, weights in self.event_loader(start=start_event, end=end_event):
            xsecs_benchmarks += np.sum(weights, axis=0)

        # xsecs at thetas
        xsecs = []
        for theta in thetas:
            theta_matrix = self._get_theta_benchmark_matrix(theta)
            xsecs.append(mdot(theta_matrix, xsecs_benchmarks) * correction_factor)
        return np.asarray(xsecs)

    def _asimov_data(self, theta, test_split=0.2):
        start_event, end_event, correction_factor = self._train_test_split(False, test_split)
        x, weights_benchmarks = next(self.event_loader(start=start_event, end=end_event, batch_size=None))
        weights_benchmarks *= correction_factor

        theta_matrix = self._get_theta_benchmark_matrix(theta)
        weights_theta = mdot(theta_matrix, weights_benchmarks)
        weights_theta /= np.sum(weights_theta)

        return x, weights_theta

    @staticmethod
    def _make_theta_grid(theta_ranges, resolutions):
        if isinstance(resolutions, int):
            resolutions = [resolutions for _ in range(theta_ranges)]
        theta_each = []
        for resolution, (theta_min, theta_max) in zip(resolutions, theta_ranges):
            theta_each.append(np.linspace(theta_min, theta_max, resolution))
        theta_grid_each = np.meshgrid(*theta_each, indexing="ij")
        theta_grid_each = [theta.flatten() for theta in theta_grid_each]
        theta_grid = np.vstack(theta_grid_each).T
        return theta_grid

    def _make_histo(
        self, summary_function, x_bins, theta_grid, theta_bins, n_toys_per_theta=1000, histo_theta_batchsize=100
    ):
        logger.info("Building histogram with %s bins per parameter and %s bins per observable", theta_bins, x_bins)
        histo = Histo(theta_bins, x_bins)
        logger.debug("Generating histo data")
        theta, summary_stats = self._make_histo_data(
            summary_function, theta_grid, n_toys_per_theta, histo_theta_batchsize=histo_theta_batchsize
        )
        logger.debug(
            "Histo data has theta dimensions %s and summary stats dimensions %s", theta.shape, summary_stats.shape
        )
        logger.debug("Filling histogram with summary statistics")
        histo.fit(theta, summary_stats, fill_empty_bins=True)
        return histo

    def _make_histo_data(self, summary_function, thetas, n_toys_per_theta, test_split=0.2, histo_theta_batchsize=100):
        sampler = SampleAugmenter(self.madminer_filename, include_nuisance_parameters=self.include_nuisance_parameters)
        all_summary_stats, all_theta = None, None

        n_thetas = len(thetas)
        n_batches = (n_thetas - 1) // histo_theta_batchsize + 1
        for i_batch in range(n_batches):
            logger.debug("Generating histogram data for batch %s / %s", i_batch + 1, n_batches)
            theta_batch = thetas[i_batch * histo_theta_batchsize : (i_batch + 1) * histo_theta_batchsize]
            logger.debug(
                "Theta data: indices %s to %s, shape %s",
                i_batch * histo_theta_batchsize,
                (i_batch + 1) * histo_theta_batchsize,
                theta_batch.shape,
            )
            x, theta, _ = sampler.sample_train_plain(
                theta=sampling.morphing_points(theta_batch),
                n_samples=n_toys_per_theta * len(theta_batch),
                test_split=test_split,
                filename=None,
                folder=None,
                suppress_logging=True,
            )
            summary_stats = summary_function(x)
            logger.debug(
                "Output: x has shape %s, summary_stats %s, theta %s", x.shape, summary_stats.shape, theta.shape
            )
            if all_theta is None or all_summary_stats is None:
                all_theta = theta
                all_summary_stats = summary_stats
            else:
                all_theta = np.concatenate((all_theta, theta), 0)
                all_summary_stats = np.concatenate((all_summary_stats, summary_stats), 0)
        return all_theta, all_summary_stats

    def _find_x_indices(self, observables):
        x_names = list(self.observables.keys())
        x_indices = []
        for obs in observables:
            try:
                x_indices.append(x_names.index(obs))
            except ValueError:
                raise RuntimeError("Unknown observable {}, has to be one of {}".format(obs, x_names))
        logger.debug("Using x indices %s", x_indices)
        return x_indices

    @staticmethod
    def _calculate_log_likelihood_histo(x, theta_grid, histo):
        log_p = []
        for theta in theta_grid:
            log_p.append(histo.log_likelihood(theta, x))
        log_p = np.asarray(log_p)
        return log_p

    def _calculate_log_likelihood_xsec(self, n_observed, theta_grid, luminosity=300000.0):
        n_observed_rounded = int(np.round(n_observed, 0))
        n_predicted = self._calculate_xsecs(theta_grid) * luminosity
        logger.debug("Observed events: %s", n_observed)
        logger.debug("Expected events: %s", n_predicted)
        log_p = poisson.logpmf(k=n_observed_rounded, mu=n_predicted)
        return log_p

    def _calculate_log_likelihood_ratio_kinematics(self, x_observed, theta_grid, model, theta1=None):
        if isinstance(model, ParameterizedRatioEstimator):
            log_r, _ = model.evaluate_log_likelihood_ratio(
                x=x_observed, theta=theta_grid, test_all_combinations=True, evaluate_score=False
            )
        elif isinstance(model, Ensemble) and model.estimator_type == "parameterized_ratio":
            log_r, _ = model.evaluate_log_likelihood_ratio(
                x=x_observed,
                theta=theta_grid,
                test_all_combinations=True,
                evaluate_score=False,
                calculate_covariance=False,
            )
        else:
            raise NotImplementedError(
                "Likelihood ratio estimation is currently only implemented for "
                "ParameterizedRatioEstimator instancees"
            )
        return log_r

    @staticmethod
    def _subtract_ml(log_r):
        i_ml = np.argmax(log_r)
        log_r_subtracted = log_r[:] - log_r[i_ml]
        return log_r_subtracted, i_ml

    @staticmethod
    def _clean_nans(array):
        not_finite = np.any(~np.isfinite(array), axis=0)
        if np.sum(not_finite) > 0:
            logger.warning("Removing %s inf / nan results from calculation")
            array[:, not_finite] = 0.0
        return array