Source code for SALib.analyze.pawn

from typing import Dict

import numpy as np
from scipy.stats import ks_2samp

from . import common_args
from ..util import read_param_file, ResultDict, extract_group_names, _check_groups


[docs] def analyze( problem: Dict, X: np.ndarray, Y: np.ndarray, S: int = 10, print_to_console: bool = False, seed: int = None, ): """Performs PAWN sensitivity analysis. The PAWN method [1] is a moment-independent approach to Global Sensitivity Analysis (GSA). It is described as producing robust results at relatively low sample sizes (see [2]) for the purpose of factor ranking and screening. The distribution of model outputs is examined rather than their variation as is typical in other common GSA approaches. The PAWN method further distinguishes itself from other moment-independent approaches by characterizing outputs by their cumulative distribution function (CDF) as opposed to their probability distribution function. As the CDF for a given random variable is typically normally distributed, PAWN can be more appropriately applied when outputs are highly-skewed or multi-modal, for which variance-based methods may produce unreliable results. PAWN characterizes the relationship between inputs and outputs by quantifying the variation in the output distributions after conditioning an input. A factor is deemed non-influential if distributions coincide at all ``S`` conditioning intervals. The Kolmogorov-Smirnov statistic is used as a measure of distance between the distributions. This implementation reports the PAWN index at the min, mean, median, and max across the slides/conditioning intervals as well as the coefficient of variation (``CV``) and standard deviation (``stdev``). The median value is the typically reported value. As the ``CV`` is (standard deviation / mean), it indicates the level of variability across the slides, with values closer to zero indicating lower variation. Notes ----- Compatible with: all samplers This implementation ignores all NaNs. When applied to grouped factors, the analysis is conducted on each factor individually, and the mean of their results are reported. Examples -------- >>> X = latin.sample(problem, 1000) >>> Y = Ishigami.evaluate(X) >>> Si = pawn.analyze(problem, X, Y, S=10, print_to_console=False) Parameters ---------- problem : dict The problem definition X : numpy.array A NumPy array containing the model inputs Y : numpy.array A NumPy array containing the model outputs S : int Number of slides; the conditioning intervals (default 10) print_to_console : bool Print results directly to console (default False) seed : int Seed value to ensure deterministic results References ---------- 1. Pianosi, F., Wagener, T., 2015. A simple and efficient method for global sensitivity analysis based on cumulative distribution functions. Environmental Modelling & Software 67, 1-11. https://doi.org/10.1016/j.envsoft.2015.01.004 2. Pianosi, F., Wagener, T., 2018. Distribution-based sensitivity analysis from a generic input-output sample. Environmental Modelling & Software 108, 197-207. https://doi.org/10.1016/j.envsoft.2018.07.019 3. Baroni, G., Francke, T., 2020. An effective strategy for combining variance- and distribution-based global sensitivity analysis. Environmental Modelling & Software, 134, 104851. https://doi.org/10.1016/j.envsoft.2020.104851 4. Baroni, G., Francke, T., 2020. GSA-cvd Combining variance- and distribution-based global sensitivity analysis https://github.com/baronig/GSA-cvd """ if seed: np.random.seed(seed) D = problem["num_vars"] var_names, _ = extract_group_names(problem) results = np.full((D, 6), np.nan) temp_pawn = np.full((S, D), np.nan) step = 1 / S for d_i in range(D): seq = np.arange(0, 1 + step, step) X_di = X[:, d_i] X_q = np.nanquantile(X_di, seq) for s in range(S): Y_sel = Y[(X_di >= X_q[s]) & (X_di < X_q[s + 1])] if len(Y_sel) == 0: # no available samples continue # KD value # Function returns a KS object which holds the KS statistic # and p-value # Note from scipy documentation: # if the K-S statistic is small or the p-value is high, then # we cannot reject the hypothesis that the distributions of # the two samples are the same. ks = ks_2samp(Y_sel, Y) temp_pawn[s, d_i] = ks.statistic p_ind = temp_pawn[:, d_i] mins = np.nanmin(p_ind) mean = np.nanmean(p_ind) med = np.nanmedian(p_ind) maxs = np.nanmax(p_ind) stdev = np.nanstd(p_ind) cv = np.nanstd(p_ind) / mean results[d_i, :] = [mins, mean, med, maxs, cv, stdev] groups = _check_groups(problem) if groups: unique_grps, n_groups = extract_group_names(problem) tmp = np.full((n_groups, results.shape[1]), np.nan) # Take the mean of effects from parameters that are grouped together unique_grps = np.array(unique_grps) grps = np.array(groups) for grp_id, grp in enumerate(unique_grps): tmp[grp_id, :] = np.mean(results[grps == grp, :], axis=0) results = tmp tmp = None Si = ResultDict( [ ("minimum", results[:, 0]), ("mean", results[:, 1]), ("median", results[:, 2]), ("maximum", results[:, 3]), ("CV", results[:, 4]), ("stdev", results[:, 5]), ] ) Si["names"] = var_names if print_to_console: print(Si.to_df()) return Si
[docs] def cli_parse(parser): parser.add_argument( "-X", "--model-input-file", type=str, required=True, help="Model input file" ) parser.add_argument( "-S", "--slices", type=int, required=False, help="Number of slices to take" ) return parser
[docs] def cli_action(args): problem = read_param_file(args.paramfile) X = np.loadtxt(args.model_input_file, delimiter=args.delimiter) Y = np.loadtxt( args.model_output_file, delimiter=args.delimiter, usecols=(args.column,) ) analyze(problem, X, Y, S=args.slices, print_to_console=True, seed=args.seed)
if __name__ == "__main__": common_args.run_cli(cli_parse, cli_action)