from typing import Dict
import numpy as np
from scipy.stats import ks_2samp
from . import common_args
from ..util import read_param_file, ResultDict, extract_group_names, _check_groups
[docs]
def analyze(
problem: Dict,
X: np.ndarray,
Y: np.ndarray,
S: int = 10,
print_to_console: bool = False,
seed: int = None,
):
"""Performs PAWN sensitivity analysis.
The PAWN method [1] is a moment-independent approach to Global Sensitivity
Analysis (GSA). It is described as producing robust results at relatively
low sample sizes (see [2]) for the purpose of factor ranking and screening.
The distribution of model outputs is examined rather than
their variation as is typical in other common GSA approaches. The PAWN
method further distinguishes itself from other moment-independent
approaches by characterizing outputs by their cumulative distribution
function (CDF) as opposed to their probability distribution function.
As the CDF for a given random variable is typically normally distributed,
PAWN can be more appropriately applied when outputs are highly-skewed or
multi-modal, for which variance-based methods may produce unreliable
results.
PAWN characterizes the relationship between inputs and outputs by
quantifying the variation in the output distributions after conditioning
an input. A factor is deemed non-influential if distributions coincide at
all ``S`` conditioning intervals. The Kolmogorov-Smirnov statistic is used
as a measure of distance between the distributions.
This implementation reports the PAWN index at the min, mean, median, and
max across the slides/conditioning intervals as well as the coefficient of
variation (``CV``) and standard deviation (``stdev``). The median value is the
typically reported value. As the ``CV`` is (standard deviation / mean), it
indicates the level of variability across the slides, with values closer to zero
indicating lower variation.
Notes
-----
Compatible with:
all samplers
This implementation ignores all NaNs.
When applied to grouped factors, the analysis is conducted on each factor
individually, and the mean of their results are reported.
Examples
--------
>>> X = latin.sample(problem, 1000)
>>> Y = Ishigami.evaluate(X)
>>> Si = pawn.analyze(problem, X, Y, S=10, print_to_console=False)
Parameters
----------
problem : dict
The problem definition
X : numpy.array
A NumPy array containing the model inputs
Y : numpy.array
A NumPy array containing the model outputs
S : int
Number of slides; the conditioning intervals (default 10)
print_to_console : bool
Print results directly to console (default False)
seed : int
Seed value to ensure deterministic results
References
----------
1. Pianosi, F., Wagener, T., 2015.
A simple and efficient method for global sensitivity analysis
based on cumulative distribution functions.
Environmental Modelling & Software 67, 1-11.
https://doi.org/10.1016/j.envsoft.2015.01.004
2. Pianosi, F., Wagener, T., 2018.
Distribution-based sensitivity analysis from a generic input-output sample.
Environmental Modelling & Software 108, 197-207.
https://doi.org/10.1016/j.envsoft.2018.07.019
3. Baroni, G., Francke, T., 2020.
An effective strategy for combining variance- and
distribution-based global sensitivity analysis.
Environmental Modelling & Software, 134, 104851.
https://doi.org/10.1016/j.envsoft.2020.104851
4. Baroni, G., Francke, T., 2020.
GSA-cvd
Combining variance- and distribution-based global sensitivity analysis
https://github.com/baronig/GSA-cvd
"""
if seed:
np.random.seed(seed)
D = problem["num_vars"]
var_names, _ = extract_group_names(problem)
results = np.full((D, 6), np.nan)
temp_pawn = np.full((S, D), np.nan)
step = 1 / S
for d_i in range(D):
seq = np.arange(0, 1 + step, step)
X_di = X[:, d_i]
X_q = np.nanquantile(X_di, seq)
for s in range(S):
Y_sel = Y[(X_di >= X_q[s]) & (X_di < X_q[s + 1])]
if len(Y_sel) == 0:
# no available samples
continue
# KD value
# Function returns a KS object which holds the KS statistic
# and p-value
# Note from scipy documentation:
# if the K-S statistic is small or the p-value is high, then
# we cannot reject the hypothesis that the distributions of
# the two samples are the same.
ks = ks_2samp(Y_sel, Y)
temp_pawn[s, d_i] = ks.statistic
p_ind = temp_pawn[:, d_i]
mins = np.nanmin(p_ind)
mean = np.nanmean(p_ind)
med = np.nanmedian(p_ind)
maxs = np.nanmax(p_ind)
stdev = np.nanstd(p_ind)
cv = np.nanstd(p_ind) / mean
results[d_i, :] = [mins, mean, med, maxs, cv, stdev]
groups = _check_groups(problem)
if groups:
unique_grps, n_groups = extract_group_names(problem)
tmp = np.full((n_groups, results.shape[1]), np.nan)
# Take the mean of effects from parameters that are grouped together
unique_grps = np.array(unique_grps)
grps = np.array(groups)
for grp_id, grp in enumerate(unique_grps):
tmp[grp_id, :] = np.mean(results[grps == grp, :], axis=0)
results = tmp
tmp = None
Si = ResultDict(
[
("minimum", results[:, 0]),
("mean", results[:, 1]),
("median", results[:, 2]),
("maximum", results[:, 3]),
("CV", results[:, 4]),
("stdev", results[:, 5]),
]
)
Si["names"] = var_names
if print_to_console:
print(Si.to_df())
return Si
[docs]
def cli_parse(parser):
parser.add_argument(
"-X", "--model-input-file", type=str, required=True, help="Model input file"
)
parser.add_argument(
"-S", "--slices", type=int, required=False, help="Number of slices to take"
)
return parser
[docs]
def cli_action(args):
problem = read_param_file(args.paramfile)
X = np.loadtxt(args.model_input_file, delimiter=args.delimiter)
Y = np.loadtxt(
args.model_output_file, delimiter=args.delimiter, usecols=(args.column,)
)
analyze(problem, X, Y, S=args.slices, print_to_console=True, seed=args.seed)
if __name__ == "__main__":
common_args.run_cli(cli_parse, cli_action)