Rejection probability is not predicted by loss distribution

Rejection probability is not predicted by loss distribution#

\(\require{mathtools} \newcommand{\notag}{} \newcommand{\tag}{} \newcommand{\label}[1]{} \newcommand{\sfrac}[2]{#1/#2} \newcommand{\bm}[1]{\boldsymbol{#1}} \newcommand{\num}[1]{#1} \newcommand{\qty}[2]{#1\,#2} \renewenvironment{align} {\begin{aligned}} {\end{aligned}} \renewenvironment{alignat} {\begin{alignedat}} {\end{alignedat}} \newcommand{\pdfmspace}[1]{} % Ignore PDF-only spacing commands \newcommand{\htmlmspace}[1]{\mspace{#1}} % Ignore PDF-only spacing commands \newcommand{\scaleto}[2]{#1} % Allow to use scaleto from scalerel package \newcommand{\RR}{\mathbb R} \newcommand{\NN}{\mathbb N} \newcommand{\PP}{\mathbb P} \newcommand{\EE}{\mathbb E} \newcommand{\XX}{\mathbb X} \newcommand{\ZZ}{\mathbb Z} \newcommand{\QQ}{\mathbb Q} \newcommand{\fF}{\mathcal F} \newcommand{\dD}{\mathcal D} \newcommand{\lL}{\mathcal L} \newcommand{\gG}{\mathcal G} \newcommand{\hH}{\mathcal H} \newcommand{\nN}{\mathcal N} \newcommand{\pP}{\mathcal P} \newcommand{\BB}{\mathbb B} \newcommand{\Exp}{\operatorname{Exp}} \newcommand{\Binomial}{\operatorname{Binomial}} \newcommand{\Poisson}{\operatorname{Poisson}} \newcommand{\linop}{\mathcal{L}(\mathbb{B})} \newcommand{\linopell}{\mathcal{L}(\ell_1)} \DeclareMathOperator{\trace}{trace} \DeclareMathOperator{\Var}{Var} \DeclareMathOperator{\Span}{span} \DeclareMathOperator{\proj}{proj} \DeclareMathOperator{\col}{col} \DeclareMathOperator*{\argmin}{arg\,min} \DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator*{\gt}{>} \definecolor{highlight-blue}{RGB}{0,123,255} % definition, theorem, proposition \definecolor{highlight-yellow}{RGB}{255,193,7} % lemma, conjecture, example \definecolor{highlight-orange}{RGB}{253,126,20} % criterion, corollary, property \definecolor{highlight-red}{RGB}{220,53,69} % criterion \newcommand{\logL}{\ell} \newcommand{\eE}{\mathcal{E}} \newcommand{\oO}{\mathcal{O}} \newcommand{\defeq}{\stackrel{\mathrm{def}}{=}} \newcommand{\Bspec}{\mathcal{B}} % Spectral radiance \newcommand{\X}{\mathcal{X}} % X space \newcommand{\Y}{\mathcal{Y}} % Y space \newcommand{\M}{\mathcal{M}} % Model \newcommand{\Tspace}{\mathcal{T}} \newcommand{\Vspace}{\mathcal{V}} \newcommand{\Mtrue}{\mathcal{M}_{\mathrm{true}}} \newcommand{\MP}{\M_{\mathrm{P}}} \newcommand{\MRJ}{\M_{\mathrm{RJ}}} \newcommand{\qproc}{\mathfrak{Q}} \newcommand{\D}{\mathcal{D}} % Data (true or generic) \newcommand{\Dt}{\tilde{\mathcal{D}}} \newcommand{\Phit}{\widetilde{\Phi}} \newcommand{\Phis}{\Phi^*} \newcommand{\qt}{\tilde{q}} \newcommand{\qs}{q^*} \newcommand{\qh}{\hat{q}} \newcommand{\AB}[1]{\mathtt{AB}~\mathtt{#1}} \newcommand{\LP}[1]{\mathtt{LP}~\mathtt{#1}} \newcommand{\NML}{\mathrm{NML}} \newcommand{\iI}{\mathcal{I}} \newcommand{\true}{\mathrm{true}} \newcommand{\dist}{D} \newcommand{\Mtheo}[1]{\mathcal{M}_{#1}} % Model (theoretical model); index: param set \newcommand{\DL}[1][L]{\mathcal{D}^{(#1)}} % Data (RV or generic) \newcommand{\DLp}[1][L]{\mathcal{D}^{(#1')}} % Data (RV or generic) \newcommand{\DtL}[1][L]{\tilde{\mathcal{D}}^{(#1)}} % Data (RV or generic) \newcommand{\DpL}[1][L]{{\mathcal{D}'}^{(#1)}} % Data (RV or generic) \newcommand{\Dobs}[1][]{\mathcal{D}_{\mathrm{obs}}^{#1}} % Data (observed) \newcommand{\calibset}{\mathcal{C}} \newcommand{\N}{\mathcal{N}} % Normal distribution \newcommand{\Z}{\mathcal{Z}} % Partition function \newcommand{\VV}{\mathbb{V}} % Variance \newcommand{\T}{\mathsf{T}} % Transpose \newcommand{\EMD}{\mathrm{EMD}} \newcommand{\dEMD}{d_{\mathrm{EMD}}} \newcommand{\dEMDtilde}{\tilde{d}_{\mathrm{EMD}}} \newcommand{\dEMDsafe}{d_{\mathrm{EMD}}^{\text{(safe)}}} \newcommand{\e}{ε} % Model confusion threshold \newcommand{\falsifythreshold}{ε} \newcommand{\bayes}[1][]{B_{#1}} \newcommand{\bayesthresh}[1][]{B_{0}} \newcommand{\bayesm}[1][]{B^{\mathcal{M}}_{#1}} \newcommand{\bayesl}[1][]{B^l_{#1}} \newcommand{\bayesphys}[1][]{B^{{p}}_{#1}} \newcommand{\Bconf}[1]{B^{\mathrm{epis}}_{#1}} \newcommand{\Bemd}[1]{B^{\mathrm{EMD}}_{#1}} \newcommand{\BQ}[1]{B^{Q}_{#1}} \newcommand{\Bconfbin}[1][]{\bar{B}^{\mathrm{conf}}_{#1}} \newcommand{\Bemdbin}[1][]{\bar{B}_{#1}^{\mathrm{EMD}}} \newcommand{\bin}{\mathcal{B}} \newcommand{\Bconft}[1][]{\tilde{B}^{\mathrm{conf}}_{#1}} \newcommand{\fc}{f_c} \newcommand{\fcbin}{\bar{f}_c} \newcommand{\paramphys}[1][]{Θ^{{p}}_{#1}} \newcommand{\paramobs}[1][]{Θ^{ε}_{#1}} \newcommand{\test}{\mathrm{test}} \newcommand{\train}{\mathrm{train}} \newcommand{\synth}{\mathrm{synth}} \newcommand{\rep}{\mathrm{rep}} \newcommand{\MNtrue}{\mathcal{M}^{{p}}_{\text{true}}} \newcommand{\MN}[1][]{\mathcal{M}^{{p}}_{#1}} \newcommand{\MNA}{\mathcal{M}^{{p}}_{Θ_A}} \newcommand{\MNB}{\mathcal{M}^{{p}}_{Θ_B}} \newcommand{\Me}[1][]{\mathcal{M}^ε_{#1}} \newcommand{\Metrue}{\mathcal{M}^ε_{\text{true}}} \newcommand{\Meobs}{\mathcal{M}^ε_{\text{obs}}} \newcommand{\Meh}[1][]{\hat{\mathcal{M}}^ε_{#1}} \newcommand{\MNa}{\mathcal{M}^{\mathcal{N}}_a} \newcommand{\MeA}{\mathcal{M}^ε_A} \newcommand{\MeB}{\mathcal{M}^ε_B} \newcommand{\Ms}{\mathcal{M}^*} \newcommand{\MsA}{\mathcal{M}^*_A} \newcommand{\MsB}{\mathcal{M}^*_B} \newcommand{\Msa}{\mathcal{M}^*_a} \newcommand{\MsAz}{\mathcal{M}^*_{A,z}} \newcommand{\MsBz}{\mathcal{M}^*_{B,z}} \newcommand{\Msaz}{\mathcal{M}^*_{a,z}} \newcommand{\MeAz}{\mathcal{M}^ε_{A,z}} \newcommand{\MeBz}{\mathcal{M}^ε_{B,z}} \newcommand{\Meaz}{\mathcal{M}^ε_{a,z}} \newcommand{\zo}{z^{0}} \renewcommand{\lL}[2][]{\mathcal{L}_{#1|{#2}}} % likelihood \newcommand{\Lavg}[2][]{\mathcal{L}^{/#2}_{#1}} % Geometric average of likelihood \newcommand{\lLphys}[2][]{\mathcal{L}^{{p}}_{#1|#2}} \newcommand{\Lavgphys}[2][]{\mathcal{L}^{{p}/#2}_{#1}} % Geometric average of likelihood \newcommand{\lLL}[3][]{\mathcal{L}^{(#3)}_{#1|#2}} \newcommand{\lLphysL}[3][]{\mathcal{L}^{{p},(#3)}_{#1|#2}} \newcommand{\lnL}[2][]{l_{#1|#2}} % Per-sample log likelihood \newcommand{\lnLt}[2][]{\widetilde{l}_{#1|#2}} \newcommand{\lnLtt}{\widetilde{l}} % Used only in path_sampling \newcommand{\lnLh}[1][]{\hat{l}_{#1}} \newcommand{\lnLphys}[2][]{l^{{p}}_{#1|#2}} \newcommand{\lnLphysL}[3][]{l^{{p},(#3)}_{#1|#2}} \newcommand{\Elmu}[2][1]{μ_{{#2}}^{(#1)}} \newcommand{\Elmuh}[2][1]{\hat{μ}_{{#2}}^{(#1)}} \newcommand{\Elsig}[2][1]{Σ_{{#2}}^{(#1)}} \newcommand{\Elsigh}[2][1]{\hat{Σ}_{{#2}}^{(#1)}} \newcommand{\pathP}{\mathop{{p}}} % Path-sampling process (generic) \newcommand{\pathPhb}{\mathop{{p}}_{\mathrm{Beta}}} % Path-sampling process (hierarchical beta) \newcommand{\interval}{\mathcal{I}} \newcommand{\Phiset}[1]{\{\Phi\}^{\small (#1)}} \newcommand{\Phipart}[1]{\{\mathcal{I}_Φ\}^{\small (#1)}} \newcommand{\qhset}[1]{\{\qh\}^{\small (#1)}} \newcommand{\Dqpart}[1]{\{Δ\qh_{2^{#1}}\}} \newcommand{\LsAzl}{\mathcal{L}_{\smash{{}^{\,*}_A},z,L}} \newcommand{\LsBzl}{\mathcal{L}_{\smash{{}^{\,*}_B},z,L}} \newcommand{\lsA}{l_{\smash{{}^{\,*}_A}}} \newcommand{\lsB}{l_{\smash{{}^{\,*}_B}}} \newcommand{\lsAz}{l_{\smash{{}^{\,*}_A},z}} \newcommand{\lsAzj}{l_{\smash{{}^{\,*}_A},z_j}} \newcommand{\lsAzo}{l_{\smash{{}^{\,*}_A},z^0}} \newcommand{\leAz}{l_{\smash{{}^{\,ε}_A},z}} \newcommand{\lsAez}{l_{\smash{{}^{*ε}_A},z}} \newcommand{\lsBz}{l_{\smash{{}^{\,*}_B},z}} \newcommand{\lsBzj}{l_{\smash{{}^{\,*}_B},z_j}} \newcommand{\lsBzo}{l_{\smash{{}^{\,*}_B},z^0}} \newcommand{\leBz}{l_{\smash{{}^{\,ε}_B},z}} \newcommand{\lsBez}{l_{\smash{{}^{*ε}_B},z}} \newcommand{\LaszL}{\mathcal{L}_{\smash{{}^{*}_a},z,L}} \newcommand{\lasz}{l_{\smash{{}^{*}_a},z}} \newcommand{\laszj}{l_{\smash{{}^{*}_a},z_j}} \newcommand{\laszo}{l_{\smash{{}^{*}_a},z^0}} \newcommand{\laez}{l_{\smash{{}^{ε}_a},z}} \newcommand{\lasez}{l_{\smash{{}^{*ε}_a},z}} \newcommand{\lhatasz}{\hat{l}_{\smash{{}^{*}_a},z}} \newcommand{\pasz}{p_{\smash{{}^{*}_a},z}} \newcommand{\paez}{p_{\smash{{}^{ε}_a},z}} \newcommand{\pasez}{p_{\smash{{}^{*ε}_a},z}} \newcommand{\phatsaz}{\hat{p}_{\smash{{}^{*}_a},z}} \newcommand{\phateaz}{\hat{p}_{\smash{{}^{ε}_a},z}} \newcommand{\phatseaz}{\hat{p}_{\smash{{}^{*ε}_a},z}} \newcommand{\Phil}[2][]{Φ_{#1|#2}} % Φ_{\la} \newcommand{\Philt}[2][]{\widetilde{Φ}_{#1|#2}} % Φ_{\la} \newcommand{\Philhat}[2][]{\hat{Φ}_{#1|#2}} % Φ_{\la} \newcommand{\Philsaz}{Φ_{\smash{{}^{*}_a},z}} % Φ_{\lasz} \newcommand{\Phileaz}{Φ_{\smash{{}^{ε}_a},z}} % Φ_{\laez} \newcommand{\Philseaz}{Φ_{\smash{{}^{*ε}_a},z}} % Φ_{\lasez} \newcommand{\mus}[1][1]{μ^{(#1)}_*} \newcommand{\musA}[1][1]{μ^{(#1)}_{\smash{{}^{\,*}_A}}} \newcommand{\SigsA}[1][1]{Σ^{(#1)}_{\smash{{}^{\,*}_A}}} \newcommand{\musB}[1][1]{μ^{(#1)}_{\smash{{}^{\,*}_B}}} \newcommand{\SigsB}[1][1]{Σ^{(#1)}_{\smash{{}^{\,*}_B}}} \newcommand{\musa}[1][1]{μ^{(#1)}_{\smash{{}^{*}_a}}} \newcommand{\Sigsa}[1][1]{Σ^{(#1)}_{\smash{{}^{*}_a}}} \newcommand{\Msah}{{\color{highlight-red}\mathcal{M}^{*}_a}} \newcommand{\Msazh}{{\color{highlight-red}\mathcal{M}^{*}_{a,z}}} \newcommand{\Meah}{{\color{highlight-blue}\mathcal{M}^{ε}_a}} \newcommand{\Meazh}{{\color{highlight-blue}\mathcal{M}^{ε}_{a,z}}} \newcommand{\lsazh}{{\color{highlight-red}l_{\smash{{}^{*}_a},z}}} \newcommand{\leazh}{{\color{highlight-blue}l_{\smash{{}^{ε}_a},z}}} \newcommand{\lseazh}{{\color{highlight-orange}l_{\smash{{}^{*ε}_a},z}}} \newcommand{\Philsazh}{{\color{highlight-red}Φ_{\smash{{}^{*}_a},z}}} % Φ_{\lasz} \newcommand{\Phileazh}{{\color{highlight-blue}Φ_{\smash{{}^{ε}_a},z}}} % Φ_{\laez} \newcommand{\Philseazh}{{\color{highlight-orange}Φ_{\smash{{}^{*ε}_a},z}}} % Φ_{\lasez} \newcommand{\emdstd}{\tilde{σ}} \DeclareMathOperator{\Mvar}{Mvar} \DeclareMathOperator{\AIC}{AIC} \DeclareMathOperator{\epll}{epll} \DeclareMathOperator{\elpd}{elpd} \DeclareMathOperator{\MDL}{MDL} \DeclareMathOperator{\comp}{COMP} \DeclareMathOperator{\Lognorm}{Lognorm} \DeclareMathOperator{\erf}{erf} \DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator{\Image}{Image} \DeclareMathOperator{\sgn}{sgn} \DeclareMathOperator{\SE}{SE} % standard error \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator{\Poisson}{Poisson} \DeclareMathOperator{\SkewNormal}{SkewNormal} \DeclareMathOperator{\TruncNormal}{TruncNormal} \DeclareMathOperator{\Exponential}{Exponential} \DeclareMathOperator{\exGaussian}{exGaussian} \DeclareMathOperator{\IG}{IG} \DeclareMathOperator{\NIG}{NIG} \DeclareMathOperator{\Gammadist}{Gamma} \DeclareMathOperator{\Lognormal}{Lognormal} \DeclareMathOperator{\Beta}{Beta} \newcommand{\sinf}{{s_{\infty}}}\)

This notebook explores a similar question as Aleatoric vs Sampling vs Replication uncertainty, namely why we cannot use the loss distribution directly and instead really need to estimate the epistemic distribution.

Here we explore the question in the context of our calibration experiments. Ultimately the goal is to predict the probability that a replication of the experiment would select model \(A\) over model \(B\), and this is exactly what our calibration experiments test (with synthetic replications). So if we can replace \(\Bemd{}\) by a hypothetical \(\BQ{}\) defined solely in terms of the loss distribution and still find a correlation with \(\Bconf{}\), that would suggest that we don’t really need \(\Bemd{}\) (which depends on a more complicated distribution and ad hoc parameter \(c\)) – we could get away with considering just the loss distribution, which is much simpler and therefore more appealing.

Of course, there is no reason to expect that the loss distribution (which is a result of aleatoric uncertainty) will be related to replication uncertainty – indeed, that is the point of the aforementioned notebook. But the question gets asked, so in addition to arguing on the basis of principle, it is good to tackle the question head-on, in the most direct way possible, with an actual experiment.

One challenge is how best to define \(B^Q\): since it measures aleatoric instead of replication/epistemic uncertainty, there is no ontologically obvious way to do this. So instead we will use the mathematically obvious way[1] and “hope for the best”:[2] we match the form of Eq. 2.13, replacing the \(R\)-distribution by the \(Q\)-distribution:

\[\begin{split}\BQ{AB;c_Q} &\coloneqq P(Q_A < Q_B + η)\,, \\ η &\sim \nN(0, c_Q^2) \,.\end{split}\]

Note that now the probability now is over the samples of \(\Mtrue\) rather than over replications. The parameter \(c_Q\) is an additional degree of freedom; it effectively convolves the \(Q\) distributions with a Gaussian, thus increasing their spread and, (similar to the effect of \(c\) on \(\Bemd{}\)) decreasing the sensitivity of \(\BQ{}\). We include \(η\) so that both \(\BQ{}\) and \(\Bemd{}\) have the same number of free parameters, and thus make the comparison more fair. We will select the \(c_Q\) which maximizes the correlation with \(\Bconf{}\), thus giving \(\BQ{}\) the best chance to match the performance of \(\Bemd{}\).

The advantage of Eq. 7.1 is that it is easily estimated from observed samples, with no need to define an ad hoc distribution over quantile functions. The disadvantage is that it replaces an ad hoc distribution by an ad hoc ontology, since there is no clear link between the distribution of \(Q\) values and the ability of a model to replicate. (The two are not unrelated, but the relation is highly confounded by the aleatoric uncertainty.) As we will see below, this makes it a poor predictor for \(\Bconf{}\).

Note also that appropriate values of \(c_Q\) are entirely dependent on what the typical values of the loss are. If the loss is given in terms of a probability density, rather than a probability mass, then this is not scale independent. There is even less reason to expect the values of \(c_Q\) therefore to generalize to other problems, in contrast to what we observed with \(\Bemd{}\).

Imports#

import numpy as np

from functools import partial
from itertools import chain

import holoviews as hv
hv.extension("matplotlib", "bokeh")

emdcmp library

import emdcmp

Project code

from config import config
import viz
import viz.emdcmp   # Backported updates to emdcmp.viz

Load the epistemic dists used in the calibration experiments

Load the customized calibration task which computes \(\BQ{}\) instead of \(\Bemd{}\).

Execution#

How many experiments

The function emd.viz.calibration_plot attempts to collect results into 16 bins,[3] so making \(N\) a multiple of 16 works nicely.

We only run the experiments on the 6 epistemic distributions used in the main paper.

#N = 64
#N = 128
#N = 512
N = 2048
Ωdct = {(f"{Ω.a} vs {Ω.b}", Ω.ξ_name, Ω.σo_dist, Ω.τ_dist, Ω.σi_dist): Ω
        for Ω in [
            EpistemicDist(N, "C", "D", "Gaussian", "low noise", "short input correlations", "weak input"),
            EpistemicDist(N, "A", "B", "Gaussian", "low noise", "short input correlations", "weak input"),
            EpistemicDist(N, "A", "D", "Gaussian", "low noise", "short input correlations", "weak input"),
            EpistemicDist(N, "C", "D", "Gaussian", "low noise", "short input correlations", "strong input"),
            EpistemicDist(N, "A", "B", "Gaussian", "low noise", "short input correlations", "strong input"),
            EpistemicDist(N, "A", "D", "Gaussian", "low noise", "short input correlations", "strong input"),
        ]
       }

Hint

Calibrate will iterate over the data models twice, so it is important that the iterable passed as data_models not be consumable.

c_list = [2**-6, 2**-4, 2**-2, 2**0, 2**2, 2**4]
LQ = 4000 # Number of draws for the Monte Carlo estimate of normal dist addition

cQ_chosen = -2**-3
cQ_list=[#-2**1,  2**1,
         #-2**0,  2**0,
         -2**-1, 2**-1,
         #-2**-2, 2**-2,
         -2**-3, 2**-3,
         #0
        ]

tasks_normal = {}
tasks_BQ = {}
for Ωkey, Ω in Ωdct.items():
    task = CalibrateBQ(
        reason = f"BQ calibration attempt – {Ω.a} vs {Ω.b} - {Ω.ξ_name} - {Ω.σo_dist} - {Ω.τ_dist} - {Ω.σi_dist} - {N=}",
        #c_list = cQ_list,
        c_list = c_list,
        experiments = Ω.generate(N),
        Ldata = L_data,
        Linf = Linf,
        LQ = LQ
    )
    tasks_BQ[Ωkey] = task

    task = emdcmp.tasks.Calibrate(
        reason = f"Prinz calibration – {Ω.a} vs {Ω.b} - {Ω.ξ_name} - {Ω.σo_dist} - {Ω.τ_dist} - {Ω.σi_dist} - {N=}",
        c_list = c_list,
        #c_list = [2**-6, 2**-4, 2**-2, 2**0, 2**4],
        experiments = Ω.generate(N),
        Ldata = L_data,
        Linf = Linf
    )
    tasks_normal[Ωkey] = task

The code below creates task files for any missing tasks

    for task in chain(tasks_normal.values(), tasks_BQ.values()):
        if not task.has_run:  # Don’t create task files for tasks which have already run
            Ω = task.experiments
            taskfilename = f"prinz_{type(task).__qualname__}__{Ω.a}vs{Ω.b}_{Ω.ξ_name}_{Ω.σo_dist}_{Ω.τ_dist}_{Ω.σi_dist}_N={Ω.N}_c={task.c_list}"
            task.save(taskfilename)

If any files were created, run those tasks from the command line with

smttask run -n1 --import config <task file>

before continuing.

assert all(task.has_run for task in chain(tasks_normal.values(), tasks_BQ.values()))

Comparison of \(\Bemd{}\) and \(\BQ{}\) calibration curves#

Important

The basic principles for calibration

Wide support

The goal of calibration is to probe that intermediate where selection of either model \(A\) or \(B\) is not certain. It is important therefore that we obtain values \(\Bemd{}\) over the entire interval \([0, 1]\).
Whether this is the case will be a function of two things:

the design of the design of the calibration experiments: whether it produces ambiguous selection problems;
the choice of \(c\): generally, a larger \(c\) will concentrate \(\Bemd{}\) towards 0.5, a smaller \(c\) will concentrate them towards 0 and 1.

So we want the support of \(\Bemd{}\) to be as large as possible.

Flat distribution

As a secondary goal, we also want it to be as flat as possible, since this will lead to more efficient sampling: Since we need enough samples at every subinterval of \(\Bemd{}\), it is the most sparsely sampled regions which determine how many calibration datasets we need to generate. (And therefore how long the computation needs to run.)

Beyond making for shorter compute times, a flat distribution however isn’t in and of itself a good thing: more important is that the criterion is able to resolve the models when it should.

Hint: Diagnosing \(\Bemd{}\) and \(\Bconf{}\) histograms

\(\Bemd{}\) distribution which bulges around 0.5.

May indicate that \(c\) is too large and the criterion underconfident.

May also indicate that the calibration distribution is generating a large number of (data, modelA, modelB) triples which are essentially undecidable. If neither model is a good fit to the data, then their \(δ^{\mathrm{EMD}}\) discrepancies between mixed and synthetic PPFs will be large, and they will have broad distributions for the expected risk. Broad distributions overlap more, hence the skew of \(\Bemd{}\) towards 0.5.

\(\Bemd{}\) distribution which is heavy at both ends.

May indicate that \(c\) is too small and the criterion overconfident.

May also indicate that the calibration distribution is not sampling enough ambiguous conditions. In this case the answer is not to increase the value of \(c\) but rather to tighten the calibration distribution to focus on the area with \(\Bemd{}\) close to 0.5. It may be possible to simply run the calibration longer until there have enough samples everywhere, but this is generally less effective than adjusting the calibration distribution.

\(\Bemd{}\) distribution which is heavily skewed either towards 0 or 1.

Check that the calibration distribution is using both candidate models to generate datasets. The best is usually to use each candidate to generate half of the datasets: then each model should fit best in roughly half the cases. The skew need not be removed entirely – one model may just be more permissive than the other.

This can also happen when \(c\) is too small.

\(\Bconf{}\) distribution which is almost entirely on either 0 or 1.

Again, check that the calibration distribution is using both models to generate datasets.

If each candidate is used for half the datasets, and we still see ueven distribution of \(\Bconf{}\), then this can indicate a problem: it means that the ideal measure we are striving towards (true expected risk) is unable to identify that model used to generate the data. In this case, tweaking the \(c\) value is a waste of time: the issue then is with the problem statement rather than the \(\Bemd{}\) calibration. Most likely the issue is that the loss is ill-suited to the problem:

It might not account for rotation/translation symmetries in images, or time dilation in time-series.
One model’s loss might be lower, even on data generated with the other model. This can happen with a log posterior, when one model has more parameters: its higher dimensional prior “dilutes” the likelihood. This may be grounds to reject the more complex model on the basis of preferring simplicity, but it is not grounds to falsify that model. (Since it may still fit the data equally well.)

Hint: Diagnosing calibration curves

Flat calibration curve

This is the most critical issue, since it indicates that \(\Bemd{}\) is actually not predictive of \(\Bconf{}\) at all. The most common reason is a mistake in the definition of the calibration distribution, where some values are fixed when they shouldn’t be.

Remember that the model construction pipeline used on the real data needs to be repeated in full for each experimental condition produced by the calibration distribution. For example, in Experiment above we refit the observation noise \(σ\) for each experimental condition generated within __iter__.
Treat any global used within EpistemicDist with particular suspicion, as they are likely to fix values which should be variable. To minimize the risk of accidental global variables, you can define EpistemicDist in its own separate module.

To help investigate issues, it is often helpful to reconstruct conditions that produce the unexpected behaviour. The following code snippet recovers the first calibration dataset for which both Bemd > 0.9 and Bconf = False; the recovered dataset is D:

Bemd = calib_results[1.0]["Bemd"]
Bconf = calib_results[1.0]["Bconf"]
i = next(iter(i for i in range(len(Bemd)) if Bemd[i] > 0.9))
  
for j, D in zip(range(i+1), task.models_Qs):
    pass
assert j == i

Calibration curve with shortened domain

I.e. \(\Bemd{}\) values don’t reach 0 and/or 1. This is not necessarily critical: the calibration distribution we want to test may simply not allow to fully distinguish the candidate models under any condition.

If it is acceptable to change the calibration distribution (or to add one to the test suite), then the most common way to address this is to ensure the distribution produces conditions where \(\Bemd{}\) can achieve maximum confidence – for example by having conditions with negligeable observation noise.

calibs_normal = {key: viz.emdcmp.calibration_plot(task.unpack_results(task.run()))
                 for key, task in tasks_normal.items()}
calibs_BQ     = {key: viz.emdcmp.calibration_plot(task.unpack_results(task.run()))
                 for key, task in tasks_BQ.items()}

calibopts = (
    hv.opts.Overlay(backend="matplotlib",
                    #legend_cols=3,
                    #legend_opts={"columnspacing": .5, "alignment": "center",
                    #             "loc": "upper center"},
                    #hooks=[partial(viz.despine_hook(), left=False)],
                    #fig_inches=config.figures.defaults.fig_inches,
                    aspect=1, fontscale=1.3),
    hv.opts.Scatter(backend="matplotlib", s=20),
    #hv.opts.Scatter(color=hv.Palette("copper", range=(0., 1), reverse=True)),
    hv.opts.Layout(sublabel_format=""),
    hv.opts.Layout(backend="matplotlib", hspace=0.1, vspace=0.05,
                   fig_inches=0.65*config.figures.defaults.fig_inches)
)

def format_calib_curves(panels, tasks):
    assert len(panels) == 6
    top_row = panels[:3]
    bot_row = panels[3:]
    
    # Column titles
    pairs = [(task.experiments.a, task.experiments.b) for task in tasks]
    assert pairs[:3] == pairs[3:]
    pairs = pairs[:3]
    col_titles = [rf"$\mathcal{{{{M}}}}_{a}$ vs $\mathcal{{{{M}}}}_{b}$" for a,b in pairs]
    panels[0].opts(title=col_titles[0])
    panels[1].opts(title=col_titles[1])
    panels[2].opts(title=col_titles[2])

    # Row titles
    assert all(task.experiments.σi_dist == "weak input" for task in tasks[:3])
    assert all(task.experiments.σi_dist == "strong input" for task in tasks[3:])
    panels[0].opts(ylabel=r"weak $I_{\mathrm{ext}}$")
    panels[3].opts(ylabel=r"strong $I_{\mathrm{ext}}$")

    # Legend placement
    panels[0].opts(hv.opts.Overlay(legend_cols=5, legend_position="top_left", legend_opts={"columnspacing": 2.}))
    for i in (1, 2, 3, 4, 5):
        panels[i].opts(hv.opts.Overlay(show_legend=False))

    # xlabel only on the centred bottom panel
    for i in (0, 1, 2, 3, 5):
        panels[i].opts(xlabel="")

    # Despine, set axis ticks, display labels only on outer panels
    hooks = {i: [viz.despine_hook, viz.set_xticks_hook([0, 0.5, 1]), viz.set_yticks_hook([0, 0.5, 1])] for i in range(6)}
    for i in (0, 3):
        hooks[i].extend([viz.set_yticklabels_hook(["$0$", "$0.5$", "$1$"])])
    for i in (3, 4, 5):
        hooks[i].extend([viz.set_xticklabels_hook(["$0$", "$0.5$", "$1$"])])
    for i in (0, 1, 2):
        hooks[i].append(viz.xaxis_off_hook)
    for i in (1, 2, 4, 5):
        hooks[i].append(viz.yaxis_off_hook)
    for i, hook_lst in hooks.items():
        panels[i].opts(hooks=hook_lst)

    
    return panels

fig = hv.Layout(
    format_calib_curves([calplot.overlayed_scatters.redim(Bemd=hv.Dimension("Bemd", label=r"$B^{\mathrm{EMD}}$"))
                         for calplot in calibs_normal.values()],
                        list(tasks_normal.values()))
).cols(3).opts(*calibopts, hv.opts.Scatter(backend="matplotlib", s=12),
               #hv.opts.Overlay(legend_position="left", legend_cols=1)
               hv.opts.Overlay(show_legend=False),
               hv.opts.Layout(fig_inches=1.15)
              )
display(fig)
# The legend will use Curve data, which are all grey dotted lines.
# To get a legend with coloured dots, we do another figure with only the scatter plots
# We don’t want the figure (just the legend), so we make the figure tiny, remove its axes, and let the legend overflow
#hv.Overlay(list(fig.Overlay.II.Scatter.data.values())).opts(
#    fig_inches=0.01,
#    hooks=[viz.xaxis_off_hook, viz.yaxis_off_hook])

┌───────────┬──────────┬──────────────────────────┬──────────────────┬─────────────┬────────────────┐
│           │ models   │ input corr               │ input strength   │ obs noise   │ obs dist       │
├───────────┼──────────┼──────────────────────────┼──────────────────┼─────────────┼────────────────┤
│ Panel (a) │ C vs D   │ short input correlations │ weak input       │ low noise   │ Gaussian noise │
│ Panel (b) │ A vs B   │ short input correlations │ weak input       │ low noise   │ Gaussian noise │
│ Panel (c) │ A vs D   │ short input correlations │ weak input       │ low noise   │ Gaussian noise │
│ Panel (d) │ C vs D   │ short input correlations │ strong input     │ low noise   │ Gaussian noise │
│ Panel (e) │ A vs B   │ short input correlations │ strong input     │ low noise   │ Gaussian noise │
│ Panel (f) │ A vs D   │ short input correlations │ strong input     │ low noise   │ Gaussian noise │
└───────────┴──────────┴──────────────────────────┴──────────────────┴─────────────┴────────────────┘

Redraw the calibration curves in the main text#

Since in the course of this experiment, we redid the original six panel calibration with more experiments, we might as well update the figure in the main text with a higher-resolution one. The original code is found here.

fig_calib_highres = hv.Layout(
    format_calib_curves([calplot.overlayed_lines.redim(Bemd=hv.Dimension("Bemd", label=r"$B^{\mathrm{EMD}}$"))
                         for calplot in calibs_normal.values()],
                        list(tasks_normal.values()))
).cols(3).opts(*calibopts,
               #hv.opts.Overlay(legend_position="left", legend_cols=1),
               hv.opts.Overlay(show_legend=False),
               hv.opts.Layout(fig_inches=1.15),
               hv.opts.Curve(color=hv.Palette("copper", range=(0., 1), reverse=True))
              )
display(fig_calib_highres)

The legend is created separately and assembled with Inkscape.

import math
legend_calib_highres = hv.Overlay([curve.clone().redim.range(Bemd=(0,0.1), Bepis=(0.5,0.55)).relabel(label=f"$c=2^{{{int(round(math.log2(c)))}}}$")
            for c, curve in zip(c_list, fig_calib_highres.Overlay.I.Curve)]).opts(
    hv.opts.Overlay(show_legend=True, legend_cols=6,
                    hooks=[viz.xaxis_off_hook, viz.yaxis_off_hook],
                    aspect=6)
)
legend_calib_highres

hv.save(fig_calib_highres, config.paths.figures/"prinz_calibrations_high-res_raw.svg")
hv.save(legend_calib_highres, config.paths.figures/"prinz_calibrations_high-res_legend_raw.svg")

Compute comparison tables for each \(c\) value#

from addict import Dict
from functools import partial
import math
import emdcmp

import utils
from Ex_Prinz2004 import LP_data, phys_models, AdditiveNoise, fit_gaussian_σ, generate_synth_samples, Q

import holoviews as hv
hv.extension("matplotlib")

Recreate the synthetic and mixed PPFs as we do in the base notebook.

candidate_models = Dict()
Qrisk = Dict()
with mp.Pool(4) as pool:
    fitted_σs = pool.starmap(fit_gaussian_σ, [(LP_data, phys_models[a], "Gaussian") for a in "ABCD"])
for a, σ in zip("ABCD", fitted_σs):
    candidate_models[a] = utils.compose(AdditiveNoise("Gaussian", σ),
                                        phys_models[a])
    Qrisk[a] = Q(phys_model=phys_models[a], obs_model="Gaussian", σ=σ)

/usr/lib64/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.
  self.pid = os.fork()

def get_synth_ppf(a):
    return emdcmp.make_empirical_risk_ppf(Qrisk[a](generate_synth_samples(candidate_models[a])))
def get_mixed_ppf(a):
    return emdcmp.make_empirical_risk_ppf(Qrisk[a](LP_data.get_data()))

with mp.Pool(4) as pool:
    synth_ppf = Dict(zip("ABCD", pool.map(get_synth_ppf, "ABCD")))
    mixed_ppf = Dict(zip("ABCD", pool.map(get_mixed_ppf, "ABCD")))

Draw sets of R samples for each value of \(c\).

c_list = [2**-6, 2**-4, 2**-2, 2**0, 2**2, 2**4]

def draw_R_samples(a, c):
    return emdcmp.draw_R_samples(mixed_ppf[a], synth_ppf[a], c=c)
with mp.Pool(4) as pool:
    R_samples = {}
    for c in c_list:
        R_samples[c] = Dict(zip("ABCD", pool.map(partial(draw_R_samples, c=c), "ABCD")))

hm = hv.HoloMap({int(round(math.log2(c))): hv.Table(emdcmp.utils.compare_matrix(R_samples[c]).reset_index().rename(columns={"index": "models"}))
                 for c in c_list},
                kdims=["log2(c)"])
hm

emdcmp.utils.GitSHA(packages=["emdcmp", "pyloric-network-simulator"])

2025-07-18 git: emd-paper main #05b68aa4

emdcmp : 1.1.0
pyloric-network-simulator : 0.1.4.dev0+g2093368.d20250708