Implementations of common model comparison criteria#
\(\require{mathtools} \newcommand{\notag}{} \newcommand{\tag}{} \newcommand{\label}[1]{} \newcommand{\sfrac}[2]{#1/#2} \newcommand{\bm}[1]{\boldsymbol{#1}} \newcommand{\num}[1]{#1} \newcommand{\qty}[2]{#1\,#2} \renewenvironment{align} {\begin{aligned}} {\end{aligned}} \renewenvironment{alignat} {\begin{alignedat}} {\end{alignedat}} \newcommand{\pdfmspace}[1]{} % Ignore PDF-only spacing commands \newcommand{\htmlmspace}[1]{\mspace{#1}} % Ignore PDF-only spacing commands \newcommand{\scaleto}[2]{#1} % Allow to use scaleto from scalerel package \newcommand{\RR}{\mathbb R} \newcommand{\NN}{\mathbb N} \newcommand{\PP}{\mathbb P} \newcommand{\EE}{\mathbb E} \newcommand{\XX}{\mathbb X} \newcommand{\ZZ}{\mathbb Z} \newcommand{\QQ}{\mathbb Q} \newcommand{\fF}{\mathcal F} \newcommand{\dD}{\mathcal D} \newcommand{\lL}{\mathcal L} \newcommand{\gG}{\mathcal G} \newcommand{\hH}{\mathcal H} \newcommand{\nN}{\mathcal N} \newcommand{\pP}{\mathcal P} \newcommand{\BB}{\mathbb B} \newcommand{\Exp}{\operatorname{Exp}} \newcommand{\Binomial}{\operatorname{Binomial}} \newcommand{\Poisson}{\operatorname{Poisson}} \newcommand{\linop}{\mathcal{L}(\mathbb{B})} \newcommand{\linopell}{\mathcal{L}(\ell_1)} \DeclareMathOperator{\trace}{trace} \DeclareMathOperator{\Var}{Var} \DeclareMathOperator{\Span}{span} \DeclareMathOperator{\proj}{proj} \DeclareMathOperator{\col}{col} \DeclareMathOperator*{\argmin}{arg\,min} \DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator*{\gt}{>} \definecolor{highlight-blue}{RGB}{0,123,255} % definition, theorem, proposition \definecolor{highlight-yellow}{RGB}{255,193,7} % lemma, conjecture, example \definecolor{highlight-orange}{RGB}{253,126,20} % criterion, corollary, property \definecolor{highlight-red}{RGB}{220,53,69} % criterion \newcommand{\logL}{\ell} \newcommand{\eE}{\mathcal{E}} \newcommand{\oO}{\mathcal{O}} \newcommand{\defeq}{\stackrel{\mathrm{def}}{=}} \newcommand{\Bspec}{\mathcal{B}} % Spectral radiance \newcommand{\X}{\mathcal{X}} % X space \newcommand{\Y}{\mathcal{Y}} % Y space \newcommand{\M}{\mathcal{M}} % Model \newcommand{\Tspace}{\mathcal{T}} \newcommand{\Vspace}{\mathcal{V}} \newcommand{\Mtrue}{\mathcal{M}_{\mathrm{true}}} \newcommand{\MP}{\M_{\mathrm{P}}} \newcommand{\MRJ}{\M_{\mathrm{RJ}}} \newcommand{\qproc}{\mathfrak{Q}} \newcommand{\D}{\mathcal{D}} % Data (true or generic) \newcommand{\Dt}{\tilde{\mathcal{D}}} \newcommand{\Phit}{\widetilde{\Phi}} \newcommand{\Phis}{\Phi^*} \newcommand{\qt}{\tilde{q}} \newcommand{\qs}{q^*} \newcommand{\qh}{\hat{q}} \newcommand{\AB}[1]{\mathtt{AB}~\mathtt{#1}} \newcommand{\LP}[1]{\mathtt{LP}~\mathtt{#1}} \newcommand{\NML}{\mathrm{NML}} \newcommand{\iI}{\mathcal{I}} \newcommand{\true}{\mathrm{true}} \newcommand{\dist}{D} \newcommand{\Mtheo}[1]{\mathcal{M}_{#1}} % Model (theoretical model); index: param set \newcommand{\DL}[1][L]{\mathcal{D}^{(#1)}} % Data (RV or generic) \newcommand{\DLp}[1][L]{\mathcal{D}^{(#1')}} % Data (RV or generic) \newcommand{\DtL}[1][L]{\tilde{\mathcal{D}}^{(#1)}} % Data (RV or generic) \newcommand{\DpL}[1][L]{{\mathcal{D}'}^{(#1)}} % Data (RV or generic) \newcommand{\Dobs}[1][]{\mathcal{D}_{\mathrm{obs}}^{#1}} % Data (observed) \newcommand{\calibset}{\mathcal{C}} \newcommand{\N}{\mathcal{N}} % Normal distribution \newcommand{\Z}{\mathcal{Z}} % Partition function \newcommand{\VV}{\mathbb{V}} % Variance \newcommand{\T}{\mathsf{T}} % Transpose \newcommand{\EMD}{\mathrm{EMD}} \newcommand{\dEMD}{d_{\mathrm{EMD}}} \newcommand{\dEMDtilde}{\tilde{d}_{\mathrm{EMD}}} \newcommand{\dEMDsafe}{d_{\mathrm{EMD}}^{\text{(safe)}}} \newcommand{\e}{ε} % Model confusion threshold \newcommand{\falsifythreshold}{ε} \newcommand{\bayes}[1][]{B_{#1}} \newcommand{\bayesthresh}[1][]{B_{0}} \newcommand{\bayesm}[1][]{B^{\mathcal{M}}_{#1}} \newcommand{\bayesl}[1][]{B^l_{#1}} \newcommand{\bayesphys}[1][]{B^{{p}}_{#1}} \newcommand{\Bconf}[1]{B^{\mathrm{epis}}_{#1}} \newcommand{\Bemd}[1]{B^{\mathrm{EMD}}_{#1}} \newcommand{\BQ}[1]{B^{Q}_{#1}} \newcommand{\Bconfbin}[1][]{\bar{B}^{\mathrm{conf}}_{#1}} \newcommand{\Bemdbin}[1][]{\bar{B}_{#1}^{\mathrm{EMD}}} \newcommand{\bin}{\mathcal{B}} \newcommand{\Bconft}[1][]{\tilde{B}^{\mathrm{conf}}_{#1}} \newcommand{\fc}{f_c} \newcommand{\fcbin}{\bar{f}_c} \newcommand{\paramphys}[1][]{Θ^{{p}}_{#1}} \newcommand{\paramobs}[1][]{Θ^{ε}_{#1}} \newcommand{\test}{\mathrm{test}} \newcommand{\train}{\mathrm{train}} \newcommand{\synth}{\mathrm{synth}} \newcommand{\rep}{\mathrm{rep}} \newcommand{\MNtrue}{\mathcal{M}^{{p}}_{\text{true}}} \newcommand{\MN}[1][]{\mathcal{M}^{{p}}_{#1}} \newcommand{\MNA}{\mathcal{M}^{{p}}_{Θ_A}} \newcommand{\MNB}{\mathcal{M}^{{p}}_{Θ_B}} \newcommand{\Me}[1][]{\mathcal{M}^ε_{#1}} \newcommand{\Metrue}{\mathcal{M}^ε_{\text{true}}} \newcommand{\Meobs}{\mathcal{M}^ε_{\text{obs}}} \newcommand{\Meh}[1][]{\hat{\mathcal{M}}^ε_{#1}} \newcommand{\MNa}{\mathcal{M}^{\mathcal{N}}_a} \newcommand{\MeA}{\mathcal{M}^ε_A} \newcommand{\MeB}{\mathcal{M}^ε_B} \newcommand{\Ms}{\mathcal{M}^*} \newcommand{\MsA}{\mathcal{M}^*_A} \newcommand{\MsB}{\mathcal{M}^*_B} \newcommand{\Msa}{\mathcal{M}^*_a} \newcommand{\MsAz}{\mathcal{M}^*_{A,z}} \newcommand{\MsBz}{\mathcal{M}^*_{B,z}} \newcommand{\Msaz}{\mathcal{M}^*_{a,z}} \newcommand{\MeAz}{\mathcal{M}^ε_{A,z}} \newcommand{\MeBz}{\mathcal{M}^ε_{B,z}} \newcommand{\Meaz}{\mathcal{M}^ε_{a,z}} \newcommand{\zo}{z^{0}} \renewcommand{\lL}[2][]{\mathcal{L}_{#1|{#2}}} % likelihood \newcommand{\Lavg}[2][]{\mathcal{L}^{/#2}_{#1}} % Geometric average of likelihood \newcommand{\lLphys}[2][]{\mathcal{L}^{{p}}_{#1|#2}} \newcommand{\Lavgphys}[2][]{\mathcal{L}^{{p}/#2}_{#1}} % Geometric average of likelihood \newcommand{\lLL}[3][]{\mathcal{L}^{(#3)}_{#1|#2}} \newcommand{\lLphysL}[3][]{\mathcal{L}^{{p},(#3)}_{#1|#2}} \newcommand{\lnL}[2][]{l_{#1|#2}} % Per-sample log likelihood \newcommand{\lnLt}[2][]{\widetilde{l}_{#1|#2}} \newcommand{\lnLtt}{\widetilde{l}} % Used only in path_sampling \newcommand{\lnLh}[1][]{\hat{l}_{#1}} \newcommand{\lnLphys}[2][]{l^{{p}}_{#1|#2}} \newcommand{\lnLphysL}[3][]{l^{{p},(#3)}_{#1|#2}} \newcommand{\Elmu}[2][1]{μ_{{#2}}^{(#1)}} \newcommand{\Elmuh}[2][1]{\hat{μ}_{{#2}}^{(#1)}} \newcommand{\Elsig}[2][1]{Σ_{{#2}}^{(#1)}} \newcommand{\Elsigh}[2][1]{\hat{Σ}_{{#2}}^{(#1)}} \newcommand{\pathP}{\mathop{{p}}} % Path-sampling process (generic) \newcommand{\pathPhb}{\mathop{{p}}_{\mathrm{Beta}}} % Path-sampling process (hierarchical beta) \newcommand{\interval}{\mathcal{I}} \newcommand{\Phiset}[1]{\{\Phi\}^{\small (#1)}} \newcommand{\Phipart}[1]{\{\mathcal{I}_Φ\}^{\small (#1)}} \newcommand{\qhset}[1]{\{\qh\}^{\small (#1)}} \newcommand{\Dqpart}[1]{\{Δ\qh_{2^{#1}}\}} \newcommand{\LsAzl}{\mathcal{L}_{\smash{{}^{\,*}_A},z,L}} \newcommand{\LsBzl}{\mathcal{L}_{\smash{{}^{\,*}_B},z,L}} \newcommand{\lsA}{l_{\smash{{}^{\,*}_A}}} \newcommand{\lsB}{l_{\smash{{}^{\,*}_B}}} \newcommand{\lsAz}{l_{\smash{{}^{\,*}_A},z}} \newcommand{\lsAzj}{l_{\smash{{}^{\,*}_A},z_j}} \newcommand{\lsAzo}{l_{\smash{{}^{\,*}_A},z^0}} \newcommand{\leAz}{l_{\smash{{}^{\,ε}_A},z}} \newcommand{\lsAez}{l_{\smash{{}^{*ε}_A},z}} \newcommand{\lsBz}{l_{\smash{{}^{\,*}_B},z}} \newcommand{\lsBzj}{l_{\smash{{}^{\,*}_B},z_j}} \newcommand{\lsBzo}{l_{\smash{{}^{\,*}_B},z^0}} \newcommand{\leBz}{l_{\smash{{}^{\,ε}_B},z}} \newcommand{\lsBez}{l_{\smash{{}^{*ε}_B},z}} \newcommand{\LaszL}{\mathcal{L}_{\smash{{}^{*}_a},z,L}} \newcommand{\lasz}{l_{\smash{{}^{*}_a},z}} \newcommand{\laszj}{l_{\smash{{}^{*}_a},z_j}} \newcommand{\laszo}{l_{\smash{{}^{*}_a},z^0}} \newcommand{\laez}{l_{\smash{{}^{ε}_a},z}} \newcommand{\lasez}{l_{\smash{{}^{*ε}_a},z}} \newcommand{\lhatasz}{\hat{l}_{\smash{{}^{*}_a},z}} \newcommand{\pasz}{p_{\smash{{}^{*}_a},z}} \newcommand{\paez}{p_{\smash{{}^{ε}_a},z}} \newcommand{\pasez}{p_{\smash{{}^{*ε}_a},z}} \newcommand{\phatsaz}{\hat{p}_{\smash{{}^{*}_a},z}} \newcommand{\phateaz}{\hat{p}_{\smash{{}^{ε}_a},z}} \newcommand{\phatseaz}{\hat{p}_{\smash{{}^{*ε}_a},z}} \newcommand{\Phil}[2][]{Φ_{#1|#2}} % Φ_{\la} \newcommand{\Philt}[2][]{\widetilde{Φ}_{#1|#2}} % Φ_{\la} \newcommand{\Philhat}[2][]{\hat{Φ}_{#1|#2}} % Φ_{\la} \newcommand{\Philsaz}{Φ_{\smash{{}^{*}_a},z}} % Φ_{\lasz} \newcommand{\Phileaz}{Φ_{\smash{{}^{ε}_a},z}} % Φ_{\laez} \newcommand{\Philseaz}{Φ_{\smash{{}^{*ε}_a},z}} % Φ_{\lasez} \newcommand{\mus}[1][1]{μ^{(#1)}_*} \newcommand{\musA}[1][1]{μ^{(#1)}_{\smash{{}^{\,*}_A}}} \newcommand{\SigsA}[1][1]{Σ^{(#1)}_{\smash{{}^{\,*}_A}}} \newcommand{\musB}[1][1]{μ^{(#1)}_{\smash{{}^{\,*}_B}}} \newcommand{\SigsB}[1][1]{Σ^{(#1)}_{\smash{{}^{\,*}_B}}} \newcommand{\musa}[1][1]{μ^{(#1)}_{\smash{{}^{*}_a}}} \newcommand{\Sigsa}[1][1]{Σ^{(#1)}_{\smash{{}^{*}_a}}} \newcommand{\Msah}{{\color{highlight-red}\mathcal{M}^{*}_a}} \newcommand{\Msazh}{{\color{highlight-red}\mathcal{M}^{*}_{a,z}}} \newcommand{\Meah}{{\color{highlight-blue}\mathcal{M}^{ε}_a}} \newcommand{\Meazh}{{\color{highlight-blue}\mathcal{M}^{ε}_{a,z}}} \newcommand{\lsazh}{{\color{highlight-red}l_{\smash{{}^{*}_a},z}}} \newcommand{\leazh}{{\color{highlight-blue}l_{\smash{{}^{ε}_a},z}}} \newcommand{\lseazh}{{\color{highlight-orange}l_{\smash{{}^{*ε}_a},z}}} \newcommand{\Philsazh}{{\color{highlight-red}Φ_{\smash{{}^{*}_a},z}}} % Φ_{\lasz} \newcommand{\Phileazh}{{\color{highlight-blue}Φ_{\smash{{}^{ε}_a},z}}} % Φ_{\laez} \newcommand{\Philseazh}{{\color{highlight-orange}Φ_{\smash{{}^{*ε}_a},z}}} % Φ_{\lasez} \newcommand{\emdstd}{\tilde{σ}} \DeclareMathOperator{\Mvar}{Mvar} \DeclareMathOperator{\AIC}{AIC} \DeclareMathOperator{\epll}{epll} \DeclareMathOperator{\elpd}{elpd} \DeclareMathOperator{\MDL}{MDL} \DeclareMathOperator{\comp}{COMP} \DeclareMathOperator{\Lognorm}{Lognorm} \DeclareMathOperator{\erf}{erf} \DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator{\Image}{Image} \DeclareMathOperator{\sgn}{sgn} \DeclareMathOperator{\SE}{SE} % standard error \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator{\Poisson}{Poisson} \DeclareMathOperator{\SkewNormal}{SkewNormal} \DeclareMathOperator{\TruncNormal}{TruncNormal} \DeclareMathOperator{\Exponential}{Exponential} \DeclareMathOperator{\exGaussian}{exGaussian} \DeclareMathOperator{\IG}{IG} \DeclareMathOperator{\NIG}{NIG} \DeclareMathOperator{\Gammadist}{Gamma} \DeclareMathOperator{\Lognormal}{Lognormal} \DeclareMathOperator{\Beta}{Beta} \newcommand{\sinf}{{s_{\infty}}}\)
This file collects definitions for other popular model comparison criteria.
Devising a criterion involves casting the model selection problem in a way that can be replaced by a model score; the higher/lower the score, the more favoured a model is. A criterion is then an estimator for that score.
It’s important to remember that not all criteria start from the same score, and therefore they are not all completely exchangeable. In particular, \(\Bemd{}\) is the only criterion which tries to estimate the uncertainty of its score.
from joblib import Memory
import psutil
memory = Memory(".joblib-cache", verbose=0)
ncores = min(psutil.cpu_count(logical=False), 64)
# ncores = max(2, min(psutil.cpu_count(logical=False)-4, 100))
import numpy as np
from scipy import stats
import dynesty
import xarray
import arviz
import emdcmp as emd
import matplotlib as mpl
import utils
Criteria definitions#
For comparability, we write all criteria as ratios of probabilities, so a criterion \(B^C_{AB}\) is understood as
Model \(A\) is \(B^C_{AB}\) times more probable than model \(B\).
where \(C\) can be “model evidence”, “relative likelihood”, etc. Thus, if \(P(A)\) is the “probability of model \(A\) and \(P(B)\) the “probability of model \(B\), then a criterion corresponds to
@dataclasses.dataclass
class Criterion:
lognum : float
logdenom: float
@property
def ratio(self): return exp(self.lognum - self.logdenom)
@property
def logratio(self): return (self.lognum - self.logdenom)*base_2_to_10
Criteria are often presented this way because the notion of “probability of a model” is ill-defined, and the quantities \(P(A)\) and \(P(B)\) often diverge or go to zero – making the ratio the only quantity with some degree of stability. Of course, ratios of the form \(\tfrac{0}{0}\) or \(\tfrac{\inf}{\inf}\) are known to lead to all kinds of pathologies, which is one way to understand why many criteria work better in theory than in practice.
The \(\Bemd{}\) attempts to resolve this by defining a proper probability which does not require a ratio to obtain a finite number:
We consider this a better assessment quantity, less prone to over/underconfidence than a ratio with diverging denominator. However, since other criteria have no equivalent form, for the purpose of comparability, we will convert \(\Bemd{}\) into a ratio-like quantity \(\underline{B}^{\mathrm{EMD}}\) (we use \(\approx^*\) to denote “conceptual equivalence”, rather a strict mathematical equality):
Note
\(\Bemd{}\) does not require specifying priors on the parameters or reserving test data.
def get_ppfs(𝓜, 𝒟, rng=None, L_synth=2**14):
mixed_ppf = emd.make_empirical_risk_ppf(𝓜.Q(𝒟.get_data()))
synth_ppf = emd.make_empirical_risk_ppf(𝓜.Q(𝓜.gen_data(L_synth, λmin=𝒟.λmin, λmax=𝒟.λmax, rng=rng)))
return mixed_ppf, synth_ppf
@memory.cache
def Bemd(𝒟):
#c = 2**-1 if 𝒟.λmax == 30*μm else 2**0
c = c_chosen
_mixed_ppf, _synth_ppf = get_ppfs(𝒟)
_Bemd = emd.Bemd(_mixed_ppf["Planck"], _mixed_ppf["Rayleigh-Jeans"],
_synth_ppf["Planck"], _synth_ppf["Rayleigh-Jeans"],
c=c, res=8, M=128,
progbarA=None, progbarB=None)
return Criterion(log(_Bemd), log(1-_Bemd))
Symbol |
Variable |
Meaning |
---|---|---|
\(L\) |
|
Number of data samples used to fit the model. |
\(L'\) |
|
Number of new data samples used only to test the model |
\(\mathcal{D}\) |
|
Dataset |
\(\hat{Θ}\) |
|
Maximum likelihood parameters |
\(\logL(Θ)\) |
|
Log likelihood function |
- \(\logL\): Log likelihood function
Recall that the likelihood is a function of model parameters, so we can write
since we defined the loss \(Q\) to be the negative log probability.
To assign a likelihood to a model, some criteria use \(\max_{Θ} \logL_a(Θ)\); i.e. they evaluate the at the fitted parameters. In the following we denote this \(\logL_a(\hat{Θ})\).
def 𝓁(Θ, Q, 𝒟):
return -Q[Θ](𝒟.get_data()).sum().squeeze()
def 𝓁(Θ, Q, 𝒟): # This version can deal with larger datasets, by splitting the data into manageable chunks
x, y = 𝒟.get_data()
w = 2**12
return sum(
-Q[Θ]((x[i*w:(i+1)*w], y[i*w:(i+1)*w])).sum().squeeze()
for i in range(int(np.ceil(len(x)/w)))
)
Note
When we check the fitted temperatures, they are all in the range 3200-4100 K. So these are reasonable datasets
- \(R\): Risk
The (true) risk is a scalar obtained by averaging the loss on test samples:
The \(R\)-distributions computed in our paper approximate the uncertainty on \(R_Q\) due to modelling errors.
Note although the true risk is technically an expectation over the true data distribution, we get a very good estimate by using an empirical average because:
we can generate test data by using a different random seed from the training data;
the models are very simple, and therefore we can make \(L'\) very large.
@cache
def R(Q, Θ, 𝒟, Lᑊ=2**12, purpose="expected risk"):
"""Compute the expected risk by averaging it over _new_ data samples.
Change the value of `purpose` to change the RNG seed used to generate new samples."""
return Q[Θ](replace(𝒟, purpose=purpose, L=Lᑊ).get_data()).mean() * base_e_to_2
- \(π\): Prior
We assume the prior factorizes into an independent distribution for each variable:
~ The FactorizedPrior
class comes equipped with an expect
method.
This computes a Monte Carlo estimate of \(\int f(Θ) dπ(Θ)\): samples are drawn from the prior, and we track both their mean and variance.
~ Experience has shown that alredy for 2d priors, this is much more efficient than computing the marginal with direct integration with dblquad
.
~ The variance is used to compute the standard error on the mean. Once this is below a certain threshold, we return the mean. A threshold of \(2^{-6} \approx 1.5\%\) runs fairly fast; computation time becomes noticeable for thresholds \(2^{-n}\) with \(n > 6\), and increases exponentially in \(n\).
~ The number of samples is increased doubled each time until the standard error threshold is reached. We use the Parallel algorithm to update the estimated mean and variance.
~ Important expect
can average any function \(f\) over the prior, but that function must be provided as \(\log_2 f\) (i.e. it must return the logarithm base 2 the function we actually want to average).
This allows us to use logaddexp2
to minimize rounding errors. The use of base 2 instead of \(e\) is not only much more computationally efficient, but also reduces rounding errors.
# FactorizedPrior
# To use joblib.Memory, we need objects which can be hashed & pickled consistently
# Stats distributions don’t do this, but we can store their arguments instead
@dataclasses.dataclass(frozen=True)
class FactorizedPrior:
"""
Container for a factorized prior.
Each factor is given by an independent 1d distribution from scipy.stats.
"""
distnames: list[str]
args: list[tuple]
rng: int|str|tuple[int|str]
def __and__(self, other: 'FactorizedPrior'):
return FactorizedPrior(self.distnames + other.distnames,
self.args + other.args,
self.rng + other.rng)
@cached_property # cached property avoids resetting the RNG to the same value
def rv_list(self):
_rv_list = []
for i, (distname, _args) in enumerate(zip(self.distnames, self.args)):
rv = getattr(stats, distname)(*_args)
rv.random_state = utils.get_rng(self.rng, i)
_rv_list.append(rv)
return _rv_list
def rvs(self, size=1, random_state=None): return np.stack([rv.rvs(size=size, random_state=random_state)
for rv in self.rv_list]).T
def pdf(self, x): return np.prod([rv.pdf(xi)
for xi, rv in zip(np.atleast_2d(x).T, self.rv_list)], axis=0)
def logpdf(self, x): return np.sum([rv.logpdf(xi)
for xi, rv in zip(np.atleast_2d(x).T, self.rv_list)], axis=0)
@property
def prior_transform(self):
def prior_transform(u):
return np.array([rv.ppf(ui) for ui, rv in zip(u, self.rv_list)])
return prior_transform
def _avg_M2(self, log2f, args=(), rtol=2**-6, Lπ_min=2**10, num_doublings=6):
"""
Return an estimate of the mean and of the sum of squared differences of f under the prior.
For numerical reasons we take a function which returns the _logarithm_ of f in base 2.
"""
def one_block(Lπ):
log2farr = np.fromiter((log2f(Θ, *args) for Θ in self.rvs(size=Lπ)),
dtype=float, count=Lπ)
log2farr.sort() # Minimize rounding errors by sorting values before summing
μ = 2**np.logaddexp2.reduce(log2farr) / Lπ # Faster but requires more memory than reduce(np.logaddexp2, log2farr)
M2arr = (2**log2farr - μ)**2 # NB: using logM2arr could run into issues for points where the difference is near zero
M2arr.sort() # Idem: minimize rounding errors
return μ, M2arr.sum()
Lπ = Lπ_min
μ, M2 = one_block(Lπ)
stderr = np.sqrt(M2) / Lπ
if stderr < rtol * μ:
return μ, M2, Lπ
# Keep doubling the number of samples until the relative error is below the tolerance
for _ in range(num_doublings):
μB, M2B = one_block(Lπ)
δ = μ - μB
μ = (μ + μB)/2
M2 += M2B + δ**2 * Lπ/2
Lπ *= 2
stderr = np.sqrt(M2) / Lπ
if stderr < rtol * μ:
break
else:
logger.warning("`Prior.expect` did not achieve the target relative accuracy:\n"
f"Std err is {stderr/μ*100:.3}% of mean, but it should be less than {rtol*100:.3}%.")
#print("# prior samples:", Lπ)
#print(" std. err:", stderr)
return μ, M2, Lπ
def expect(self, log2f, args=()):
"""
Return the expectation of f under the prior.
.. Caution:: For numerical reasons we take a function which returns the
_logarithm_ of f in base 2.
"""
μ, M2, Lπ = self._avg_M2(log2f, args)
return μ
def variance(self, log2f, args=()):
"""
Return an estimator for the variance of f under the posterior.
.. Caution:: For numerical reasons we take a function which returns the
_logarithm_ of f in base 2.
"""
μ, M2, Lπ = self._avg_M2(log2f, args)
return M2/Lπ
- \(\eE\): Model evidence
The Bayesian model evidence is obtained by marginizaling the likelihood over the prior.
An initial attempt to compute evidence by Monte Carlo fails, because even in this simple example, the likelihood is too peaked: none (or not enough) of the samples are drawn in the high-probability region.
@memory.cache
def logℰ(Q, π, 𝒟):
return np.log(π.expect(partial(𝓁, Q=Q, 𝒟=𝒟)))
Instead we use dynesty, a package specialized for computed the model evidence using slice sampling. Slice sampling goes through a series of refinements, to locate the mode(s) of the posterior and focus the samples on those regions.
@memory.cache(ignore=["ncores"])
def sample_posterior(Q, π, 𝒟, dlogz=0.01, nlive=1024, sample='rwalk', enlarge=1.25, bootstrap=0, ncores=ncores):
"""
Sample the posterior using slice sampling (specifically with Dynesty).
The resulting samples can be used for evaluating both the model evidence
and, after uniform resampling, more usual Bayesian statistics like WAIC and posterior variance.
Optional arguments are passed on to `dynesty.NestedSampler`; the defaults define
a more robust and accurate sampler than the dynesty defaults. This allows it to
accommodate a more complex posterior, at the cost of longer convergence times.
.. Hint:: The dynesty documentation recommends that `nlive` be an integer multiple
of the number of cores.
"""
with dynesty.pool.Pool(ncores, 𝓁, π.prior_transform) as pool:
sampler = dynesty.NestedSampler(pool.loglike, pool.prior_transform,
logl_kwargs={'Q':Q, 'D':𝒟},
ndim=π.rvs().size, pool=pool,
nlive=nlive, sample=sample, enlarge=enlarge, bootstrap=bootstrap)
sampler.run_nested(print_progress=False, dlogz=dlogz)
# Within a tqdm loop, the default print_func spams the console (it prints below the old progress, instead of on top of it)
# Possible solution: set up our own tqdm progbar, and pass it as the `pbar` argument to dynesty.results.print_fn
return sampler.results
def logℰ(Q, π, 𝒟) -> tuple[float, float]:
"""
Returns estimate of the log evidence and the standard error on that estimate,
as computed by `dynesty` during sampling.
The standard de
"""
dynres = sample_posterior(Q, π, 𝒟)
return dynres.logz[-1], dynres.logzerr[-1]
- \(\mathrm{elpd}\): Expected log pointwise predictive density, WAIC, LOO
The expected log pointwise predictive density (\(\mathrm{elpd}\)) on unseen data is often considered a gold standard when it comes to comparing models. Directly estimating the elpd requires putting aside a large amount of data for testing, which is rarely feasible; hence approximations have been developed, like the widely applicable information criterion (WAIC) and leave-one-out (LOO) cross-validation [33]. In this example however we can generate data at will, so there is no need for approximations: we can compute the \(\mathrm{elpd}\) directly.
In the following we use \(\{x_i, y_i\} := \{x_i, y_i\}_{i=1}^L\) to denote the original data points used to fit the model, and \(x_j'\), \(y_j'\) (with \(j \in \{1,\dotsc,L'\}\)) to denote the new data points on which we evaluate the \(\mathrm{elpd}\).
The prior and posterior over parameters are denoted respectively \(π(Θ)\) and \(p(Θ \mid \{x_i, y_i\})\).
The likelihood and posterior are denoted respectively \(\logL(Θ) \stackrel{\mathrm{def}}{=} p(\{x_i, y_i\} | Θ)\) and \(p(x_j', y_j' | \{x_i, y_i\}, Θ)\).
The \(\mathrm{elpd}\) is closely related to the expected risk \(R\); in fact, if we define \(Q\) to be the log posterior instead of the log likelihood, it becomes equivalent.
- \(B^l\): Relative likelihood / AIC
Computing the ratio of likelihoods is equivalent to the difference of log likelihoods:
~ The Akaike information criterion (AIC) for a model with \(k\) parameters reads
Since the Rayleigh-Jeans and Planck models both have the same number of parameters, this is equivalent to the likelihood ratio (up to a factor of 2).
# NB: We can’t just use len(Θ) because Θ may contain lists
def num_params(Θ): return sum(1 for _ in utils.flatten(Θ))
def AIC(Q, Θ, 𝒟) : return -2*𝓁(Θ, Q, 𝒟) + 2*num_params(Θ)
def BIC(Q, Θ, 𝒟) : return -2*𝓁(Θ, Q, 𝒟) + np.log(𝒟.L)*num_params(Θ)
#@memory.cache
def DIC(Q, Θ, π, 𝒟):
#return -2*𝓁(Θ, Q, 𝒟) + 4*π.variance(partial(𝓁, Q=Q, 𝒟=𝒟))
samples = sample_posterior(Q, π, 𝒟)
weights = samples.importance_weights()
var = (samples.logl**2).dot(weights) - samples.logl.dot(weights)**2
return -2*𝓁(Θ, Q, 𝒟) + 4*var
WAIC and PSIS-LOO-CV assume uniformly drawn MCMC samples. We could use an MCMC sampler like pymc or emcee, but since we already run dynesty’s slice sampler to compute the model evidence, this seems wasteful. Instead we subsample the log likelihood values returned by dynesty according to their probability, thus emulating what a uniform MCMC sampler would return.
For the calculations themselves, we use the implementations ArviZ.waic or ArviZ.loo. At least for PSIS-LOO-CV, this is the recommendation of that method’s official implementation repo.
Difference with dynesty’s resample_equal
function
dynesty provides the resample_equal
function which also generates an MCMC-like set of uniform samples. The difference is that resample_equal
samples with replacement to create a new dataset of the same size as the original: so it generates a bigger dataset, but with repeated values.
I found that the estimated number of parameters using subsampling was more plausible (1.4 instead of 2.3), which is why I use this approach.
Ultimately this choice does not really matter, because the important values (elpd and its standard error) are effectively the same for both methods.
@memory.cache
def elpd(Q, π, 𝒟, purpose="elpd",
scale: Literal["log", "negative_log", "deviance"]="log",
method: Literal["waic", "loo"]="waic"
) -> tuple[float, float]:
"""
Return the elpd, estimated using either WAIC or PSIS-LOO-CV.
Returns:
estimated elpd
standard error
"""
rng = utils.get_rng(purpose)
if method not in {"waic", "loo"}: raise ValueError(f"`method` must be either 'waic' or 'loo'. Received {method}")
dynresult = sample_posterior(Q, π, 𝒟)
select = rng.binomial(1, np.exp(dynresult.logl - dynresult.logl.max())).astype(bool)
θsamples = dynresult.samples[select]
#θsamples = dynresult.samples_equal()
# Massage the log likelihood data into the form expected by ArviZ.
# Note that we want pointwise log likelihoods, so neither can we use the values in dynresult.logl nor our own log likelihood function 𝓁
logL_dataset = xarray.Dataset(
{"logL": (["chain", "draw", "B"], [[-Q[θ](𝒟.get_data()) for θ in θsamples]])},
coords = {"chain": (["chain"], [0]),
"draw": (["draw"], np.arange(len(θsamples))),
"B": (["B"], np.arange(𝒟.L)),
}
)
idata = arviz.InferenceData(log_likelihood=logL_dataset)
if method == "waic":
elpd_res = arviz.waic(idata, pointwise=True, scale=scale)
return elpd_res.elpd_waic, elpd_res.se
else:
weights = np.exp(logL)
weights /= weights.sum()
reff = 1/(weights**2).sum() / len(weights)
elpd_res = arviz.loo(idata, pointwise=True, reff=reff, scale=scale)
return elpd_res.elpd_loo, elpd_res.se
Old implementation using direct expectation
The idea here was to compute the elpd “directly”, to avoid possible confounds from the approximations inherent in other methods like WAIC or PSIS-LOO-CV. Unfortunately direct evaluation of the integral is still too slow, even for this simple problem, and the Monte Carlo implementation below doesn’t draw enough samples from the high-probability region to produce an accurate estimate.
# Functions must be defined in module scope in order to be pickleable
def h(Θ, xy, Q, 𝒟): return (𝓁(Θ, Q, 𝒟) - Q[Θ](xy)) * base_e_to_2
def lpd(xy, π, h): return π.expect(partial(h, xy=xy))
@memory.cache
def elpd(Q, π, 𝒟, Lᑊ=16, purpose="elpd"):
# Collecting multiple param draws before summing them allows to sort and reduce rounding errors
#Lπ = 1024
𝒟test = replace(𝒟, L=Lᑊ, purpose=f"{𝒟.purpose} - {purpose}")
#xytest = 𝒟test.get_data()
#def h(Θ, xy): return (𝓁(Θ, Q, 𝒟) - Q[Θ](xy)) * base_e_to_2
#def lpd(xy): return π.expect(partial(h, xy=xy))
_lpd = partial(lpd, π=π, h=partial(h, Q=Q, D=𝒟))
args = tqdm(zip(*𝒟test.get_data()), desc="elpd test sample", total=Lᑊ)
with multiprocessing.Pool(ncores) as pool:
lpd_arr = np.fromiter(pool.imap_unordered(_lpd, args, chunksize=max(1, Lᑊ//8)),
dtype=float, count=Lᑊ)
#lpd_arr = np.zeros(Lᑊ, dtype=float)
#for j, (xj, yj) in tqdm(enumerate(zip(*𝒟test.get_data())), desc="elpd test sample"):
# lpd_arr[j] = π.expect(partial(h, xy=(xj, yj)))
lpd_arr.sort()
return np.log(lpd_arr).mean()
- \(B^{\mathrm{Bayes}}\): Bayes factor
The Bayes factor is the ratio of the evidence for each model:
def BBayes(𝒟, πlogσ, πlogT): return Criterion(logℰ("Planck", 𝒟, πlogσ, πlogT),
logℰ("Rayleigh-Jeans", 𝒟, πlogσ, πlogT))
- \(B^{\mathrm{elpd}}\): \(\mathrm{elpd}\) criterion
The \(\mathrm{elpd}\) is usually reported as-is, but since it does scale like a log probability, we can make it comparable to other criteria by defining
In practice, a large positive value for \(\log B^{\mathrm{elpd}}_{AB}\) would be interpreted as strong evidence for model \(A\).
def Belpd(𝒟, πlogσ, πlogT): return Criterion(elpd("Planck", 𝒟, πlogσ, πlogT),
elpd("Rayleigh-Jeans", 𝒟, πlogσ, πlogT))
- \(\underline{B}^R\): Ratio of expected risk
As with the \(\mathrm{elpd}\), the expected risk is more commonly given directly. However since we chose \(Q\) to be the negative log likelihood, it is reasonable to present it as a ratio to make it comparable with other criteria:
def BR(𝒟): return Criterion(-R("Planck", 𝒟), -R("Rayleigh-Jeans", 𝒟))
2025-07-19 git: emd-paper main #af8cfb86