Comparison with other criteria#
\(\require{mathtools} \newcommand{\notag}{} \newcommand{\tag}{} \newcommand{\label}[1]{} \newcommand{\sfrac}[2]{#1/#2} \newcommand{\bm}[1]{\boldsymbol{#1}} \newcommand{\num}[1]{#1} \newcommand{\qty}[2]{#1\,#2} \renewenvironment{align} {\begin{aligned}} {\end{aligned}} \renewenvironment{alignat} {\begin{alignedat}} {\end{alignedat}} \newcommand{\pdfmspace}[1]{} % Ignore PDF-only spacing commands \newcommand{\htmlmspace}[1]{\mspace{#1}} % Ignore PDF-only spacing commands \newcommand{\scaleto}[2]{#1} % Allow to use scaleto from scalerel package \newcommand{\RR}{\mathbb R} \newcommand{\NN}{\mathbb N} \newcommand{\PP}{\mathbb P} \newcommand{\EE}{\mathbb E} \newcommand{\XX}{\mathbb X} \newcommand{\ZZ}{\mathbb Z} \newcommand{\QQ}{\mathbb Q} \newcommand{\fF}{\mathcal F} \newcommand{\dD}{\mathcal D} \newcommand{\lL}{\mathcal L} \newcommand{\gG}{\mathcal G} \newcommand{\hH}{\mathcal H} \newcommand{\nN}{\mathcal N} \newcommand{\pP}{\mathcal P} \newcommand{\BB}{\mathbb B} \newcommand{\Exp}{\operatorname{Exp}} \newcommand{\Binomial}{\operatorname{Binomial}} \newcommand{\Poisson}{\operatorname{Poisson}} \newcommand{\linop}{\mathcal{L}(\mathbb{B})} \newcommand{\linopell}{\mathcal{L}(\ell_1)} \DeclareMathOperator{\trace}{trace} \DeclareMathOperator{\Var}{Var} \DeclareMathOperator{\Span}{span} \DeclareMathOperator{\proj}{proj} \DeclareMathOperator{\col}{col} \DeclareMathOperator*{\argmin}{arg\,min} \DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator*{\gt}{>} \definecolor{highlight-blue}{RGB}{0,123,255} % definition, theorem, proposition \definecolor{highlight-yellow}{RGB}{255,193,7} % lemma, conjecture, example \definecolor{highlight-orange}{RGB}{253,126,20} % criterion, corollary, property \definecolor{highlight-red}{RGB}{220,53,69} % criterion \newcommand{\logL}{\ell} \newcommand{\eE}{\mathcal{E}} \newcommand{\oO}{\mathcal{O}} \newcommand{\defeq}{\stackrel{\mathrm{def}}{=}} \newcommand{\Bspec}{\mathcal{B}} % Spectral radiance \newcommand{\X}{\mathcal{X}} % X space \newcommand{\Y}{\mathcal{Y}} % Y space \newcommand{\M}{\mathcal{M}} % Model \newcommand{\Tspace}{\mathcal{T}} \newcommand{\Vspace}{\mathcal{V}} \newcommand{\Mtrue}{\mathcal{M}_{\mathrm{true}}} \newcommand{\MP}{\M_{\mathrm{P}}} \newcommand{\MRJ}{\M_{\mathrm{RJ}}} \newcommand{\qproc}{\mathfrak{Q}} \newcommand{\D}{\mathcal{D}} % Data (true or generic) \newcommand{\Dt}{\tilde{\mathcal{D}}} \newcommand{\Phit}{\widetilde{\Phi}} \newcommand{\Phis}{\Phi^*} \newcommand{\qt}{\tilde{q}} \newcommand{\qs}{q^*} \newcommand{\qh}{\hat{q}} \newcommand{\AB}[1]{\mathtt{AB}~\mathtt{#1}} \newcommand{\LP}[1]{\mathtt{LP}~\mathtt{#1}} \newcommand{\NML}{\mathrm{NML}} \newcommand{\iI}{\mathcal{I}} \newcommand{\true}{\mathrm{true}} \newcommand{\dist}{D} \newcommand{\Mtheo}[1]{\mathcal{M}_{#1}} % Model (theoretical model); index: param set \newcommand{\DL}[1][L]{\mathcal{D}^{(#1)}} % Data (RV or generic) \newcommand{\DLp}[1][L]{\mathcal{D}^{(#1')}} % Data (RV or generic) \newcommand{\DtL}[1][L]{\tilde{\mathcal{D}}^{(#1)}} % Data (RV or generic) \newcommand{\DpL}[1][L]{{\mathcal{D}'}^{(#1)}} % Data (RV or generic) \newcommand{\Dobs}[1][]{\mathcal{D}_{\mathrm{obs}}^{#1}} % Data (observed) \newcommand{\calibset}{\mathcal{C}} \newcommand{\N}{\mathcal{N}} % Normal distribution \newcommand{\Z}{\mathcal{Z}} % Partition function \newcommand{\VV}{\mathbb{V}} % Variance \newcommand{\T}{\mathsf{T}} % Transpose \newcommand{\EMD}{\mathrm{EMD}} \newcommand{\dEMD}{d_{\mathrm{EMD}}} \newcommand{\dEMDtilde}{\tilde{d}_{\mathrm{EMD}}} \newcommand{\dEMDsafe}{d_{\mathrm{EMD}}^{\text{(safe)}}} \newcommand{\e}{ε} % Model confusion threshold \newcommand{\falsifythreshold}{ε} \newcommand{\bayes}[1][]{B_{#1}} \newcommand{\bayesthresh}[1][]{B_{0}} \newcommand{\bayesm}[1][]{B^{\mathcal{M}}_{#1}} \newcommand{\bayesl}[1][]{B^l_{#1}} \newcommand{\bayesphys}[1][]{B^{{p}}_{#1}} \newcommand{\Bconf}[1]{B^{\mathrm{epis}}_{#1}} \newcommand{\Bemd}[1]{B^{\mathrm{EMD}}_{#1}} \newcommand{\Bconfbin}[1][]{\bar{B}^{\mathrm{conf}}_{#1}} \newcommand{\Bemdbin}[1][]{\bar{B}_{#1}^{\mathrm{EMD}}} \newcommand{\bin}{\mathcal{B}} \newcommand{\Bconft}[1][]{\tilde{B}^{\mathrm{conf}}_{#1}} \newcommand{\fc}{f_c} \newcommand{\fcbin}{\bar{f}_c} \newcommand{\paramphys}[1][]{Θ^{{p}}_{#1}} \newcommand{\paramobs}[1][]{Θ^{ε}_{#1}} \newcommand{\test}{\mathrm{test}} \newcommand{\train}{\mathrm{train}} \newcommand{\synth}{\mathrm{synth}} \newcommand{\rep}{\mathrm{rep}} \newcommand{\MNtrue}{\mathcal{M}^{{p}}_{\text{true}}} \newcommand{\MN}[1][]{\mathcal{M}^{{p}}_{#1}} \newcommand{\MNA}{\mathcal{M}^{{p}}_{Θ_A}} \newcommand{\MNB}{\mathcal{M}^{{p}}_{Θ_B}} \newcommand{\Me}[1][]{\mathcal{M}^ε_{#1}} \newcommand{\Metrue}{\mathcal{M}^ε_{\text{true}}} \newcommand{\Meobs}{\mathcal{M}^ε_{\text{obs}}} \newcommand{\Meh}[1][]{\hat{\mathcal{M}}^ε_{#1}} \newcommand{\MNa}{\mathcal{M}^{\mathcal{N}}_a} \newcommand{\MeA}{\mathcal{M}^ε_A} \newcommand{\MeB}{\mathcal{M}^ε_B} \newcommand{\Ms}{\mathcal{M}^*} \newcommand{\MsA}{\mathcal{M}^*_A} \newcommand{\MsB}{\mathcal{M}^*_B} \newcommand{\Msa}{\mathcal{M}^*_a} \newcommand{\MsAz}{\mathcal{M}^*_{A,z}} \newcommand{\MsBz}{\mathcal{M}^*_{B,z}} \newcommand{\Msaz}{\mathcal{M}^*_{a,z}} \newcommand{\MeAz}{\mathcal{M}^ε_{A,z}} \newcommand{\MeBz}{\mathcal{M}^ε_{B,z}} \newcommand{\Meaz}{\mathcal{M}^ε_{a,z}} \newcommand{\zo}{z^{0}} \renewcommand{\lL}[2][]{\mathcal{L}_{#1|{#2}}} % likelihood \newcommand{\Lavg}[2][]{\mathcal{L}^{/#2}_{#1}} % Geometric average of likelihood \newcommand{\lLphys}[2][]{\mathcal{L}^{{p}}_{#1|#2}} \newcommand{\Lavgphys}[2][]{\mathcal{L}^{{p}/#2}_{#1}} % Geometric average of likelihood \newcommand{\lLL}[3][]{\mathcal{L}^{(#3)}_{#1|#2}} \newcommand{\lLphysL}[3][]{\mathcal{L}^{{p},(#3)}_{#1|#2}} \newcommand{\lnL}[2][]{l_{#1|#2}} % Per-sample log likelihood \newcommand{\lnLt}[2][]{\widetilde{l}_{#1|#2}} \newcommand{\lnLtt}{\widetilde{l}} % Used only in path_sampling \newcommand{\lnLh}[1][]{\hat{l}_{#1}} \newcommand{\lnLphys}[2][]{l^{{p}}_{#1|#2}} \newcommand{\lnLphysL}[3][]{l^{{p},(#3)}_{#1|#2}} \newcommand{\Elmu}[2][1]{μ_{{#2}}^{(#1)}} \newcommand{\Elmuh}[2][1]{\hat{μ}_{{#2}}^{(#1)}} \newcommand{\Elsig}[2][1]{Σ_{{#2}}^{(#1)}} \newcommand{\Elsigh}[2][1]{\hat{Σ}_{{#2}}^{(#1)}} \newcommand{\pathP}{\mathop{{p}}} % Path-sampling process (generic) \newcommand{\pathPhb}{\mathop{{p}}_{\mathrm{Beta}}} % Path-sampling process (hierarchical beta) \newcommand{\interval}{\mathcal{I}} \newcommand{\Phiset}[1]{\{\Phi\}^{\small (#1)}} \newcommand{\Phipart}[1]{\{\mathcal{I}_Φ\}^{\small (#1)}} \newcommand{\qhset}[1]{\{\qh\}^{\small (#1)}} \newcommand{\Dqpart}[1]{\{Δ\qh_{2^{#1}}\}} \newcommand{\LsAzl}{\mathcal{L}_{\smash{{}^{\,*}_A},z,L}} \newcommand{\LsBzl}{\mathcal{L}_{\smash{{}^{\,*}_B},z,L}} \newcommand{\lsA}{l_{\smash{{}^{\,*}_A}}} \newcommand{\lsB}{l_{\smash{{}^{\,*}_B}}} \newcommand{\lsAz}{l_{\smash{{}^{\,*}_A},z}} \newcommand{\lsAzj}{l_{\smash{{}^{\,*}_A},z_j}} \newcommand{\lsAzo}{l_{\smash{{}^{\,*}_A},z^0}} \newcommand{\leAz}{l_{\smash{{}^{\,ε}_A},z}} \newcommand{\lsAez}{l_{\smash{{}^{*ε}_A},z}} \newcommand{\lsBz}{l_{\smash{{}^{\,*}_B},z}} \newcommand{\lsBzj}{l_{\smash{{}^{\,*}_B},z_j}} \newcommand{\lsBzo}{l_{\smash{{}^{\,*}_B},z^0}} \newcommand{\leBz}{l_{\smash{{}^{\,ε}_B},z}} \newcommand{\lsBez}{l_{\smash{{}^{*ε}_B},z}} \newcommand{\LaszL}{\mathcal{L}_{\smash{{}^{*}_a},z,L}} \newcommand{\lasz}{l_{\smash{{}^{*}_a},z}} \newcommand{\laszj}{l_{\smash{{}^{*}_a},z_j}} \newcommand{\laszo}{l_{\smash{{}^{*}_a},z^0}} \newcommand{\laez}{l_{\smash{{}^{ε}_a},z}} \newcommand{\lasez}{l_{\smash{{}^{*ε}_a},z}} \newcommand{\lhatasz}{\hat{l}_{\smash{{}^{*}_a},z}} \newcommand{\pasz}{p_{\smash{{}^{*}_a},z}} \newcommand{\paez}{p_{\smash{{}^{ε}_a},z}} \newcommand{\pasez}{p_{\smash{{}^{*ε}_a},z}} \newcommand{\phatsaz}{\hat{p}_{\smash{{}^{*}_a},z}} \newcommand{\phateaz}{\hat{p}_{\smash{{}^{ε}_a},z}} \newcommand{\phatseaz}{\hat{p}_{\smash{{}^{*ε}_a},z}} \newcommand{\Phil}[2][]{Φ_{#1|#2}} % Φ_{\la} \newcommand{\Philt}[2][]{\widetilde{Φ}_{#1|#2}} % Φ_{\la} \newcommand{\Philhat}[2][]{\hat{Φ}_{#1|#2}} % Φ_{\la} \newcommand{\Philsaz}{Φ_{\smash{{}^{*}_a},z}} % Φ_{\lasz} \newcommand{\Phileaz}{Φ_{\smash{{}^{ε}_a},z}} % Φ_{\laez} \newcommand{\Philseaz}{Φ_{\smash{{}^{*ε}_a},z}} % Φ_{\lasez} \newcommand{\mus}[1][1]{μ^{(#1)}_*} \newcommand{\musA}[1][1]{μ^{(#1)}_{\smash{{}^{\,*}_A}}} \newcommand{\SigsA}[1][1]{Σ^{(#1)}_{\smash{{}^{\,*}_A}}} \newcommand{\musB}[1][1]{μ^{(#1)}_{\smash{{}^{\,*}_B}}} \newcommand{\SigsB}[1][1]{Σ^{(#1)}_{\smash{{}^{\,*}_B}}} \newcommand{\musa}[1][1]{μ^{(#1)}_{\smash{{}^{*}_a}}} \newcommand{\Sigsa}[1][1]{Σ^{(#1)}_{\smash{{}^{*}_a}}} \newcommand{\Msah}{{\color{highlight-red}\mathcal{M}^{*}_a}} \newcommand{\Msazh}{{\color{highlight-red}\mathcal{M}^{*}_{a,z}}} \newcommand{\Meah}{{\color{highlight-blue}\mathcal{M}^{ε}_a}} \newcommand{\Meazh}{{\color{highlight-blue}\mathcal{M}^{ε}_{a,z}}} \newcommand{\lsazh}{{\color{highlight-red}l_{\smash{{}^{*}_a},z}}} \newcommand{\leazh}{{\color{highlight-blue}l_{\smash{{}^{ε}_a},z}}} \newcommand{\lseazh}{{\color{highlight-orange}l_{\smash{{}^{*ε}_a},z}}} \newcommand{\Philsazh}{{\color{highlight-red}Φ_{\smash{{}^{*}_a},z}}} % Φ_{\lasz} \newcommand{\Phileazh}{{\color{highlight-blue}Φ_{\smash{{}^{ε}_a},z}}} % Φ_{\laez} \newcommand{\Philseazh}{{\color{highlight-orange}Φ_{\smash{{}^{*ε}_a},z}}} % Φ_{\lasez} \newcommand{\emdstd}{\tilde{σ}} \DeclareMathOperator{\Mvar}{Mvar} \DeclareMathOperator{\AIC}{AIC} \DeclareMathOperator{\epll}{epll} \DeclareMathOperator{\elpd}{elpd} \DeclareMathOperator{\MDL}{MDL} \DeclareMathOperator{\comp}{COMP} \DeclareMathOperator{\Lognorm}{Lognorm} \DeclareMathOperator{\erf}{erf} \DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator{\Image}{Image} \DeclareMathOperator{\sgn}{sgn} \DeclareMathOperator{\SE}{SE} % standard error \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator{\Poisson}{Poisson} \DeclareMathOperator{\SkewNormal}{SkewNormal} \DeclareMathOperator{\TruncNormal}{TruncNormal} \DeclareMathOperator{\Exponential}{Exponential} \DeclareMathOperator{\exGaussian}{exGaussian} \DeclareMathOperator{\IG}{IG} \DeclareMathOperator{\NIG}{NIG} \DeclareMathOperator{\Gammadist}{Gamma} \DeclareMathOperator{\Lognormal}{Lognormal} \DeclareMathOperator{\Beta}{Beta} \newcommand{\sinf}{{s_{\infty}}}\)
import other_criteria
Notebook parameters#
For the comparison with other criteria, we use variations of the dataset shown in Fig. 2.6:
We add an intermediate dataset size
L_med
.We increase the upper wavelength range so data go into the microwave range.
We use three different noise levels (with the middle noise closest to the data in Fig. 2.6.
We use two bias conditions: unbiased, or bias equal to Fig. 2.6 / 8. (We need to reduce the bias because at large wavelengths the radiance is smaller.)
L_list = [L_small, L_med, L_large]
Lᑊ = 2**12
s_list = 2.**array([12, 16, 20]) * Bunits**-1 # data_noise_s is about 2**16.5
table_λwindow1 = (20*μm, 1000*μm)
table_λwindow2 = (data_λ_min, data_λ_max)
table_T = data_T
table_B0 = data_B0/8
dataset_list = [Dataset("Criterion comparison", L, λmin, λmax, s, T=table_T, B0=B0)
for s in s_list
for L in L_list
for (λmin, λmax) in [table_λwindow1, table_λwindow2]
for B0 in (table_B0, 0*Bunits)]
To provide a reference, we compute for each dataset the ratio between the variance (controlled by \(s\)) and the maximum value of the radiance. This gives a sense of how the noise compares with the scale of the data.
The variance is actually \(λ\)-dependent; to give a single value, we average the expression for the variance given in the paper over \(λ\):
We compare this against \(\max_λ\Bigl(\Bspec_{\mathrm{P}}(λ;T)\Bigr) = \Bspec_{\mathrm{P}}(λ_{min};T) + \Bspec_0\):
noise_fraction = {}
for 𝒟 in dataset_list:
λ, B = 𝒟.get_data()
Ba = 𝒟.data_model[1](𝒟.L)[1]
noise_sd = np.sqrt((Ba/𝒟.s).mean())
noise_fraction[𝒟.B0, 𝒟.s] = noise_sd / (Ba+𝒟.B0).max()
Criteria definitions#
For comparability, we write all criteria as ratios of probabilities, so a criterion \(B^C_{AB}\) is understood as
Model \(A\) is \(B^C_{AB}\) times more probable than model \(B\).
where \(C\) can be “model evidence”, “relative likelihood”, etc. Thus, if \(P(A)\) is the “probability of model \(A\) and \(P(B)\) the “probability of model \(B\), then a criterion corresponds to
@dataclasses.dataclass
class Criterion:
lognum : float
logdenom: float
@property
def ratio(self): return exp(self.lognum - self.logdenom)
@property
def logratio(self): return (self.lognum - self.logdenom)*base_x_to_10
Criteria are often presented this way because the notion of “probability of a model” is ill-defined, and the quantities \(P(A)\) and \(P(B)\) often diverge or go to zero – making the ratio the only quantity with some degree of stability. Of course, ratios of the form \(\tfrac{0}{0}\) or \(\tfrac{\inf}{\inf}\) are known to lead to all kinds of pathologies, which is what we want to illustrate with this section.
The \(\Bemd{}\) attempts to resolve this by defining a proper probability which does not require a ratio to obtain a finite number:
We consider this a better assessment quantity, less prone to over/underconfidence than a ratio with diverging denominator. However, since other criteria have no equivalent form, for the purpose of comparability, we will convert \(\Bemd{}\) into a ratio-like quantity \(\underline{B}^{\mathrm{EMD}}\) (we use \(\approx^*\) to denote “conceptual equivalence”, rather a strict mathematical equality):
Note
\(\Bemd{}\) does not require specifying priors on the parameters or reserving test data.
Note
We need to crank up the EMD sampling parameters here, because below we compute small tail probabilities.
@memory.cache
def Bemd(𝒟):
#c = 2**-1 if 𝒟.λmax == 30*μm else 2**0
c = c_chosen
_mixed_ppf, _synth_ppf = get_ppfs(𝒟)
_Bemd = emd.Bemd(_mixed_ppf["Planck"], _mixed_ppf["Rayleigh-Jeans"],
_synth_ppf["Planck"], _synth_ppf["Rayleigh-Jeans"],
c=c, res=8, M=2**12, max_M=2**14, #M=128,
progbarA=None, progbarB=None,
use_multiprocessing=False) # To allow computing multiple Bemd’s in parallel
return Criterion(log(_Bemd), log(1-_Bemd))
Symbol |
Variable |
Meaning |
---|---|---|
\(L\) |
|
Number of data samples used to fit the model. |
\(L'\) |
|
Number of new data samples used only to test the model |
\(\mathcal{D}\) |
|
Dataset |
\(\hat{σ}\), \(\hat{T}\) |
|
Maximum likelihood parameters |
- \(\logL\): Log likelihood function
Recall that the likelihood is a function of model parameters, so we can write
since we defined the loss \(Q\) to be the negative log probability.
To assign a likelihood to a model, some criteria use \(\max_{σ, T} \logL_a(σ, T)\); i.e. they evaluate the at the fitted parameters. In the following we denote this \(\logL_a(\hat{σ}_a, \hat{T}_a)\).
def lₐ(a, σ, T, 𝒟):
return -Q(a, σ=array(σ).reshape(-1,1), T=array(T).reshape(-1,1)
)(𝒟.get_data()).sum().squeeze() * base_e_to_x
Note
When we check the fitted temperatures, they are all in the range 3200-4100 K. So these are reasonable datasets
Show code cell source
σˆ = {}; Tˆ = {}
def f(log2σ_T, a, 𝒟): σ, T = 2**log2σ_T; return -lₐ(a, σ, T, 𝒟)
class res:
success = False
for 𝒟 in dataset_list:
for a in ["Rayleigh-Jeans", "Planck"]:
_𝒟 = replace(𝒟, L=L_med) # Always use a reasonable number of samples to fit σ and T => Too large can fail, and too small adds a different source variability than what we want to compare
s0 = np.sqrt(_𝒟.get_data()[1].mean()/𝒟.s).m # Initial s guess with moment matching
res = optimize.minimize(f, np.log2([s0, 4000]), (a, _𝒟), tol=1e-3); #assert res.success
if not res.success: print(np.log2(D.s.m))
σˆ[(a,𝒟)] = 2**res.x[0]; Tˆ[(a,𝒟)] = 2**res.x[1]
- \(R\): Expected risk
In our paper we treat the expected risk as a random variable, and the EMD distribution is a way to induce a distribution on \(R\). In this section however we mean the expected risk in the usual sense, where it is a scalar obtained by averaging the loss on test samples:
Note that the log likelihood is evaluated on training samples, while the expected risk is evaluated on test samples. That said, because the models are very simple and we can make \(L\) and \(L'\) as large as we want, the difference between the two sides of this equation should be negligeable.
@cache
def R(a,𝒟):
return Q(a, σ=σˆ[a,𝒟], T=Tˆ[a,𝒟])(replace(𝒟, purpose="test", L=Lᑊ).get_data()).mean() * base_e_to_x
- \(\eE\): Model evidence
The Bayesian model evidence is obtained by integerating the posterior. For this we need to set priors on the parameters \(T\) and \(σ\); a common practice is to use “uninformative priors”, to “let the data speak” [22]. (In other words, we choose broad priors which let the posterior concentrate near the likelihood.) For this example we use the following priors:
(The “prior” over \(λ\) reflects that we sample the wavelengths at regular intervals betwen \(20 \mathrm{μm}\) and \(30 \mathrm{μm}\). Thanks to the choice of a uniform distribution it drops out from the calculations.) For simplicity and to emulate modelling errors, we don’t define a prior over the bias \(\Bspec_0\): we take all candidate models to be unbiased.
Sensitivity to the choice of prior
The use of uninformative priors is well motivated in the context of inference, since with enough data the posterior becomes insensitive to the choice of prior. This is not the case however for the computation of the evidence, where sensitivity to the prior is strong for all dataset sizes \(L\). Thus we could get numerically different results with different choices of the prior. However at least in this example, simply tightening the priors (i.e. using \(π_2\) above) results in numerical differences on the order of 1%.
Show code cell source
# To use joblib.Memory, we need objects which can be hashed & pickled consistently
# Stats distributions don’t do this, but we can store their arguments instead
@dataclasses.dataclass(frozen=True)
class Prior:
distname: str
args: tuple
rng: int|str|tuple[int|str]
@property
def rv(self):
rv = getattr(stats, self.distname)(*self.args)
rv.random_state = utils.get_rng(self.rng)
return rv
@property
def rvs(self): return self.rv.rvs
@property
def pdf(self): return self.rv.pdf
@property
def logpdf(self): return self.rv.logpdf
π1logσ = Prior("uniform", (log(2**9), log(2**14)), "prior σ")
π2logσ = Prior("uniform", (log(2**9), log(2**10)), "prior σ")
π1logT = Prior("uniform", (log(1000), log(5000)), "prior T")
π2logT = Prior("uniform", (log(3900), log(4100)), "prior T")
With these choices, the evidence for model \(a\) can be written
with \(p(\{λ_i, \Bspec_i\}_{i=1}^L \mid a, σ, T)\) given by the Gaussian observation model and \(π(λ_i) = \frac{1}{10}\):
The evidence in this example is therefore computed with a double integral, which can be done directly with scipy.dblquad
if \(L\) is small. When \(L\) is large however, the probability is too small and we run into numerical underflow; in that case we can resort to evaluating the integral by generating a large number \(M\) of prior samples:
We can then replace the probabilities with their logs and use logsumexp
to get
where \(\logL_a^{(j)} = \logL_a(σ_j, T_j)\), \((σ_j, T_j) \sim π(σ) π(T)\) and \(j = 1, 2, \dotsc, L_{\eE}\).
@memory.cache
def logℰ(a, 𝒟, πlogσ, πlogT):
Lℰ = 2**14 # Number of Monte Carlo samples for integral. Use giant number so result is effectively exact
rng = utils.get_rng("uv", "evidence")
return logsumexp([lₐ(a, σj, Tj, 𝒟) # NB: We omit the `-L log 10`, which evenually drops out
for σj, Tj in zip(exp(πlogσ.rvs(Lℰ, random_state=rng)), exp(πlogT.rvs(Lℰ, random_state=rng)))
])
# `other_criteria` expects a loss function which can be indexed as Q[θ]
class QMeta(type):
def __getitem__(cls, Θ):
return Q(cls.physmodel, *Θ)
def __call__(cls, σ, T):
return Q(cls.physmodel, σ, T)
class QPlanck(metaclass=QMeta):
physmodel="Planck"
class QRJ(metaclass=QMeta):
physmodel="Rayleigh-Jeans"
Qdict = {"Planck": QPlanck, "Rayleigh-Jeans": QRJ}
def logℰ(a, 𝒟, πlogσ, πlogT):
π = other_criteria.FactorizedPrior(
distnames=[πlogσ.distname, πlogT.distname],
args=[πlogσ.args, πlogT.args],
rng = "uv - evidence"
)
logE, logEerr = other_criteria.logℰ(Qdict[a], π, 𝒟)
return logE
- \(\mathrm{elpd}\): Expected log pointwise predictive density, WAIC, LOO
The expected log pointwise predictive density (\(\mathrm{elpd}\)) on unseen data is often considered a gold standard when it comes to comparing models. Directly estimating the elpd requires putting aside a large amount of data for testing, which is rarely feasible; hence approximations have been developed, like the widely applicable information criterion (WAIC) and leave-one-out (LOO) cross-validation [31]. In this example however we can generate data at will, so there is no need for approximations: we can compute the \(\mathrm{elpd}\) directly.
In the following we use \(λ_i\), \(\Bspec_i\) to denote the original data points used to fit the model, and \(λ_j'\), \(\Bspec_j'\) (with \(j \in \{1,\dotsc,L'\}\)) to denote the new data points on which we evaluate the \(\mathrm{elpd}\). The posterior over parameters is denoted \(p^*(T, σ \mid \{λ_i, \Bspec_i\}_{i=1}^L\bigr)\).
The integrand involves only probabilities of single samples, and so can be computed directly with scipy.dblquad
.
Since \(π(T)\) and \(π(σ)\) are uniform in log space, we can further simplify the integral by a change of variables:
The \(\mathrm{elpd}\) is closely related to the expected risk \(R\); in fact, if we define \(Q\) to be the log posterior instead of the log likelihood, it becomes equivalent.
@memory.cache
def elpd(a, 𝒟, πlogσ, πlogT):
Lℰ = 2**14 # Number of Monte Carlo samples for integral. Use giant number so result is effectively exact
rng = utils.get_rng("uv", "elpd")
def h(σ, T, λ_ℬ, a=a, 𝒟=𝒟): return lₐ(a, σ, T, 𝒟) - Q(a, σ, T)(λ_ℬ)*base_e_to_x
σarr = exp(πlogσ.rvs(Lℰ, random_state=rng))
Tarr = exp(πlogT.rvs(Lℰ, random_state=rng))
λ_test, ℬ_test = replace(𝒟, L=Lᑊ, purpose="test").get_data()
return logsumexp(h(σarr, Tarr, (λ_test[:,None], ℬ_test[:,None])), axis=0
).mean() - logℰ(a, 𝒟, πlogσ, πlogT) # NB: We omit constant terms
def elpd(a, 𝒟, πlogσ, πlogT):
π = other_criteria.FactorizedPrior(
distnames=[πlogσ.distname, πlogT.distname],
args=[πlogσ.args, πlogT.args],
rng = "uv - evidence"
)
elpd_value, elpd_stderr = other_criteria.elpd(Qdict[a], π, 𝒟, purpose="uv - elpd")
return elpd_value
Slower version using dblquad
def elpd(a, L):
dblquad = scipy.integrate.dblquad
def h(logσ, logT, λ_ℬ, a=a, L=L):
return exp( lₐ(a, L, σ:=exp(logσ), T:=exp(logT)) - Q(a, σ, T)(λ_ℬ) )
λ_test, ℬ_test = replace(𝒟, L=Lᑊ, purpose="test").get_data()
return sum( log( dblquad(h, log(1000), log(5000), log(2), log(32), [(λj, ℬj)]) )
for λj, ℬj in tqdm(zip(λ_test, ℬ_test), total=Lᑊ)) / Lᑊ \
- logℰ(a, L) # We omit the constant terms, which cancel out in the ratio
- \(B^l\): Relative likelihood / AIC
Computing the ratio of likelihoods is equivalent to the difference of log likelihoods:
~ The Akaike information criterion (AIC) for a model with \(k\) parameters reads
Since the Rayleigh-Jeans and Planck models both have the same number of parameters, this is equivalent to the likelihood ratio (up to a factor of 2).
@cache
def Bl(𝒟): return Criterion(
lₐ("Planck", σˆ[("Planck",𝒟)], Tˆ[("Planck",𝒟)], 𝒟),
lₐ("Rayleigh-Jeans", σˆ[("Rayleigh-Jeans",𝒟)], Tˆ[("Rayleigh-Jeans",𝒟)], 𝒟)
)
- \(B^{\mathrm{Bayes}}\): Bayes factor
The Bayes factor is the ratio of the evidence for each model:
def BBayes(𝒟, πlogσ, πlogT): return Criterion(logℰ("Planck", 𝒟, πlogσ, πlogT),
logℰ("Rayleigh-Jeans", 𝒟, πlogσ, πlogT))
- \(B^{\mathrm{elpd}}\): \(\mathrm{elpd}\) criterion
The \(\mathrm{elpd}\) is usually reported as-is, but since it does scale like a log probability, we can make it comparable to other criteria by defining
In practice, a large positive value for \(\log B^{\mathrm{elpd}}_{AB}\) would be interpreted as strong evidence for model \(A\).
def Belpd(𝒟, πlogσ, πlogT): return Criterion(elpd("Planck", 𝒟, πlogσ, πlogT),
elpd("Rayleigh-Jeans", 𝒟, πlogσ, πlogT))
- \(\underline{B}^R\): Ratio of expected risk
As with the \(\mathrm{elpd}\), the expected risk is more commonly given directly. However since we chose \(Q\) to be the negative log likelihood, it is reasonable to present it as a ratio to make it comparable with other criteria:
def BR(𝒟): return Criterion(-R("Planck", 𝒟), -R("Rayleigh-Jeans", 𝒟))
Comparison table#
import shelve
table_data = {}
with shelve.open(str(config.paths.data/"criteria-compare-table")) as shelf:
for 𝒟 in tqdm(dataset_list):
Dkey = (f"{𝒟.λmin.m}–{𝒟.λmax.m}", 𝒟.L, 𝒟.B0.m, 𝒟.s.m, noise_fraction[𝒟.B0, 𝒟.s].m)
Dskey = ",".join(str(k) for k in Dkey)
table_data[Dkey] = {}
Bkey = r"$\underline{B}^{\mathrm{EMD}}_{\mathrm{P,RJ}}$"; Bskey = f"{Dskey},{Bkey}"
if Bskey not in shelf:
shelf[f"{Dskey},{Bkey}"] = Bemd(𝒟).logratio
table_data[Dkey][Bkey] = shelf[f"{Dskey},{Bkey}"]
Bkey=r"$\underline{B}^R_{\mathrm{P,RJ}}$" ; Bskey = f"{Dskey},{Bkey}"
if Bskey not in shelf:
shelf[f"{Dskey},{Bkey}"] = BR(𝒟).logratio
table_data[Dkey][Bkey] = shelf[f"{Dskey},{Bkey}"]
Bkey=r"$B^l_{\mathrm{P,RJ}}$"; Bskey = f"{Dskey},{Bkey}"
if Bskey not in shelf:
shelf[f"{Dskey},{Bkey}"] = Bl(𝒟).logratio
table_data[Dkey][Bkey] = shelf[f"{Dskey},{Bkey}"]
Bkey=r"$B^{\mathrm{Bayes}}_{\mathrm{P,RJ};π}$"; Bskey = f"{Dskey},{Bkey}"
if Bskey not in shelf:
shelf[f"{Dskey},{Bkey}"] = BBayes(𝒟, π1logσ, π1logT).logratio
table_data[Dkey][Bkey] = shelf[f"{Dskey},{Bkey}"]
#Bkey=r"$B^{\mathrm{Bayes}}_{\mathrm{P,RJ};π_2}$"; Bskey = f"{Dskey},{Bkey}"
# if Bskey not in shelf:
# shelf[f"{Dskey},{Bkey}"] = BBayes(𝒟, π2logσ, π2logT).logratio
#table_data[Bkey] = shelf[f"{Dskey},{Bkey}"] = BBayes(𝒟, π2logσ, π2logT).logratio
Bkey=r"$B^{\mathrm{elpd}}_{\mathrm{P,RJ};π}$"; Bskey = f"{Dskey},{Bkey}"
if Bskey not in shelf:
shelf[f"{Dskey},{Bkey}"] = Belpd(𝒟, π1logσ, π1logT).logratio
table_data[Dkey][Bkey] = shelf[f"{Dskey},{Bkey}"]
#Bkey=r"$B^{\mathrm{elpd}}_{\mathrm{P,RJ};π_2}$"; Bskey = f"{Dskey},{Bkey}"
# if Bskey not in shelf:
# shelf[f"{Dskey},{Bkey}"] = Belpd(𝒟, π2logσ, π2logT).logratio
#table_data[Bkey] = shelf[f"{Dskey},{Bkey}"] = Belpd(𝒟, π2logσ, π2logT).logratio
#df = pd.DataFrame({(f"{𝒟.λmin.m}–{𝒟.λmax.m}", 𝒟.L, 𝒟.B0.m, 𝒟.s.m, noise_fraction[𝒟.B0, 𝒟.s].m):
# {r"$\underline{B}^{\mathrm{EMD}}_{\mathrm{P,RJ}}$": Bemd(𝒟).logratio,
# r"$\underline{B}^R_{\mathrm{P,RJ}}$": BR(𝒟).logratio,
# r"$B^l_{\mathrm{P,RJ}}$": Bl(𝒟).logratio,
# r"$B^{\mathrm{Bayes}}_{\mathrm{P,RJ};π}$": BBayes(𝒟, π1logσ, π1logT).logratio,
# #r"$B^{\mathrm{Bayes}}_{\mathrm{P,RJ};π_2}$": BBayes(𝒟, π2logσ, π2logT).logratio,
# r"$B^{\mathrm{elpd}}_{\mathrm{P,RJ};π}$": Belpd(𝒟, π1logσ, π1logT).logratio,
# #r"$B^{\mathrm{elpd}}_{\mathrm{P,RJ};π_2}$": Belpd(𝒟, π2logσ, π2logT).logratio,
# }
# for 𝒟 in tqdm(dataset_list)})
df = pd.DataFrame.from_dict(table_data)
df.columns.names=["λ", "L", "B0", "s", "rel. σ"]
df.index.name = "Criterion"
df = df.stack(["λ", "L"], future_stack=True)
# Put biased data before unbiased, to match figure
df = df.sort_index(axis="columns", ascending=[False, True]) \
.sort_index(axis="index", ascending=True)
# Undo the sorting along λ, so the longer wavelengths case comes first (same order as the figure)
_index = []
for i in range(0, len(df.index), 6):
_index.extend(df.index[i+3:i+6])
_index.extend(df.index[i:i+3])
df = df.loc[_index]
# Make a MultiIndex so the heading shows at the top instead of to the left
#df.columns = pd.MultiIndex.from_product([["noise $(s^{-1})$"], df.columns.values])
Show code cell source
# Colorscheme for shading the table. Values will be transformed with log10;
# scheme goes from 0.01 (-2) to 100 (+2);
# See https://matplotlib.org/stable/api/_as_gen/matplotlib.colors.LinearSegmentedColormap.html#matplotlib.colors.LinearSegmentedColormap
# and https://matplotlib.org/stable/users/explain/colors/colormap-manipulation.html
shade_planck = config.figures.colors["pale"]["cyan"]
shade_rj = config.figures.colors["pale"]["red"]
cdict = {"red": [], "green": [], "blue": [], "alpha": []}
r,g,b = mpl.colors.to_rgb(shade_rj) # Strong evidence for Rayleigh-Jeans
cdict["red"].append((0, r, r)); cdict["green"].append((0, g, g)); cdict["blue"].append((0, b, b))
cdict["alpha"].append((0, 1.0, 1.0))
r,g,b = 1,1,1 # Center
cdict["red"].append((0.5, r, r)); cdict["green"].append((0.5, g, g)); cdict["blue"].append((0.5, b, b))
cdict["alpha"].append((0.5, 0.0, 0.0))
r,g,b = mpl.colors.to_rgb(shade_planck) # Strong evidence for Planck
cdict["red"].append((1, r, r)); cdict["green"].append((1, g, g)); cdict["blue"].append((1, b, b))
cdict["alpha"].append((1, 1.0, 1.0))
table_cmap = mpl.colors.LinearSegmentedColormap("rb_diverging", cdict)
del r,g,b
The colour scheme was picked to saturate at ratios of 30:1, which is around where common practice would interpret the evidence of a Bayes factor to be “strong” or “very strong”.
df = df \
.rename_axis(["Criterion", r"$λ (\mathrm{μm})$", "$L$"], axis="index") \
.rename_axis(["", "$s$", "rel. $σ$"], axis="columns")
# Common options
df_styled_html = df.copy().style \
.background_gradient(table_cmap, axis=0, vmin=-1.5, vmax=+1.5) # log₁₀ 30 ≈ 1.47
df_styled_latex = df.copy().style \
.background_gradient(table_cmap, axis=0, vmin=-1.5, vmax=+1.5) # log₁₀ 30 ≈ 1.47
# HTML options
df_styled_html = df_styled_html \
.format_index(lambda L: viz.format_pow2(L), axis="index", level=2, escape="latex") \
.format_index(lambda s: viz.format_pow2(s), axis="columns", level=1, escape="html") \
.format_index(lambda nf: f"{nf*100:2.0f}%", axis="columns", level=2) \
.format_index(lambda B0: "$\mathcal{B}_0 > 0$" if B0>0 else "$\mathcal{B}_0 = 0$", axis="columns", level=0, escape=None) \
.format(partial(format_B_value, format="unicode")) \
.set_table_styles([{"selector": "td", "props": "padding: 0.5em;"},
{"selector": "th", "props": "text-align: center; border-bottom: 1px black"}])
df_styled_html
ℬ0 > 0 | ℬ0 = 0 | |||||||
---|---|---|---|---|---|---|---|---|
s | 2¹² | 2¹⁶ | 2²⁰ | 2¹² | 2¹⁶ | 2²⁰ | ||
rel. σ | 35% | 9% | 2% | 36% | 9% | 2% | ||
Criterion | λ(μm) | L | ||||||
BlP, RJ | 20–1000 | 2⁹ | -0.72 | -0.74 | 5.64 | -0.72 | -0.82 | 4.62 |
2¹² | 0.64 | -0.41 | 14.49 | 0.62 | -1.05 | 7.94 | ||
2¹⁵ | -3.02 | 10.33 | 40.41 | -3.23 | 8.11 | -8.50 | ||
15–30 | 2⁹ | 0.06 | -0.35 | 3.27 | 0.06 | -0.39 | 2.61 | |
2¹² | 0.52 | 3.86 | 52.20 | 0.49 | 3.53 | 47.12 | ||
2¹⁵ | 3.04 | 23.02 | 389.62 | 2.87 | 20.33 | 348.85 | ||
BBayesP, RJ;π | 20–1000 | 2⁹ | 0.02 | -0.01 | 0.02 | -0.06 | 0.04 | -6×10⁻³ |
2¹² | 0.06 | -0.03 | -0.02 | 0.04 | -0.03 | 0.07 | ||
2¹⁵ | -0.05 | -0.05 | 0.04 | 0.03 | -0.03 | -0.02 | ||
15–30 | 2⁹ | -6×10⁻³ | -0.03 | 0.01 | -0.02 | -7×10⁻³ | -0.05 | |
2¹² | -4×10⁻³ | -5×10⁻³ | 0.04 | -0.04 | -0.06 | -0.01 | ||
2¹⁵ | 0.02 | 0.06 | -0.05 | 9×10⁻³ | 0.05 | -0.01 | ||
BelpdP, RJ;π | 20–1000 | 2⁹ | -4×10⁻³ | 3×10⁻⁵ | 3×10⁻⁴ | 2×10⁻⁴ | 2×10⁻³ | 5×10⁻⁴ |
2¹² | -9×10⁻⁴ | 1×10⁻³ | -4×10⁻³ | -3×10⁻³ | -3×10⁻³ | 7×10⁻⁴ | ||
2¹⁵ | 2×10⁻⁴ | 2×10⁻³ | -2×10⁻³ | 1×10⁻⁵ | -7×10⁻⁵ | -2×10⁻³ | ||
15–30 | 2⁹ | -4×10⁻³ | -2×10⁻³ | -2×10⁻³ | 6×10⁻⁴ | 2×10⁻³ | -4×10⁻⁴ | |
2¹² | -3×10⁻³ | 1×10⁻³ | 6×10⁻⁴ | 6×10⁻⁴ | -8×10⁻⁴ | -2×10⁻³ | ||
2¹⁵ | 6×10⁻⁴ | -2×10⁻⁴ | -1×10⁻³ | -2×10⁻³ | -2×10⁻³ | -1×10⁻³ | ||
ḆRP, RJ | 20–1000 | 2⁹ | 2×10⁻⁵ | -5×10⁻⁴ | 5×10⁻³ | 2×10⁻⁵ | -7×10⁻⁴ | 4×10⁻³ |
2¹² | 2×10⁻⁵ | -5×10⁻⁴ | 5×10⁻³ | 2×10⁻⁵ | -7×10⁻⁴ | 4×10⁻³ | ||
2¹⁵ | 2×10⁻⁵ | -5×10⁻⁴ | 5×10⁻³ | 2×10⁻⁵ | -7×10⁻⁴ | 4×10⁻³ | ||
15–30 | 2⁹ | 8×10⁻⁵ | 6×10⁻⁴ | 0.01 | 8×10⁻⁵ | 6×10⁻⁴ | 9×10⁻³ | |
2¹² | 8×10⁻⁵ | 6×10⁻⁴ | 0.01 | 8×10⁻⁵ | 6×10⁻⁴ | 9×10⁻³ | ||
2¹⁵ | 8×10⁻⁵ | 6×10⁻⁴ | 0.01 | 8×10⁻⁵ | 6×10⁻⁴ | 9×10⁻³ | ||
ḆEMDP, RJ | 20–1000 | 2⁹ | -0.04 | 0.11 | 0.33 | -0.07 | 0.12 | 0.41 |
2¹² | -0.01 | 0.07 | 0.34 | -0.01 | 0.07 | 0.39 | ||
2¹⁵ | -1×10⁻³ | -0.04 | 0.26 | -1×10⁻² | -0.05 | 0.29 | ||
15–30 | 2⁹ | -0.01 | 0.31 | 3.08 | 0.05 | 0.34 | 3.50 | |
2¹² | -0.03 | 0.40 | 1.87 | -0.02 | 0.39 | 1.80 | ||
2¹⁵ | -0.05 | 0.24 | 1.74 | -0.04 | 0.25 | 1.66 |
label = "tbl_uv_criteria-comparison"
short_caption = "Comparison of different model selection criteria."
# '@' is a hack to prevent our own preprocessors from substituting \cref with {numref}
caption = r"""
\textbf{Comparison of different model selection criteria} for variations of the datasets shown in
\@cref{fig_uv-example_r-distributions}. Criteria compare the Planck model ($\MP$) against the
Rayleigh-Jeans model ($\MRJ$) and are evaluated for different dataset sizes
($L$), different levels of noise ($s$) and different wavelength windows ($λ$).
To allow for comparisons, all criteria have been transformed into ratios of probabilities.
\textbf{Values reported are the $\log_{10}$ of those ratios.}
Positive (blue shaded) values indicate evidence in favour of the Planck model, while negative (red shaded) values indicate the converse.
For example, a value of +1 is interpreted as $\MP$ being 10 times more likely than $\MRJ$,
while a value of -2 would suggest that $\MP$ is 100 times \emph{less} likely than $\MRJ$.
Noise levels are reported in two ways: $s$ is the actual value used to
generate the data,
while "rel. $σ$" reports the resulting standard deviation as a fraction of the maximum radiance within
the data window.
The \qtyrange{«λmicrowave_low»}{«λmicrowave_high»}{\um} window for $λ$ stretches into the far infrared range,
where the two models are nearly indistinguishable;
hence higher positive values are expected for zero bias, low noise, and the \qtyrange{«λinfrared_low»}{«λinfrared_high»}{\um} window.
As in \@cref{fig_uv-example_r-distributions}, calculations were done for both positive
and null bias conditions (resp. ($\mathcal{B}_0 > 0$ and $\mathcal{B}_0 = 0$);
the former emulates a situation where neither model can fit the data perfectly.
Expressions for all criteria are given in the supplementary text.
For the $\underline{B}^{\mathrm{EMD}}_{\mathrm{P,RJ}}$ criteria, we used $c=\num{«c_chosen»}$.
Although this is the same value as we used for neuron model, it was determined through a separate calibration with different epistemic distributions.
""" \
.replace("@", "") \
.replace("«λmicrowave_low»", str(table_λwindow1[0].m)) \
.replace("«λmicrowave_high»", str(table_λwindow1[1].m)) \
.replace("«λinfrared_low»", str(table_λwindow2[0].m)) \
.replace("«λinfrared_high»", str(table_λwindow2[1].m)) \
.replace("«c_chosen»", str(c_chosen)) \
.replace(r". ", ". ")
Show code cell source
def tex2md(text):
# Hacks
text = text.replace(r"\textbf{Values reported are the $\log_{10}$ of those ratios.}",
r"__Values reported are the $\log_{10}$ of those ratios.__")
text = text.replace(r"{eq}`eq_def_Bemd-BR-table`",
r"{eq}`eq_def_Bemd-BR-table`")
# "Proper" (but buggy) substitutions
text = re.sub(r"\\emph{([^}]*?)}", r"_\1_", text)
text = re.sub(r"\\textbf{([^}]*?)}", r"__\1__", text)
text = re.sub(r"\{numref}`([^`,]+?),([^},]+?)}", r"{numref}`\1` and {numref}`\2`", text)
text = re.sub(r"\{numref}`([^`]*?)}", r"{numref}`\1`", text)
text = re.sub(r"\\qtyrange{(\d*)}{(\d*)}{([^}]*)}", r"\1—\2 $\\mathrm{\3}$", text)
text = re.sub(r"\\um", r"\\mu m", text)
text = re.sub(r"\\num{([^}]*?)}", r"\1", text)
text = text.replace("~", " ")
text += "[[source]](#code_uv-comparison-table)"
return text
2025-03-15 git: emd-paper develop #452d0bad