Comparison with other criteria – large sample asymptotics#
\(\require{mathtools} \newcommand{\notag}{} \newcommand{\tag}{} \newcommand{\label}[1]{} \newcommand{\sfrac}[2]{#1/#2} \newcommand{\bm}[1]{\boldsymbol{#1}} \newcommand{\num}[1]{#1} \newcommand{\qty}[2]{#1\,#2} \renewenvironment{align} {\begin{aligned}} {\end{aligned}} \renewenvironment{alignat} {\begin{alignedat}} {\end{alignedat}} \newcommand{\pdfmspace}[1]{} % Ignore PDF-only spacing commands \newcommand{\htmlmspace}[1]{\mspace{#1}} % Ignore PDF-only spacing commands \newcommand{\scaleto}[2]{#1} % Allow to use scaleto from scalerel package \newcommand{\RR}{\mathbb R} \newcommand{\NN}{\mathbb N} \newcommand{\PP}{\mathbb P} \newcommand{\EE}{\mathbb E} \newcommand{\XX}{\mathbb X} \newcommand{\ZZ}{\mathbb Z} \newcommand{\QQ}{\mathbb Q} \newcommand{\fF}{\mathcal F} \newcommand{\dD}{\mathcal D} \newcommand{\lL}{\mathcal L} \newcommand{\gG}{\mathcal G} \newcommand{\hH}{\mathcal H} \newcommand{\nN}{\mathcal N} \newcommand{\pP}{\mathcal P} \newcommand{\BB}{\mathbb B} \newcommand{\Exp}{\operatorname{Exp}} \newcommand{\Binomial}{\operatorname{Binomial}} \newcommand{\Poisson}{\operatorname{Poisson}} \newcommand{\linop}{\mathcal{L}(\mathbb{B})} \newcommand{\linopell}{\mathcal{L}(\ell_1)} \DeclareMathOperator{\trace}{trace} \DeclareMathOperator{\Var}{Var} \DeclareMathOperator{\Span}{span} \DeclareMathOperator{\proj}{proj} \DeclareMathOperator{\col}{col} \DeclareMathOperator*{\argmin}{arg\,min} \DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator*{\gt}{>} \definecolor{highlight-blue}{RGB}{0,123,255} % definition, theorem, proposition \definecolor{highlight-yellow}{RGB}{255,193,7} % lemma, conjecture, example \definecolor{highlight-orange}{RGB}{253,126,20} % criterion, corollary, property \definecolor{highlight-red}{RGB}{220,53,69} % criterion \newcommand{\logL}{\ell} \newcommand{\eE}{\mathcal{E}} \newcommand{\oO}{\mathcal{O}} \newcommand{\defeq}{\stackrel{\mathrm{def}}{=}} \newcommand{\Bspec}{\mathcal{B}} % Spectral radiance \newcommand{\X}{\mathcal{X}} % X space \newcommand{\Y}{\mathcal{Y}} % Y space \newcommand{\M}{\mathcal{M}} % Model \newcommand{\Tspace}{\mathcal{T}} \newcommand{\Vspace}{\mathcal{V}} \newcommand{\Mtrue}{\mathcal{M}_{\mathrm{true}}} \newcommand{\MP}{\M_{\mathrm{P}}} \newcommand{\MRJ}{\M_{\mathrm{RJ}}} \newcommand{\qproc}{\mathfrak{Q}} \newcommand{\D}{\mathcal{D}} % Data (true or generic) \newcommand{\Dt}{\tilde{\mathcal{D}}} \newcommand{\Phit}{\widetilde{\Phi}} \newcommand{\Phis}{\Phi^*} \newcommand{\qt}{\tilde{q}} \newcommand{\qs}{q^*} \newcommand{\qh}{\hat{q}} \newcommand{\AB}[1]{\mathtt{AB}~\mathtt{#1}} \newcommand{\LP}[1]{\mathtt{LP}~\mathtt{#1}} \newcommand{\NML}{\mathrm{NML}} \newcommand{\iI}{\mathcal{I}} \newcommand{\true}{\mathrm{true}} \newcommand{\dist}{D} \newcommand{\Mtheo}[1]{\mathcal{M}_{#1}} % Model (theoretical model); index: param set \newcommand{\DL}[1][L]{\mathcal{D}^{(#1)}} % Data (RV or generic) \newcommand{\DLp}[1][L]{\mathcal{D}^{(#1')}} % Data (RV or generic) \newcommand{\DtL}[1][L]{\tilde{\mathcal{D}}^{(#1)}} % Data (RV or generic) \newcommand{\DpL}[1][L]{{\mathcal{D}'}^{(#1)}} % Data (RV or generic) \newcommand{\Dobs}[1][]{\mathcal{D}_{\mathrm{obs}}^{#1}} % Data (observed) \newcommand{\calibset}{\mathcal{C}} \newcommand{\N}{\mathcal{N}} % Normal distribution \newcommand{\Z}{\mathcal{Z}} % Partition function \newcommand{\VV}{\mathbb{V}} % Variance \newcommand{\T}{\mathsf{T}} % Transpose \newcommand{\EMD}{\mathrm{EMD}} \newcommand{\dEMD}{d_{\mathrm{EMD}}} \newcommand{\dEMDtilde}{\tilde{d}_{\mathrm{EMD}}} \newcommand{\dEMDsafe}{d_{\mathrm{EMD}}^{\text{(safe)}}} \newcommand{\e}{ε} % Model confusion threshold \newcommand{\falsifythreshold}{ε} \newcommand{\bayes}[1][]{B_{#1}} \newcommand{\bayesthresh}[1][]{B_{0}} \newcommand{\bayesm}[1][]{B^{\mathcal{M}}_{#1}} \newcommand{\bayesl}[1][]{B^l_{#1}} \newcommand{\bayesphys}[1][]{B^{{p}}_{#1}} \newcommand{\Bconf}[1]{B^{\mathrm{epis}}_{#1}} \newcommand{\Bemd}[1]{B^{\mathrm{EMD}}_{#1}} \newcommand{\BQ}[1]{B^{Q}_{#1}} \newcommand{\Bconfbin}[1][]{\bar{B}^{\mathrm{conf}}_{#1}} \newcommand{\Bemdbin}[1][]{\bar{B}_{#1}^{\mathrm{EMD}}} \newcommand{\bin}{\mathcal{B}} \newcommand{\Bconft}[1][]{\tilde{B}^{\mathrm{conf}}_{#1}} \newcommand{\fc}{f_c} \newcommand{\fcbin}{\bar{f}_c} \newcommand{\paramphys}[1][]{Θ^{{p}}_{#1}} \newcommand{\paramobs}[1][]{Θ^{ε}_{#1}} \newcommand{\test}{\mathrm{test}} \newcommand{\train}{\mathrm{train}} \newcommand{\synth}{\mathrm{synth}} \newcommand{\rep}{\mathrm{rep}} \newcommand{\MNtrue}{\mathcal{M}^{{p}}_{\text{true}}} \newcommand{\MN}[1][]{\mathcal{M}^{{p}}_{#1}} \newcommand{\MNA}{\mathcal{M}^{{p}}_{Θ_A}} \newcommand{\MNB}{\mathcal{M}^{{p}}_{Θ_B}} \newcommand{\Me}[1][]{\mathcal{M}^ε_{#1}} \newcommand{\Metrue}{\mathcal{M}^ε_{\text{true}}} \newcommand{\Meobs}{\mathcal{M}^ε_{\text{obs}}} \newcommand{\Meh}[1][]{\hat{\mathcal{M}}^ε_{#1}} \newcommand{\MNa}{\mathcal{M}^{\mathcal{N}}_a} \newcommand{\MeA}{\mathcal{M}^ε_A} \newcommand{\MeB}{\mathcal{M}^ε_B} \newcommand{\Ms}{\mathcal{M}^*} \newcommand{\MsA}{\mathcal{M}^*_A} \newcommand{\MsB}{\mathcal{M}^*_B} \newcommand{\Msa}{\mathcal{M}^*_a} \newcommand{\MsAz}{\mathcal{M}^*_{A,z}} \newcommand{\MsBz}{\mathcal{M}^*_{B,z}} \newcommand{\Msaz}{\mathcal{M}^*_{a,z}} \newcommand{\MeAz}{\mathcal{M}^ε_{A,z}} \newcommand{\MeBz}{\mathcal{M}^ε_{B,z}} \newcommand{\Meaz}{\mathcal{M}^ε_{a,z}} \newcommand{\zo}{z^{0}} \renewcommand{\lL}[2][]{\mathcal{L}_{#1|{#2}}} % likelihood \newcommand{\Lavg}[2][]{\mathcal{L}^{/#2}_{#1}} % Geometric average of likelihood \newcommand{\lLphys}[2][]{\mathcal{L}^{{p}}_{#1|#2}} \newcommand{\Lavgphys}[2][]{\mathcal{L}^{{p}/#2}_{#1}} % Geometric average of likelihood \newcommand{\lLL}[3][]{\mathcal{L}^{(#3)}_{#1|#2}} \newcommand{\lLphysL}[3][]{\mathcal{L}^{{p},(#3)}_{#1|#2}} \newcommand{\lnL}[2][]{l_{#1|#2}} % Per-sample log likelihood \newcommand{\lnLt}[2][]{\widetilde{l}_{#1|#2}} \newcommand{\lnLtt}{\widetilde{l}} % Used only in path_sampling \newcommand{\lnLh}[1][]{\hat{l}_{#1}} \newcommand{\lnLphys}[2][]{l^{{p}}_{#1|#2}} \newcommand{\lnLphysL}[3][]{l^{{p},(#3)}_{#1|#2}} \newcommand{\Elmu}[2][1]{μ_{{#2}}^{(#1)}} \newcommand{\Elmuh}[2][1]{\hat{μ}_{{#2}}^{(#1)}} \newcommand{\Elsig}[2][1]{Σ_{{#2}}^{(#1)}} \newcommand{\Elsigh}[2][1]{\hat{Σ}_{{#2}}^{(#1)}} \newcommand{\pathP}{\mathop{{p}}} % Path-sampling process (generic) \newcommand{\pathPhb}{\mathop{{p}}_{\mathrm{Beta}}} % Path-sampling process (hierarchical beta) \newcommand{\interval}{\mathcal{I}} \newcommand{\Phiset}[1]{\{\Phi\}^{\small (#1)}} \newcommand{\Phipart}[1]{\{\mathcal{I}_Φ\}^{\small (#1)}} \newcommand{\qhset}[1]{\{\qh\}^{\small (#1)}} \newcommand{\Dqpart}[1]{\{Δ\qh_{2^{#1}}\}} \newcommand{\LsAzl}{\mathcal{L}_{\smash{{}^{\,*}_A},z,L}} \newcommand{\LsBzl}{\mathcal{L}_{\smash{{}^{\,*}_B},z,L}} \newcommand{\lsA}{l_{\smash{{}^{\,*}_A}}} \newcommand{\lsB}{l_{\smash{{}^{\,*}_B}}} \newcommand{\lsAz}{l_{\smash{{}^{\,*}_A},z}} \newcommand{\lsAzj}{l_{\smash{{}^{\,*}_A},z_j}} \newcommand{\lsAzo}{l_{\smash{{}^{\,*}_A},z^0}} \newcommand{\leAz}{l_{\smash{{}^{\,ε}_A},z}} \newcommand{\lsAez}{l_{\smash{{}^{*ε}_A},z}} \newcommand{\lsBz}{l_{\smash{{}^{\,*}_B},z}} \newcommand{\lsBzj}{l_{\smash{{}^{\,*}_B},z_j}} \newcommand{\lsBzo}{l_{\smash{{}^{\,*}_B},z^0}} \newcommand{\leBz}{l_{\smash{{}^{\,ε}_B},z}} \newcommand{\lsBez}{l_{\smash{{}^{*ε}_B},z}} \newcommand{\LaszL}{\mathcal{L}_{\smash{{}^{*}_a},z,L}} \newcommand{\lasz}{l_{\smash{{}^{*}_a},z}} \newcommand{\laszj}{l_{\smash{{}^{*}_a},z_j}} \newcommand{\laszo}{l_{\smash{{}^{*}_a},z^0}} \newcommand{\laez}{l_{\smash{{}^{ε}_a},z}} \newcommand{\lasez}{l_{\smash{{}^{*ε}_a},z}} \newcommand{\lhatasz}{\hat{l}_{\smash{{}^{*}_a},z}} \newcommand{\pasz}{p_{\smash{{}^{*}_a},z}} \newcommand{\paez}{p_{\smash{{}^{ε}_a},z}} \newcommand{\pasez}{p_{\smash{{}^{*ε}_a},z}} \newcommand{\phatsaz}{\hat{p}_{\smash{{}^{*}_a},z}} \newcommand{\phateaz}{\hat{p}_{\smash{{}^{ε}_a},z}} \newcommand{\phatseaz}{\hat{p}_{\smash{{}^{*ε}_a},z}} \newcommand{\Phil}[2][]{Φ_{#1|#2}} % Φ_{\la} \newcommand{\Philt}[2][]{\widetilde{Φ}_{#1|#2}} % Φ_{\la} \newcommand{\Philhat}[2][]{\hat{Φ}_{#1|#2}} % Φ_{\la} \newcommand{\Philsaz}{Φ_{\smash{{}^{*}_a},z}} % Φ_{\lasz} \newcommand{\Phileaz}{Φ_{\smash{{}^{ε}_a},z}} % Φ_{\laez} \newcommand{\Philseaz}{Φ_{\smash{{}^{*ε}_a},z}} % Φ_{\lasez} \newcommand{\mus}[1][1]{μ^{(#1)}_*} \newcommand{\musA}[1][1]{μ^{(#1)}_{\smash{{}^{\,*}_A}}} \newcommand{\SigsA}[1][1]{Σ^{(#1)}_{\smash{{}^{\,*}_A}}} \newcommand{\musB}[1][1]{μ^{(#1)}_{\smash{{}^{\,*}_B}}} \newcommand{\SigsB}[1][1]{Σ^{(#1)}_{\smash{{}^{\,*}_B}}} \newcommand{\musa}[1][1]{μ^{(#1)}_{\smash{{}^{*}_a}}} \newcommand{\Sigsa}[1][1]{Σ^{(#1)}_{\smash{{}^{*}_a}}} \newcommand{\Msah}{{\color{highlight-red}\mathcal{M}^{*}_a}} \newcommand{\Msazh}{{\color{highlight-red}\mathcal{M}^{*}_{a,z}}} \newcommand{\Meah}{{\color{highlight-blue}\mathcal{M}^{ε}_a}} \newcommand{\Meazh}{{\color{highlight-blue}\mathcal{M}^{ε}_{a,z}}} \newcommand{\lsazh}{{\color{highlight-red}l_{\smash{{}^{*}_a},z}}} \newcommand{\leazh}{{\color{highlight-blue}l_{\smash{{}^{ε}_a},z}}} \newcommand{\lseazh}{{\color{highlight-orange}l_{\smash{{}^{*ε}_a},z}}} \newcommand{\Philsazh}{{\color{highlight-red}Φ_{\smash{{}^{*}_a},z}}} % Φ_{\lasz} \newcommand{\Phileazh}{{\color{highlight-blue}Φ_{\smash{{}^{ε}_a},z}}} % Φ_{\laez} \newcommand{\Philseazh}{{\color{highlight-orange}Φ_{\smash{{}^{*ε}_a},z}}} % Φ_{\lasez} \newcommand{\emdstd}{\tilde{σ}} \DeclareMathOperator{\Mvar}{Mvar} \DeclareMathOperator{\AIC}{AIC} \DeclareMathOperator{\epll}{epll} \DeclareMathOperator{\elpd}{elpd} \DeclareMathOperator{\MDL}{MDL} \DeclareMathOperator{\comp}{COMP} \DeclareMathOperator{\Lognorm}{Lognorm} \DeclareMathOperator{\erf}{erf} \DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator{\Image}{Image} \DeclareMathOperator{\sgn}{sgn} \DeclareMathOperator{\SE}{SE} % standard error \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator{\Poisson}{Poisson} \DeclareMathOperator{\SkewNormal}{SkewNormal} \DeclareMathOperator{\TruncNormal}{TruncNormal} \DeclareMathOperator{\Exponential}{Exponential} \DeclareMathOperator{\exGaussian}{exGaussian} \DeclareMathOperator{\IG}{IG} \DeclareMathOperator{\NIG}{NIG} \DeclareMathOperator{\Gammadist}{Gamma} \DeclareMathOperator{\Lognormal}{Lognormal} \DeclareMathOperator{\Beta}{Beta} \newcommand{\sinf}{{s_{\infty}}}\)
from emdcmp import draw_R_samples
from other_criteria import FactorizedPrior, get_ppfs, R, AIC, BIC, DIC, logℰ, elpd, 𝓁
import mdl_uv
Show code cell source
import viz # Local module
from paul_tol_rainbow import discrete_rainbow_scheme # Local module
import matplotlib as mpl
import matplotlib.pyplot as plt
import panel as pn
import holoviews as hv
from cycler import cycler
hv.extension("matplotlib")
Use sans-serif for LaTeX axis labels.
STIX is maybe not the prettiest font, but it is very legible.
Bitstream Vera Sans is the matplotlib default for non-TeX labels.
Show code cell source
params = {'text.usetex': False,
'font.family': 'STIXGeneral',
'mathtext.fontset': 'stixsans'
#'mathtext.fontset': 'custom',
#'mathtext.rm' : 'Bitstream Vera Sans',
#'mathtext.it' : 'Bitstream Vera Sans:italic',
#'mathtext.bf' : 'Bitstream Vera Sans:bold',
}
mpl.rcParams.update(params)
We need to extend the color scheme from Ex_UV to support 4 candidates.
For this we use the discrete scheme proposed by Paul Tol [78, Fig. 22], which supports sequences between 1 and 23 steps. This is implemented by our custom function discrete_rainbow_scheme, which is essentially just a table lookup.
Some other decent options for candidates are listed below, from most to least discriminable. The greater variety of colours in the most discriminable palettes makes them a bit jarring.
hv.Palette("TolRainbow", range=(0.1, 1.), reverse=True)
hv.Palette("Sunset", range=(0., 1.), reverse=True)
hv.Palette("Magma", range=(0., .9), reverse=False)
hv.Palette("TolYlOrBr", range=(0.3, 1.), reverse=True)
@dataclass
class colors(colors):
candidates : hv.Cycle = hv.Cycle(values = discrete_rainbow_scheme(4)[::-1])
colors
| scale | data | RJ | Planck | calib_curves | candidates |
|---|---|---|---|---|---|
Note
See Fig. 2.8 for a more complete version of this table
Criterion |
Not restricted to nested models |
No restriction on noise model |
Not restricted to log likelihood risk |
Not asymptotic |
Allows singular models |
Allows Bayesian models |
Symmetric |
Information-theoretic |
Consistent |
Robust wrt. misspecification |
Source of variability |
|---|---|---|---|---|---|---|---|---|---|---|---|
\(\Bemd{}\) |
✔ |
✔ |
✔ |
✔ |
✔ |
✔ |
✔ |
✘ |
✔ |
✔ |
Imperfect replications |
WAIC |
✔ |
✔ |
✔ |
✔ |
✔ |
✔ |
✔ |
✘ |
✔ |
✘⁽²⁾ |
Posterior |
Bayes factor |
✔ |
✔ |
✘ |
✔ |
✘ |
✔ |
✔ |
✘ |
✔ |
✘⁽²⁾ |
Posterior |
MDL |
✔/✘⁽¹⁾ |
✔ |
✔/✘⁽¹⁾ |
✔ |
✘ |
✔ |
✘ |
✔ |
✔ |
✘⁽³⁾ |
Uniform dist on event space |
(BIC) |
✔ |
✔ |
✘ |
✘ |
✘ |
✔ |
✔ |
✘ |
✔/✘ |
✘ |
Posterior |
DIC |
✘ |
✘ |
✘ |
✘ |
✘ |
✔ |
✘ |
✔ |
✔ |
✘ |
Posterior |
AIC |
✘ |
✘ |
✘ |
✘ |
✘ |
✘ |
✘ |
✔ |
✘ |
✘ |
Perfect replications |
⁽¹⁾ Some entries are marked ✔/✘ for MDL because although the general form may not have these restrictions, they are usually required to obtain tractable approximations.
⁽²⁾ Bayesian methods are sometimes described as accounting for epistemic errors, but only for errors which are within the hypothesis class. So specifically not misspecification.
⁽³⁾ There has been at least one attempt to make MDL robust in the case of misspecified models, but it is far from standardized. It is also unclear whether it would be broadly applicable, since it depends on an ad hoc scaling of the likelihood, which itself introduces an additional hyperparameter.
Data#
For this experiment we want one of the candidates to be the true model. Also, in order to compute NML complexity exactly, we want to be able to enumerate all possible datasets. This is possible thanks to how we set the problem:
The use of Poisson noise means that possible values are discretized. Technically there are infinitely many possible values, but their probability quickly vanishes, so we need to consider only a dozen or so.
Fixed, exact \(λ\) values: for a given dataset size \(L\), we always generate data with the abscissa \(λ_k = μλ_{\mathrm{min}} + k \frac{λ_{\mathrm{max}} - λ_{\mathrm{min}}}{L-1}\), where \(k=0, \dotsc, L-1\).
Hint
This is implemented as the EnumerableDataset class within the MDL utilities.
This means we can implement enumeration as follows:
For each \(λ_k\), determine a set of possible values for \(\Bspec_k\). Below we use an interval containing 0.9998% of the probability mass. Each \(\Bspec_k\) is a sequential array with equal steps.
A dataset is generated by picking a random value from each \(\Bspec_k\): \(\{(λ_k, b_k) : b_k \sim \Bspec_k\}\). The total number of datasets is therefore \(\prod_{k=0}^{L-1} \lvert \Bspec_k \rvert\).
In practice there are still an unfeasibly large number of datasets. Therefore we generate them from most to least likely: the hope is that the integrand in an NML integral (which in this case is a series) should correlate with the true data likelihood, and that we can truncate the series once we have enough dominant terms.
𝒟 = mdl_uv.EnumerableDataset(
"comparison with other criteria",
L =100,
λmin=0.6*μm,
λmax=2 *μm,
s =16*Bunits**-1,
T =data_T)
Show code cell source
#data = Dataset("", L=4000, λmin=0.1*μm, λmax=2*μm, s=(2e-3*Bunits)**-1, T=3000*K)
𝒟_mean = replace(𝒟, s=(4e-5*Bunits)**-1)
dims = dict(kdims=hv.Dimension("λ", label="λ — wavelength", unit="μm"),
vdims=hv.Dimension("B", label="$\\tilde{B}$ – radiance", unit="kW/m² nm sr"))
fig = hv.Scatter(𝒟.get_data(strip_units=True), **dims) * hv.Curve(𝒟_mean.get_data(strip_units=True), **dims)
fig.opts(
hv.opts.Curve(color="#AA0022"),
hv.opts.Scatter(size=1.5, backend="bokeh"),
hv.opts.Scatter(s=12, backend="matplotlib"),
hv.opts.Overlay(fig_inches=tuple(2*((40, 34)*ureg.mm).to("in").m),
fontscale=2, backend="matplotlib"),
hv.opts.Overlay(width=500, backend="bokeh")
)
hv.save(fig, config.paths.figures/"compare-other-methods_sample-data.svg")
Note that in contrast to the main setup, we consider a different regime, with wavelength window closer to the distribution’s peak and lower relative noise. This makes the likelihood essentially Gaussian, up to a normalization factor.
Candidate models#
Most established model comparison methods are meant to compare nested models, where preferably one of the alternatives is the true model. For criteria typically interpreted in terms of information theory, like AIC and MDL, that interpretation usually only holds when the true model is also the simplest. Therefore to give existing criteria the fairest comparison, we setup a model comparison problem where the model alternatives are variants of the Planck radiance model augmented with a polynomial:
Note that in contrast to the main setup, the difference between Poisson and Gaussian noise is not too important, for two reasons:
We increased the signal to noise ratio, partly by choosing a regime (i.e. a \(λ\) window) which contains the largest radiance values.
We scale the variance in accordance with the Poisson distribution of the underlying data.
This ensures that whether we fit the model using Poisson or Gaussian likelihood, the true model is recovered when \(m=0\).
It is worth emphasizing that in order to accommodate some criteria, we are posing a different problem than the one we actually want to solve. First, instead of comparing the Planck (\(\MP\)) and Rayleigh-Jeans (\(\MRJ\)) models, we compare some artificially augmented version of \(\MP\). Second, we need to be in the regime where the Poisson is effectively a discretized Gaussian. This allows us to substitute the Poisson’s discrete PMF with the Gaussian’s continuous PDF for evaluating the loss. (If we tried to fit a discrete model, its PMF would vanish as soon as one data point is not in the support.) This also ensures that the asymptotic criteria, which assume that we have reached the regime where the likelihood is “Gaussian-like”, are valid. Finally, the discreteness of the data distribution nevertheless allows us to enumerate all possible datasets, a feature which is convenient for computing the NML distribution.
Making these choices of regime and data-generating model ensures no method is disadvantaged, but of course in practice a practitioner does not get to choose their data, and should choose their model(s) based on what makes sense for those data, not what simplifies the statistical analysis.
In practice of course we would like to be able to compare models without restriction on their structure or the type of noise, as we did in Fig. 2.7 comparing the Planck and Rayleigh-Jeans models with \(\Bemd{}\). Bayesian methods are also agnostic to the choice of model. Bayesian methods however consider a different type of epistemic error, and in particular ignore errors due to misspecification. We illustrate this with a comparison table in Table 6.1. For completeness, this table also includes values obtained with the other criteria – since in practice they are often used even when they are invalid.
@dataclass(frozen=True)
class CandidateModel:
physmodel: Literal["Plank","Rayleigh-Jeans"]
σ : float|PintQuantity
T : PintQuantity # units: K
coeffs: list[float]
@property
def m(self):
"""Returns the order of the model’s spurious polynomial."""
return len(self.coeffs)
def __post_init__(self):
# If parameters were given as plain floats, convert to default units
if not isinstance(self.σ, pint.Quantity):
object.__setattr__(self, "σ", self.σ * Bunits) # IMO bypassing frozen dataclass in __post_init__ is acceptable
if not isinstance(self.T, pint.Quantity):
object.__setattr__(self, "T", self.T * ureg.kelvin)
def Q(self, λ_B):
"""
The loss is computed from differences between the observed data `λ_B`
and predictions of the _physical_ model. The logpdf of the _observation_
model is used to convert those differences into a loss.
"""
λ, Bdata = λ_B
_, Btheory = self.apply_physmodel(λ)
#return -stats.poisson.logpdf((Bdata - Btheory).m ,
Bphys = phys_models[self.physmodel](λ, self.T).to(Bunits) # HACK: Copied from apply_physmodel, but this isn’t DRY
return -stats.norm.logpdf((Bdata - Btheory).m, loc=0, scale=self.σ.m*np.sqrt(Bphys.m)) # IMPORTANT: Must match def in obsmodel
def apply_obsmodel(self, λ_B, rng=None):
λ, Bdata = λ_B
Bphys = phys_models[self.physmodel](λ, self.T).to(Bunits) # HACK: Copied from apply_physmodel, but this isn’t DRY
return λ, Bdata + stats.norm.rvs(size=len(Bdata), loc=0, scale=self.σ.m*np.sqrt(Bphys.m), random_state=rng)*Bunits
def apply_physmodel(self, λarr: ArrayLike, rng=None) -> tuple[ArrayLike, ArrayLike]:
if isinstance(λarr, tuple): # Allow passing output of data directly
assert len(λarr) == 2, "CandidateModel expects either a single array `λarr`, or a tuple `(λarr, Barr)`."
λarr = λarr[0]
Barr = phys_models[self.physmodel](λarr, self.T).to(Bunits)
if self.coeffs:
Barr += Polynomial(self.coeffs)(λarr.m) * Bunits
return λarr, Barr
def gen_data(self, L, λmin=data_λ_min, λmax=data_λ_max, rng=None):
λarr = np.linspace(λmin, λmax, L)
return self.apply_obsmodel(self.apply_physmodel(λarr), rng=rng)
def __call__(self, λarr, rng=None):
return self.apply_obsmodel(self.apply_physmodel(λarr), rng=rng)
Criteria & loss definitions#
Risk functional#
We use the negative log likelihood as our risk functional, with a Gaussian observation model.
(For technical reasons the risk functional is defined above as a method of the candidate model. The comparison functions expect a separate functional, so Q[phys,σ,T,coeffs] is a wrapper which calls Q from a properly parameterized model.)
Show code cell source
class QMeta(type):
def __getitem__(cls, Θ):
σ, T, *coeffs = Θ
return cls.__call__(σ, T, coeffs)
def __call__(cls, σ, T, coeffs=()):
if len(coeffs) == 1 and isinstance(coeffs[0], (list, tuple)):
coeffs = coeffs[0]
#return Q(σ, T, coeffs)
model = CandidateModel(cls.physmodel, σ, T, coeffs)
return model.Q
#@dataclass
class QPlanck(metaclass=QMeta):
physmodel="Planck"
class QRJ(metaclass=QMeta):
physmodel="Rayleigh-Jeans"
Q = {"Planck": QPlanck, "Rayleigh-Jeans": QRJ}
Fitting function#
Given some observed data and the order of the extra polynomial, returns a tuple (σ, T, coeffs) of fitted parameters.
Parameters are fitted by minimizing \(Q\), subject to some mild regularization on \(σ\) and \(T\).
Show code cell source
def _fitΘ(λℬ, physmodel, m):
log2_T0 = np.log2(data_T.m)
priorT_std = 12 # |log₂ 4000 - log₂ 5000| ≈ 12
def f(Θtilde, physmodel=physmodel, λℬ=λℬ):
log2σ, log2T, *coeffs = Θtilde
σ = 2**log2σ; T = 2**log2T
risk = Q[physmodel](σ, T, coeffs)(λℬ).mean()
priorσ = 2**(-log2σ/128) # Soft floor on σ so it cannot go too low
priorT = (log2T - log2_T0)**2 / (2*priorT_std**2)
#priorσ = 0
#priorT = 0
return risk + priorσ + priorT
res = optimize.minimize(f, np.r_[np.log2([.1, data_T.m]), [0]*m], tol=1e-5)
log2σ, log2T, *coeffs = res.x
σ = 2**log2σ; T = 2**log2T
return σ, T, tuple(coeffs)
_fitΘ_cache = {}
def fitΘ(λℬ, physmodel, m):
key = (tuple(λℬ[0].m), tuple(λℬ[1].m), physmodel, m)
res = _fitΘ_cache.get(key)
if res is None:
res = _fitΘ(λℬ, physmodel, m)
return res
Bayesian prior#
π_phys_loose = FactorizedPrior(["loguniform", "loguniform"],
[(2**-10, 2**-1), (1000, 10000)],
rng="prior - compare - models")
π_phys_tight = FactorizedPrior(["loguniform", "loguniform"],
[(2**-5, 2**-4), (2000, 5000)],
rng="prior - compare - models")
def π_coeffs_loose(m):
return FactorizedPrior(["norm"]*m, [(0, 5)]*m,
rng="prior - compare - models - coeffs")
def π_coeffs_tight(m):
return FactorizedPrior(["norm"]*m, [(0, 2)]*m,
rng="prior - compare - models - coeffs")
NML distribution#
def logL_mdl(*Θ, Q): return lambda λB: -Q(*Θ)(λB)
def MDL_criterion(Q, θˆ, 𝒟, physmodel, m):
#return Q[θˆ](𝒟.get_data()) \
comp = mdl_uv.comp(𝒟, r=30, m=m, logL=partial(logL_mdl, Q=Q),
fitΘ=lambda λℬ, m: fitΘ(λℬ, physmodel, m),
cache_key="UV - Plank + poly", rng=3014,
no_compute=refresh_shelves)
return None if comp is None else -𝓁(θˆ, Q, 𝒟) + comp
Compute criteria#
Note: We don’t change the dataset’s seed, so there is correlation in the data, but not exactly the same \((x_i, y_i)\) pairs are drawn. (Because the \(x_i\) are distributed evenly across \([λ_{min}, λ_{max}]\).)
mlist = [0,2,4,6]
Llist = [2**6, 2**7, 2**8, 2**9, 2**10, 2**11, 2**12] # np.logspace(6, 12, 7, base=2)
Llist_short = [2**6, 2**8, 2**10, 2**12]
Llist = np.logspace(6, 12, 25, base=2).astype(int)
assert len(mlist) <= len(colors.candidates)
The PPF curves are useful diagnostics in and of themselves, to check that the fitted model behaves as expected.
Show code cell source
Φarr = np.linspace(0, 1)
ppf_panels = {}
scatter_panels = {}
def no_units(args): return tuple(a.m for a in args)
for physmodel in ["Planck", "Rayleigh-Jeans"]:
for L in Llist:
for m in mlist:
𝒟 = replace(𝒟, L=L, purpose=𝒟.purpose + f" - {L=}")
θˆ = fitΘ(𝒟.get_data(), physmodel, m=m)
𝓜 = CandidateModel(physmodel, *θˆ)
mixed_ppf, synth_ppf = get_ppfs(𝓜, 𝒟, rng=utils.get_rng("synth ppf - compare - models"))
ppf_panels[physmodel, L, m] = \
hv.Curve((Φarr, mixed_ppf(Φarr)), kdims="Φ", vdims="PPF", label="mixed") \
* hv.Curve((Φarr, synth_ppf(Φarr)), kdims="Φ", vdims="PPF", label="synth")
scatter_panels[physmodel, L, m] = \
hv.Scatter(no_units(𝓜.gen_data(D.L, λmin=D.λmin, λmax=D.λmax)), kdims="λ", vdims="B", label="model") \
* hv.Scatter(no_units(𝒟.get_data()), kdims="λ", vdims="B", label="true")