-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy paththejoker.py
486 lines (419 loc) · 18.6 KB
/
thejoker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
# Standard library
import os
import numpy as np
# Third-party
from astropy.utils.decorators import deprecated_renamed_argument
from .data_helpers import validate_prepare_data
from .likelihood_helpers import get_trend_design_matrix
# Project
from .logging import logger
from .prior import JokerPrior, _validate_model
from .prior_helpers import validate_n_offsets, validate_poly_trend
from .samples import JokerSamples
from .samples_analysis import is_P_unimodal
from .src.fast_likelihood import CJokerHelper
__all__ = ["TheJoker"]
class TheJoker:
"""
A custom Monte-Carlo sampler for two-body systems.
Parameters
----------
prior : `~thejoker.JokerPrior`
The specification of the prior probability distribution over all
parameters used in The Joker.
pool : `schwimmbad.BasePool` (optional)
A processing pool (default is a `schwimmbad.SerialPool` instance).
rng : `numpy.random.Generator` (optional)
A `numpy.random.Generator` instance for controlling random number
generation.
tempfile_path : str (optional)
A location on disk where The Joker may store some temporary files. Any
files written here by The Joker should be cleaned up: If any files in
this path persist, something must have gone wrong within The Joker.
Default: ``~/.thejoker``
"""
@deprecated_renamed_argument(
"random_state", "rng", since="v1.3", warning_type=DeprecationWarning
)
def __init__(self, prior, pool=None, rng=None, tempfile_path=None):
# set the processing pool
if pool is None:
import schwimmbad
pool = schwimmbad.SerialPool()
elif not hasattr(pool, "map") or not hasattr(pool, "close"):
raise TypeError(
"Input pool object must have .map() and .close() "
"methods. We recommend using `schwimmbad` pools."
)
self.pool = pool
# Set the parent random state - child processes get different states
# based on the parent
if rng is None:
rng = np.random.default_rng()
elif not isinstance(rng, np.random.Generator):
msg = (
"The input random number generator must be a numpy.random.Generator "
"instance."
)
raise TypeError(msg)
self.rng = rng
# check if a JokerParams instance was passed in to specify the state
if not isinstance(prior, JokerPrior):
raise TypeError("The input prior must be a JokerPrior instance.")
self.prior = prior
if tempfile_path is None:
self._tempfile_path = os.path.expanduser("~/.thejoker/")
else:
self._tempfile_path = os.path.abspath(os.path.expanduser(tempfile_path))
@property
def tempfile_path(self):
os.makedirs(self._tempfile_path, exist_ok=True)
return self._tempfile_path
def _make_joker_helper(self, data):
all_data, ids, trend_M = validate_prepare_data(
data, self.prior.poly_trend, self.prior.n_offsets
)
return CJokerHelper(all_data, self.prior, trend_M)
def marginal_ln_likelihood(
self, data, prior_samples, n_batches=None, in_memory=False
):
"""
Compute the marginal log-likelihood at each of the input prior samples.
Parameters
----------
data : `thejoker.RVData`, iterable, dict
The radial velocity data, or an iterable containing ``RVData``
objects for each data source.
prior_samples : str, `thejoker.JokerSamples`
Either a path to a file containing prior samples generated from The
Joker, or a `~thejoker.JokerSamples` instance containing the prior
samples.
n_batches : int (optional)
The number of batches to split the prior samples into before
distributing for computation. If using the (default) serial
computation pool, this doesn't have any impact. If using
multiprocessing or MPI, this determines how many batches to split
the samples into before scattering over all workers.
in_memory : bool (optional)
Load all prior samples or keep all prior samples in memory and run
all calculations without creating a temporary cache file.
Returns
-------
ln_likelihood : `numpy.ndarray`
The marginal log-likelihood computed at the location of each prior
sample.
"""
from .likelihood_helpers import marginal_ln_likelihood_inmem
from .multiproc_helpers import marginal_ln_likelihood_helper
joker_helper = self._make_joker_helper(data) # also validates data
if in_memory:
if isinstance(prior_samples, JokerSamples):
prior_samples, _ = prior_samples.pack(
units=joker_helper.internal_units, names=joker_helper.packed_order
)
return marginal_ln_likelihood_inmem(joker_helper, prior_samples)
return marginal_ln_likelihood_helper(
joker_helper, prior_samples, pool=self.pool, n_batches=n_batches
)
def rejection_sample(
self,
data,
prior_samples,
n_prior_samples=None,
max_posterior_samples=None,
n_linear_samples=1,
return_logprobs=False,
return_all_logprobs=False,
n_batches=None,
randomize_prior_order=False,
in_memory=False,
):
"""
Run The Joker's rejection sampling on prior samples to get posterior
samples for the input data.
You must either specify the number of prior samples to generate and
use for rejection sampling, ``n_prior_samples``, or the path to a file
containing prior samples, ``prior_cache_file``.
Parameters
----------
data : `~thejoker.RVData`
The radial velocity data, or an iterable containing ``RVData``
objects for each data source.
prior_samples : str, `~thejoker.JokerSamples`, int
Either a path to a file containing prior samples generated from The
Joker, or a `~thejoker.JokerSamples` instance containing the prior
samples.
n_prior_samples : int (optional)
The number of prior samples to run on. This is only used if passing
in a string filename: If the file contains a large number of prior
samples, you may want to set this to only run on a subset.
max_posterior_samples : int (optional)
The maximum number of posterior samples to generate. If using a
large library of prior samples, and running on uninformative data,
you may want to set this to a small but reasonable number (like,
256).
n_linear_samples : int (optional)
The number of linear parameter samples to generate for each
nonlinear parameter sample returned from the rejection sampling
step.
return_logprobs : bool (optional)
Also return the log-prior and (marginal) log-likelihood values
evaluated at each sample.
return_all_logprobs : bool (optional)
This will return the marginal log-likelihood values at every prior
sample used in this sampling! Use at your own (memory) risk.
n_batches : int (optional)
The number of batches to split the prior samples into before
distributing for computation. If using the (default) serial
computation pool, this doesn't have any impact. If using
multiprocessing or MPI, this determines how many batches to split
the samples into before scattering over all workers.
randomize_prior_order : bool (optional)
Randomly shuffle the prior samples before reading and running the
rejection sampler. This is only useful if you are using a large
library of prior samples, and choosing to run on a subset of those
samples.
in_memory : bool (optional)
Load all prior samples or keep all prior samples in memory and run
all calculations without creating a temporary cache file.
Returns
-------
samples : `~thejoker.JokerSamples`
The posterior samples produced from The Joker.
"""
from .likelihood_helpers import rejection_sample_inmem
from .multiproc_helpers import rejection_sample_helper
joker_helper = self._make_joker_helper(data) # also validates data
if isinstance(prior_samples, int):
# If an integer, generate that many prior samples first
N = prior_samples
prior_samples = self.prior.sample(size=N, return_logprobs=return_logprobs)
if in_memory:
if isinstance(prior_samples, JokerSamples):
ln_prior = None
if return_logprobs:
ln_prior = prior_samples["ln_prior"]
prior_samples, _ = prior_samples.pack(
units=joker_helper.internal_units, names=joker_helper.packed_order
)
else:
ln_prior = return_logprobs
samples = rejection_sample_inmem(
joker_helper,
prior_samples,
rng=self.rng,
ln_prior=ln_prior,
max_posterior_samples=max_posterior_samples,
n_linear_samples=n_linear_samples,
return_all_logprobs=return_all_logprobs,
)
else:
samples = rejection_sample_helper(
joker_helper,
prior_samples,
pool=self.pool,
rng=self.rng,
n_prior_samples=n_prior_samples,
max_posterior_samples=max_posterior_samples,
n_linear_samples=n_linear_samples,
return_logprobs=return_logprobs,
n_batches=n_batches,
randomize_prior_order=randomize_prior_order,
return_all_logprobs=return_all_logprobs,
)
return samples
def iterative_rejection_sample(
self,
data,
prior_samples,
n_requested_samples,
max_prior_samples=None,
n_linear_samples=1,
return_logprobs=False,
n_batches=None,
randomize_prior_order=False,
init_batch_size=None,
growth_factor=128,
in_memory=False,
):
"""This is an experimental sampling method that adaptively generates
posterior samples given a large library of prior samples. The advantage
of this function over the standard ``rejection_sample`` method is that
it will try to adaptively figure out how many prior samples it needs to
evaluate the likelihood at in order to return the desired number of
posterior samples.
Parameters
----------
data : `~thejoker.RVData`
The radial velocity data, or an iterable containing ``RVData``
objects for each data source.
prior_samples : str, `~thejoker.JokerSamples`
Either a path to a file containing prior samples generated from The
Joker, or a `~thejoker.JokerSamples` instance containing the prior
samples.
n_requested_samples : int (optional)
The number of posterior samples desired.
max_prior_samples : int (optional)
The maximum number of prior samples to process.
n_linear_samples : int (optional)
The number of linear parameter samples to generate for each
nonlinear parameter sample returned from the rejection sampling
step.
return_logprobs : bool (optional)
Also return the log-prior and (marginal) log-likelihood values
evaluated at each sample.
n_batches : int (optional)
The number of batches to split the prior samples into before
distributing for computation. If using the (default) serial
computation pool, this doesn't have any impact. If using
multiprocessing or MPI, this determines how many batches to split
the samples into before scattering over all workers.
randomize_prior_order : bool (optional)
Randomly shuffle the prior samples before reading and running the
rejection sampler. This is only useful if you are using a large
library of prior samples, and choosing to run on a subset of those
samples.
init_batch_size : int (optional)
The initial batch size of likelihoods to compute, before growing
the batches using the multiplicative growth factor, below.
growth_factor : int (optional)
A factor used to adaptively grow the number of prior samples to
evaluate on. Larger numbers make the trial batches grow faster.
in_memory : bool (optional)
Load all prior samples or keep all prior samples in memory and run
all calculations without creating a temporary cache file.
Returns
-------
samples : `~thejoker.JokerSamples`
The posterior samples produced from The Joker.
"""
from .likelihood_helpers import iterative_rejection_inmem
from .multiproc_helpers import iterative_rejection_helper
joker_helper = self._make_joker_helper(data) # also validates data
if in_memory:
if isinstance(prior_samples, JokerSamples):
ln_prior = None
if return_logprobs:
ln_prior = prior_samples["ln_prior"]
prior_samples, _ = prior_samples.pack(
units=joker_helper.internal_units, names=joker_helper.packed_order
)
else:
ln_prior = return_logprobs
samples = iterative_rejection_inmem(
joker_helper,
prior_samples,
rng=self.rng,
n_requested_samples=n_requested_samples,
ln_prior=ln_prior,
init_batch_size=init_batch_size,
growth_factor=growth_factor,
n_linear_samples=n_linear_samples,
)
else:
samples = iterative_rejection_helper(
joker_helper,
prior_samples,
init_batch_size=init_batch_size,
growth_factor=growth_factor,
pool=self.pool,
rng=self.rng,
n_requested_samples=n_requested_samples,
max_prior_samples=max_prior_samples,
n_linear_samples=n_linear_samples,
return_logprobs=return_logprobs,
n_batches=n_batches,
randomize_prior_order=randomize_prior_order,
)
return samples
def setup_mcmc(self, data, joker_samples, model=None, custom_func=None):
"""
Setup the model to run MCMC using pymc.
Parameters
----------
data : `~thejoker.RVData`
The radial velocity data, or an iterable containing ``RVData``
objects for each data source.
joker_samples : `~thejoker.JokerSamples`
If a single sample is passed in, this is packed into a pymc
initialization dictionary and returned after setting up. If
multiple samples are passed in, the median (along period) sample is
taken and returned after setting up for MCMC.
model : `pymc.Model`
This is either required, or this function must be called within a
pymc model context.
custom_func : callable (optional)
Returns
-------
mcmc_init : dict
"""
import pymc as pm
import pytensor.tensor as pt
import thejoker.units as xu
from thejoker._keplerian_orbit import KeplerianOrbit
model = _validate_model(model)
# Reduce data, strip units:
data, ids, _ = validate_prepare_data(
data, self.prior.poly_trend, self.prior.n_offsets
)
x = data._t_bmjd - data._t_ref_bmjd
y = data.rv.value
err = data.rv_err.to_value(data.rv.unit)
# First, prepare the joker_samples:
if not isinstance(joker_samples, JokerSamples):
raise TypeError(
"You must pass in a JokerSamples instance to the "
"joker_samples argument."
)
if len(joker_samples) > 1:
# check if unimodal in P, if not, warn
if not is_P_unimodal(joker_samples, data):
logger.warn("TODO: samples ain't unimodal")
MAP_sample = joker_samples.median_period()
else:
MAP_sample = joker_samples
mcmc_init = {}
for name in self.prior.par_names:
unit = getattr(self.prior.pars[name], xu.UNIT_ATTR_NAME)
mcmc_init[name] = MAP_sample[name].to_value(unit)
if custom_func is not None:
mcmc_init = custom_func(mcmc_init, MAP_sample, model)
mcmc_init = {k: np.squeeze(v) for k, v in mcmc_init.items()}
p = self.prior.pars
if "t_peri" not in model.named_vars:
with model:
pm.Deterministic("t_peri", p["P"] * p["M0"] / (2 * np.pi))
if "obs" in model.named_vars:
return mcmc_init
with model:
# Set up the orbit model
orbit = KeplerianOrbit(
period=p["P"],
ecc=p["e"],
omega=p["omega"],
t_periastron=model.named_vars["t_peri"],
)
# design matrix
M = get_trend_design_matrix(data, ids, self.prior.poly_trend)
# deal with v0_offsets, trend here:
_, offset_names = validate_n_offsets(self.prior.n_offsets)
_, vtrend_names = validate_poly_trend(self.prior.poly_trend)
with model:
v_pars = (
[p["v0"]]
+ [p[name] for name in offset_names]
+ [p[name] for name in vtrend_names[1:]]
) # skip v0
v_trend_vec = pt.stack(v_pars, axis=0)
trend = pt.dot(M, v_trend_vec)
rv_model = orbit.get_radial_velocity(x, K=p["K"]) + trend
pm.Deterministic("model_rv", rv_model)
err = pt.sqrt(err**2 + p["s"] ** 2)
pm.Normal("obs", mu=rv_model, sigma=err, observed=y)
pm.Deterministic("logp", model.logp())
dist = pm.Normal.dist(model.model_rv, data.rv_err.value)
lnlike = pm.Deterministic(
"ln_likelihood", pm.logp(dist, data.rv.value).sum(axis=-1)
)
pm.Deterministic("ln_prior", model.logp() - lnlike)
return mcmc_init