import numpy as np
import scipy as sp
import os
import scipy.optimize as op
import cPickle as cpkl
import emcee
import matplotlib as mpl
mpl.use('PS')
import matplotlib.pyplot as plt
import chippr
from chippr import defaults as d
from chippr import plot_utils as pu
from chippr import utils as u
from chippr import stat_utils as s
from chippr import log_z_dens_plots as plots
[docs]class log_z_dens(object):
def __init__(self, catalog, hyperprior, truth=None, loc='.', prepend='', vb=True):
"""
An object representing the redshift density function (normalized
redshift distribution function)
Parameters
----------
catalog: chippr.catalog object
dict containing bin endpoints, interim prior bin values, and
interim posterior PDF bin values
hyperprior: chippr.mvn object
multivariate Gaussian distribution for hyperprior distribution
truth: chippr.gmix object, optional
true redshift density function expressed as univariate Gaussian
mixture
loc: string, optional
directory into which to save results and plots made along the way
prepend: str, optional
prepend string to file names
vb: boolean, optional
True to print progress messages to stdout, False to suppress
"""
self.info = {}
self.add_text = prepend + '_'
self.bin_ends = np.array(catalog['bin_ends'])
self.bin_range = self.bin_ends[:-1]-self.bin_ends[0]
self.bin_mids = (self.bin_ends[1:]+self.bin_ends[:-1])/2.
self.bin_difs = self.bin_ends[1:]-self.bin_ends[:-1]
self.log_bin_difs = u.safe_log(self.bin_difs)
self.n_bins = len(self.bin_mids)
self.info['bin_ends'] = self.bin_ends
self.log_int_pr = np.array(catalog['log_interim_prior'])
self.int_pr = np.exp(self.log_int_pr)
self.info['log_interim_prior'] = self.log_int_pr
self.log_pdfs = np.array(catalog['log_interim_posteriors'])
self.pdfs = np.exp(self.log_pdfs)
self.n_pdfs = len(self.log_pdfs)
self.info['log_interim_posteriors'] = self.log_pdfs
if vb:
print(str(self.n_bins) + ' bins, ' + str(len(self.log_pdfs)) + ' interim posterior PDFs')
self.hyper_prior = hyperprior
self.truth = truth
self.info['truth'] = None
if self.truth is not None:
self.info['truth'] = {}
self.tru_nz = np.zeros(self.n_bins)
self.fine_zs = []
self.fine_nz = []
for b in range(self.n_bins):
fine_z = np.linspace(self.bin_ends[b], self.bin_ends[b+1], self.n_bins)
self.fine_zs.extend(fine_z)
fine_dz = (self.bin_ends[b+1] - self.bin_ends[b]) / self.n_bins
fine_n = self.truth.evaluate(fine_z)
self.fine_nz.extend(fine_n)
coarse_nz = np.sum(fine_n) * fine_dz
self.tru_nz[b] += coarse_nz
self.tru_nz /= np.dot(self.tru_nz, self.bin_difs)
self.log_tru_nz = u.safe_log(self.tru_nz)
self.info['log_tru_nz'] = self.log_tru_nz
self.info['truth']['z_grid'] = np.array(self.fine_zs)
self.info['truth']['nz_grid'] = np.array(self.fine_nz)
self.info['estimators'] = {}
self.info['stats'] = {}
self.dir = loc
self.data_dir = os.path.join(loc, 'data')
self.plot_dir = os.path.join(loc, 'plots')
if not os.path.exists(self.plot_dir):
os.makedirs(self.plot_dir)
self.res_dir = os.path.join(loc, 'results')
if not os.path.exists(self.res_dir):
os.makedirs(self.res_dir)
return
#
# def precompute(self):
# """
# Function to precompute values that show up in posterior that are independent of n(z) params
#
# Returns
# -------
# precomputed: float
# log-probability component independent of test params
# """
# integrated_int_pr = np.log(np.dot(self.int_pr, self.bin_difs))
# integrated_int_posts = np.log(np.dot(self.pdfs, axis=0)
# precomputed = integrated_int_posts - integrated_int_pr
# return precomputed
[docs] def evaluate_log_hyper_likelihood(self, log_nz):
"""
Function to evaluate log hyperlikelihood
Parameters
----------
log_nz: numpy.ndarray, float
vector of logged redshift density bin values at which to evaluate
the hyperlikelihood
Returns
-------
log_hyper_likelihood: float
log likelihood probability associated with parameters in log_nz
"""
nz = np.exp(log_nz)
norm_nz = nz / np.dot(nz, self.bin_difs)
# testing whether the norm step is still necessary
hyper_lfs = np.sum(norm_nz[None,:] * self.pdfs / self.int_pr[None,:] * self.bin_difs, axis=1)
log_hyper_likelihood = np.sum(u.safe_log(hyper_lfs)) - u.safe_log(np.dot(norm_nz, self.bin_difs))
# this used to work...
# log_hyper_likelihood = np.dot(np.exp(log_nz + self.precomputed), self.bin_difs)
return log_hyper_likelihood
[docs] def evaluate_log_hyper_prior(self, log_nz):
"""
Function to evaluate log hyperprior
Parameters
----------
log_nz: numpy.ndarray, float
vector of logged redshift density bin values at which to evaluate
the hyperprior
Returns
-------
log_hyper_prior: float
log prior probability associated with parameters in log_nz
"""
log_hyper_prior = u.safe_log(self.hyper_prior.evaluate_one(log_nz))
return log_hyper_prior
[docs] def evaluate_log_hyper_posterior(self, log_nz):
"""
Function to evaluate log hyperposterior
Parameters
----------
log_nz: numpy.ndarray, float
vector of logged redshift density bin values at which to evaluate
the full posterior
Returns
-------
log_hyper_posterior: float
log hyperposterior probability associated with parameters in log_nz
"""
log_hyper_likelihood = self.evaluate_log_hyper_likelihood(log_nz)
log_hyper_prior = self.evaluate_log_hyper_prior(log_nz)
log_hyper_posterior = log_hyper_likelihood + log_hyper_prior
return log_hyper_posterior
[docs] def optimize(self, start, no_data, no_prior, vb=True):
"""
Maximizes the hyperposterior of the redshift density
Parameters
----------
start: numpy.ndarray, float
array of log redshift density function bin values at which to begin
optimization
no_data: boolean
True to exclude data contribution to hyperposterior
no_prior: boolean
True to exclude prior contribution to hyperposterior
vb: boolean, optional
True to print progress messages to stdout, False to suppress
Returns
-------
res.x: numpy.ndarray, float
array of logged redshift density function bin values maximizing
hyperposterior
"""
if no_data:
if vb: print('only optimizing prior')
def _objective(log_nz):
return -2. * self.evaluate_log_hyper_prior(log_nz)
elif no_prior:
if vb: print('only optimizing likelihood')
def _objective(log_nz):
return -2. * self.evaluate_log_hyper_likelihood(log_nz)
else:
if vb: print('optimizing posterior')
def _objective(log_nz):
return -2. * self.evaluate_log_hyper_posterior(log_nz)
if vb:
print(self.dir + ' starting at ', start, _objective(start))
res = op.minimize(_objective, start, method="Nelder-Mead", options={"maxfev": 1e5, "maxiter":1e5})
if vb:
print(self.dir + ': ' + str(res))
return res.x
[docs] def calculate_mmle(self, start, vb=True, no_data=0, no_prior=0):
"""
Calculates the marginalized maximum likelihood estimator of the
redshift density function
Parameters
----------
start: numpy.ndarray, float
array of log redshift density function bin values at which to begin
optimization
vb: boolean, optional
True to print progress messages to stdout, False to suppress
no_data: boolean, optional
True to exclude data contribution to hyperposterior
no_prior: boolean, optional
True to exclude prior contribution to hyperposterior
Returns
-------
log_mle_nz: numpy.ndarray, float
array of logged redshift density function bin values maximizing
hyperposterior
"""
# self.precomputed = self.precompute()
if 'log_mmle_nz' not in self.info['estimators']:
log_mle = self.optimize(start, no_data=no_data, no_prior=no_prior, vb=vb)
mle_nz = np.exp(log_mle)
self.mle_nz = mle_nz / np.dot(mle_nz, self.bin_difs)
self.log_mle_nz = u.safe_log(self.mle_nz)
self.info['estimators']['log_mmle_nz'] = self.log_mle_nz
else:
self.log_mle_nz = self.info['estimators']['log_mmle_nz']
self.mle_nz = np.exp(self.log_mle_nz)
return self.log_mle_nz
[docs] def calculate_stacked(self, vb=True):
"""
Calculates the stacked estimator of the redshift density function
Parameters
----------
vb: boolean, optional
True to print progress messages to stdout, False to suppress
Returns
-------
log_stk_nz: ndarray, float
array of logged redshift density function bin values
"""
if 'log_stacked_nz' not in self.info['estimators']:
self.stk_nz = np.sum(self.pdfs, axis=0)
self.stk_nz /= np.dot(self.stk_nz, self.bin_difs)
self.log_stk_nz = u.safe_log(self.stk_nz)
self.info['estimators']['log_stacked_nz'] = self.log_stk_nz
else:
self.log_stk_nz = self.info['estimators']['log_stacked_nz']
self.stk_nz = np.exp(self.log_stk_nz)
return self.log_stk_nz
[docs] def calculate_mmap(self, vb=True):
"""
Calculates the marginalized maximum a posteriori estimator of the
redshift density function
Parameters
----------
vb: boolean, optional
True to print progress messages to stdout, False to suppress
Returns
-------
log_map_nz: ndarray, float
array of logged redshift density function bin values
"""
if 'log_mmap_nz' not in self.info['estimators']:
self.map_nz = np.zeros(self.n_bins)
mappreps = [np.argmax(l) for l in self.log_pdfs]
for m in mappreps:
self.map_nz[m] += 1.
self.map_nz /= self.bin_difs[m] * self.n_pdfs
self.log_map_nz = u.safe_log(self.map_nz)
self.info['estimators']['log_mmap_nz'] = self.log_map_nz
else:
self.log_map_nz = self.info['estimators']['log_mmap_nz']
self.map_nz = np.exp(self.log_map_nz)
return self.log_map_nz
[docs] def calculate_mexp(self, vb=True):
"""
Calculates the marginalized expected value estimator of the redshift
density function
Parameters
----------
vb: boolean, optional
True to print progress messages to stdout, False to suppress
Returns
-------
log_exp_nz: ndarray, float
array of logged redshift density function bin values
"""
if 'log_mexp_nz' not in self.info['estimators']:
expprep = [sum(z) for z in self.bin_mids * self.pdfs * self.bin_difs]
self.exp_nz = np.zeros(self.n_bins)
for z in expprep:
for k in range(self.n_bins):
if z > self.bin_ends[k] and z < self.bin_ends[k+1]:
self.exp_nz[k] += 1.
self.exp_nz /= self.bin_difs * self.n_pdfs
self.log_exp_nz = u.safe_log(self.exp_nz)
self.info['estimators']['log_mexp_nz'] = self.log_exp_nz
else:
self.log_exp_nz = self.info['estimators']['log_mexp_nz']
self.exp_nz = np.exp(self.log_exp_nz)
return self.log_exp_nz
[docs] def sample(self, ivals, n_samps, vb=True):
"""
Samples the redshift density hyperposterior
Parameters
----------
ivals: numpy.ndarray, float
initial values of the walkers
n_samps: int
number of samples to accept before stopping
vb: boolean, optional
True to print progress messages to stdout, False to suppress
Returns
-------
mcmc_outputs: dict
dictionary containing array of sampled redshift density function
bin values as well as posterior probabilities, acceptance
fractions, and autocorrelation times
"""
self.sampler.reset()
pos, prob, state = self.sampler.run_mcmc(ivals, n_samps)
chains = self.sampler.chain
probs = self.sampler.lnprobability
fracs = self.sampler.acceptance_fraction
acors = s.acors(chains)
mcmc_outputs = {}
mcmc_outputs['chains'] = chains
mcmc_outputs['probs'] = probs
mcmc_outputs['fracs'] = fracs
mcmc_outputs['acors'] = acors
return mcmc_outputs
[docs] def calculate_samples(self, ivals, n_accepted=d.n_accepted, n_burned=d.n_burned, vb=True, n_procs=1, no_data=0, no_prior=0, gr_threshold=d.gr_threshold):
"""
Calculates samples estimating the redshift density function
Parameters
----------
ivals: numpy.ndarray, float
initial values of log n(z) for each walker
n_accepted: int, optional
log10 number of samples to accept per walker
n_burned: int, optional
log10 number of samples between tests of burn-in condition
n_procs: int, optional
number of processors to use, defaults to single-thread
vb: boolean, optional
True to print progress messages to stdout, False to suppress
no_data: boolean, optional
True to exclude data contribution to hyperposterior
no_prior: boolean, optional
True to exclude prior contribution to hyperposterior
Returns
-------
log_samples_nz: ndarray, float
array of sampled log redshift density function bin values
"""
# self.precomputed = self.precompute()
if 'log_mean_sampled_nz' not in self.info['estimators']:
self.n_walkers = len(ivals)
if no_data:
def distribution(log_nz):
return self.evaluate_log_hyper_prior(log_nz)
elif no_prior:
def distribution(log_nz):
return self.evaluate_log_hyper_likelihood(log_nz)
else:
def distribution(log_nz):
return self.evaluate_log_hyper_posterior(log_nz)
self.sampler = emcee.EnsembleSampler(self.n_walkers, self.n_bins, distribution, threads=n_procs)
self.burn_ins = 0
if n_burned == 0:
self.burning_in = False
else:
self.burning_in = True
vals = ivals
vals -= u.safe_log(np.sum(np.exp(ivals) * self.bin_difs[np.newaxis, :], axis=1))[:, np.newaxis]
if vb:
plots.plot_ivals(vals, self.info, self.plot_dir, prepend=self.add_text)
canvas = plots.set_up_burn_in_plots(self.n_bins, self.n_walkers)
full_chain = np.array([[vals[w]] for w in range(self.n_walkers)])
while self.burning_in:
if vb:
print('beginning sampling '+str(self.burn_ins))
burn_in_mcmc_outputs = self.sample(vals, 10**n_burned)
chain = burn_in_mcmc_outputs['chains']
burn_in_mcmc_outputs['chains'] -= u.safe_log(np.sum(np.exp(chain) * self.bin_difs[np.newaxis, np.newaxis, :], axis=2))[:, :, np.newaxis]
with open(os.path.join(self.res_dir, 'mcmc'+str(self.burn_ins)+'.p'), 'wb') as file_location:
cpkl.dump(burn_in_mcmc_outputs, file_location)
full_chain = np.concatenate((full_chain, burn_in_mcmc_outputs['chains']), axis=1)
if vb:
canvas = plots.plot_sampler_progress(canvas, burn_in_mcmc_outputs, full_chain, self.burn_ins, self.plot_dir, prepend=self.add_text)
self.burning_in = s.gr_test(full_chain, gr_threshold)
vals = np.array([item[-1] for item in burn_in_mcmc_outputs['chains']])
self.burn_ins += 1
mcmc_outputs = self.sample(vals, 10**n_accepted)
chain = mcmc_outputs['chains']
mcmc_outputs['chains'] -= u.safe_log(np.sum(np.exp(chain) * self.bin_difs[np.newaxis, np.newaxis, :], axis=2))[:, :, np.newaxis]
full_chain = np.concatenate((full_chain, mcmc_outputs['chains']), axis=1)
with open(os.path.join(self.res_dir, 'full_chain.p'), 'wb') as file_location:
cpkl.dump(full_chain, file_location)
self.log_smp_nz = mcmc_outputs['chains']
self.smp_nz = np.exp(self.log_smp_nz)
self.info['log_sampled_nz_meta_data'] = mcmc_outputs
self.log_bfe_nz = s.norm_fit(self.log_smp_nz)[0]
self.bfe_nz = np.exp(self.log_bfe_nz)
self.info['estimators']['log_mean_sampled_nz'] = self.log_bfe_nz
else:
self.log_smp_nz = self.info['log_sampled_nz_meta_data']
self.smp_nz = np.exp(self.log_smp_nz)
self.log_bfe_nz = self.info['estimators']['log_mean_sampled_nz']
self.bfe_nz = np.exp(self.log_smp_nz)
# if vb:
# plots.plot_samples(self.info, self.plot_dir)
return self.log_smp_nz
[docs] def compare(self, vb=True):
"""
Calculates all available goodness of fit measures
Parameters
----------
vb: boolean, optional
True to print progress messages to stdout, False to suppress
Returns
-------
out_info: dict
dictionary of all available statistics
"""
self.info['stats']['kld'], self.info['stats']['log_kld'] = {}, {}
self.info['stats']['rms'], self.info['stats']['log_rms'] = {}, {}
if self.truth is not None:
for key in self.info['estimators']:
self.info['stats']['kld'][key] = s.calculate_kld(np.exp(self.info['log_tru_nz']), np.exp(self.info['estimators'][key]))
# self.info['stats']['log_kld'][key] = s.calculate_kld(self.log_tru_nz, self.info['estimators'][key])
self.info['stats']['rms']['true_nz' + '__' + key[4:]] = s.calculate_rms(np.exp(self.info['log_tru_nz']), np.exp(self.info['estimators'][key]))
self.info['stats']['log_rms']['log_true_nz'+ '__' + key] = s.calculate_rms(self.info['log_tru_nz'], self.info['estimators'][key])
for i in range(len(self.info['estimators'].keys())):
key_1 = self.info['estimators'].keys()[i]
for j in range(len(self.info['estimators'].keys()[:i])):
key_2 = self.info['estimators'].keys()[j]
# print(((i,j), (key_1, key_2)))
self.info['stats']['log_rms'][key_1 + '__' + key_2] = s.calculate_rms(self.info['estimators'][key_1], self.info['estimators'][key_2])
self.info['stats']['rms'][key_1[4:] + '__' + key_2[4:]] = s.calculate_rms(np.exp(self.info['estimators'][key_1]), np.exp(self.info['estimators'][key_2]))
out_info = self.info['stats']
if vb:
print(out_info)
return out_info
[docs] def plot_estimators(self, log=True, mini=True):
"""
Plots all available estimators of the redshift density function.
"""
if mini:
also = 'mini'
else:
also = ''
if log:
plots.plot_estimators(self.info, self.plot_dir, prepend=self.add_text+also+'log_', mini=mini)
else:
plots.plot_estimators(self.info, self.plot_dir, log=False, prepend=self.add_text+also+'lin_', mini=mini)
return
[docs] def read(self, read_loc, style='pickle', vb=True):
"""
Function to load inferred quantities from files.
Parameters
----------
read_loc: string
filepath where inferred redshift density function is stored
style: string, optional
keyword for file format, currently only 'pickle' supported
vb: boolean, optional
True to print progress messages to stdout, False to suppress
Returns
-------
self.info: dict
returns the log_z_dens information dictionary object
"""
with open(os.path.join(self.res_dir, read_loc), 'rb') as file_location:
self.info = cpkl.load(file_location)
if vb:
print('The following quantities were read from '+read_loc+' in the '+style+' format:')
for key in self.info:
print(key)
if 'estimators' in self.info:
print(self.info['estimators'].keys())
return self.info
[docs] def write(self, write_loc, style='pickle', vb=True):
"""
Function to write results of inference to files.
Parameters
----------
write_loc: string
filepath where results of inference should be saved.
style: string, optional
keyword for file format, currently only 'pickle' supported
vb: boolean, optional
True to print progress messages to stdout, False to suppress
"""
with open(os.path.join(self.res_dir, write_loc), 'wb') as file_location:
cpkl.dump(self.info, file_location)
if vb:
print('The following quantities were written to '+write_loc+' in the '+style+' format:')
for key in self.info:
print(key)
return