Source code for log_z_dens

import numpy as np
import scipy as sp
import os
import scipy.optimize as op
import cPickle as cpkl
import emcee

import matplotlib as mpl
mpl.use('PS')
import matplotlib.pyplot as plt

import chippr
from chippr import defaults as d
from chippr import plot_utils as pu
from chippr import utils as u
from chippr import stat_utils as s
from chippr import log_z_dens_plots as plots

[docs]class log_z_dens(object): def __init__(self, catalog, hyperprior, truth=None, loc='.', prepend='', vb=True): """ An object representing the redshift density function (normalized redshift distribution function) Parameters ---------- catalog: chippr.catalog object dict containing bin endpoints, interim prior bin values, and interim posterior PDF bin values hyperprior: chippr.mvn object multivariate Gaussian distribution for hyperprior distribution truth: chippr.gmix object, optional true redshift density function expressed as univariate Gaussian mixture loc: string, optional directory into which to save results and plots made along the way prepend: str, optional prepend string to file names vb: boolean, optional True to print progress messages to stdout, False to suppress """ self.info = {} self.add_text = prepend + '_' self.bin_ends = np.array(catalog['bin_ends']) self.bin_range = self.bin_ends[:-1]-self.bin_ends[0] self.bin_mids = (self.bin_ends[1:]+self.bin_ends[:-1])/2. self.bin_difs = self.bin_ends[1:]-self.bin_ends[:-1] self.log_bin_difs = u.safe_log(self.bin_difs) self.n_bins = len(self.bin_mids) self.info['bin_ends'] = self.bin_ends self.log_int_pr = np.array(catalog['log_interim_prior']) self.int_pr = np.exp(self.log_int_pr) self.info['log_interim_prior'] = self.log_int_pr self.log_pdfs = np.array(catalog['log_interim_posteriors']) self.pdfs = np.exp(self.log_pdfs) self.n_pdfs = len(self.log_pdfs) self.info['log_interim_posteriors'] = self.log_pdfs if vb: print(str(self.n_bins) + ' bins, ' + str(len(self.log_pdfs)) + ' interim posterior PDFs') self.hyper_prior = hyperprior self.truth = truth self.info['truth'] = None if self.truth is not None: self.info['truth'] = {} self.tru_nz = np.zeros(self.n_bins) self.fine_zs = [] self.fine_nz = [] for b in range(self.n_bins): fine_z = np.linspace(self.bin_ends[b], self.bin_ends[b+1], self.n_bins) self.fine_zs.extend(fine_z) fine_dz = (self.bin_ends[b+1] - self.bin_ends[b]) / self.n_bins fine_n = self.truth.evaluate(fine_z) self.fine_nz.extend(fine_n) coarse_nz = np.sum(fine_n) * fine_dz self.tru_nz[b] += coarse_nz self.tru_nz /= np.dot(self.tru_nz, self.bin_difs) self.log_tru_nz = u.safe_log(self.tru_nz) self.info['log_tru_nz'] = self.log_tru_nz self.info['truth']['z_grid'] = np.array(self.fine_zs) self.info['truth']['nz_grid'] = np.array(self.fine_nz) self.info['estimators'] = {} self.info['stats'] = {} self.dir = loc self.data_dir = os.path.join(loc, 'data') self.plot_dir = os.path.join(loc, 'plots') if not os.path.exists(self.plot_dir): os.makedirs(self.plot_dir) self.res_dir = os.path.join(loc, 'results') if not os.path.exists(self.res_dir): os.makedirs(self.res_dir) return # # def precompute(self): # """ # Function to precompute values that show up in posterior that are independent of n(z) params # # Returns # ------- # precomputed: float # log-probability component independent of test params # """ # integrated_int_pr = np.log(np.dot(self.int_pr, self.bin_difs)) # integrated_int_posts = np.log(np.dot(self.pdfs, axis=0) # precomputed = integrated_int_posts - integrated_int_pr # return precomputed
[docs] def evaluate_log_hyper_likelihood(self, log_nz): """ Function to evaluate log hyperlikelihood Parameters ---------- log_nz: numpy.ndarray, float vector of logged redshift density bin values at which to evaluate the hyperlikelihood Returns ------- log_hyper_likelihood: float log likelihood probability associated with parameters in log_nz """ nz = np.exp(log_nz) norm_nz = nz / np.dot(nz, self.bin_difs) # testing whether the norm step is still necessary hyper_lfs = np.sum(norm_nz[None,:] * self.pdfs / self.int_pr[None,:] * self.bin_difs, axis=1) log_hyper_likelihood = np.sum(u.safe_log(hyper_lfs)) - u.safe_log(np.dot(norm_nz, self.bin_difs)) # this used to work... # log_hyper_likelihood = np.dot(np.exp(log_nz + self.precomputed), self.bin_difs) return log_hyper_likelihood
[docs] def evaluate_log_hyper_prior(self, log_nz): """ Function to evaluate log hyperprior Parameters ---------- log_nz: numpy.ndarray, float vector of logged redshift density bin values at which to evaluate the hyperprior Returns ------- log_hyper_prior: float log prior probability associated with parameters in log_nz """ log_hyper_prior = u.safe_log(self.hyper_prior.evaluate_one(log_nz)) return log_hyper_prior
[docs] def evaluate_log_hyper_posterior(self, log_nz): """ Function to evaluate log hyperposterior Parameters ---------- log_nz: numpy.ndarray, float vector of logged redshift density bin values at which to evaluate the full posterior Returns ------- log_hyper_posterior: float log hyperposterior probability associated with parameters in log_nz """ log_hyper_likelihood = self.evaluate_log_hyper_likelihood(log_nz) log_hyper_prior = self.evaluate_log_hyper_prior(log_nz) log_hyper_posterior = log_hyper_likelihood + log_hyper_prior return log_hyper_posterior
[docs] def optimize(self, start, no_data, no_prior, vb=True): """ Maximizes the hyperposterior of the redshift density Parameters ---------- start: numpy.ndarray, float array of log redshift density function bin values at which to begin optimization no_data: boolean True to exclude data contribution to hyperposterior no_prior: boolean True to exclude prior contribution to hyperposterior vb: boolean, optional True to print progress messages to stdout, False to suppress Returns ------- res.x: numpy.ndarray, float array of logged redshift density function bin values maximizing hyperposterior """ if no_data: if vb: print('only optimizing prior') def _objective(log_nz): return -2. * self.evaluate_log_hyper_prior(log_nz) elif no_prior: if vb: print('only optimizing likelihood') def _objective(log_nz): return -2. * self.evaluate_log_hyper_likelihood(log_nz) else: if vb: print('optimizing posterior') def _objective(log_nz): return -2. * self.evaluate_log_hyper_posterior(log_nz) if vb: print(self.dir + ' starting at ', start, _objective(start)) res = op.minimize(_objective, start, method="Nelder-Mead", options={"maxfev": 1e5, "maxiter":1e5}) if vb: print(self.dir + ': ' + str(res)) return res.x
[docs] def calculate_mmle(self, start, vb=True, no_data=0, no_prior=0): """ Calculates the marginalized maximum likelihood estimator of the redshift density function Parameters ---------- start: numpy.ndarray, float array of log redshift density function bin values at which to begin optimization vb: boolean, optional True to print progress messages to stdout, False to suppress no_data: boolean, optional True to exclude data contribution to hyperposterior no_prior: boolean, optional True to exclude prior contribution to hyperposterior Returns ------- log_mle_nz: numpy.ndarray, float array of logged redshift density function bin values maximizing hyperposterior """ # self.precomputed = self.precompute() if 'log_mmle_nz' not in self.info['estimators']: log_mle = self.optimize(start, no_data=no_data, no_prior=no_prior, vb=vb) mle_nz = np.exp(log_mle) self.mle_nz = mle_nz / np.dot(mle_nz, self.bin_difs) self.log_mle_nz = u.safe_log(self.mle_nz) self.info['estimators']['log_mmle_nz'] = self.log_mle_nz else: self.log_mle_nz = self.info['estimators']['log_mmle_nz'] self.mle_nz = np.exp(self.log_mle_nz) return self.log_mle_nz
[docs] def calculate_stacked(self, vb=True): """ Calculates the stacked estimator of the redshift density function Parameters ---------- vb: boolean, optional True to print progress messages to stdout, False to suppress Returns ------- log_stk_nz: ndarray, float array of logged redshift density function bin values """ if 'log_stacked_nz' not in self.info['estimators']: self.stk_nz = np.sum(self.pdfs, axis=0) self.stk_nz /= np.dot(self.stk_nz, self.bin_difs) self.log_stk_nz = u.safe_log(self.stk_nz) self.info['estimators']['log_stacked_nz'] = self.log_stk_nz else: self.log_stk_nz = self.info['estimators']['log_stacked_nz'] self.stk_nz = np.exp(self.log_stk_nz) return self.log_stk_nz
[docs] def calculate_mmap(self, vb=True): """ Calculates the marginalized maximum a posteriori estimator of the redshift density function Parameters ---------- vb: boolean, optional True to print progress messages to stdout, False to suppress Returns ------- log_map_nz: ndarray, float array of logged redshift density function bin values """ if 'log_mmap_nz' not in self.info['estimators']: self.map_nz = np.zeros(self.n_bins) mappreps = [np.argmax(l) for l in self.log_pdfs] for m in mappreps: self.map_nz[m] += 1. self.map_nz /= self.bin_difs[m] * self.n_pdfs self.log_map_nz = u.safe_log(self.map_nz) self.info['estimators']['log_mmap_nz'] = self.log_map_nz else: self.log_map_nz = self.info['estimators']['log_mmap_nz'] self.map_nz = np.exp(self.log_map_nz) return self.log_map_nz
[docs] def calculate_mexp(self, vb=True): """ Calculates the marginalized expected value estimator of the redshift density function Parameters ---------- vb: boolean, optional True to print progress messages to stdout, False to suppress Returns ------- log_exp_nz: ndarray, float array of logged redshift density function bin values """ if 'log_mexp_nz' not in self.info['estimators']: expprep = [sum(z) for z in self.bin_mids * self.pdfs * self.bin_difs] self.exp_nz = np.zeros(self.n_bins) for z in expprep: for k in range(self.n_bins): if z > self.bin_ends[k] and z < self.bin_ends[k+1]: self.exp_nz[k] += 1. self.exp_nz /= self.bin_difs * self.n_pdfs self.log_exp_nz = u.safe_log(self.exp_nz) self.info['estimators']['log_mexp_nz'] = self.log_exp_nz else: self.log_exp_nz = self.info['estimators']['log_mexp_nz'] self.exp_nz = np.exp(self.log_exp_nz) return self.log_exp_nz
[docs] def sample(self, ivals, n_samps, vb=True): """ Samples the redshift density hyperposterior Parameters ---------- ivals: numpy.ndarray, float initial values of the walkers n_samps: int number of samples to accept before stopping vb: boolean, optional True to print progress messages to stdout, False to suppress Returns ------- mcmc_outputs: dict dictionary containing array of sampled redshift density function bin values as well as posterior probabilities, acceptance fractions, and autocorrelation times """ self.sampler.reset() pos, prob, state = self.sampler.run_mcmc(ivals, n_samps) chains = self.sampler.chain probs = self.sampler.lnprobability fracs = self.sampler.acceptance_fraction acors = s.acors(chains) mcmc_outputs = {} mcmc_outputs['chains'] = chains mcmc_outputs['probs'] = probs mcmc_outputs['fracs'] = fracs mcmc_outputs['acors'] = acors return mcmc_outputs
[docs] def calculate_samples(self, ivals, n_accepted=d.n_accepted, n_burned=d.n_burned, vb=True, n_procs=1, no_data=0, no_prior=0, gr_threshold=d.gr_threshold): """ Calculates samples estimating the redshift density function Parameters ---------- ivals: numpy.ndarray, float initial values of log n(z) for each walker n_accepted: int, optional log10 number of samples to accept per walker n_burned: int, optional log10 number of samples between tests of burn-in condition n_procs: int, optional number of processors to use, defaults to single-thread vb: boolean, optional True to print progress messages to stdout, False to suppress no_data: boolean, optional True to exclude data contribution to hyperposterior no_prior: boolean, optional True to exclude prior contribution to hyperposterior Returns ------- log_samples_nz: ndarray, float array of sampled log redshift density function bin values """ # self.precomputed = self.precompute() if 'log_mean_sampled_nz' not in self.info['estimators']: self.n_walkers = len(ivals) if no_data: def distribution(log_nz): return self.evaluate_log_hyper_prior(log_nz) elif no_prior: def distribution(log_nz): return self.evaluate_log_hyper_likelihood(log_nz) else: def distribution(log_nz): return self.evaluate_log_hyper_posterior(log_nz) self.sampler = emcee.EnsembleSampler(self.n_walkers, self.n_bins, distribution, threads=n_procs) self.burn_ins = 0 if n_burned == 0: self.burning_in = False else: self.burning_in = True vals = ivals vals -= u.safe_log(np.sum(np.exp(ivals) * self.bin_difs[np.newaxis, :], axis=1))[:, np.newaxis] if vb: plots.plot_ivals(vals, self.info, self.plot_dir, prepend=self.add_text) canvas = plots.set_up_burn_in_plots(self.n_bins, self.n_walkers) full_chain = np.array([[vals[w]] for w in range(self.n_walkers)]) while self.burning_in: if vb: print('beginning sampling '+str(self.burn_ins)) burn_in_mcmc_outputs = self.sample(vals, 10**n_burned) chain = burn_in_mcmc_outputs['chains'] burn_in_mcmc_outputs['chains'] -= u.safe_log(np.sum(np.exp(chain) * self.bin_difs[np.newaxis, np.newaxis, :], axis=2))[:, :, np.newaxis] with open(os.path.join(self.res_dir, 'mcmc'+str(self.burn_ins)+'.p'), 'wb') as file_location: cpkl.dump(burn_in_mcmc_outputs, file_location) full_chain = np.concatenate((full_chain, burn_in_mcmc_outputs['chains']), axis=1) if vb: canvas = plots.plot_sampler_progress(canvas, burn_in_mcmc_outputs, full_chain, self.burn_ins, self.plot_dir, prepend=self.add_text) self.burning_in = s.gr_test(full_chain, gr_threshold) vals = np.array([item[-1] for item in burn_in_mcmc_outputs['chains']]) self.burn_ins += 1 mcmc_outputs = self.sample(vals, 10**n_accepted) chain = mcmc_outputs['chains'] mcmc_outputs['chains'] -= u.safe_log(np.sum(np.exp(chain) * self.bin_difs[np.newaxis, np.newaxis, :], axis=2))[:, :, np.newaxis] full_chain = np.concatenate((full_chain, mcmc_outputs['chains']), axis=1) with open(os.path.join(self.res_dir, 'full_chain.p'), 'wb') as file_location: cpkl.dump(full_chain, file_location) self.log_smp_nz = mcmc_outputs['chains'] self.smp_nz = np.exp(self.log_smp_nz) self.info['log_sampled_nz_meta_data'] = mcmc_outputs self.log_bfe_nz = s.norm_fit(self.log_smp_nz)[0] self.bfe_nz = np.exp(self.log_bfe_nz) self.info['estimators']['log_mean_sampled_nz'] = self.log_bfe_nz else: self.log_smp_nz = self.info['log_sampled_nz_meta_data'] self.smp_nz = np.exp(self.log_smp_nz) self.log_bfe_nz = self.info['estimators']['log_mean_sampled_nz'] self.bfe_nz = np.exp(self.log_smp_nz) # if vb: # plots.plot_samples(self.info, self.plot_dir) return self.log_smp_nz
[docs] def compare(self, vb=True): """ Calculates all available goodness of fit measures Parameters ---------- vb: boolean, optional True to print progress messages to stdout, False to suppress Returns ------- out_info: dict dictionary of all available statistics """ self.info['stats']['kld'], self.info['stats']['log_kld'] = {}, {} self.info['stats']['rms'], self.info['stats']['log_rms'] = {}, {} if self.truth is not None: for key in self.info['estimators']: self.info['stats']['kld'][key] = s.calculate_kld(np.exp(self.info['log_tru_nz']), np.exp(self.info['estimators'][key])) # self.info['stats']['log_kld'][key] = s.calculate_kld(self.log_tru_nz, self.info['estimators'][key]) self.info['stats']['rms']['true_nz' + '__' + key[4:]] = s.calculate_rms(np.exp(self.info['log_tru_nz']), np.exp(self.info['estimators'][key])) self.info['stats']['log_rms']['log_true_nz'+ '__' + key] = s.calculate_rms(self.info['log_tru_nz'], self.info['estimators'][key]) for i in range(len(self.info['estimators'].keys())): key_1 = self.info['estimators'].keys()[i] for j in range(len(self.info['estimators'].keys()[:i])): key_2 = self.info['estimators'].keys()[j] # print(((i,j), (key_1, key_2))) self.info['stats']['log_rms'][key_1 + '__' + key_2] = s.calculate_rms(self.info['estimators'][key_1], self.info['estimators'][key_2]) self.info['stats']['rms'][key_1[4:] + '__' + key_2[4:]] = s.calculate_rms(np.exp(self.info['estimators'][key_1]), np.exp(self.info['estimators'][key_2])) out_info = self.info['stats'] if vb: print(out_info) return out_info
[docs] def plot_estimators(self, log=True, mini=True): """ Plots all available estimators of the redshift density function. """ if mini: also = 'mini' else: also = '' if log: plots.plot_estimators(self.info, self.plot_dir, prepend=self.add_text+also+'log_', mini=mini) else: plots.plot_estimators(self.info, self.plot_dir, log=False, prepend=self.add_text+also+'lin_', mini=mini) return
[docs] def read(self, read_loc, style='pickle', vb=True): """ Function to load inferred quantities from files. Parameters ---------- read_loc: string filepath where inferred redshift density function is stored style: string, optional keyword for file format, currently only 'pickle' supported vb: boolean, optional True to print progress messages to stdout, False to suppress Returns ------- self.info: dict returns the log_z_dens information dictionary object """ with open(os.path.join(self.res_dir, read_loc), 'rb') as file_location: self.info = cpkl.load(file_location) if vb: print('The following quantities were read from '+read_loc+' in the '+style+' format:') for key in self.info: print(key) if 'estimators' in self.info: print(self.info['estimators'].keys()) return self.info
[docs] def write(self, write_loc, style='pickle', vb=True): """ Function to write results of inference to files. Parameters ---------- write_loc: string filepath where results of inference should be saved. style: string, optional keyword for file format, currently only 'pickle' supported vb: boolean, optional True to print progress messages to stdout, False to suppress """ with open(os.path.join(self.res_dir, write_loc), 'wb') as file_location: cpkl.dump(self.info, file_location) if vb: print('The following quantities were written to '+write_loc+' in the '+style+' format:') for key in self.info: print(key) return