Source code for snmachine.snfeatures

"""
Module for feature extraction on supernova light curves.
"""

from __future__ import division, print_function
import numpy as np
from . import parametric_models
import sys, pywt, time, subprocess, os, sncosmo
from scipy.interpolate import interp1d
import george
from astropy.table import Table, vstack, hstack, join
from multiprocessing import Pool
from functools import partial
from scipy import stats
import scipy.optimize as op
from iminuit import Minuit, describe
import traceback

try:
    import pymultinest
    from pymultinest.analyse import Analyzer
    has_multinest=True
except ImportError:
    has_multinest=False
    
try:
    import emcee
    has_emcee=True
except ImportError:
    has_emcee=False

try:
    import george
    has_george=True
except ImportError:
    has_george=False

try:
    from gapp import dgp
    has_gapp=True
except ImportError:
    has_gapp=False

def _GP(obj, d, ngp, xmin, xmax, initheta, save_output, output_root, gpalgo='george'):
    """
    Fit a Gaussian process curve at specific evenly spaced points along a light curve.

    iarameters
    ----------
    obj : str
        Name of the object
    d : Dataset-like object
        Dataset
    ngp : int
        Number of points to evaluate Gaussian Process at
    xmin : float
        Minimim time to evaluate at
    xmax : float
        Maximum time to evaluate at
    initheta : list-like
        Initial values for theta parameters. These should be roughly the scale length in the y & x directions.
    save_output : bool
        Whether or not to save the output
    output_root : str
        Output directory
    gpalgo : str
        which gp package is used for the Gaussian Process Regression, GaPP or george

    Returns
    -------
    astropy.table.Table
        Table with evaluated Gaussian process curve and errors

    """

    print(obj)
    if gpalgo=='gapp' and not has_gapp:
        print('No GP module gapp. Defaulting to george instead.')
        gpalgo='george'
    sys.stdout.flush()
    lc=d.data[obj]
    filters=np.unique(lc['filter'])
    #Store the output in another astropy table
    output=[]
    for fil in d.filter_set:
        if fil in filters:
            x=lc['mjd'][lc['filter']==fil]
            y=lc['flux'][lc['filter']==fil]
            err=lc['flux_error'][lc['filter']==fil]
            sys.stdout = open(os.devnull, "w")
            if gpalgo=='gapp':
                sys.stdout = open(os.devnull, "w")
                g=dgp.DGaussianProcess(x, y, err, cXstar=(xmin, xmax, ngp))
                sys.stdout=sys.__stdout__
                rec, theta=g.gp(theta=initheta)
            elif gpalgo=='george':
                # Define the objective function (negative log-likelihood in this case).
                def nll(p):
                    g.set_parameter_vector(p)
                    ll = g.log_likelihood(y, quiet=True)
                    return -ll if np.isfinite(ll) else 1e25

                # And the gradient of the objective function.
                def grad_nll(p):
                    g.set_parameter_vector(p)
                    return -g.grad_log_likelihood(y, quiet=True)

      #          sys.stdout = open(os.devnull, "w")
                g=george.GP(initheta[0]**2*george.kernels.ExpSquaredKernel(metric=initheta[1]**2))
                g.compute(x,err)
                p0 = g.get_parameter_vector()
                results = op.minimize(nll, p0, jac=grad_nll, method="L-BFGS-B")
                g.set_parameter_vector(results.x)
       #         sys.stdout=sys.__stdout__
                xstar=np.linspace(xmin,xmax,ngp)
                mu,cov=g.predict(y,xstar)
                std=np.sqrt(np.diag(cov))
                rec=np.column_stack((xstar,mu,std))
        else:
            rec=np.zeros([ngp, 3])
        newtable=Table([rec[:, 0], rec[:, 1], rec[:, 2], [fil]*ngp], names=['mjd', 'flux', 'flux_error', 'filter'])
        if len(output)==0:
            output=newtable
        else:
            output=vstack((output, newtable))
    if save_output=='gp' or save_output=='all':
        output.write(os.path.join(output_root, 'gp_'+obj), format='ascii')        
    return output



def _run_leastsq(obj, d, model, n_attempts, seed=-1):
    """
    Minimises the chi2 on all the filter bands of a given light curve, fitting the model to each one and extracting
    the best fitting parameters

    Parameters
    ----------
    obj : str
        Object name
    d : Dataset
        Dataset object
    model : parametric model object
        Parametric model
    n_attempts : int
        We run this multiple times to try to avoid local minima and take the best fitting values.

    Returns
    -------
    astropy.table.Table
        Table of best fitting parameters for all filters

    """
    lc=d.data[obj]
    filts=np.unique(lc['filter'])

    n_params=len(model.param_names)

    #How many times to keep trying to fit each object to obtain a good fit (defined as reduced chi2 of less than 2)
    if n_attempts<1:
        n_attempts=1

    labels=['Object']
    for f in d.filter_set:
        pams=model.param_names
        for p in pams:
            labels.append(f+'-'+p)
    output=Table(names=labels, dtype=['U32']+['f']*(len(labels)-1))

    if seed!=-1:
        np.random.seed(seed)

    t1=time.time()
    row=[obj]
    for f in d.filter_set:
        if f in filts:
            x=np.array(lc['mjd'][lc['filter']==f])
            y=np.array(lc['flux'][lc['filter']==f])
            err=np.array(lc['flux_error'][lc['filter']==f])

            def mini_func(*params):
                #This function changes with each object so is necessary to redefine (since you can't pass arguments through Minuit)
                for i in range(len(params)):
                    p=params[i]
                    if p<model.limits[model.param_names[i]][0] or p>model.limits[model.param_names[i]][1]:
                        return np.inf
                ynew=model.evaluate(x, params)
                chi2=np.sum((y-ynew)*(y-ynew)/err/err)
                return chi2

            #For the sake of speed, we stop as soon as we get to reduced chi2<2, failing that we use the
            #best fit found so far
            fmin=np.inf
            min_params=[]

            for i in range(n_attempts):
                if i==0:
                    #Try the default starting point
                    input_args=model.initial.copy()
                else:
                    #Pick a new, random starting point
                    input_args={}
                    for p in model.param_names:
                        val=np.random.uniform(model.limits[p][0], model.limits[p][1])
                        input_args[p]=val
                for p in model.param_names:
                    input_args['limit_'+p]=model.limits[p]

                m=Minuit(mini_func, pedantic=False, print_level=0,forced_parameters=model.param_names, **input_args)

                m.migrad()
                parm=[]
                for p in model.param_names:
                    parm.append(m.values[p])

                rchi2=m.fval/len(x)
                if rchi2<2:
                    fmin=m.fval
                    min_params=parm
                    break
                elif m.fval<fmin:
                    fmin=m.fval
                    min_params=parm

            outfl=open('out', 'a')
            outfl.write('%s\t%s\t%f\t%d\n' %(obj, f, fmin/len(x), i))
            outfl.close()
            row+=min_params
        else:
            row+=[0]*len(model.param_names) #Fill missing values with zeros
    output.add_row(row)
    #print 'Time per filter', (time.time()-t1)/4
    return output





def _run_multinest(obj, d, model, chain_directory,  nlp, convert_to_binary, n_iter, restart=False, seed=-1):
    """
    Runs multinest on all the filter bands of a given light curve, fitting the model to each one and
    extracting the best fitting parameters.

    Parameters
    ----------
    obj : str
        Object name
    d : Dataset
        Dataset object
    model : parametric model object
        Parametric model
    chain_directory : str
        Path to where the output files go
    nlp : int
        Number of live points
    convert_to_binary : bool
        Whether or not to convert the ascii output files to binary
    n_iter : int
        Maximum number of iterations
    restart : bool
        Whether to restart from existing chain files.

    Returns
    -------
    astropy.table.Table
        Table of best fitting parameters and their errors for all filters

    """

    try:
        def prior_multinest(cube, ndim, nparams):
            """Prior function specifically for multinest. This would be called for one filter, for one object.
            @param cube A ctypes pointer to the parameter cube (actually just the current parameter values).
            @param ndim Number of dimensions
            @param nparams Number of parameters. Usually the same as ndim unless you have unsampled (e.g. calculated) parameters. These
            are assumed to be the first (ndim-nparams) parameters.
            """
            up=model.upper_limit
            low=model.lower_limit
            
            for i in range(nparams):
                cube[i]=cube[i]*(up[i]-low[i])+low[i]
            return cube
            
            
        lc=d.data[obj]
        filts=np.unique(lc['filter'])
        
        n_params=len(model.param_names)
        
        #err_plus=[pname+'_err+' for pname in self.model.param_names]
        #err_minus=[pname+'_err-' for pname in self.model.param_names]
        labels=['Object']
        for f in d.filter_set:
            pams=model.param_names
            for p in pams:
                labels.append(f+'-'+p)

        output=Table(names=labels, dtype=['U32']+['f']*(len(labels)-1))
    
        
        row=[obj]
        for f in d.filter_set:
            t1=time.time()
            if f in filts:
                #current values for x,y and err are set each time multinest is called
                x, y, err=np.column_stack((lc['mjd'][lc['filter']==f], lc['flux'][lc['filter']==f], lc['flux_error'][lc['filter']==f])).T
                
                def loglike_multinest(cube, ndim, nparams):
                    """Loglikelihood function specifically for multinest. This would be called for one filter, for one object.
                    @param cube A ctypes pointer to the parameter cube (actually just the current parameter values).
                    @param ndim Number of dimensions
                    @param nparams Number of parameters. Usually the same as ndim unless you have unsampled (e.g. calculated) parameters. These
                    are assumed to be the first (ndim-nparams) parameters.
                    """
                    params=np.zeros(nparams)
                    #This is the only obvious way to convert a ctypes pointer to a numpy array
                    for i in range(nparams):
                        params[i]=cube[i]
                    #params=[ 26.97634888,   45.13123322,    2.59183478,    0.12057552,    7.65392637]
                    ynew=model.evaluate(x, params)
                    
                    chi2=np.sum(((y-ynew)*(y-ynew))/err/err)
                    #print 'likelihood', -chi2/2.
                    return -chi2/2.
                
                chain_name=os.path.join(chain_directory, '%s-%s-%s-' %(obj.split('.')[0], f, model.model_name))
                
                if not restart or not os.path.exists(chain_name+'stats.dat'):
                    #Gives the ability to restart from existing chains if they exist
                    pymultinest.run(loglike_multinest, prior_multinest, n_params, importance_nested_sampling = False, init_MPI=False, 
                    resume = False, verbose = False, sampling_efficiency = 'parameter', n_live_points = nlp, outputfiles_basename=chain_name, 
                    multimodal=False, max_iter=n_iter, seed=seed)
                    
                #An=Analyzer(n_params, chain_name)
                #best_params=An.get_best_fit()['parameters']
                
#                chain=np.loadtxt(chain_name+'.txt')
#                best_params=np.median(chain, axis=0)[2:]
                best_params=get_MAP(chain_name)

                if convert_to_binary and not restart:
                    s='%s-%s-%s-' %(obj.split('.')[0], f, model.model_name)
                    ext=['ev.dat', 'phys_live.points', 'live.points', '.txt', 'post_equal_weights.dat'] #These are the files we can convert
                    for e in ext:
                        infile=os.path.join(chain_directory,'%s-%s-%s-%s' %(obj.split('.')[0], f, model.model_name, e))
                        outfile=infile+'.npy'
                        try:
                            x=np.loadtxt(infile)
                            np.save(outfile, x)
                            os.system('rm %s' %infile)
                        except:
                            print ('ERROR reading file', infile)
                            print ('File unconverted')
    #            stats=An.get_stats()['marginals']
    #            sig_upper=[stats[p]['1sigma'][1]-best_params[p] for p in range(len(stats))]
    #            sig_lower=[best_params[p]-stats[p]['1sigma'][0] for p in range(len(stats))]
    #            best=best_params+sig_lower+sig_upper
                #Expects first the parameters, then -sigma then +sigma

                row+=best_params
            else:
                row+=[0]*len(model.param_names) #I'm not sure if it makes the most sense to fill in missing values with zeroes...
            #print 'Time for object', obj, 'filter', f,':', (time.time()-t1)
            np.savetxt(os.path.join(chain_directory,'%s-%s-%s-.time' %(obj.split('.')[0], f, model.model_name)), [time.time()-t1])
            
        output.add_row(row)
            
        return output
    
    except:
        #Sometimes things just break
        print ('ERROR in', obj)
        print (sys.exc_info()[0])
        print (sys.exc_info()[1])
        traceback.print_tb(sys.exc_info()[2])
        
        return None
        

    
    
def _run_leastsq_templates(obj, d, model_name, use_redshift, bounds, seed=-1):
    """
    Fit template-based supernova models using least squares.

    Parameters
    ----------
    obj : str
        Object name
    d : Dataset
        Dataset object
    model_name : str
        Name of model to use (to be passed to sncosmo)
    use_redshift : bool
        Whether or not to use provided redshift information
    bounds : dict
        Bounds on parameters

    Returns
    -------
    astropy.table.Table
        Table of best fitting parameters
    """

    lc=d.data[obj]

    if model_name=='mlcs2k2':
        dust=sncosmo.CCM89Dust()
        model=sncosmo.Model(model_name,effects=[dust],effect_names=['host'], effect_frames=['rest'])
    else:
        model=sncosmo.Model(model_name)


    #labels=['Object']+model.param_names+['Chisq']
    #output=Table(names=labels, dtype=['S32']+['f']*(len(model.param_names))+['f'])
    labels = ['Object'] + model.param_names
    output=Table(names=labels, dtype=['U32']+['f']*(len(model.param_names)))

    t1=time.time()
    row=[obj]

    if use_redshift:
        model.set(z=lc.meta['z'])
        prms=model.param_names
        prms=prms[1:]
        bnds=bounds.copy()
        bnds.pop('z', None)
        res, fitted_model=sncosmo.fit_lc(lc, model, vparam_names=prms,
            bounds=bnds, minsnr=0)
    else:
        res, fitted_model=sncosmo.fit_lc(lc, model, vparam_names=model.param_names,
            bounds=bounds, minsnr=0)
    best=res['parameters']
    best=best.tolist()
    row+=best
    #row+=[res['chisq']]

    output.add_row(row)

    return output
        

def _run_multinest_templates(obj, d, model_name, bounds, chain_directory='./',  nlp=1000, convert_to_binary=True, use_redshift=False, short_name='', restart=False, seed=-1):
    """

    Fit template-based supernova models using multinest.

    Parameters
    ----------
    obj : str
        Object name
    d : Dataset
        Dataset object
    model_name : str
        Name of model to use (to be passed to sncosmo)
    use_redshift : bool
        Whether or not to use provided redshift information
    bounds : dict
        Bounds on parameters

    Returns
    -------
    astropy.table.Table
        Table of best fitting parameters

    Parameters
    ----------
    obj : str
        Object name
    d : Dataset
        Dataset object
    model_name : str
        Name of model to use (to be passed to sncosmo)
    bounds : dict
        Bounds on parameters
    chain_directory : str
        Path to output directory for chains
    nlp : int
        Number of live points
    convert_to_binary : bool
        Whether or not to convert ascii Multinest files to binary
    use_redshift : bool
        Whether or not to use provided redshift information
    short_name : str
        A shorter name for the chains (to overcome Multinest's character limitation)
    restart : bool
        Whether or not to restart from existing chains

    Returns
    -------
    array-like
        List of best-fitting parameters
    """
    try:
        def prior_multinest(cube, ndim, nparams):
            """Prior function specifically for multinest. This would be called for one filter, for one object.
            @param cube A ctypes pointer to the parameter cube (actually just the current parameter values).
            @param ndim Number of dimensions
            @param nparams Number of parameters. Usually the same as ndim unless you have unsampled (e.g. calculated) parameters. These
            are assumed to be the first (ndim-nparams) parameters.
            """

            params=model.param_names
            if use_redshift:
                params=params[1:]
            for i in range(ndim):
                p=params[i]
                cube[i]=cube[i]*(bounds[p][1]-bounds[p][0])+bounds[p][0]

            return cube

        def loglike_multinest(cube, ndim, nparams):
            """Loglikelihood function specifically for multinest. This would be called for one filter, for one object.
            @param cube A ctypes pointer to the parameter cube (actually just the current parameter values).
            @param ndim Number of dimensions
            @param nparams Number of parameters. Usually the same as ndim unless you have unsampled (e.g. calculated) parameters. These
            are assumed to be the first (ndim-nparams) parameters.
            """
            dic = {}
            # This is the only obvious way to convert a ctypes pointer to a numpy array
            params = model.param_names
            if use_redshift:
                params = params[1:]
                dic['z'] = lc.meta['z']
            for i in range(nparams):
                dic[params[i]] = cube[i]

            model.set(**dic)
            chi2 = 0
            for filt in filts:
                yfit = model.bandflux(filt, X[filt], zp=27.5, zpsys='ab')
                chi2 += np.sum(((Y[filt] - yfit) / E[filt]) ** 2)
            return -chi2 / 2.

        lc=d.data[obj]
        filts=np.unique(lc['filter'])
        if model_name=='mlcs2k2':
            dust=sncosmo.CCM89Dust()
            model=sncosmo.Model(model_name,effects=[dust],effect_names=['host'], effect_frames=['rest'])
        else:
            model=sncosmo.Model(model_name)
        
        n_params=len(model.param_names)
        
        #err_plus=[pname+'_err+' for pname in self.model.param_names]
        #err_minus=[pname+'_err-' for pname in self.model.param_names]
        labels=['Object']+model.param_names
        
        t1=time.time()
        
        #Convert the astropy table to numpy array outside the likelihood function to avoid repeated calls
        X={}
        Y={}
        E={}
        for filt in filts:
            x, y, err=np.column_stack((lc['mjd'][lc['filter']==filt], lc['flux'][lc['filter']==filt], lc['flux_error'][lc['filter']==filt])).T
            X[filt]=x
            Y[filt]=y
            E[filt]=err
        

        
        chain_name=os.path.join(chain_directory, '%s-%s-' %(obj.split('.')[0], short_name))
        
        if use_redshift:
            ndim=len(model.param_names)-1
        else:
            ndim=len(model.param_names)
        
        
        if not restart or not os.path.exists(chain_name+'stats.dat'):
            pymultinest.run(loglike_multinest, prior_multinest, ndim, importance_nested_sampling = False, init_MPI=False,
            resume = False, verbose = False, sampling_efficiency = 'parameter', n_live_points = nlp, outputfiles_basename=chain_name, 
            multimodal=False, seed=seed)
        
        best_params=get_MAP(chain_name)
        
        if use_redshift:
            best_params=[lc.meta['z']]+best_params
            

        if convert_to_binary and not restart:
            s='%s-%s-' %(obj.split('.')[0], short_name)
            ext=['ev.dat', 'phys_live.points', 'live.points', '.txt', 'post_equal_weights.dat'] #These are the files we can convert
            for e in ext:
                infile=os.path.join(chain_directory,'%s-%s-%s' %(obj.split('.')[0], short_name,  e))
                outfile=infile+'.npy'
                x=np.loadtxt(infile)
                np.save(outfile, x)
                os.system('rm %s' %infile)
                

        np.savetxt(os.path.join(chain_directory,'%s-%s-.time' %(obj.split('.')[0], short_name)), [time.time()-t1])

        return np.array(best_params)
    
    except:
        #Sometimes things just break
        print ('ERROR in', obj)
        print (sys.exc_info()[0])
        print (sys.exc_info()[1])
        traceback.print_tb(sys.exc_info()[2])
        
        return None


[docs]def output_time(tm): """ Simple function to output the time nicely formatted. Parameters ---------- tm : Input time in seconds. """ hrs = tm / (60 * 60) mins = tm / 60 secs = tm if hrs >= 1: out = '%.2f hours' % (hrs) elif mins >= 1: out = '%.2f minutes' % (mins) else: out = '%.2f seconds' % (secs) print ('Time taken is', out)
[docs]def get_MAP(chain_name): """ Read maximum posterior parameters from a stats file of multinest. Parameters ---------- chain_name : str Root for the chain files Returns ------- list-like Best-fitting parameters """ stats_file = chain_name + 'stats.dat' fl = open(stats_file, 'r') lines = fl.readlines() stats = [] ind = [i for i in range(len(lines)) if 'MAP' in lines[i]][0] params = [float(l.split()[1]) for l in lines[ind + 2:]] return params
[docs]class Features: """Base class to define basic functionality for extracting features from supernova datasets. Users are not restricted to inheriting from this class, but any Features class must contain the functions extract_features and fit_sn. """ def __init__(self): self.p_limit=0.05 #At what point to we suggest a model has been a bad fit. def extract_features(self): pass def fit_sn(self): pass
[docs] def goodness_of_fit(self, d): """ Test (for any feature set) how well the reconstruction from the features fits each of the objects in the dataset. Parameters ---------- d : Dataset Dataset object. Returns ------- astropy.table.Table Table with the reduced Chi2 for each object """ if len(d.models)==0: print ('Call Dataset.set_models first.') return None filts=np.unique(d.data[d.object_names[0]]['filter']) filts=np.array(filts).tolist() rcs=Table(names=['Object']+filts, dtype=['U32']+['float64']*len(filts)) #Reduced chi2 for obj in d.object_names: #Go through each filter chi2=[] lc=d.data[obj] mod=d.models[obj] for filt in filts: l=lc[lc['filter']==filt] m=mod[mod['filter']==filt] x=l['mjd'] y=l['flux'] e=l['flux_error'] xmod=m['mjd'] ymod=m['flux'] #Interpolate fit=interp1d(xmod, ymod) yfit=fit(x) chi2.append(sum((yfit-y)**2/e**2)/(len(x)-1)) rcs.add_row([obj]+chi2) return rcs
[docs] def posterior_predictor(self, lc, nparams, chi2): """ Computes posterior predictive p-value to see if the model fits sufficient well. ***UNTESTED*** Parameters ---------- lc : astropy.table.Table Light curve nparams : int The number of parameters in the model. For the wavelets, this will be the number of PCA coefficients. For the parametric models, this will be the number parameters per model multiplied by the number of filters. For the template models, this is simply the number of parameters in the model. chi2 : array-like An array of chi2 values for each set of parameter space in the parameter samples. This is easy to obtain as -2*loglikelihood output from a multinest or mcmc chain. For features such as the wavelets, this will have to be separately calculated by drawing thousands of curves consistent with the coefficients and their errors and then computing the chi2. Returns ------- float The posterior predictive p-value. If this number is too close to 0 or 1 it implies the model is a poor fit. """ #Count the number of data points over all filters. ndata=len(lc['mjd']) dof=ndata-nparams-1 if dof<=0: dof=1 chi2_limit=stats.chi2.ppf(1-self.p_limit, dof) print (chi2_limit) print (chi2.min(), chi2.max()) chi2=np.sort(chi2) p_value=np.count_nonzero(chi2>chi2_limit)/len(chi2) if p_value>self.p_limit: print ('Model fit unsatisfactory, p value=', p_value) return p_value
[docs] def convert_astropy_array(self, tab): """ Convenience function to convert an astropy table (floats only) into a numpy array. """ out_array= np.array([tab[c] for c in tab.columns]).T return out_array
[docs]class TemplateFeatures(Features): """ Calls sncosmo to fit a variety of templates to the data. The number of features will depend on the templates chosen (e.g. salt2, nugent2p etc.) """ def __init__(self, model=['Ia'], sampler='leastsq',lsst_bands=False,lsst_dir='../lsst_bands/'): """ To initialise, provide a list of models to fit for (defaults to salt2 Ia templates). Parameters ---------- model : list-like, optional List of models. In theory you can fit Ia and non-Ia models and use all those as features. So far only tested with SALT2. sampler : str, optional A choice of 'mcmc', which uses the emcee sampler, or 'multinest' or 'leastsq' (default). lsst_bands : bool, optional Whether or not the LSST bands are required. Only need for LSST simulations to register bands with sncosmo. lsst_dir : str, optional Directory where LSST bands are stored. """ Features.__init__(self) if lsst_bands: self.registerBands(lsst_dir,prefix='approxLSST_',suffix='_total.dat') self.model_names=model self.templates={'Ia':'salt2-extended','salt2':'salt2-extended', 'mlcs2k2':'mlcs2k2', 'II':'nugent-sn2n','IIn':'nugent-sn2n','IIp':'nugent-sn2p', 'IIl':'nugent-sn2l', 'Ibc':'nugent-sn1bc', 'Ib':'nugent-sn1bc', 'Ic':'nugent-sn1bc'} self.short_names={'Ia':'salt2', 'mlcs2k2':'mlcs'} #Short names because of limitations in Multinest if sampler=='nested': try: import pymultinest except ImportError: print ('Nested sampling selected but pymultinest is not installed. Defaulting to least squares.') sampler='leastsq' elif sampler=='mcmc': try: import emcee except ImportError: print ('MCMC sampling selected but emcee is not installed. Defaulting to least squares.') sampler='leastsq' self.sampler=sampler self.bounds={'salt2-extended':{'z':(0.01, 1.5), 't0':(-100,100),'x0':(-1e-3, 1e-3), 'x1':(-3, 3), 'c':(-0.5, 0.5)}, 'mlcs2k2':{'z':(0.01, 1.5), 't0':(-100,100), 'amplitude':(0, 1e-17), 'delta':(-1.0,1.8),'hostebv':(0, 1),'hostr_v':(-7.0, 7.0)}, 'nugent-sn2n':{'z':(0.01, 1.5)}, 'nugent-sn2p':{'z':(0.01, 1.5)}, 'nugent-sn2l':{'z':(0.01, 1.5)}, 'nugent-sn1bc':{'z':(0.01, 1.5)}}
[docs] def extract_features(self, d, save_chains=False, chain_directory='chains', use_redshift=False, nprocesses=1, restart=False, seed=-1): """ Extract template features for a dataset. Parameters ---------- d : Dataset object Dataset save_chains : bool Whether or not to save the intermediate output (if Bayesian inference is used instead of least squares) chain_directory : str Where to save the chains use_redshift : bool Whether or not to use provided redshift when fitting objects nprocesses : int, optional Number of processors to use for parallelisation (shared memory only) restart : bool Whether or not to restart from multinest chains Returns ------- astropy.table.Table Table of fitted model parameters. """ subprocess.call(['mkdir', chain_directory]) print ('Fitting templates using', self.sampler, '...') all_output=[] t1=time.time() for mod_name in self.model_names: if mod_name=='mlcs2k2': dust=sncosmo.CCM89Dust() self.model=sncosmo.Model(self.templates[mod_name],effects=[dust],effect_names=['host'], effect_frames=['rest']) else: self.model=sncosmo.Model(self.templates[mod_name]) params=['['+mod_name+']'+pname for pname in self.model.param_names] # err_plus=[pname+'_err+' for pname in params] # err_minus=[pname+'_err-' for pname in params] labels = ['Object'] + params # if self.sampler=='mcmc': # labels=['Object']+params # elif self.sampler=='nested': # labels=['Object']+params # else: # labels=['Object']+params+['Chisq'] #output=Table(names=labels, dtype=['S32']+['f']*(len(labels)-1)) output = Table(names=labels, dtype=['U32'] + ['f'] * (len(labels) - 1)) k=0 if nprocesses<2: for obj in d.object_names: if k%100==0: print (k, 'objects fitted') lc=d.data[obj] if self.sampler=='mcmc': if seed!=-1: np.random.seed(seed) res, fitted_model = sncosmo.mcmc_lc(lc, self.model, self.model.param_names, bounds=self.bounds[self.templates[mod_name]], nwalkers=20, nsamples=1500, nburn=300) chain=res.samples if save_chains: tab=Table(chain, names=self.model.param_names) tab.write(os.path.join(chain_directory, obj.split('.')[0]+'_emcee_'+mod_name), format='ascii') best=res['parameters'].flatten('F').tolist() elif self.sampler=='nested': best=_run_multinest_templates(obj, d, self.templates[mod_name], self.bounds[self.templates[mod_name]], chain_directory=chain_directory, nlp=1000, convert_to_binary=False, use_redshift=use_redshift, short_name=self.short_names[mod_name], restart=restart, seed=seed) best=best.tolist() elif self.sampler=='leastsq': if use_redshift: self.model.set(z=lc.meta['z']) prms=self.model.param_names prms=prms[1:] bnds=self.bounds[self.templates[mod_name]].copy() bnds.pop('z', None) res, fitted_model=sncosmo.fit_lc(lc, self.model, vparam_names=prms, bounds=bnds, minsnr=0) else: res, fitted_model=sncosmo.fit_lc(lc, self.model, vparam_names=self.model.param_names, bounds=self.bounds[self.templates[mod_name]], minsnr=0) best=res['parameters'].flatten('F').tolist()#+[res['chisq']] row=[obj]+best output.add_row(row) k+=1 if len(all_output)==0: all_output=output else: all_output=join(all_output, output) else: if self.sampler=='leastsq': p=Pool(nprocesses, maxtasksperchild=1) partial_func=partial(_run_leastsq_templates, d=d, model_name=self.templates[mod_name], use_redshift=use_redshift, bounds=self.bounds[self.templates[mod_name]]) out=p.map(partial_func, d.object_names) output=out[0] for i in range(1, len(out)): output=vstack((output, out[i])) if len(all_output)==0: all_output=output else: all_output=vstack((all_output, output)) elif self.sampler=='nested': p=Pool(nprocesses, maxtasksperchild=1) partial_func=partial(_run_multinest_templates, d=d, model_name=self.templates[mod_name], bounds=self.bounds[self.templates[mod_name]], chain_directory=chain_directory, nlp=1000, convert_to_binary=True, use_redshift=use_redshift, short_name=self.short_names[mod_name], restart=restart, seed=seed) out=p.map(partial_func, d.object_names) for i in range(len(out)): output.add_row([d.object_names[i]]+out[i].tolist()) if len(all_output)==0: all_output=output else: all_output=vstack((all_output, output)) print (len(all_output), 'objects fitted') output_time(time.time()-t1) return all_output
[docs] def fit_sn(self, lc, features): """ Fits the chosen template model to a given light curve. Parameters ---------- lc : astropy.table.Table Light curve features : astropy.table.Table Model parameters Returns ------- astropy.table.Table Fitted light curve """ obj=lc.meta['name'] tab=features[features['Object']==obj] params=np.array([tab[c] for c in tab.columns[1:]]).flatten() if len(params)==0: print ('No feature set found for', obj) return None model_name=self.templates[self.model_names[0]] if model_name=='mlcs2k2': dust=sncosmo.CCM89Dust() model=sncosmo.Model(model_name,effects=[dust],effect_names=['host'], effect_frames=['rest']) else: model=sncosmo.Model(model_name) param_dict={} for i in range(len(model.param_names)): param_dict[model.param_names[i]]=params[i] model.set(**param_dict) filts=np.unique(lc['filter']) labels=['mjd', 'flux', 'filter'] output=Table(names=labels, dtype=['f', 'f', 'U32'], meta={'name':obj}) for filt in filts: x=lc['mjd'][lc['filter']==filt] xnew=np.linspace(0, x.max()-x.min(), 100) P=params ynew=model.bandflux(filt, xnew, zp=27.5, zpsys='ab') newtable=Table([xnew+x.min(), ynew, [filt]*len(xnew)], names=labels) #newtable=Table([x, ynew, [filt]*len(x)], names=labels) output=vstack((output, newtable)) return output
[docs] def registerBands(self, dirname, prefix=None, suffix=None): """Register LSST bandpasses with sncosmo. Courtesy of Rahul Biswas""" bands = ['u', 'g', 'r', 'i', 'z', 'y'] for band in bands: fname = os.path.join(dirname, prefix + band + suffix) data = np.loadtxt(fname) bp = sncosmo.Bandpass(wave=data[:, 0], trans=data[:, 1], name='lsst'+band) sncosmo.registry.register(bp, force=True)
[docs]class ParametricFeatures(Features): """ Fits a few options of generalised, parametric models to the data. """ def __init__(self, model_choice, sampler='leastsq', limits=None): """ Initialisation Parameters ---------- model_choice : str Which parametric model to use sampler : str, optional Choice of 'mcmc' (requires emcee), 'nested' (requires pymultinest) or 'leastsq' limits : dict, optional Parameter bounds if something other than the default is needed. """ Features.__init__(self) self.model_choices={'newling':parametric_models.NewlingModel, 'karpenka':parametric_models.KarpenkaModel} try: self.model_name=model_choice #Used for labelling output files if limits is not None: self.model=self.model_choices[model_choice](limits=limits) else: self.model=self.model_choices[model_choice]() except KeyError: print ('Your selected model is not in the parametric_models package. Either choose an existing model, or implement a new one in that package.') print ('Make sure any new models are included in the model_choices dictionary in the ParametricFeatures class.') sys.exit() if sampler=='nested' and not has_multinest: print ('Nested sampling selected but pymultinest is not installed. Defaulting to least squares.') sampler='leastsq' elif sampler=='mcmc' and not has_emcee: print ('MCMC sampling selected but emcee is not installed. Defaulting to least squares.') sampler='leastsq' self.sampler=sampler
[docs] def extract_features(self, d, chain_directory='chains', save_output=True, n_attempts=20, nprocesses=1, n_walkers=100, n_steps=500, walker_spread=0.1, burn=50, nlp=1000, starting_point=None, convert_to_binary=True, n_iter=0, restart=False, seed=-1): """ Fit parametric models and return best-fitting parameters as features. Parameters ---------- d : Dataset object Dataset chain_directory : str Where to save the chains save_output : bool Whether or not to save the intermediate output (if Bayesian inference is used instead of least squares) n_attempts : int Allow the minimiser to start in new random locations if the fit is bad. Put n_attempts=1 to fit only once with the default starting position. nprocesses : int, optional Number of processors to use for parallelisation (shared memory only) n_walkers : int emcee parameter - number of walkers to use n_steps : int emcee parameter - total number of steps walker_spread : float emcee parameter - standard deviation of distribution of starting points of walkers burn : int emcee parameter - length of burn-in nlp : int multinest parameter - number of live points starting_point : None or array-like Starting points of parameters for leastsq or emcee convert_to_binary : bool multinest parameter - whether or not to convert ascii output files to binary n_iter : int leastsq parameter - number of iterations to avoid local minima restart : bool Whether or not t restart from existing multinest chains Returns ------- astropy.table.Table Best-fitting parameters """ subprocess.call(['mkdir', chain_directory]) self.chain_directory=chain_directory t1=time.time() output=[] #obj=d.object_names[0] if nprocesses<2: k=0 for obj in d.object_names: if k%100==0: print (k, 'objects fitted') if self.sampler=='leastsq': newtable=_run_leastsq(obj, d, self.model, n_attempts, seed=seed) elif self.sampler=='mcmc': if(seed!=-1): np.random.seed(seed) newtable=self.run_emcee(d, obj, save_output, chain_directory, n_walkers, n_steps, walker_spread, burn, starting_point) else: newtable=_run_multinest(obj, d, self.model, chain_directory, nlp, convert_to_binary, n_iter, restart, seed=seed) if len(output)==0: output=newtable else: output=vstack((output, newtable)) k+=1 else: if self.sampler=='leastsq': p=Pool(nprocesses, maxtasksperchild=1) partial_func=partial(_run_leastsq, d=d, model=self.model, n_attempts=n_attempts, seed=seed) out=p.map(partial_func, d.object_names) output=out[0] for i in range(1, len(out)): output=vstack((output, out[i])) elif self.sampler=='nested': p=Pool(nprocesses, maxtasksperchild=1) partial_func=partial(_run_multinest, d=d, model=self.model,chain_directory=chain_directory, nlp=nlp, convert_to_binary=convert_to_binary, n_iter=n_iter, restart=restart, seed=seed) #Pool starts a number of threads, all of which may try to tackle all of the data. Better to take it in chunks output=[] k=0 objs=d.object_names while k<len(objs): objs_subset=objs[k:k+nprocesses] out=p.map(partial_func, objs_subset) for i in range(0, len(out)): if out[i]==None: print ('Fitting failed for', objs_subset[i]) else: if len(output)==0: output=out[i] else: output=vstack((output, out[i])) k+=len(objs_subset) print (len(output), 'objects fitted') output_time(time.time()-t1) return output
[docs] def fit_sn(self, lc, features): """ Fits the chosen parametric model to a given light curve. Parameters ---------- lc : astropy.table.Table Light curve features : astropy.table.Table Model parameters Returns ------- astropy.table.Table Fitted light curve """ obj=lc.meta['name'] params=features[features['Object']==obj] if len(params)==0: print ('No feature set found for', obj) return None filts=np.unique(lc['filter']) labels=['mjd', 'flux', 'filter'] output=Table(names=labels, dtype=['f', 'f', 'U32'], meta={'name':obj}) cols=params.columns[1:] prms=np.array([params[c] for c in cols]) for filt in filts: x=lc['mjd'][lc['filter']==filt] xnew=np.linspace(0, x.max()-x.min(), 100) #inds=[s for s in cols if filt in s] inds=np.where([filt in s for s in cols])[0] P=np.array(prms[inds],dtype='float') ynew=self.model.evaluate(xnew, P) newtable=Table([xnew+x.min(), ynew, [filt]*len(xnew)], names=labels) output=vstack((output, newtable)) return output
[docs] def run_emcee(self, d, obj, save_output, chain_directory, n_walkers, n_steps, walker_spread, burn, starting_point, seed=-1): """ Runs emcee on all the filter bands of a given light curve, fitting the model to each one and extracting the best fitting parameters. Parameters ---------- d : Dataset object Dataset obj : str Object name save_output : bool Whether or not to save the intermediate output chain_directory : str Where to save the chains n_walkers : int emcee parameter - number of walkers to use n_steps : int emcee parameter - total number of steps walker_spread : float emcee parameter - standard deviation of distribution of starting points of walkers burn : int emcee parameter - length of burn-in starting_point : None or array-like Starting points of parameters Returns ------- astropy.table.Table Best fitting parameters (at the maximum posterior) """ def get_params(starting_point, obj, filt): #Helper function to get parameters from the features astropy table X=starting_point[starting_point['Object']==obj] cols=X.columns inds=[s for s in cols if filt in s] P=X[inds] P=np.array([P[c] for c in P.columns]).flatten() return P lc=d.data[obj] filts=np.unique(lc['filter']) if(seed!=-1): np.random.seed(seed) n_params=len(self.model.param_names) #err_plus=[pname+'_err+' for pname in self.model.param_names] #err_minus=[pname+'_err-' for pname in self.model.param_names] labels=['Object'] for f in d.filter_set: pams=self.model.param_names for p in pams: labels.append(f+'-'+p) output=Table(names=labels, dtype=['U32']+['f']*(len(labels)-1)) t1=time.time() row=[obj] for f in d.filter_set: if f in filts: #This is pretty specific to current setup chain_name=os.path.join(self.chain_directory, '%s-%s-%s-' %(obj.split('.')[0], f, self.model_name)) x, y, yerr=np.array(lc['mjd'][lc['filter']==f]), np.array(lc['flux'][lc['filter']==f]), np.array(lc['flux_error'][lc['filter']==f]) if starting_point is None: #Initialise randomly in parameter space iniparams=np.random(n_params)*(self.model.upper_limit-self.model.lower_limit)+self.model.lower_limit else: #A starting point from a least squares run can be given as an astropy table iniparams=get_params(starting_point, obj, f) pos = [iniparams + walker_spread*np.randn(n_params) for i in range(n_walkers)] sampler = emcee.EnsembleSampler(n_walkers, n_params, self.lnprob_emcee, args=(x, y, yerr)) pos, prob, state = sampler.run_mcmc(pos, burn) #Remove burn-in sampler.reset() pos, prob, state = sampler.run_mcmc(pos, n_steps) samples = sampler.flatchain lnpost=sampler.flatlnprobability if save_output: np.savetxt(chain_directory+'emcee_chain_%s_%s_%s' %(self.model_name, f, (str)(obj)), np.column_stack((samples, lnpost))) #Maximum posterior params ind=lnpost.argmax() best_params=samples[ind, :] #Expects first the parameters, then -sigma then +sigma row+=best_params else: row+=[0]*len(self.model.param_names) #I'm not sure if it makes the most sense to fill in missing values with zeroes... output.add_row(row) print ('Time per filter', (time.time()-t1)/len(d.filter_set)) return output
[docs] def lnprob_emcee(self, params, x, y, yerr): """ Likelihood function for emcee Parameters ---------- params x y yerr Returns ------- """ #Uniform prior. Directly compares arrays if (np.any(params>self.model.upper_limit)) or (np.any(params<self.model.upper_limit)): return -np.inf else: ynew=self.model.evaluate(x, params) chi2=np.sum((y-ynew)*(y-ynew)/yerr/yerr) return -chi2/2.
[docs]class WaveletFeatures(Features): """ Uses wavelets to decompose the data and then reduces dimensionality of the feature space using PCA. """ def __init__(self, wavelet='sym2', ngp=100,**kwargs): """ Initialises the pywt wavelet object and sets the maximum depth for deconstruction. Parameters ---------- wavelet : str, optional String for which wavelet family to use. ngp : int, optional Number of points on the Gaussian process curve level : int, optional The maximum depth for wavelet deconstruction. If not provided, will use the maximum depth possible given the number of points in the Gaussian process curve. """ Features.__init__(self) self.wav=pywt.Wavelet(wavelet) self.ngp=ngp #Number of points to use on the Gaussian process curve self.wavelet_list=pywt.wavelist() #All possible families if wavelet not in self.wavelet_list: print ('Wavelet not recognised in pywt') sys.exit() #If the user does not specify a level of depth for the wavelet, automatically calculate it if 'level' in kwargs: self.mlev=kwargs['level'] else: self.mlev=pywt.swt_max_level(self.ngp)
[docs] def extract_features(self, d, initheta=[500, 20], save_output='none',output_root='features', nprocesses=1, restart='none', gpalgo='george', xmin=None, xmax=None): """ Applies a wavelet transform followed by PCA dimensionality reduction to extract wavelet coefficients as features. Parameters ---------- d : Dataset object Dataset initheta: list-like, optional Initial values for theta parameters. These should be roughly the scale length in the y & x directions. save_output : bool, optional Whether or not to save the output output_root : str, optional Output directory nprocesses : int, optional Number of processors to use for parallelisation (shared memory only) restart : str, optional Either 'none' to start from scratch, 'gp' to restart from saved Gaussian processes, or 'wavelet' to restart from saved wavelet decompositions (will look in output_root for the previously saved outputs). log : bool, optional Whether or not to take the logarithm of the final PCA components. Recommended setting is False (legacy code). Returns ------- astropy.table.Table Table of features (first column object names, the rest are the PCA coefficient values) """ if save_output is not 'none': subprocess.call(['mkdir', output_root]) if xmin is None: xmin=0 if xmax is None: xmax=d.get_max_length() if restart=='wavelet': wavout, waveout_err=self.restart_from_wavelets(d, output_root) else: if restart=='gp': self.restart_from_gp(d, output_root) else: self.extract_GP(d, self.ngp, xmin, xmax, initheta, save_output, output_root, nprocesses, gpalgo=gpalgo) wavout, waveout_err=self.extract_wavelets(d, self.wav, self.mlev, nprocesses, save_output, output_root) self.features,vals,vec,mn=self.extract_pca(d.object_names.copy(), wavout) #Save the PCA information self.PCA_eigenvals = vals self.PCA_eigenvectors=vec self.PCA_mean=mn return self.features
[docs] def fit_sn(self, lc, comps, vec, mn, xmin, xmax, filter_set): """ Fits a single object using previously run PCA components. Performs the full inverse wavelet transform. Parameters ---------- lc : astropy.table.Table Light curve comps : astropy.table.Table The PCA coefficients for each object (i.e. the astropy table of wavelet features from by extract_features). vec : array-like PCA component vectors as array (each column is a vector, ordered from most to least significant) mn : array-like Mean vector xmin : float The minimum on the x axis (as defined for the original GP decomposition) xmax : float The maximum on the x axis (as defined for the original GP decomposition) filter_set : list-like The full set of filters of the original dataset Returns ------- astropy.table.Table Fitted light curve """ obj=lc.meta['name'] filts=np.unique(lc['filter']) #The PCA will have been done over the full coefficient set, across all filters try: pca_comps=comps[comps['Object']==obj] except KeyError: print ('No feature set found for', obj) return None new_comps=np.array([pca_comps[c] for c in pca_comps.columns[1:]]).flatten() ncomp=len(new_comps) eigs=vec[:, :ncomp] coeffs=np.array(np.dot(new_comps, eigs.T)+mn).flatten() n=self.mlev*2*self.ngp xnew=np.linspace(xmin, xmax, self.ngp) output=[] for i in range(len(filter_set)): if filter_set[i] in filts: filt_coeffs=coeffs[i*n:(i+1)*n] filt_coeffs=filt_coeffs.reshape(self.mlev, 2, self.ngp, order='C') ynew=self.iswt(filt_coeffs, self.wav) newtable=Table([xnew, ynew, [filter_set[i]]*self.ngp], names=['mjd', 'flux', 'filter'], dtype=['f', 'f', 'U32']) if len(output)==0: output=newtable else: output=vstack((output, newtable)) return output
[docs] def restart_from_gp(self, d, output_root): """ Allows the restarted of the feature extraction process from previously saved Gaussian Process curves. Parameters ---------- d : Dataset object The same dataset (object) on which the previous GP analysis was performed. output_root : str Location of GP objects """ print ('Restarting from stored Gaussian Processes...') for obj in d.object_names: fname=os.path.join(output_root, 'gp_'+obj) try: tab=Table.read(fname, format='ascii') d.models[obj]=tab except IOError: print ('IOError, file ',fname, 'does not exist.')
[docs] def restart_from_wavelets(self, d, output_root): """ Allows the restarted of the feature extraction process from previously saved wavelet decompositions. This allows you to quickly try different dimensionality reduction (e.g. PCA) algorithms on the wavelets. Parameters ---------- d : Dataset object The same dataset (object) on which the previous wavelet analysis was performed. output_root : str Location of previously decomposed wavelet coefficients Returns ------- wavout : array A numpy array of the wavelet coefficients where each row is an object and each column a different coefficient wavout_err : array A similar numpy array storing the (assuming Gaussian) error on each coefficient. """ print ('Restarting from stored wavelets...') nfilts=len(d.filter_set) wavout=np.zeros([len(d.object_names), self.ngp*2*self.mlev*nfilts]) #This is just a very big array holding coefficients in memory wavout_err=np.zeros([len(d.object_names), self.ngp*2*self.mlev*nfilts]) for i in range(len(d.object_names)): obj=d.object_names[i] fname=os.path.join(output_root, 'wavelet_'+obj) try: out=Table.read(fname, format='ascii') cols=out.colnames[:-1] n=self.ngp*2*self.mlev for j in range(nfilts): x=out[out['filter']==d.filter_set[j]] coeffs=x[cols[:self.mlev*2]] coeffs_err=x[cols[self.mlev*2:]] newcoeffs=np.array([coeffs[c] for c in coeffs.columns]).T newcoeffs_err=np.array([coeffs_err[c] for c in coeffs_err.columns]).T wavout[i, j*n:(j+1)*n]=newcoeffs.flatten('F') wavout_err[i, j*n:(j+1)*n]=newcoeffs_err.flatten('F') except IOError: print ('IOError, file ',fname, 'does not exist.') return wavout, wavout_err
[docs] def extract_GP(self, d, ngp, xmin, xmax, initheta, save_output, output_root, nprocesses, gpalgo='george'): """ Runs Gaussian process code on entire dataset. The result is stored inside the models attribute of the dataset object. Parameters ---------- d : Dataset object Dataset ngp : int Number of points to evaluate Gaussian Process at xmin : float Minimim time to evaluate at xmax : float Maximum time to evaluate at initheta : list-like Initial values for theta parameters. These should be roughly the scale length in the y & x directions. save_output : bool Whether or not to save the output output_root : str Output directory nprocesses : int, optional Number of processors to use for parallelisation (shared memory only) """ print ('Performing Gaussian process regression') t1=time.time() #Check for parallelisation if nprocesses==1: for i in range(len(d.object_names)): obj=d.object_names[i] out=_GP(obj, d=d,ngp=ngp, xmin=xmin, xmax=xmax, initheta=initheta, save_output=save_output, output_root=output_root, gpalgo=gpalgo) d.models[obj]=out if save_output!='none': out.write(os.path.join(output_root, 'gp_'+obj), format='ascii') else: p=Pool(nprocesses, maxtasksperchild=1) #Pool and map can only really work with single-valued functions partial_GP=partial(_GP, d=d, ngp=ngp, xmin=xmin, xmax=xmax, initheta=initheta, save_output=save_output, output_root=output_root, gpalgo=gpalgo) out=p.map(partial_GP, d.object_names) for i in range(len(out)): obj=d.object_names[i] d.models[obj]=out[i] print ('Time taken for Gaussian process regression', time.time()-t1)
[docs] def GP(self, obj, d, ngp=200, xmin=0, xmax=170, initheta=[500, 20], gpalgo='george'): """ Fit a Gaussian process curve at specific evenly spaced points along a light curve. Parameters ---------- obj : str Object name d : Dataset object Dataset ngp / int, optional Number of points to evaluate Gaussian Process at xmin : float, optional Minimim time to evaluate at xmax : float, optional Maximum time to evaluate at initheta : list-like, optional Initial values for theta parameters. These should be roughly the scale length in the y & x directions. Returns ------- astropy.table.Table Table with evaluated Gaussian process curve and errors Notes ----- Wraps internal module-level function in order to circumvent multiprocessing module limitations in dealing with objects when parallelising. """ return _GP(obj, d, ngp, xmin, xmax, initheta, gpalgo)
[docs] def wavelet_decomp(self, lc, wav, mlev): """ Perform a wavelet decomposition on a single light curve. Parameters ---------- lc : astropy.table.Table Light curve wav : str or swt.Wavelet object Which wavelet family to use mlev : int Max depth Returns ------- astropy.table.Table Decomposed coefficients in each filter. """ filters=np.unique(lc['filter']) ngp=len(lc['flux'][lc['filter']==filters[0]]) #Store the output in another astropy table output=0 for fil in filters: y=lc['flux'][lc['filter']==fil] err=lc['flux_error'][lc['filter']==fil] coeffs=np.array(pywt.swt(y, wav, level=mlev)) coeffs_err=np.array(pywt.swt(err, wav, level=mlev)) #This actual gives a slight overestimate of the error #Create the column names (follows pywt convention) labels=[] for i in range(len(coeffs)): labels.append('cA%d' %(len(coeffs)-i)) labels.append('cD%d' %(len(coeffs)-i)) #For the erors err_labels=[] for i in range(len(labels)): err_labels.append(labels[i]+'_err') npoints=len(coeffs[0, 0, :]) c=coeffs.reshape(mlev*2, npoints).T c_err=coeffs_err.reshape(mlev*2, npoints).T newtable1=Table(c, names=labels) newtable2=Table(c_err, names=err_labels) joined_table=hstack([newtable1, newtable2]) #Add the filters joined_table['filter']=[fil]*npoints if output==0: output=joined_table else: output=vstack((output, joined_table)) return output
[docs] def extract_wavelets(self, d, wav, mlev, nprocesses, save_output, output_root): """ Perform wavelet decomposition on all objects in dataset. Output is stored as astropy table for each object. Parameters ---------- d : Dataset object Dataset wav : str or swt.Wavelet object Which wavelet family to use mlev : int Max depth nprocesses : int, optional Number of processors to use for parallelisation (shared memory only) save_output : bool, optional Whether or not to save the output output_root : str, optional Output directory Returns ------- wavout : array A numpy array of the wavelet coefficients where each row is an object and each column a different coefficient wavout_err : array A numpy array storing the (assuming Gaussian) error on each coefficient. """ print ('Performing wavelet decomposition') nfilts=len(d.filter_set) wavout=np.zeros([len(d.object_names), self.ngp*2*mlev*nfilts]) #This is just a big array holding coefficients in memory wavout_err=np.zeros([len(d.object_names), self.ngp*2*mlev*nfilts]) t1=time.time() for i in range(len(d.object_names)): obj=d.object_names[i] lc=d.models[obj] out= self.wavelet_decomp(lc, wav, mlev) if save_output=='wavelet' or save_output=='all': out.write(os.path.join(output_root, 'wavelet_'+obj), format='ascii') #We go by filter, then by set of coefficients cols=out.colnames[:-1] n=self.ngp*2*mlev filts=np.unique(lc['filter']) for j in range(nfilts): if d.filter_set[j] in filts: x=out[out['filter']==d.filter_set[j]] coeffs=x[cols[:mlev*2]] coeffs_err=x[cols[mlev*2:]] newcoeffs=np.array([coeffs[c] for c in coeffs.columns]).T newcoeffs_err=np.array([coeffs_err[c] for c in coeffs_err.columns]).T wavout[i, j*n:(j+1)*n]=newcoeffs.flatten('F') wavout_err[i, j*n:(j+1)*n]=newcoeffs_err.flatten('F') print ('Time for wavelet decomposition', time.time()-t1) return wavout, wavout_err
[docs] def pca(self, X): """ Performs PCA decomposition of a feature array X. Parameters ---------- X : array Array of features to perform PCA on. Returns ------- vals : list-like Ordered array of eigenvalues vec : array Ordered array of eigenvectors, where each column is an eigenvector. mn : array The mean of the dataset, which is subtracted before PCA is performed. Notes ----- Although SVD is considerably more efficient than eigh, it seems more numerically unstable and results in many more components being required to adequately describe the dataset, at least for the wavelet feature sets considered. """ #Find the normalised spectra X=X.transpose() mn=np.mean(X, axis=1) mn.shape=(len(mn), 1) X=X-mn nor=np.sqrt(np.sum(X**2, axis=0)) x_norm=X/nor # # #Construct the covariance matrix C=np.dot(x_norm, x_norm.T) #C=np.cov(X.T) #print C.shape C=np.mat(C) vals, vec = np.linalg.eigh(C) inds=np.argsort(vals)[::-1] return vals[inds], vec[:, inds], mn
[docs] def best_coeffs(self, vals, tol=0.98): """ Determine the minimum number of PCA components required to adequately describe the dataset. Parameters ---------- vals : list-like List of eigenvalues (ordered largest to smallest) tol : float, optional How much 'energy' or information must be retained in the dataset. Returns ------- int The required number of coefficients to retain the requested amount of "information". """ tot=np.sum(vals) tot2=0 for i in range(len(vals)): tot2+=vals[i] if tot2>=tol*tot: return i break print ("No dimensionality reduction achieved. All components of the PCA are required.") return -1
[docs] def project_pca(self, X, eig_vec): """ Project a vector onto a PCA axis (i.e. transform data to PCA space). Parameters ---------- X : array Vector of original data (for one object). eig_vec : array Array of eigenvectors, first column most significant. Returns ------- array Coefficients of eigen vectors """ A=np.linalg.lstsq(np.mat(eig_vec), np.mat(X).T)[0].flatten() return np.array(A)
[docs] def extract_pca(self, object_names, wavout): """ Dimensionality reduction of wavelet coefficients using PCA. Parameters ---------- object_names : list-like Object names corresponding to each row of the wavelet coefficient array. wavout : array Wavelet coefficient array, each row corresponds to an object, each column is a coefficient. log Returns ------- astropy.table.Table Astropy table containing PCA features. """ print ('Running PCA...') t1=time.time() #We now run PCA on this big matrix vals, vec, mn=self.pca(wavout) mn=mn.flatten() # # np.savetxt('PCA_vals.txt', vals) # np.savetxt('PCA_vec.txt', vec) # np.savetxt('PCA_mean.txt', mn) #Actually fit the components ncomp=self.best_coeffs(vals) eigs=vec[:, :ncomp] comps=np.zeros([len(wavout), ncomp]) for i in range(len(wavout)): coeffs=wavout[i] A=self.project_pca(coeffs-mn, eigs) comps[i]=A labels=['C%d' %i for i in range(ncomp)] wavs=Table(comps, names=labels) object_names.shape=(len(object_names), 1) objnames=Table(object_names, names=['Object']) wavs=hstack((objnames, wavs)) #wavs.write('wavelet_features.dat', format='ascii') print ('Time for PCA', time.time()-t1) return wavs,vals,vec,mn
[docs] def iswt(self, coefficients, wavelet): """ Performs inverse wavelet transform. M. G. Marino to complement pyWavelets' swt. Parameters ---------- coefficients : array approx and detail coefficients, arranged in level value exactly as output from swt: e.g. [(cA1, cD1), (cA2, cD2), ..., (cAn, cDn)] wavelet : str or swt.Wavelet Either the name of a wavelet or a Wavelet object Returns ------- array The inverse transformed array """ output = coefficients[0][0].copy() # Avoid modification of input data #num_levels, equivalent to the decomposition level, n num_levels = len(coefficients) for j in range(num_levels,0,-1): step_size = int(2**(j-1)) last_index = step_size _, cD = coefficients[num_levels - j] for first in range(last_index): # 0 to last_index - 1 # Getting the indices that we will transform indices = np.arange(first, len(cD), step_size) # select the even indices even_indices = indices[0::2] # select the odd indices odd_indices = indices[1::2] # perform the inverse dwt on the selected indices, # making sure to use periodic boundary conditions x1 = pywt.idwt(output[even_indices], cD[even_indices], wavelet, 'per') x2 = pywt.idwt(output[odd_indices], cD[odd_indices], wavelet, 'per') # perform a circular shift right x2 = np.roll(x2, 1) # average and insert into the correct indices output[indices] = (x1 + x2)/2. return output