Source code for txpipe.source_selector

from .base_stage import PipelineStage
from .data_types import ShearCatalog, YamlFile, PhotozPDFFile, TomographyCatalog, HDFFile, TextFile
from .utils import SourceNumberDensityStats
from .utils.calibration_tools import read_shear_catalog_type, apply_metacal_response
from .utils.calibration_tools import metacal_variants, band_variants, MetacalCalculator, LensfitCalculator, HSCCalculator
import numpy as np
import warnings

[docs]class TXSourceSelector(PipelineStage):

    """
    This pipeline stage selects objects to be used
    as the source sample for the shear-shear and
    shear-position calibrations.  It applies some
    general cuts based on the flags that metacal
    gives for the objects, and size and S/N cuts
    based on the configuration file.

    It also splits those objects into tomographic
    bins according to the choice the user makes
    in the input file, from the information in the
    photo-z PDF file.

    Once these selections are made it constructs
    the quantities needed to calibrate each bin -
    this consists of two shear response quantities.

    TODO: add option to use lensfit catalogs, which
    would be much much simpler.
    """

    name='TXSourceSelector'

    inputs = [
        ('shear_catalog', ShearCatalog),
        ('calibration_table', TextFile),
    ]

    outputs = [
        ('shear_tomography_catalog', TomographyCatalog)
    ]

    config_options = {
        'input_pz': False,
        'true_z': False,
        'bands': 'riz', # bands from metacal to use
        'verbose': False,
        'T_cut':float,
        's2n_cut':float,
        'delta_gamma': float,
        'chunk_rows':10000,
        'source_zbin_edges':[float],
        'random_seed': 42,
        'shear_prefix': 'mcal_',
    }

[docs]    def run(self):
        """
        Run the analysis for this stage.

         - Collect the list of columns to read
         - Create iterators to read chunks of those columns
         - Loop through chunks:
            - select objects for each bin
            - write them out
            - accumulate selection bias values
         - Average the selection biases
         - Write out biases and close the output
        """
        import astropy.table
        import sklearn.ensemble

        # Suppress some warnings from numpy that are not relevant
        original_warning_settings = np.seterr(all='ignore')

        # Are we using a metacal or lensfit catalog?
        shear_catalog_type = read_shear_catalog_type(self)

        # The output file we will put the tomographic
        # information into
        output_file = self.setup_output()

        # various config options
        bands = self.config['bands']
        chunk_rows = self.config['chunk_rows']
        delta_gamma = self.config['delta_gamma']

        shear_prefix = self.config['shear_prefix']


        # Columns we need from the shear catalog, will need to modify for lensfit catalogs
        shear_cols = [f'{shear_prefix}flags', f'{shear_prefix}psf_T_mean', 'weight']
        shear_cols += band_variants(bands,
                                    f'{shear_prefix}mag',
                                    f'{shear_prefix}mag_err',
                                    shear_catalog_type=shear_catalog_type)

        if shear_catalog_type == 'metacal':
            shear_cols += metacal_variants('mcal_T', 'mcal_s2n', 'mcal_g1', 'mcal_g2')
        elif shear_catalog_type == 'lensfit':
            shear_cols += ['T', 's2n', 'g1', 'g2','weight','m']
        elif shear_catalog_type =='hsc':
            shear_cols += ['T', 's2n', 'g1', 'g2','weight','m','c1','c2','sigma_e']

        if self.config['input_pz'] and self.config['shear_catalog_type']=='metacal':
            shear_cols += ['mean_z']
            shear_cols += ['mean_z_1p']
            shear_cols += ['mean_z_1m']
            shear_cols += ['mean_z_2p']
            shear_cols += ['mean_z_2m']
        elif self.config['input_pz'] and self.config['shear_catalog_type']!='metacal':
            shear_cols += ['mean_z']
        elif self.config['true_z']:
            shear_cols += ['redshift_true']
        else:
            # Build a classifier used to put objects into tomographic bins
            classifier, features = self.build_tomographic_classifier()

            # this bit is for metacal if we want to use it later

        # Input data.  These are iterators - they lazily load chunks
        # of the data one by one later when we do the for loop.
        # This code can be run in parallel, and different processes will
        # each get different chunks of the data
        iter_shear = self.iterate_hdf('shear_catalog', 'shear', shear_cols, chunk_rows)

        # We will collect the selection biases for each bin
        # as a matrix.  We will collect together the different
        # matrices for each chunk and do a weighted average at the end.
        nbin_source = len(self.config['source_zbin_edges'])-1

        selection_biases = []
        number_density_stats = SourceNumberDensityStats(nbin_source, comm=self.comm,shear_type=self.config['shear_catalog_type'])

        if shear_catalog_type == 'metacal':
            calibrators = [MetacalCalculator(self.select, delta_gamma) for i in range(nbin_source)]
            # 2d calibrator
            calibrators.append(MetacalCalculator(self.select_2d, delta_gamma))
        elif shear_catalog_type == 'lensfit':
            calibrators = [LensfitCalculator(self.select,self.config['input_m_is_weighted']) for i in range(nbin_source)]
            calibrators.append(LensfitCalculator(self.select_2d,self.config['input_m_is_weighted']))
        elif shear_catalog_type == 'hsc':
            calibrators = [HSCCalculator(self.select) for i in range(nbin_source)]
            calibrators.append(HSCCalculator(self.select_2d))
        else:
            raise ValueError("Unknown shear catalog type. Please specify from 'metacal','lensfit', or 'hsc'.")

        # Loop through the input data, processing it chunk by chunk
        for (start, end, shear_data) in iter_shear:
            print(f"Process {self.rank} running selection for rows {start:,}-{end:,}")

            if self.config['true_z'] or self.config['input_pz']:
                pz_data = self.apply_simple_redshift_cut(shear_data)

            else:
                # Select most likely tomographic source bin
                pz_data = self.apply_classifier(classifier, features, shear_data)

            # Combine this selection with size and snr cuts to produce a source selection
            # and calculate the shear bias it would generate
            tomo_bin, R, counts = self.calculate_tomography(pz_data, shear_data, calibrators)

            # Save the tomography for this chunk
            self.write_tomography(output_file, start, end, tomo_bin, R)

            # Accumulate information on the number counts and the selection biases.
            # These will be brought together at the end.
            number_density_stats.add_data(shear_data, tomo_bin)  # check this

        # Do the selection bias averaging and output that too.
        self.write_global_values(output_file, calibrators, number_density_stats)

        # Save and complete
        output_file.close()

        # Restore the original warning settings in case we are being called from a library
        np.seterr(**original_warning_settings)


    def build_tomographic_classifier(self):
        # Load the training data
        # Build the SOM from the training data
        from astropy.table import Table
        from sklearn.ensemble import RandomForestClassifier

        if self.rank > 0:
            classifier = self.comm.bcast(None)
            features = self.comm.bcast(None)
            return classifier, features

        # Load the training data
        training_file = self.get_input("calibration_table")
        training_data_table = Table.read(training_file, format='ascii')

        # Pull out the appropriate columns and combinations of the data
        bands = self.config['bands']
        print(f"Using these bands to train the tomography selector: {bands}")

        # Generate the training data that we will use
        # We record both the name of the column and the data iself
        features = []
        training_data = []
        for b1 in bands[:]:
            # First we use the magnitudes themselves
            features.append(b1)
            training_data.append(training_data_table[b1])
            # We also use the colours as training data, even the redundant ones
            for b2 in bands[:]:
                if b1<b2:
                    features.append(f'{b1}-{b2}')
                    training_data.append(training_data_table[b1] - training_data_table[b2])
        training_data = np.array(training_data).T

        print("Training data for bin classifier has shape ", training_data.shape)

        # Now put the training data into redshift bins
        # We use -1 to indicate that we are outside the desired ranges
        z = training_data_table['sz']
        training_bin = np.repeat(-1, len(z))
        print("Using these bin edges:", self.config['source_zbin_edges'])
        for i, zmin in enumerate(self.config['source_zbin_edges'][:-1]):
            zmax = self.config['source_zbin_edges'][i+1]
            training_bin[(z>zmin) & (z<zmax)] = i
            ntrain_bin = ((z>zmin) & (z<zmax)).sum()
            print(f"Training set: {ntrain_bin} objects in tomographic bin {i}")

        # Can be replaced with any classifier
        classifier = RandomForestClassifier(max_depth=10, max_features=None, n_estimators=20,
            random_state=self.config['random_seed'])
        classifier.fit(training_data, training_bin)

        # Sklearn fitters can be pickled, which means they can also be sent through
        # mpi4py
        if self.is_mpi():
            self.comm.bcast(classifier)
            self.comm.bcast(features)

        return classifier, features


[docs]    def apply_classifier(self, classifier, features, shear_data):
        """Apply the classifier to the measured magnitudes
        """
        bands = self.config['bands']
        shear_prefix = self.config['shear_prefix']

        if self.config['shear_catalog_type'] == 'metacal':
            variants = ['', '_1p', '_2p', '_1m', '_2m']
        else:
            variants = ['']

        pz_data = {}

        for v in variants:
            # Pull out the columns that we have trained this bin selection
            # model on.
            data = []
            for f in features:
                # may be a single band
                if len(f) == 1:
                    col = shear_data[f'{shear_prefix}mag_{f}{v}']
                # or a colour
                else:
                    b1,b2 = f.split('-')
                    col = shear_data[f'{shear_prefix}mag_{b1}{v}'] - shear_data[f'{shear_prefix}mag_{b2}{v}']
                if np.all(~np.isfinite(col)):
                    # entire column is NaN.  Hopefully this will get deselected elsewhere
                    col[:] = 30.0
                else:
                    ok = np.isfinite(col)
                    col[~ok] = col[ok].max()
                data.append(col)
            data = np.array(data).T

            # Run the random forest on this data chunk
            pz_data[f'zbin{v}'] = classifier.predict(data)
        return pz_data

    def apply_simple_redshift_cut(self, shear_data):

        pz_data = {}

        if self.config['input_pz'] and self.config['shear_catalog_type']=='metacal':

            # this bit is for metacal, if we need it later
            variants = ['', '_1p', '_2p', '_1m', '_2m']
            for v in variants:
                zz = shear_data[f'mean_z{v}']

                pz_data_v = np.zeros(len(zz), dtype=int) -1
                for zi in range(len(self.config['source_zbin_edges'])-1):
                    mask_zbin = (zz>=self.config['source_zbin_edges'][zi]) & (zz<self.config['source_zbin_edges'][zi+1])
                    pz_data_v[mask_zbin] = zi

                pz_data[f'zbin{v}'] = pz_data_v
        else:

            if self.config['input_pz']:
                zz = shear_data['mean_z']
            else:
                zz = shear_data['redshift_true']

            pz_data_bin = np.zeros(len(zz), dtype=int) -1
            for zi in range(len(self.config['source_zbin_edges'])-1):
                mask_zbin = (zz>=self.config['source_zbin_edges'][zi]) & (zz<self.config['source_zbin_edges'][zi+1])
                pz_data_bin[mask_zbin] = zi

            pz_data[f'zbin'] = pz_data_bin



        return pz_data

[docs]    def calculate_tomography(self, pz_data, shear_data, calibrators):
        """
        Select objects to go in each tomographic bin and their calibration.

        Parameters
        ----------

        pz_data: table or dict of arrays
            A chunk of input photo-z data containing mean values for each object
        shear_data: table or dict of arrays
            A chunk of input shear data with metacalibration variants.
        """
        delta_gamma = self.config['delta_gamma']
        nbin = len(self.config['source_zbin_edges'])-1
        shear_prefix = self.config['shear_prefix']
        n = len(shear_data[f'{shear_prefix}g1'])

        # The main output data - the tomographic
        # bin index for each object, or -1 for no bin.
        tomo_bin = np.repeat(-1, n)
        if self.config['shear_catalog_type']=='metacal':
            R = np.zeros((n, 2, 2))
        else:
            R = np.zeros((n,))

        # We also keep count of total count of objects in each bin
        counts = np.zeros(nbin + 1, dtype=int)

        data = {**pz_data, **shear_data}

        # TODO: Emily - do we want to call the calibration tools for this?
        if self.config['shear_catalog_type']=='metacal':
            R[:,0,0] = (data['mcal_g1_1p'] - data['mcal_g1_1m']) / delta_gamma
            R[:,0,1] = (data['mcal_g1_2p'] - data['mcal_g1_2m']) / delta_gamma
            R[:,1,0] = (data['mcal_g2_1p'] - data['mcal_g2_1m']) / delta_gamma
            R[:,1,1] = (data['mcal_g2_2p'] - data['mcal_g2_2m']) / delta_gamma
        elif self.config['shear_catalog_type']=='lensfit':
            R = 1.0
        else:
            w_tot = np.sum(data['weight'])
            R[:] =  np.array([1. - np.sum(data['weight']*data['sigma_e'])/w_tot]*len(data['weight']))


        for i in range(nbin):
            sel_00 = calibrators[i].add_data(data, i)
            tomo_bin[sel_00] = i
            nsum = sel_00.sum()
            counts[i] = nsum
            # also count up the 2D sample
            counts[-1] += nsum

        # and calibrate the 2D sample.
        # This calibrator refers to self.select_2d
        calibrators[-1].add_data(data)

        return tomo_bin, R, counts

[docs]    def setup_output(self):
        """
        Set up the output data file.

        Creates the data sets and groups to put module output
        in the shear_tomography_catalog output file.
        """
        n = self.open_input('shear_catalog')['shear/ra'].size
        zbins = self.config['source_zbin_edges']
        nbin_source = len(zbins)-1

        outfile = self.open_output('shear_tomography_catalog', parallel=True)
        group = outfile.create_group('tomography')
        group.create_dataset('source_bin', (n,), dtype='i')
        group.create_dataset('source_counts', (nbin_source,), dtype='i')
        group.create_dataset('source_counts_2d', (1,), dtype='i')
        group.create_dataset('sigma_e', (nbin_source,), dtype='f')
        group.create_dataset('sigma_e_2d', (1,), dtype='f')
        group.create_dataset('mean_e1', (nbin_source,), dtype='f')
        group.create_dataset('mean_e2', (nbin_source,), dtype='f')
        group.create_dataset('mean_e1_2d', (1,), dtype='f')
        group.create_dataset('mean_e2_2d', (1,), dtype='f')
        group.create_dataset('N_eff', (nbin_source,), dtype='f')
        group.create_dataset('N_eff_2d', (1,), dtype='f')

        group.attrs['nbin_source'] = nbin_source
        group.attrs['catalog_type'] = self.config["shear_catalog_type"]
        for i in range(nbin_source):
            group.attrs[f'source_zmin_{i}'] = zbins[i]
            group.attrs[f'source_zmax_{i}'] = zbins[i+1]

        #group = outfile.create_group('multiplicative_bias')  # why is this called "multiplicative_bias"?
        if self.config['shear_catalog_type']=='metacal':
            group = outfile.create_group('metacal_response')
            group.create_dataset('R_gamma', (n,2,2), dtype='f')
            group.create_dataset('R_S', (nbin_source,2,2), dtype='f')
            group.create_dataset('R_gamma_mean', (nbin_source,2,2), dtype='f')
            group.create_dataset('R_total', (nbin_source,2,2), dtype='f')
            group.create_dataset('R_S_2d', (2,2), dtype='f')
            group.create_dataset('R_gamma_mean_2d', (2,2), dtype='f')
            group.create_dataset('R_total_2d', (2,2), dtype='f')
        elif self.config['shear_catalog_type']=='lensfit':
            group = outfile.create_group('response')
            group.create_dataset('K', (nbin_source,), dtype='f')
            group.create_dataset('C', (nbin_source,2), dtype='f')
            group.create_dataset('K_2d', (1,), dtype='f')
            group.create_dataset('C_2d', (2), dtype='f')
        else:
            group = outfile.create_group('response')
            group.create_dataset('R', (n,), dtype='f')
            group.create_dataset('K', (nbin_source,), dtype='f')
            group.create_dataset('C', (nbin_source,2), dtype='f')
            group.create_dataset('R_mean', (nbin_source,), dtype='f')
            group.create_dataset('K_2d', (1,), dtype='f')
            group.create_dataset('C_2d', (2), dtype='f')
            group.create_dataset('R_mean_2d', (1,), dtype='f')

        return outfile

[docs]    def write_tomography(self, outfile, start, end, source_bin, R):
        """
        Write out a chunk of tomography and response.

        Parameters
        ----------


        outfile: h5py.File

        start: int
            The index into the output this chunk starts at

        end: int
            The index into the output this chunk ends at

        tomo_bin: array of shape (nrow,)
            The bin index for each output object

        R: array of shape (nrow,2,2)
            Multiplicative bias calibration factor for each object


        """
        group = outfile['tomography']
        group['source_bin'][start:end] = source_bin
        if self.config['shear_catalog_type']=='metacal':
            group = outfile['metacal_response']
            group['R_gamma'][start:end,:,:] = R
        elif self.config['shear_catalog_type']=='hsc':
            group = outfile['response']
            group['R'][start:end] = R

[docs]    def write_global_values(self, outfile, calibrators, number_density_stats):
        """
        Write out overall selection biases

        Parameters
        ----------

        outfile: h5py.File

        S: array of shape (nbin,2,2)
            Selection bias matrices
        """
        nbin_source = len(calibrators) - 1

        R = np.zeros((nbin_source, 2, 2))
        S = np.zeros((nbin_source, 2, 2))
        K = np.zeros(nbin_source)
        C = np.zeros((nbin_source, 2))
        N = np.zeros(nbin_source)
        R_scalar = np.zeros(nbin_source)
        mean_e1 = np.zeros(nbin_source)
        mean_e2 = np.zeros(nbin_source)
        sigma_e = np.zeros(nbin_source)

        means, variances, means_2d, variances_2d = number_density_stats.collect()

        # Loop through the tomographic calibrators.
        # (The last calibrator is for the non-tomographic selection)
        for i in range(nbin_source):
            cal = calibrators[i]
            mu1 = np.array([means[i, 0]])
            mu2 = np.array([means[i, 1]])

            # We now have to calibrate both the mean shear and the
            # sigma_e estimator
            if self.config['shear_catalog_type']=='metacal':
                # Collect the total calibration factor
                R[i], S[i], N[i] = cal.collect(self.comm)

                # Apply it to the means
                mean_e1[i], mean_e2[i] = apply_metacal_response(
                    R[i], S[i], g1=mu1, g2=mu2)

                # Inverse of the square of the reponse, taking
                # diagonal because we don't have the covariance
                # and it should be very small
                P = np.diag(np.linalg.inv(R[i] @ R[i]))
                # Apply to the variances to get sigma_e
                sigma_e[i] = np.sqrt(0.5 * P @ variances[i])

            elif self.config['shear_catalog_type']=='lensfit' or self.config['shear_catalog_type']=='hsc':
                # Collect the overall calibration
                K[i], C[i], N[i] = cal.collect(self.comm)

                mean_e1[i] = C[i][0]
                mean_e2[i] = C[i][1]

                # This also needs checking.
                sigma_e[i] = np.sqrt(
                    (0.5 * (variances[i, 0] + variances[i, 1]))
                ) / (1 + K[i])

            else:
                raise ValueError("Unknown calibration type in mean g / sigma_e calc")

        # The non-tomographic parts
        cal2d = calibrators[-1]
        mu1 = np.array([means_2d[0]])
        mu2 = np.array([means_2d[1]])

        # Non-tomo metacal
        if self.config['shear_catalog_type']=='metacal':
            R_2d, S_2d, N_2d = cal2d.collect(self.comm)

            mean_e1_2d, mean_e2_2d = apply_metacal_response(
                R_2d, S_2d, g1=mu1, g2=mu2)

            # non-tomo sigma_e in metacal
            P = np.diag(np.linalg.inv(R_2d @ R_2d))
            sigma_e_2d = np.sqrt(0.5 * P @ variances_2d)

        # Non-tomo lensfit
        elif self.config['shear_catalog_type']=='lensfit':
            K_2d, C_2d, N_2d = cal2d.collect(self.comm)

            # should probably use one of the calibration_tools functions
            mean_e1_2d = C_2d[0]
            mean_e2_2d = C_2d[1]
            # non-tomo sigma_e in lensfit
            sigma_e_2d = np.sqrt(
                (0.5 * (variances_2d[0] + variances_2d[1]))
            ) / (1 + K_2d)

        # Non-tomo lensfit
        elif self.config['shear_catalog_type']=='hsc':
            print("(also check in the 2D bit!)")
            R_scalar_2d, K_2d, C_2d, N_2d = cal2d.collect(self.comm)

            # should probably use one of the calibration_tools functions
            mean_e1_2d = C_2d[0][0]
            mean_e2_2d = C_2d[0][1]
            # non-tomo sigma_e in hsc
            sigma_e_2d = np.sqrt(
                (0.5 * (variances_2d[0] + variances_2d[1]))
            ) / (1 + K_2d[0])



        if self.rank==0:
            if self.config['shear_catalog_type']=='metacal':
                group = outfile['metacal_response']
                # Tomographic outputs
                group['R_S'][:,:,:] = S
                group['R_gamma_mean'][:,:,:] = R
                group['R_total'][:,:,:] = R + S

                # Non-tomographic outputs
                group['R_S_2d'][:,:] = S_2d
                group['R_gamma_mean_2d'][:,:] = R_2d
                group['R_total_2d'][:,:] = R_2d + S_2d
            elif self.config['shear_catalog_type']=='lensfit':
                group = outfile['response']
                # Tomographic outputs
                group['C'][:] = C
                group['K'][:] = K

                # Non-tomographic outputs
                group['C_2d'][:] = C_2d
                group['K_2d'][:] = K_2d
            else:
                group = outfile['response']
                # Tomographic outputs
                group['R_mean'][:] = R_scalar
                group['C'][:] = C
                group['K'][:] = K

                # Non-tomographic outputs
                group['R_mean_2d'][:] = R_scalar_2d
                group['C_2d'][:] = C_2d
                group['K_2d'][:] = K_2d

            # These are the same in the two methods
            group = outfile['tomography']

            group['source_counts'][:] = N
            group['N_eff'][:] = N
            group['mean_e1'][:] = mean_e1
            group['mean_e2'][:] = mean_e2
            group['sigma_e'][:] = sigma_e

            # and the non-tomographic versions of the same things
            group['source_counts_2d'][:] = N_2d
            group['N_eff_2d'][:] = N_2d
            group['mean_e1_2d'][:] = mean_e1_2d
            group['mean_e2_2d'][:] = mean_e2_2d
            group['sigma_e_2d'][:] = sigma_e_2d


    def select(self, data, bin_index):
        zbin = data['zbin']
        verbose = self.config['verbose']

        sel = self.select_2d(data, calling_from_select=True)
        sel &= zbin==bin_index
        f4 = sel.sum() / sel.size

        if verbose:
            print(f"{f4:.2%} z for bin {bin_index}")
            print("total tomo", sel.sum())

        return sel

    def select_2d(self, data, calling_from_select=False):
        # Select any objects that pass general WL cuts
        # The calling_from_select option just specifies whether we
        # are calling this function from within the select
        # method above, because the useful printed verbose
        # output is different in each case
        shear_prefix = self.config['shear_prefix']
        s2n_cut = self.config['s2n_cut']
        T_cut = self.config['T_cut']
        verbose = self.config['verbose']
        variant = data.suffix

        s2n = data[f'{shear_prefix}s2n']
        T = data[f'{shear_prefix}T']

        Tpsf = data[f'{shear_prefix}psf_T_mean']
        flag = data[f'{shear_prefix}flags']

        n0 = len(flag)
        sel  = flag==0
        f1 = sel.sum() / n0
        sel &= (T/Tpsf)>T_cut
        f2 = sel.sum() / n0
        sel &= s2n>s2n_cut
        f3 = sel.sum() / n0
        sel &= data['zbin'] >= 0
        f4 = sel.sum() / n0

        # Print out a message.  If we are selecting a 2D sample
        # this is the complete message.  Otherwise if we are about
        # to also apply a redshift bin cut about then the message will continue
        # as above
        if verbose and calling_from_select:
            print(f"Tomo selection ({variant}) {f1:.2%} flag, {f2:.2%} size, "
                  f"{f3:.2%} SNR, ", end="")
        elif verbose:
            print(f"2D selection ({variant}) {f1:.2%} flag, {f2:.2%} size, "
                  f"{f3:.2%} SNR, {f4:.2%} any z bin")
            print("total 2D", sel.sum())
        return sel


def flatten_list(lst):
    return [item for sublist in lst for item in sublist]


if __name__ == '__main__':
    PipelineStage.main()