Source code for txpipe.lens_selector

from .base_stage import PipelineStage
from .data_types import YamlFile, TomographyCatalog, HDFFile, TextFile
from .utils import LensNumberDensityStats
from .utils import Splitter
import numpy as np
import warnings




[docs]class TXBaseLensSelector(PipelineStage):
    """
    This pipeline stage selects objects to be used
    as the lens sample for the galaxy clustering and
    shear-position calibrations.
    """

    name='TXBaseLensSelector'

    outputs = [
        ('lens_tomography_catalog', TomographyCatalog),
    ]

    config_options = {
        'verbose': False,
        'chunk_rows':10000,
        'lens_zbin_edges':[float],
        # Mag cuts
        # Default photometry cuts based on the BOSS Galaxy Target Selection:                                                     
        # http://www.sdss3.org/dr9/algorithms/boss_galaxy_ts.php                                           
        'cperp_cut':0.2,
        'r_cpar_cut':13.5,
        'r_lo_cut':16.0,
        'r_hi_cut':19.6,
        'i_lo_cut':17.5,
        'i_hi_cut':19.9,
        'r_i_cut':2.0,
        'random_seed': 42,
    }

[docs]    def run(self):
        """
        Run the analysis for this stage.
        
         - Collect the list of columns to read
         - Create iterators to read chunks of those columns
         - Loop through chunks:
            - select objects for each bin
            - write them out
            - accumulate selection bias values
         - Average the selection biases
         - Write out biases and close the output
        """
        import astropy.table
        import sklearn.ensemble

        if self.name == "TXBaseLensSelector":
            raise ValueError("Do not run TXBaseLensSelector - run a sub-class")

        # Suppress some warnings from numpy that are not relevant
        original_warning_settings = np.seterr(all='ignore')  

        # The output file we will put the tomographic
        # information into
        output_file = self.setup_output()

        iterator = self.data_iterator()

        # We will collect the selection biases for each bin
        # as a matrix.  We will collect together the different
        # matrices for each chunk and do a weighted average at the end.
        nbin_lens = len(self.config['lens_zbin_edges']) - 1

        number_density_stats = LensNumberDensityStats(nbin_lens, self.comm)

        # Loop through the input data, processing it chunk by chunk
        for (start, end, phot_data) in iterator:
            print(f"Process {self.rank} running selection for rows {start:,}-{end:,}")

            pz_data = self.apply_redshift_cut(phot_data)

            # Select lens bin objects
            lens_gals = self.select_lens(phot_data)

            # Combine this selection with size and snr cuts to produce a source selection
            # and calculate the shear bias it would generate
            tomo_bin, counts = self.calculate_tomography(pz_data, phot_data, lens_gals)

            # Save the tomography for this chunk
            self.write_tomography(output_file, start, end, tomo_bin)

            # Accumulate information on the number counts and the selection biases.
            # These will be brought together at the end.
            number_density_stats.add_data(tomo_bin)

        # Do the selection bias averaging and output that too.
        self.write_global_values(output_file, number_density_stats)

        # Save and complete
        output_file.close()

        # Restore the original warning settings in case we are being called from a library
        np.seterr(**original_warning_settings)


    def apply_redshift_cut(self, phot_data):

        pz_data = {}
        nbin = len(self.config['lens_zbin_edges']) - 1

        z = phot_data[f'z']

        zbin = np.repeat(-1, len(z))
        for zi in range(nbin):
            mask_zbin = (
                  (z >= self.config['lens_zbin_edges'][zi]) 
                & (z<self.config['lens_zbin_edges'][zi+1])
            )
            zbin[mask_zbin] = zi
            
        pz_data[f'zbin'] = zbin

        return pz_data


[docs]    def setup_output(self):
        """
        Set up the output data file.

        Creates the data sets and groups to put module output
        in the tomography_catalog output file.
        """
        n = self.open_input('photometry_catalog')['photometry/ra'].size
        nbin_lens = len(self.config['lens_zbin_edges'])-1

        outfile = self.open_output('lens_tomography_catalog', parallel=True)
        group = outfile.create_group('tomography')
        group.create_dataset('lens_bin', (n,), dtype='i')
        group.create_dataset('lens_weight', (n,), dtype='f')
        group.create_dataset('lens_counts', (nbin_lens,), dtype='i')
        group.create_dataset('lens_counts_2d', (1,), dtype='i')

        group.attrs['nbin_lens'] = nbin_lens
        group.attrs[f'lens_zbin_edges'] = self.config['lens_zbin_edges']

        return outfile

[docs]    def write_tomography(self, outfile, start, end, lens_bin):
        """
        Write out a chunk of tomography and response.

        Parameters
        ----------
        outfile: h5py.File

        start: int
            The index into the output this chunk starts at

        end: int
            The index into the output this chunk ends at

        tomo_bin: array of shape (nrow,)
            The bin index for each output object

        R: array of shape (nrow,2,2)
            Multiplicative bias calibration factor for each object
        """

        group = outfile['tomography']
        group['lens_bin'][start:end] = lens_bin
        group['lens_weight'][start:end] = 1.0

[docs]    def write_global_values(self, outfile, number_density_stats):
        """
        Write out overall selection biases

        Parameters
        ----------
        outfile: h5py.File
        """
        lens_counts, lens_counts_2d = number_density_stats.collect()


        if self.rank==0:
            group = outfile['tomography']
            group['lens_counts'][:] = lens_counts
            group['lens_counts_2d'][:] = lens_counts_2d


[docs]    def select_lens(self, phot_data):
        """Photometry cuts based on the BOSS Galaxy Target Selection:
        http://www.sdss3.org/dr9/algorithms/boss_galaxy_ts.php
        """
        mag_i = phot_data['mag_i']
        mag_r = phot_data['mag_r']
        mag_g = phot_data['mag_g']

        # Mag cuts 
        cperp_cut_val = self.config['cperp_cut']
        r_cpar_cut_val = self.config['r_cpar_cut']
        r_lo_cut_val = self.config['r_lo_cut']
        r_hi_cut_val = self.config['r_hi_cut']
        i_lo_cut_val = self.config['i_lo_cut']
        i_hi_cut_val = self.config['i_hi_cut']
        r_i_cut_val = self.config['r_i_cut']

        n = len(mag_i)
        # HDF does not support bools, so we will prepare a binary array
        # where 0 is a lens and 1 is not
        lens_gals = np.repeat(0,n)

        cpar = 0.7 * (mag_g - mag_r) + 1.2 * ((mag_r - mag_i) - 0.18)
        cperp = (mag_r - mag_i) - ((mag_g - mag_r) / 4.0) - 0.18
        dperp = (mag_r - mag_i) - ((mag_g - mag_r) / 8.0)

        # LOWZ
        cperp_cut = np.abs(cperp) < cperp_cut_val #0.2
        r_cpar_cut = mag_r < r_cpar_cut_val + cpar / 0.3
        r_lo_cut = mag_r > r_lo_cut_val #16.0
        r_hi_cut = mag_r < r_hi_cut_val #19.6

        lowz_cut = (cperp_cut) & (r_cpar_cut) & (r_lo_cut) & (r_hi_cut)

        # CMASS
        i_lo_cut = mag_i > i_lo_cut_val #17.5
        i_hi_cut = mag_i < i_hi_cut_val #19.9
        r_i_cut = (mag_r - mag_i) < r_i_cut_val #2.0
        #dperp_cut = dperp > 0.55 # this cut did not return any sources...

        cmass_cut = (i_lo_cut) & (i_hi_cut) & (r_i_cut)

        # If a galaxy is a lens under either LOWZ or CMASS give it a zero
        lens_mask =  lowz_cut | cmass_cut
        lens_gals[lens_mask] = 1

        return lens_gals

    def calculate_tomography(self, pz_data, phot_data, lens_gals):
    
        nbin = len(self.config['lens_zbin_edges']) - 1
        n = len(phot_data['mag_i'])

        # The main output data - the tomographic
        # bin index for each object, or -1 for no bin.
        tomo_bin = np.repeat(-1, n)

        # We also keep count of total count of objects in each bin
        counts = np.zeros(nbin, dtype=int)

        for i in range(nbin):
            sel_00 = (pz_data['zbin'] == i) & (lens_gals == 1)
            tomo_bin[sel_00] = i
            counts[i] = sel_00.sum()

        return tomo_bin, counts


[docs]class TXTruthLensSelector(TXBaseLensSelector):
    name = "TXTruthLensSelector"

    inputs = [
        ('photometry_catalog', HDFFile),
    ]

    def data_iterator(self):
        print(f"We are cheating and using the true redshift.")
        chunk_rows = self.config['chunk_rows']
        phot_cols = ['mag_i','mag_r','mag_g', 'redshift_true']
        # Input data.  These are iterators - they lazily load chunks
        # of the data one by one later when we do the for loop.
        # This code can be run in parallel, and different processes will
        # each get different chunks of the data 
        for s, e, data in self.iterate_hdf('photometry_catalog', 'photometry', phot_cols, chunk_rows):
            data['z'] = data['redshift_true']
            yield s, e, data


[docs]class TXMeanLensSelector(TXBaseLensSelector):
    name = "TXMeanLensSelector"
    inputs = [
        ('photometry_catalog', HDFFile),
        ('lens_photoz_pdfs', HDFFile),
    ]


    def data_iterator(self):
        chunk_rows = self.config['chunk_rows']
        phot_cols = ['mag_i','mag_r','mag_g']
        z_cols = ['z_mean']
        iter_phot = self.iterate_hdf('photometry_catalog', 'photometry', phot_cols, chunk_rows)
        iter_pz = self.iterate_hdf('lens_photoz_pdfs', 'point_estimates', z_cols, chunk_rows)
        for (s, e, data), (_, _, z_data) in zip(iter_phot, iter_pz):
            data['z'] = z_data['z_mean']
            yield s, e, data

[docs]class TXModeLensSelector(TXBaseLensSelector):
    name = "TXModeLensSelector"
    inputs = [
        ('photometry_catalog', HDFFile),
        ('lens_photoz_pdfs', HDFFile),
    ]


    def data_iterator(self):
        chunk_rows = self.config['chunk_rows']
        phot_cols = ['mag_i','mag_r','mag_g']
        z_cols = ['z_mode']
        iter_phot = self.iterate_hdf('photometry_catalog', 'photometry', phot_cols, chunk_rows)
        iter_pz = self.iterate_hdf('lens_photoz_pdfs', 'point_estimates', z_cols, chunk_rows)
        for (s, e, data), (_, _, z_data) in zip(iter_phot, iter_pz):
            data['z'] = z_data['z_mode']
            yield s, e, data




[docs]class TXLensCatalogSplitter(PipelineStage):
    """
    Split a lens catalog file into a new file with separate bins.
    """
    name='TXLensCatalogSplitter'

    inputs = [
        ('lens_tomography_catalog', TomographyCatalog),
        ('photometry_catalog', HDFFile),
    ]

    outputs = [
        ("binned_lens_catalog", HDFFile),
    ]

    config_options = {
        "initial_size": 100_000,
        "chunk_rows": 100_000,
    }

    lens_cat_tag = "photometry_catalog"
    lens_cat_sec = "photometry"

    def run(self):

        with self.open_input("lens_tomography_catalog") as f:
            nbin = f["tomography"].attrs["nbin_lens"]
            counts = f["tomography/lens_counts"][:]
            count2d = f["tomography/lens_counts_2d"][:]

        cols = ["ra", "dec", "weight"]

        # Object we use to make the separate lens bins catalog
        cat_output = self.open_output("binned_lens_catalog", parallel=True)
        cat_group = cat_output.create_group("lens")
        cat_group.attrs['nbin'] = len(counts)
        cat_group.attrs['nbin_lens'] = len(counts)


        bins = {b:c for b,c in enumerate(counts)}
        bins["all"] = count2d
        splitter = Splitter(cat_group, "bin", cols, bins)

        my_bins = list(self.split_tasks_by_rank(bins))
        if my_bins:
            my_bins_text = ", ".join(str(x) for x in my_bins)
            print(f"Process {self.rank} collating bins: [{my_bins_text}]")
        else:
            print(f"Note: Process {self.rank} will not do anything.")

        it = self.combined_iterators(
            self.config["chunk_rows"],
            # first file
            "lens_tomography_catalog",
            "tomography",
            ["lens_bin", "lens_weight"],
            # second file
            self.lens_cat_tag,
            self.lens_cat_sec,
            ["ra", "dec"],
            parallel=False,
        )

        for s, e, data in it:
            if self.rank == 0:
                print(f"Process 0 binning data in range {s:,} - {e:,}")

            data["weight"] = data["lens_weight"]
            for b in my_bins:
                if b == "all":
                    w = np.where(data["lens_bin"] >= 0)
                else:
                    w = np.where(data["lens_bin"] == b)
                d = {name: col[w] for name, col in data.items()}
                splitter.write_bin(d, b)

        splitter.finish(my_bins)
        cat_output.close()


[docs]class TXExternalLensCatalogSplitter(TXLensCatalogSplitter):
    name = "TXExternalLensCatalogSplitter"
    """
    Split an external lens catalog into bins
    """
    inputs = [
        ('lens_tomography_catalog', TomographyCatalog),
        ('lens_catalog', HDFFile),
    ]
    lens_cat_tag = "lens_catalog"

    lens_cat_sec = "lens"


if __name__ == '__main__':
    PipelineStage.main()