Source code for txpipe.photoz_stack

from .base_stage import PipelineStage
from .data_types import PhotozPDFFile, TomographyCatalog, HDFFile, PNGFile, NOfZFile
from .utils.mpi_utils import in_place_reduce
import numpy as np
import warnings

class Stack:
    def __init__(self, name, z, nbin):
        """
        Create an n(z) stacker

        Parameters
        ----------
        name: str
            Name of the stack, for when we save it
        z: array
            redshift edges array
        nbin: int
            number of tomographic bins
        """
        self.name = name
        self.z = z
        self.nbin = nbin
        self.nz = z.size
        self.stack = np.zeros((nbin, z.size))
        self.counts = np.zeros(nbin)

    def add_pdfs(self, bins, pdfs):
        """
        Add a set of PDFs to the stack, per tomographic bin

        Parameters
        ----------
        bins: array[int]
            The tomographic bin for each object
        pdfs: 2d array[float]
            The p(z) per object
        """
        for b in range(self.nbin):
            w = np.where(bins==b)
            self.stack[b] += pdfs[w].sum(axis=0)
            self.counts[b] += w[0].size

    def add_delta_function(self, bins, z):
        """
        Add a set of objects to the stack whose redshift
        is known perfectly

        Parameters
        ----------
        bins: array[int]
            The tomographic bin for each object
        z: array[float]
            The redshift per object
        """
        # here self.z are the edges of the bins the nz will have. 
        stack_bin = np.digitize(z, self.z)
        for b in range(self.nbin):
            w = np.where(bins==b)
            stack_bin_b = stack_bin[w]
            for i in stack_bin_b:
                if 0 <= i < self.nz:
                    # digitize returns numbers between 1 and len(self.z)-1
                    # that is why we need to subtract 1 here, since the
                    # minimum value of i will be 1. 
                    self.stack[b][i-1] += 1
                    self.counts[b] += 1

    def save(self, outfile, comm=None):
        """
        Write this stack to a new group in the output file.
        Collect the stack from all processors if comm is provided

        Parameters
        ----------
        outfile: h5py.File
            Output file, already open

        comm: mpi4py communicator
            Optional, default=0
        """
        # stack the results from different comms
        if comm is not None:
            in_place_reduce(self.stack, comm)
            in_place_reduce(self.counts, comm)

            # only root saves output
            if comm.Get_rank() != 0:
                return

        # normalize each n(z)
        for b in range(self.nbin):
            if self.counts[b] > 0:
                self.stack[b] /= self.counts[b]

        # Create a group inside for the n_of_z data we made here.
        group = outfile.create_group(f"n_of_z/{self.name}")

        # HDF has "attributes" which are for small metadata like this
        group.attrs["nbin"] = self.nbin
        group.attrs["nz"] = len(self.z)

        # Save the redshift sampling. Adding half a bin here to save the mean z.
        # remove the last item when converting from edges to mean.
        group.create_dataset("z", data=self.z[:-1]+(self.z[2]-self.z[1])/2.)
        
        # And all the bins separately
        for b in range(self.nbin):
            group.attrs[f"count_{b}"] = self.counts[b]
            # remove the last item
            group.create_dataset(f"bin_{b}", data=self.stack[b][:-1])



[docs]class TXPhotozSourceStack(PipelineStage):
    """
    Naively stack photo-z PDFs in bins according to previous selections.

    This parent class does only the source bins.
    """
    name = "TXPhotozSourceStack"

    inputs = [
        ('source_photoz_pdfs', PhotozPDFFile),
        ('shear_tomography_catalog', TomographyCatalog),
    ]
    outputs = [
        ('shear_photoz_stack', NOfZFile),            
    ]
    config_options = {
        'chunk_rows': 5000,  # number of rows to read at once
    }

[docs]    def run(self):
        """
        Run the analysis for this stage.
        
         - Get metadata and allocate space for output
         - Set up iterators to loop through tomography and PDF input files
         - Accumulate the PDFs for each object in each bin
         - Divide by the counts to get the stacked PDF
        """

        # Create the stack objects
        outputs = self.prepare_outputs('source')
        warnings.warn("WEIGHTS/RESPONSE ARE NOT CURRENTLY INCLUDED CORRECTLY in PZ STACKING")

        # So we just do a single loop through the pair of files.
        for (s, e, data) in self.data_iterator():
            # Feed back on our progress
            print(f"Process {self.rank} read data chunk {s:,} - {e:,}")
            # Add data to the stacks
            self.stack_data('source', data, outputs)
        # Save the stacks
        self.write_outputs("shear_photoz_stack", outputs)

    def prepare_outputs(self, name):
        z, nbin_source = self.get_metadata()
        # For this class we do two stacks, and main one and a 2d one
        stack = Stack(name, z, nbin_source)
        stack2d = Stack(f'{name}2d', z, 1)
        return stack, stack2d


    def data_iterator(self):
        # This collects together matching inputs from the different
        # input files and returns an iterator to them which yields
        # start, end, data
        return self.combined_iterators(
                self.config['chunk_rows'],
                'source_photoz_pdfs', # tag of input file to iterate through
                'pdf', # data group within file to look at
                ['pdf'], # column(s) to read

                'shear_tomography_catalog', # tag of input file to iterate through
                'tomography', # data group within file to look at
                ['source_bin'], # column(s) to read
        )

    def stack_data(self, name, data, outputs):
        # add the data we have loaded into the stacks
        stack, stack2d = outputs
        stack.add_pdfs(data[f'{name}_bin'], data['pdf'])
        # -1 indicates no selection.  For the non-tomo 2d case
        # we just say anything that is >=0 is set to bin zero, like this
        bin2d = data[f'{name}_bin'].clip(-1, 0)
        stack2d.add_pdfs(bin2d, data['pdf'])


    def write_outputs(self, tag, outputs):
        stack, stack2d = outputs
        # only the root process opens the file.
        # The others don't use that so we have to
        # give them something in place
        # (i.e. inside the save method the non-root procs
        # will not reference the first arg)
        if self.rank == 0:
            f = self.open_output(tag)
            stack.save(f, self.comm)
            stack2d.save(f, self.comm)
            f.close()
        else:
            stack.save(None, self.comm)
            stack2d.save(None, self.comm)


[docs]    def get_metadata(self):
        """
        Load the z column and the number of bins

        Returns
        -------
        z: array
            Redshift column for photo-z PDFs
        nbin:
            Number of different redshift bins
            to split into.

        """
        # It's a bit odd but we will just get this from the file and 
        # then close it again, because we're going to use the 
        # built-in iterator method to get the rest of the data

        # open_input is a method defined on the superclass.
        # it knows about different file formats (pdf, fits, etc)
        with self.open_input('source_photoz_pdfs') as photoz_file:
            # This is the syntax for reading a complete HDF column
            z = photoz_file['pdf/zgrid'][:]

        # Save again but for the number of bins in the tomography catalog
        with self.open_input('shear_tomography_catalog') as tomo_file:
            nbin_source = tomo_file['tomography'].attrs['nbin_source']

        return z, nbin_source



[docs]class TXPhotozLensStack(TXPhotozSourceStack):
    """
    Naively stack photo-z PDFs in bins according to previous selections.

    This parent class does only the source bins.
    """
    name = "TXPhotozLensStack"
    inputs = [
        ('lens_photoz_pdfs', PhotozPDFFile),
        ('lens_tomography_catalog', TomographyCatalog),
    ]
    outputs = [
        ('lens_photoz_stack', NOfZFile),            
    ]
    config_options = {
        'chunk_rows': 5000,  # number of rows to read at once
    }

[docs]    def run(self):
        """
        Run the analysis for this stage.
        
         - Get metadata and allocate space for output
         - Set up iterators to loop through tomography and PDF input files
         - Accumulate the PDFs for each object in each bin
         - Divide by the counts to get the stacked PDF
        """

        # Create the stack objects
        outputs = self.prepare_outputs('lens')
        warnings.warn("WEIGHTS/RESPONSE ARE NOT CURRENTLY INCLUDED CORRECTLY in PZ STACKING")

        # So we just do a single loop through the pair of files.
        for (s, e, data) in self.data_iterator():
            # Feed back on our progress
            print(f"Process {self.rank} read data chunk {s:,} - {e:,}")
            # Add data to the stacks
            self.stack_data('lens', data, outputs)
        # Save the stacks
        self.write_outputs("lens_photoz_stack", outputs)



    def data_iterator(self):
        # This collects together matching inputs from the different
        # input files and returns an iterator to them which yields
        # start, end, data
        return self.combined_iterators(
                self.config['chunk_rows'],
                'lens_photoz_pdfs', # tag of input file to iterate through
                'pdf', # data group within file to look at
                ['pdf'], # column(s) to read

                'lens_tomography_catalog', # tag of input file to iterate through
                'tomography', # data group within file to look at
                ['lens_bin'], # column(s) to read
        )

[docs]    def get_metadata(self):
        """
        Load the z column and the number of bins

        Returns
        -------
        z: array
            Redshift column for photo-z PDFs
        nbin:
            Number of different redshift bins
            to split into.

        """
        # It's a bit odd but we will just get this from the file and
        # then close it again, because we're going to use the
        # built-in iterator method to get the rest of the data

        # open_input is a method defined on the superclass.
        # it knows about different file formats (pdf, fits, etc)
        with self.open_input('lens_photoz_pdfs') as photoz_file:
            # This is the syntax for reading a complete HDF column
            z = photoz_file['pdf/zgrid'][:]

        # Save again but for the number of bins in the tomography catalog
        with self.open_input('lens_tomography_catalog') as tomo_file:
            nbin_lens = tomo_file['tomography'].attrs['nbin_lens']

        return z, nbin_lens


[docs]class TXPhotozPlots(PipelineStage):
    """
    Make n(z) plots

    """
    name = "TXPhotozPlots"
    inputs = [
        ('shear_photoz_stack', NOfZFile),
        ('lens_photoz_stack', NOfZFile)
    ]
    outputs = [
        ('nz_lens', PNGFile),
        ('nz_source', PNGFile),
    ]

    config_options = {
    }


    def run(self):
        import matplotlib
        matplotlib.use('agg')
        import matplotlib.pyplot as plt
        f = self.open_input('lens_photoz_stack', wrapper=True)
        
        out1 = self.open_output('nz_lens', wrapper=True)
        f.plot('lens')
        plt.legend(frameon=False)
        plt.title("Lens n(z)")
        plt.xlim(xmin=0)
        out1.close()

        f = self.open_input('shear_photoz_stack', wrapper=True)
        out2 = self.open_output('nz_source', wrapper=True)
        f.plot('source')
        plt.legend(frameon=False)
        plt.title("Source n(z)")
        plt.xlim(xmin=0)
        out2.close()

[docs]class TXSourceTrueNumberDensity(TXPhotozSourceStack):
    """
    Fake an n(z) by histogramming the true redshift values for each object.
    Uses the same method as its parent but loads the data
    differently and uses the truth redshift as a delta function PDF
    """
    name = "TXSourceTrueNumberDensity"
    inputs = [
        ('shear_catalog', HDFFile),
        ('shear_tomography_catalog', TomographyCatalog),
    ]
    outputs = [
        ('shear_photoz_stack', NOfZFile),
    ]
    config_options = {
        'chunk_rows': 5000,  # number of rows to read at once
        'zmax': float,
        'nz': int,
    }


    def data_iterator(self):
        return self.combined_iterators(
                self.config['chunk_rows'],
                'shear_catalog', # tag of input file to iterate through
                'shear', # data group within file to look at
                ['redshift_true'], # column(s) to read

                'shear_tomography_catalog', # tag of input file to iterate through
                'tomography', # data group within file to look at
                ['source_bin'], # column(s) to read
            )

    def stack_data(self, name, data, outputs):
        stack, stack2d = outputs
        stack.add_delta_function(data[f'{name}_bin'], data['redshift_true'])
        bin2d = data[f'{name}_bin'].clip(-1, 0)
        stack2d.add_delta_function(bin2d, data['redshift_true'])


[docs]    def get_metadata(self):
        # Check we are running on a photo file with redshift_true
        with self.open_input("shear_catalog") as photo_file:
            has_z = 'redshift_true' in photo_file['shear'].keys()
            if not has_z:
                msg = ("The photometry_catalog file you supplied does not have a redshift_true column. "
                       "If you're running on sims you need to make sure to ingest that column from GCR. "
                       "If you're running on real data then sadly this isn't going to work. "
                       "Use a different stacking stage."
                    )
                raise ValueError(msg)

        zmax = self.config['zmax']
        nz = self.config['nz']
        z = np.linspace(0, zmax, nz)

        shear_tomo_file = self.open_input('shear_tomography_catalog')
        nbin_source = shear_tomo_file['tomography'].attrs['nbin_source']
        shear_tomo_file.close()

        return z, nbin_source


[docs]class TXLensTrueNumberDensity(TXPhotozLensStack, TXSourceTrueNumberDensity):
    name = "TXLensTrueNumberDensity"
    inputs = [
        ('photometry_catalog', HDFFile),
        ('lens_tomography_catalog', TomographyCatalog),
    ]
    outputs = [
        ('lens_photoz_stack', NOfZFile),
    ]
    config_options = {
        'chunk_rows': 5000,  # number of rows to read at once
        'zmax': float,
        'nz': int,
    }

    def data_iterator(self):
        # This collects together matching inputs from the different
        # input files and returns an iterator to them which yields
        # start, end, data
        return self.combined_iterators(
                self.config['chunk_rows'],
                'photometry_catalog', # tag of input file to iterate through
                'photometry', # data group within file to look at
                ['redshift_true'], # column(s) to read

                'lens_tomography_catalog', # tag of input file to iterate through
                'tomography', # data group within file to look at
                ['lens_bin'], # column(s) to read
        )

[docs]    def get_metadata(self):
        zmax = self.config['zmax']
        nz = self.config['nz']
        z = np.linspace(0, zmax, nz)
        # Save again but for the number of bins in the tomography catalog
        with self.open_input('lens_tomography_catalog') as tomo_file:
            nbin_lens = tomo_file['tomography'].attrs['nbin_lens']

        return z, nbin_lens