Source code for snat_sim.utils.time_series

"""The ``time_series`` module extends ``pandas`` functionality
for manipulating time series data. It is intended support tasks particular to
dealing with atmospheric / weather data.

Usage Example
-------------

As an example, we create a ``pandas.Series`` object with missing data and
fill in the missing data using the ``periodic_interpolation`` method.

.. doctest:: python

   >>> import numpy as np
   >>> import pandas as pd

   >>> # Create a Series with missing data
   >>> demo_series = pd.Series(np.arange(10, 21))
   >>> demo_series.iloc[[0, -1]] = np.nan
   >>> print(demo_series)
   0      NaN
   1     11.0
   2     12.0
   3     13.0
   4     14.0
   5     15.0
   6     16.0
   7     17.0
   8     18.0
   9     19.0
   10     NaN
   dtype: float64


   >>> # Interpolate for the missing data using periodic boundary conditions
   >>> print(demo_series.tsu.periodic_interpolation())
   0     13.666667
   1     11.000000
   2     12.000000
   3     13.000000
   4     14.000000
   5     15.000000
   6     16.000000
   7     17.000000
   8     18.000000
   9     19.000000
   10    16.333333
   dtype: float64

For information on what other methods are incorporated under the ``tsu``
accessor attribut, see the ``TSUAccessor`` class.

Module Docs
-----------
"""

import datetime as dt
import warnings
from typing import *

import numpy as np
import pandas as pd
from astropy import units as u

from .. import constants as const
from .. import types


[docs]def datetime_to_sec_in_year(date: types.DateColl) -> types.NumpyLike:
    """Calculate number of seconds elapsed modulo 1 year.

    Accurate to within a microsecond.

    Args:
        date: Date(s) to calculate seconds for

    Returns:
        A single float if the input is a single datetime, or a numpy array if the input is a collection.
    """

    # Using ``atleast_1d`` with ``to_datetime`` guarantees a ``DatetimeIndex``
    # object is returned, otherwise we get a ``TimeStamp`` object for scalars
    # which has different attributes names than the ones we use below
    pandas_dates = pd.to_datetime(np.atleast_1d(date))

    # The ``values`` attributes returns a numpy array. Pandas objects
    # are not generically compatible with astropy units
    seconds = (
            (pandas_dates.dayofyear.values - 1) * u.day +
            pandas_dates.hour.values * u.hour +
            pandas_dates.second.values * u.s +
            pandas_dates.microsecond.values * u.ms
    ).to(u.s).value

    # If the argument was a scalar, return a scalar
    if np.ndim(date) == 0:
        seconds = seconds.item()

    return seconds


@np.vectorize
def datetime_to_season(date: types.DateColl) -> np.ndarray:
    """Determine the calendar season corresponding to a given datetime

    Seasons are labeled as 'winter', 'spring', 'summer', or 'fall'.
    All season names are defined as being in the southern hemisphere.

    Args:
        date: Datetime value(s)

    Returns:
        An array of strings
    """

    dummy_year = const.jun_solstice.year

    seasons = [
        ('summer', (dt.date(dummy_year, 1, 1), const.mar_equinox.date())),
        ('fall', (const.mar_equinox.date(), const.jun_solstice.date())),
        ('winter', (const.jun_solstice.date(), const.sep_equinox.date())),
        ('spring', (const.sep_equinox.date(), const.dec_solstice.date())),
        ('summer', (const.dec_solstice.date(), dt.date(dummy_year + 1, 1, 1)))
    ]

    date = date.date().replace(year=dummy_year)
    return cast(np.ndarray, next(season for season, (start, end) in seasons if start <= date < end))


[docs]@pd.api.extensions.register_series_accessor('tsu')
class TSUAccessor:
    """Pandas Series accessor for time series utilities"""

[docs]    def __init__(self, pandas_obj: pd.Series) -> None:
        """Extends ``pandas`` support for time series data

        DO NOT USE THIS CLASS DIRECTLY! This class is registered as a pandas accessor.
        See the module level usage example for more information.
        """

        self._obj = pandas_obj

[docs]    def supplemented_data(self, year: int, supp_years: Collection[int] = tuple()) -> pd.Series:
        """Return the supplemented subset of the series corresponding to a given year

        Data for the given year is supplemented with any available data from
        supplementary years by asserting that the measured values from
        supplementary years are exactly the same as they would be if taken during
        the primary year. Priority is given to supplementary years in the order
        specified by the ``supp_years`` argument.

        Args:
            year: Year to supplement data for
            supp_years: Years to supplement data with when missing from ``year``

        Returns:
            A pandas Series object
        """

        input_data = self._obj.dropna().sort_index()
        years = np.array([year, *supp_years])

        # Check for years with no available data
        missing_years = years[~np.isin(years, input_data.index.year)]
        if missing_years:
            raise ValueError(f'No data for years: {missing_years}')

        # Keep only data for the given years while maintaining priority order
        stacked_pwv = pd.concat(
            [input_data[input_data.index.year == yr] for yr in years]
        )

        # Make all dates have the same year and keep only unique dates
        stacked_pwv.index = [date_idx.replace(year=years[0]) for date_idx in stacked_pwv.index]
        return stacked_pwv[~stacked_pwv.index.duplicated(keep='first')]

[docs]    def periodic_interpolation(self) -> pd.Series:
        """Linearly interpolate the series using periodic boundary conditions

        Similar to the default linear interpolation used by pandas, but
        missing values at the beginning and end of the series are
        interpolated assuming a periodic boundary condition.

        Returns:
            An interpolated copy of the passed series
        """

        if self._obj.dtype is np.dtype('O'):
            warnings.warn('Interpolation may not work for object data types', RuntimeWarning)

        # Identify non-NAN values closest to the edges of the series
        series = self._obj.sort_index()
        delta = series.index[1] - series.index[0]
        start_idx, end_idx = series.iloc[[0, -1]].index
        first_not_nan, last_not_nan = series.dropna().iloc[[0, -1]]

        # Extend the series with temporary values so we can interpolate any missing values
        series.loc[start_idx - 2 * delta] = last_not_nan
        series.loc[start_idx - delta] = np.NAN
        series.loc[end_idx + delta] = np.NAN
        series.loc[end_idx + 2 * delta] = first_not_nan

        # Drop the temporary values
        return series.sort_index().interpolate().truncate(start_idx, end_idx)

[docs]    def resample_data_across_year(self) -> pd.Series:
        """Resample the series evenly from the beginning of the
        earliest year through the end of the latest year.

        Returns:
            A copy of the passed series interpolated for January first through December 31st
        """

        start_time = self._obj.index[0].replace(month=1, day=1, hour=0, minute=0, second=0)
        end_time = self._obj.index[-1].replace(month=12, day=31, hour=23, minute=59, second=59)
        delta = self._obj.index[1] - self._obj.index[0]

        # Modulo operation to determine any linear offset in the temporal sampling
        offset = self._obj.index[0] - start_time
        while offset >= delta:
            offset -= delta

        index_values = np.arange(start_time, end_time, delta).astype(pd.Timestamp) + offset
        new_index = pd.to_datetime(index_values).tz_localize(self._obj.index.tz)
        return self._obj.reindex(new_index)
Table Of Contents

Source code for snat_sim.utils.time_series