"""The ``time_series`` module extends ``pandas`` functionality
for manipulating time series data. It is intended support tasks particular to
dealing with atmospheric / weather data.
Usage Example
-------------
As an example, we create a ``pandas.Series`` object with missing data and
fill in the missing data using the ``periodic_interpolation`` method.
.. doctest:: python
>>> import numpy as np
>>> import pandas as pd
>>> # Create a Series with missing data
>>> demo_series = pd.Series(np.arange(10, 21))
>>> demo_series.iloc[[0, -1]] = np.nan
>>> print(demo_series)
0 NaN
1 11.0
2 12.0
3 13.0
4 14.0
5 15.0
6 16.0
7 17.0
8 18.0
9 19.0
10 NaN
dtype: float64
>>> # Interpolate for the missing data using periodic boundary conditions
>>> print(demo_series.tsu.periodic_interpolation())
0 13.666667
1 11.000000
2 12.000000
3 13.000000
4 14.000000
5 15.000000
6 16.000000
7 17.000000
8 18.000000
9 19.000000
10 16.333333
dtype: float64
For information on what other methods are incorporated under the ``tsu``
accessor attribut, see the ``TSUAccessor`` class.
Module Docs
-----------
"""
import datetime as dt
import warnings
from typing import *
import numpy as np
import pandas as pd
from astropy import units as u
from .. import constants as const
from .. import types
[docs]def datetime_to_sec_in_year(date: types.DateColl) -> types.NumpyLike:
"""Calculate number of seconds elapsed modulo 1 year.
Accurate to within a microsecond.
Args:
date: Date(s) to calculate seconds for
Returns:
A single float if the input is a single datetime, or a numpy array if the input is a collection.
"""
# Using ``atleast_1d`` with ``to_datetime`` guarantees a ``DatetimeIndex``
# object is returned, otherwise we get a ``TimeStamp`` object for scalars
# which has different attributes names than the ones we use below
pandas_dates = pd.to_datetime(np.atleast_1d(date))
# The ``values`` attributes returns a numpy array. Pandas objects
# are not generically compatible with astropy units
seconds = (
(pandas_dates.dayofyear.values - 1) * u.day +
pandas_dates.hour.values * u.hour +
pandas_dates.second.values * u.s +
pandas_dates.microsecond.values * u.ms
).to(u.s).value
# If the argument was a scalar, return a scalar
if np.ndim(date) == 0:
seconds = seconds.item()
return seconds
@np.vectorize
def datetime_to_season(date: types.DateColl) -> np.ndarray:
"""Determine the calendar season corresponding to a given datetime
Seasons are labeled as 'winter', 'spring', 'summer', or 'fall'.
All season names are defined as being in the southern hemisphere.
Args:
date: Datetime value(s)
Returns:
An array of strings
"""
dummy_year = const.jun_solstice.year
seasons = [
('summer', (dt.date(dummy_year, 1, 1), const.mar_equinox.date())),
('fall', (const.mar_equinox.date(), const.jun_solstice.date())),
('winter', (const.jun_solstice.date(), const.sep_equinox.date())),
('spring', (const.sep_equinox.date(), const.dec_solstice.date())),
('summer', (const.dec_solstice.date(), dt.date(dummy_year + 1, 1, 1)))
]
date = date.date().replace(year=dummy_year)
return cast(np.ndarray, next(season for season, (start, end) in seasons if start <= date < end))
[docs]@pd.api.extensions.register_series_accessor('tsu')
class TSUAccessor:
"""Pandas Series accessor for time series utilities"""
[docs] def __init__(self, pandas_obj: pd.Series) -> None:
"""Extends ``pandas`` support for time series data
DO NOT USE THIS CLASS DIRECTLY! This class is registered as a pandas accessor.
See the module level usage example for more information.
"""
self._obj = pandas_obj
[docs] def supplemented_data(self, year: int, supp_years: Collection[int] = tuple()) -> pd.Series:
"""Return the supplemented subset of the series corresponding to a given year
Data for the given year is supplemented with any available data from
supplementary years by asserting that the measured values from
supplementary years are exactly the same as they would be if taken during
the primary year. Priority is given to supplementary years in the order
specified by the ``supp_years`` argument.
Args:
year: Year to supplement data for
supp_years: Years to supplement data with when missing from ``year``
Returns:
A pandas Series object
"""
input_data = self._obj.dropna().sort_index()
years = np.array([year, *supp_years])
# Check for years with no available data
missing_years = years[~np.isin(years, input_data.index.year)]
if missing_years:
raise ValueError(f'No data for years: {missing_years}')
# Keep only data for the given years while maintaining priority order
stacked_pwv = pd.concat(
[input_data[input_data.index.year == yr] for yr in years]
)
# Make all dates have the same year and keep only unique dates
stacked_pwv.index = [date_idx.replace(year=years[0]) for date_idx in stacked_pwv.index]
return stacked_pwv[~stacked_pwv.index.duplicated(keep='first')]
[docs] def periodic_interpolation(self) -> pd.Series:
"""Linearly interpolate the series using periodic boundary conditions
Similar to the default linear interpolation used by pandas, but
missing values at the beginning and end of the series are
interpolated assuming a periodic boundary condition.
Returns:
An interpolated copy of the passed series
"""
if self._obj.dtype is np.dtype('O'):
warnings.warn('Interpolation may not work for object data types', RuntimeWarning)
# Identify non-NAN values closest to the edges of the series
series = self._obj.sort_index()
delta = series.index[1] - series.index[0]
start_idx, end_idx = series.iloc[[0, -1]].index
first_not_nan, last_not_nan = series.dropna().iloc[[0, -1]]
# Extend the series with temporary values so we can interpolate any missing values
series.loc[start_idx - 2 * delta] = last_not_nan
series.loc[start_idx - delta] = np.NAN
series.loc[end_idx + delta] = np.NAN
series.loc[end_idx + 2 * delta] = first_not_nan
# Drop the temporary values
return series.sort_index().interpolate().truncate(start_idx, end_idx)
[docs] def resample_data_across_year(self) -> pd.Series:
"""Resample the series evenly from the beginning of the
earliest year through the end of the latest year.
Returns:
A copy of the passed series interpolated for January first through December 31st
"""
start_time = self._obj.index[0].replace(month=1, day=1, hour=0, minute=0, second=0)
end_time = self._obj.index[-1].replace(month=12, day=31, hour=23, minute=59, second=59)
delta = self._obj.index[1] - self._obj.index[0]
# Modulo operation to determine any linear offset in the temporal sampling
offset = self._obj.index[0] - start_time
while offset >= delta:
offset -= delta
index_values = np.arange(start_time, end_time, delta).astype(pd.Timestamp) + offset
new_index = pd.to_datetime(index_values).tz_localize(self._obj.index.tz)
return self._obj.reindex(new_index)