Source code for bdranalytics.sklearn.tests.test_model_selection

import numpy as np
import pandas as pd
import unittest

from bdranalytics.sklearn.model_selection import GrowingWindow, IntervalGrowingWindow


[docs]def create_time_series_data_set(start_date=pd.datetime(year=2000, month=1, day=1), n_rows=100): end_date = start_date + pd.Timedelta(days=n_rows-1) ds = np.random.rand(n_rows) X = pd.DataFrame(ds, columns=['variable'], index=pd.date_range(start_date, end_date)) y = np.random.randint(2, size=(n_rows,))
return X, y
[docs]class TestGrowingWindow(unittest.TestCase):
[docs] def test_n_splits(self):
assert GrowingWindow(4).get_n_splits(np.arange(15).reshape(3, 5)) == 4
[docs] def test_n_splits_returned(self): assert len(list(GrowingWindow(4).split(
np.arange(15).reshape(3, 5), np.arange(3)))) == 4
[docs] def test_n_splits_testsize(self): for train, test in GrowingWindow(4).split(np.arange(15).reshape(5, 3), np.arange(5)):
assert len(test) == 1
[docs] def test_n_splits_testsize2(self): for i, (train, test) in zip(range(4), GrowingWindow(4).split(np.arange(15).reshape(5, 3), np.arange(5))):
assert len(train) == i+1
[docs]class TestIntervalGrowingWindow(unittest.TestCase):
[docs] def test_split_on_index(self): X, y = create_time_series_data_set() cv = IntervalGrowingWindow( test_start_date=pd.datetime(year=2000, month=2, day=1), test_end_date=pd.datetime(year=2000, month=3, day=1), test_size='7D')
self.assertTrue(len(list(cv.split(X, y))) == 4)
[docs] def test_split_on_array(self): X, y = create_time_series_data_set() test_size_in_days = 7 cv = IntervalGrowingWindow( timestamps=X.index.values, test_start_date=pd.datetime(year=2000, month=2, day=1), test_end_date=pd.datetime(year=2000, month=3, day=1), test_size=pd.Timedelta(days=test_size_in_days))
self.assertTrue(len(list(cv.split(X, y))) == 4)
[docs] def test_split_test_size(self): X, y = create_time_series_data_set() test_size_in_days = 7 cv = IntervalGrowingWindow( test_start_date=pd.datetime(year=2000, month=2, day=1), test_end_date=pd.datetime(year=2000, month=3, day=1), test_size=pd.Timedelta(days=test_size_in_days)) for _, test in cv.split(X, y):
self.assertTrue(len(test) == test_size_in_days)
[docs] def test_split_with_train_size(self): X, y = create_time_series_data_set() train_size_in_days = 14 cv = IntervalGrowingWindow( test_start_date=pd.datetime(year=2000, month=2, day=1), test_end_date=pd.datetime(year=2000, month=3, day=1), test_size=pd.Timedelta(days=7), train_size=pd.Timedelta(days=train_size_in_days)) for train, _ in cv.split(X, y):
self.assertTrue(len(train) == train_size_in_days)
[docs] def test_n_splits(self): X, y = create_time_series_data_set() cv = IntervalGrowingWindow( test_start_date=pd.datetime(year=2000, month=2, day=1), test_end_date=pd.datetime(year=2000, month=3, day=1), test_size=pd.Timedelta(days=7))
self.assertTrue(cv.get_n_splits(X) == 4)