Source code for bdranalytics.sklearn.model_selection

import numpy as np
import pandas as pd
from abc import ABCMeta
from sklearn.externals.six import with_metaclass
from sklearn.utils.validation import _num_samples


[docs]class GrowingWindow(with_metaclass(ABCMeta)):
    """Growing Window cross validator

    Provides train/test indices to split data in train/test sets.
    Divides the data in n_folds+1 slices.
    For split i [1..n_folds], slices [0..i} are train, slice i is test

    Parameters:
        n_folds : int, default=3
            Number of folds. Must be at least 1
    """

    def __init__(self, n_folds=3):
        self.n_folds = n_folds

    def __repr__(self):
        return _build_repr(self)

[docs]    def split(self, X, y=None, labels=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, of length n_samples
            The target variable for supervised learning problems.
            ignored
        labels : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
            ignored
        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        n = _num_samples(X)
        n_slices = self.n_folds + 1
        # loop from the first 2 folds to the total number of folds
        for i in range(2, n_slices + 1):
            # the split is the percentage at which to split the folds into train
            # and test. For example when i = 2 we are taking the first 2 folds out
            # of the total available. In this specific case we have to split the
            # two of them in half (train on the first, test on the second),
            # so split = 1/2 = 0.5 = 50%. When i = 3 we are taking the first 3 folds
            # out of the total available, meaning that we have to split the three of them
            # in two at split = 2/3 = 0.66 = 66% (train on the first 2 and test on the
            # following)
            split = float(i - 1) / i
            # as we loop over the folds X and y are updated and increase in size.
            # This is the data that is going to be split and it increases in size
            # in the loop as we account for more folds. If k = 300, with i starting from 2
            # the result is the following in the loop
            # i = 2
            # X = X_train[:(600)]
            # y = y_train[:(600)]
            #
            # i = 3
            # X = X_train[:(900)]
            # y = y_train[:(900)]
            # ....
            n_sub = int(np.floor(float(n * i) / n_slices))
            subset = range(0, n_sub)
            # X and y contain both the folds to train and the fold to test.
            # index is the integer telling us where to split, according to the
            # split percentage we have set above
            n_train = int(np.floor(n_sub * split))
            train_index = np.arange(0, n_train)
            test_index = np.arange(n_train, n_sub)
            yield train_index, test_index

[docs]    def get_n_splits(self, X, y=None, labels=None):
        """Returns the number of splitting iterations in the cross-validator
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : object
            Always ignored, exists for compatibility.
        labels : object
            Always ignored, exists for compatibility.
        Returns
        -------
        n_splits : int
            Returns the number of splitting iterations in the cross-validator.
        """
        if X is None:
            raise ValueError("The X parameter should not be None")
        return self.n_folds


[docs]class IntervalGrowingWindow(with_metaclass(ABCMeta)):
    """Growing Window cross-validator based on time intervals"""

    def __init__(self, test_start_date, timestamps='index', test_end_date=None,
                 test_size=None, train_size=None):

        self.test_start_date = pd.to_datetime(test_start_date)
        self.test_end_date = pd.to_datetime(test_end_date)
        self.test_size = pd.to_timedelta(test_size)
        self.train_size = pd.to_timedelta(train_size)

        self.timestamps = timestamps
        if timestamps is not 'index':
            self.timestamps = pd.to_datetime(timestamps)

[docs]    def generate_intervals(self, timestamps):

        # infer test interval end date if not specified
        # has to be done here to work with timestamps from DataFrame index
        # NOTE: test_end_date is NOT included
        if self.test_end_date is None:
            # can be overridden for reuse
            self.test_end_date = max(timestamps)

        # determine start date of the test intervals
        intervals_start = pd.to_datetime(pd.date_range(self.test_start_date,
                                                       self.test_end_date,
                                                       freq=self.test_size)
                                         .values)

        # convert to (start, end) tuples
        intervals = list(zip(intervals_start[:-1], intervals_start[1:]))

        return intervals

[docs]    def get_timeseries(self, X):
        """Returns the numpy array of timestamps for the given dataset"""
        if self.timestamps is 'index':
            return pd.to_datetime(X.index.values)
        else:
            return self.timestamps

[docs]    def split(self, X, y=None, labels=None):
        """Generate indices to split data into training and test sets based on time stamps"""
        if X is None:
            raise ValueError("The X parameter should not be None")

        # extract timestamps from DataFrame index, if needed
        timestamps = self.get_timeseries(X)
        intervals = self.generate_intervals(timestamps)

        # extract first sample for unlimited train size
        first_sample_date = min(timestamps)

        # number of samples
        n = _num_samples(X)

        # list of indices, to convert booleans later on
        index = np.arange(n)

        # loop over each interval
        for test_start, test_end in intervals:

            if self.train_size is not None:
                train_start = test_start - self.train_size
            else:
                train_start = first_sample_date

            train_interval_bool = np.array(list(map(lambda date:
                                                    train_start <= date < test_start,
                                                    timestamps)))

            test_interval_bool = np.array(list(map(lambda date:
                                                   test_start <= date < test_end,
                                                   timestamps)))

            # convert boolean to integer indices
            train_index = index[train_interval_bool]
            test_index = index[test_interval_bool]

            yield train_index, test_index

[docs]    def get_n_splits(self, X, y=None, labels=None):
        if X is None:
            raise ValueError("The X parameter should not be None")

        # extract timestamps from DataFrame index, if needed
        timestamps = self.get_timeseries(X)
        intervals = self.generate_intervals(timestamps)

        # compute number of folds
        return len(intervals)