Source code for bdranalytics.sklearn.preprocessing.tests.test_encoding

import random
import unittest

import numpy as np
import pandas as pd

from bdranalytics.sklearn.preprocessing import WeightOfEvidenceEncoder


[docs]class TestEncoding(unittest.TestCase):
[docs] def verify_numeric(self, X_test): for dt in X_test.dtypes: numeric = False if np.issubdtype(dt, int) or np.issubdtype(dt, float): numeric = True
self.assertTrue(numeric)
[docs] @staticmethod def create_dataset(n_rows=1000): """ Creates a data set with some categorical variables """ ds = [[ random.random(), random.random(), random.choice(['A', 'B', 'C']), random.choice(['A', 'B', 'C']), random.choice(['A', 'B', 'C', None]), random.choice(['A', 'B', 'C']) ] for _ in range(n_rows)] X = pd.DataFrame(ds, columns=['c1', 'c2', 'c3', 'c4', 'c5', 'c6']) y = np.random.randint(2, size=(n_rows,))
return X, y
[docs] def test_weight_of_evidence(self): """ Unit test for WeightOfEvidenceEncoder class """ # generate some training data cols = ['c3', 'c4', 'c5', 'c6'] n_rows = 100 X_train, y_train = self.create_dataset(n_rows=n_rows) # independent data set to-be-transformed X_test, _ = self.create_dataset(n_rows=10) # add unseen category to catch NaN filling behavior X_test.loc[0, 'c3'] = 'Z' # data frame case enc = WeightOfEvidenceEncoder(verbose=1, cols=cols) enc.fit(X_train, y_train) self.verify_numeric(enc.transform(X_test)) # numpy array case enc_np = WeightOfEvidenceEncoder(verbose=0, return_df=False, cols=cols) enc_np.fit(X_train, y_train) output_array_enc_np = enc_np.transform( X_test) # save for following tests self.assertTrue(isinstance(output_array_enc_np, np.ndarray)) # external dep var, DIFFERENT from y_train enc_ext = WeightOfEvidenceEncoder(verbose=1, cols=cols, return_df=False, dependent_variable_values=np.random.randint(2, size=(n_rows,))) enc_ext.fit(X_train, y_train) self.assertTrue(np.array_equal(output_array_enc_np, enc_ext.transform(X_test)) is False) # external dep var, SAME y_train enc_ext = WeightOfEvidenceEncoder(verbose=1, cols=cols, return_df=False, dependent_variable_values=y_train) enc_ext.fit(X_train, y_train) self.assertTrue(np.array_equal(output_array_enc_np,
enc_ext.transform(X_test)) is True) if __name__ == '__main__': unittest.main()