Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 80 additions & 2 deletions tests/test_func_api_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,9 +828,13 @@ def test_exceptions(self):
X_train, y_train, X_test, verbose=25)

# Internal function model_action
assert_raises(ValueError, model_action, LinearRegression(),
X_train, y_train, X_test, sample_weight=None,
assert_raises(ValueError, model_action, LinearRegression(),
X_train, y_train, X_test, sample_weight=None,
action='abc', transform=None)

# X_test is None when mode != 'oof'
assert_raises(ValueError, stacking, [LinearRegression()],
X_train, y_train, None, mode='oof_pred_bag')

#---------------------------------------------------------------------------
# Testing parameter warnings
Expand Down Expand Up @@ -940,6 +944,80 @@ def test_small_input(self):
assert_array_equal(S_train_1, S_train_3)
assert_array_equal(S_test_1, S_test_3)

#---------------------------------------------------------------------------
# Mode 'oof', X_test=None
#---------------------------------------------------------------------------

def test_oof_mode_with_none(self):

model = LinearRegression()
S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds,
n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
S_test_1 = None

models = [LinearRegression()]
S_train_2, S_test_2 = stacking(models, X_train, y_train, None,
regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir,
mode = 'oof', random_state = 0, verbose = 0)

# Load OOF from file
# Normally if cleaning is performed there is only one .npy file at given moment
# But if we have no cleaning there may be more then one file so we take the latest
file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file
S = np.load(file_name, allow_pickle=True)
S_train_3 = S[0]
S_test_3 = S[1]

assert_array_equal(S_train_1, S_train_2)
assert_array_equal(S_test_1, S_test_2)

assert_array_equal(S_train_1, S_train_3)
assert_array_equal(S_test_1, S_test_3)

#---------------------------------------------------------------------------
# All default values (mode='oof_pred_bag')
#---------------------------------------------------------------------------

def test_all_defaults(self):

# Override global n_folds=5, because default value in stacking function is 4
n_folds=4

S_test_temp = np.zeros((X_test.shape[0], n_folds))
kf = KFold(n_splits = n_folds, shuffle = False, random_state = 0)
for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)):
# Split data and target
X_tr = X_train[tr_index]
y_tr = y_train[tr_index]
X_te = X_train[te_index]
y_te = y_train[te_index]
model = LinearRegression()
_ = model.fit(X_tr, y_tr)
S_test_temp[:, fold_counter] = model.predict(X_test)
S_test_1 = np.mean(S_test_temp, axis = 1).reshape(-1, 1)

model = LinearRegression()
S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds,
n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)

models = [LinearRegression()]
S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, save_dir=temp_dir)

# Load OOF from file
# Normally if cleaning is performed there is only one .npy file at given moment
# But if we have no cleaning there may be more then one file so we take the latest
file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file
S = np.load(file_name, allow_pickle=True)
S_train_3 = S[0]
S_test_3 = S[1]

assert_array_equal(S_train_1, S_train_2)
assert_array_equal(S_test_1, S_test_2)

assert_array_equal(S_train_1, S_train_3)
assert_array_equal(S_test_1, S_test_3)


#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------

Expand Down
14 changes: 9 additions & 5 deletions vecstack/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,14 @@ def stacking(models, X_train, y_train, X_test,
y_train : numpy 1d array
Target values

X_test : numpy array or sparse matrix of N-dim shape, e.g. 2-dim [n_test_samples, n_features]
X_test : numpy array or sparse matrix of N-dim shape, e.g. 2-dim [n_test_samples, n_features], or None
Test data
Note: X_test can be set to None when mode='oof'

sample_weight : numpy array of shape [n_train_samples]
sample_weight : numpy array of shape [n_train_samples], default None
Individual weights for each sample (passed to fit method of the model).
Note: sample_weight has length of full training set X_train and it would be
split automatically for each fold.
Note: sample_weight must have the same length as full training set X_train.
It will be split automatically for each fold.

regression : boolean, default True
If True - perform stacking for regression task,
Expand Down Expand Up @@ -188,7 +189,7 @@ def stacking(models, X_train, y_train, X_test,

mode: str, default 'oof_pred_bag' (alias 'A')
Note: for detailes see terminology below
'oof' - return only oof
'oof' - return only oof. X_test can be set to None
'oof_pred' (alias 'B') - return oof and pred
'oof_pred_bag' (alias 'A') - return oof and bagged pred
'pred' - return pred only
Expand Down Expand Up @@ -406,6 +407,9 @@ def your_metric(y_true, y_pred):
# If empty <models> list
if 0 == len(models):
raise ValueError('List of models is empty')
# X_test can be None only if mode='oof'
if X_test is None and mode != 'oof':
raise ValueError("X_test can be None only if mode='oof'")
# Check arrays
# y_train and sample_weight must be 1d ndarrays (i.e. row, not column)
X_train, y_train = check_X_y(X_train,
Expand Down