#all_no_test
import os
import time
import json
import math
from typing import Callable
from functools import reduce
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm.callback import record_evaluation, CallbackEnv, EarlyStopException, _format_eval_result
from lightgbm.basic import _log_info
from opt_utils import *
'pip install neptune-client')
os.system('pip install neptune-lightgbm')
os.system(import neptune.new as neptune
from neptune.new.integrations.lightgbm import create_booster_summary
from kaggle_secrets import UserSecretsClient
= UserSecretsClient()
user_secrets = user_secrets.get_secret("NEPTUNE_API_TOKEN")
NEPTUNE_API_TOKEN
# Callback for dart earlystopping
def early_stopping_dart(dart_dict: dict, stopping_rounds: int=None, model_file: str=None) -> Callable:
"""Create a callback that saves the model with the best validation score at ``model_file``,
and stops training if there ``stopping_rounds`` without improvement.
Parameters
----------
dart_dict : dict
A dictionary to save the best iteration, train score, and valid score of that iteration
stopping_rounds : int
The possible number of rounds without the score improving.
model_file : str
Path used to save the best scoring model.
Returns
-------
callback : callable
The callback that saves the best scoring model to file, and collects
best iteration, train score, and valid score.
"""
= dart_dict.get('stopping_rounds', stopping_rounds)
stopping_rounds = dart_dict.get('model_file', model_file)
model_file = [math.inf]
best_score = [0]
best_iter
def _callback(env: CallbackEnv) -> None:
= env.evaluation_result_list[-1][2]
score if score < best_score[0]:
0] = score
best_score[
env.model.save_model(model_file)'best_iteration'] = best_iter[0] = env.iteration
dart_dict['train_score'] = score
dart_dict['valid_score'] = env.evaluation_result_list[1][2]
dart_dict[
if env.iteration - best_iter[0] >= stopping_rounds:
f"Early stopping, best iteration is:\n[{best_iter[0] + 1}]\t{best_score[0]}")
_log_info(raise EarlyStopException(best_iter[0], env.evaluation_result_list)
return _callback
= {
cfg "script_name": 'opt_train_v2_op_89',
"path_features": '../input/generate-train-features-script-p13/p13_train.pkl', # Used in train mode
"path_models": '',
"path_data_raw": '../input/optiver-realized-volatility-prediction/',
"neptune_project": 'chrisrichardmiles/optiver',
"neptune_description": 'p13 encoding top columns with shake .3',
"encode_time_cols": ['real_vol_ratio_5_10', 'real_vol_mean_decay_0.9_-1', 'order_norm_momentum', 'real_vol_mean_decay_0.85_-1', 'real_vol_mean_decay_0.99_1', 'real_vol_mean_decay_0.95_1', 'abs_price_wap_diff_mean_decay_flip', 'abs_price_wap_diff_mean_decay', 'order_norm_sum', 'real_vol_mean_decay_0.85_-1_2', 'spread_mean_decay_flip_95', 'real_vol_mean_decay_0.99_-1', 'real_vol_mean_decay_0.75_-1', 'spread_mean', 'order_count_sum', 'bid_price_diff_count_unique', 'spread_momentum', 'size_mean', 'real_vol_mean_decay_0.95_-1', 'order_norm_mean_decay', 'spread_2_mean_decay_95', 'order_size_mean', 'spread_mean_decay_95',
'spread_pct_momentum'],
"encode_stock_cols": ['real_vol_ratio_5_10', 'real_vol_mean_decay_0.9_-1', 'order_norm_momentum', 'real_vol_mean_decay_0.85_-1', 'real_vol_mean_decay_0.99_1', 'real_vol_mean_decay_0.95_1', 'abs_price_wap_diff_mean_decay_flip', 'abs_price_wap_diff_mean_decay', 'order_norm_sum', 'real_vol_mean_decay_0.85_-1_2', 'spread_mean_decay_flip_95', 'real_vol_mean_decay_0.99_-1', 'real_vol_mean_decay_0.75_-1', 'spread_mean', 'order_count_sum', 'bid_price_diff_count_unique', 'spread_momentum', 'size_mean', 'real_vol_mean_decay_0.95_-1', 'order_norm_mean_decay', 'spread_2_mean_decay_95', 'order_size_mean', 'spread_mean_decay_95',
'spread_pct_momentum'],
"drop_cols": ['row_id', 'time_id', 'stock_id', 'target'] + ['real_momentum_0.55_-1', 'wap_2_last-first', 'sum_bid_ask_momentum', 'abs_log_return_momentum', 'order_size_sqaure_weighted_sum', 'size_mean_decay_95', 'size_norm_momentum_95', 'size_norm_momentum', 'size_spread_mean_decay_flip_95', 'log_return_2_realized_volatility', 'abs_price_wap_diff_2_std', 'sum_bid_ask_mean', 'real_momentum_0.85_-1', 'real_vol_mean_decay_0.55_-1', 'abs_price_wap_diff_amax', 'size_mean_decay', 'real_vol_mean_decay_0.9_-1_2', 'abs_price_wap_diff_2_mean', 'real_momentum_0.65_-1_2', 'price_wap_diff_2_max_sub_min', 'real_vol_mean_decay_0.75_1', 'real_vol_min_5', 'real_vol_mean_decay_0.95_-1_2', 'real_vol_min_6', 'size_spread_mean', 'size_spread_mean_decay_95', 'real_vol_mean_decay_0.75_1_2', 'sum_bid_ask_std', 'price_wap_diff_mean_decay', 'wap_2_real_vol', 'abs_price_wap_diff_2_mean_decay', 'size_momentum_95', 'real_momentum_0.65_-1', 'size_spread_momentum_95', 'real_vol_min_2', 'sum_bid_ask_mean_decay_flip', 'real_vol_mean_decay_0.65_1', 'size_spread_mean_decay_flip', 'real_momentum_0.75_-1_2', 'abs_price_wap_diff_2_max_sub_min', 'price_wap_diff_momentum', 'real_momentum_0.55_-1_2', 'real_momentum_0.45_-1_2', 'spread_pct_std', 'abs_price_wap_diff_max_sub_min', 'real_momentum_0.95_-1', 'real_vol_min_7', 'real_vol_min_4_2', 'real_vol_mean_decay_0.99_1_2', 'spread_2_sum', 'sum_bid_ask_mean_decay', 'real_vol_mean_decay_0.55_1', 'real_vol_mean_decay_0.55_1_2', 'real_vol_mean_decay_0.9_1_2', 'real_vol_mean_decay_0.65_1_2', 'real_vol_mean_decay_0.45_1_2', 'wap_2_std', 'price_wap_diff_max_sub_min', 'real_momentum_0.85_-1_2', 'real_momentum_0.45_-1', 'price_wap_diff_2_mean_decay_flip', 'size_norm_mean_decay_flip_95', 'price_wap_diff_mean_decay_flip', 'real_momentum_0.99_-1', 'sum_bid_ask_max_sub_min', 'real_vol_mean_decay_0.95_1_2', 'real_vol_min_4', 'real_vol_min_8', 'real_vol_min_2_2', 'real_momentum_0.99_-1_2', 'real_momentum_0.95_-1_2', 'log_return_mean_decay_flip', 'real_vol_mean_decay_0.85_1_2', 'price_wap_diff_2_std', 'log_return_realized_volatility', 'abs_log_return_2_momentum', 'log_return_2_mean_decay_flip', 'log_return_std', 'log_return_2_std', 'real_momentum_0.75_-1', 'real_vol_mean_decay_0.45_-1_2', 'abs_price_wap_diff_2_momentum', 'real_momentum_0.9_-1_2', 'real_vol_mean_decay_0.9_1', 'price_wap_diff_2_mean_decay', 'real_vol_min_9_2', 'real_vol_mean_decay_0.99_-1_2', 'wap_2_max_sub_min', 'real_momentum_0.9_-1', 'dummy3', 'abs_price_wap_diff_2_mean_decay_flip',
'abs_log_return_2_std'],
"encode_funcs": ['mean', 'std'],
"shake": False,
"shake_std": .3,
"prefix": '',
"rerun": True,
"neptune_run_name": '',
"lgb_params": {
# https://lightgbm.readthedocs.io/en/latest/index.html
"boosting_type": "dart",
"objective": "rmse",
"learning_rate": .05,
"num_leaves": 255,
"min_data_in_leaf": 255,
"feature_fraction": 0.8,
"bagging_fraction": .5, # Select bagging_fraction of rows every bagging_freq of iterations.
"bagging_freq": 1, # This speeds up training and underfits. Need both set to do anything.
"n_estimators": 3,
"early_stopping_rounds": 400,
"n_jobs": -1,
"seed": 42,
"verbose": -1,
},
}with open('cfg.json', 'w') as f:
json.dump(cfg, f)
def main():
= pd.read_pickle(cfg['path_features'])
train 'real_vol_ratio_5_10'] = train[[f'real_vol_min_{i}' for i in range(1, 6)]].sum(axis=1) / train[[f'real_vol_min_{i}' for i in range(6, 11)]].sum(axis=1)
train[= encode_cols(train, cfg["encode_time_cols"], funcs=cfg['encode_funcs'], on='time_id')
train
# Saving encoded stock columns
= 'enc_' + os.path.split(cfg['path_features'])[1]
feat_file = encode_cols(train, cfg["encode_stock_cols"], funcs=cfg['encode_funcs'], on='stock_id')
tmp 'path_models'], feat_file))
tmp.to_pickle(os.path.join(cfg[del tmp
= [c for c in cfg['drop_cols'] if c in train.columns and c != 'stock_id']
drop_cols = train.drop(drop_cols, axis = 1)
x = train['target']
y
= np.zeros(x.shape[0]) # Create out of folds array
oof_predictions = [] # Keep track of scores for each fold and all oof at the end
scores = []
best_iterations = []
training_best_scores = [] # Same as scores in this script, but would be different with nested cv
valid_best_scores = []
best_score_diffs = [] # For experimentation tracking
dict_eval_logs = [] # For experimentation tracking
booster_summaries = []
dumb_features = []
top_features
for fold in range(5):
= x.fold != fold
trn_ind = x.fold == fold
val_ind
print(f'Training fold {fold}')
= x[trn_ind].drop('fold', axis=1), x[val_ind].drop('fold', axis=1)
x_train, x_val = y[trn_ind], y[val_ind]
y_train, y_val
= encode_cols(x_train,
x_train 'encode_stock_cols'],
cfg[=cfg['encode_funcs'],
funcs=cfg['shake'],
shake=cfg['shake_std']).drop('stock_id', axis=1)
shake_std= x_train.shape[1]
n_train_cols
= encode_cols(x_val,
x_val 'encode_stock_cols'],
cfg[=cfg['encode_funcs']).drop('stock_id', axis=1)
funcs
= 1 / np.square(y_train) # Root mean squared percentage error weights
train_weights = 1 / np.square(y_val)
val_weights = lgb.Dataset(x_train, y_train, weight=train_weights, free_raw_data=False)
train_dataset = lgb.Dataset(x_val, y_val, weight=val_weights, reference=train_dataset, free_raw_data=False)
val_dataset
# Variables for callback functions
= os.path.join(cfg['path_models'], f'{cfg["prefix"]}lgb_fold_{fold}.txt')
model_file = {'stopping_rounds': cfg['lgb_params']['early_stopping_rounds'],
dart_dict 'model_file': model_file}
= {}
dict_eval_log
= lgb.train(params = cfg['lgb_params'],
model = train_dataset,
train_set = [val_dataset, train_dataset],
valid_sets = ['valid', 'train'],
valid_names = feval_rmspe,
feval =[record_evaluation(dict_eval_log),
callbacks
early_stopping_dart(dart_dict)],= 1)
verbose_eval
= lgb.Booster(model_file=model_file)
model = model.predict(x_val)
y_pred = y_pred
oof_predictions[val_ind] round(rmspe(y_val, y_pred), 3))
scores.append(
dumb_features.append(get_dumb_features(model))
top_features.append(get_top_features(model))
= create_booster_summary(
booster_summary =model,
booster=True,
log_importances=25,
max_num_features=False,
log_trees_as_dataframe=True,
log_pickled_booster=y_val,
y_true=y_pred,
y_pred
)
= dart_dict['train_score']
train_score = dart_dict['valid_score']
valid_score print(dart_dict['best_iteration'])
'best_iteration'])
best_iterations.append(dart_dict[round(train_score, 3))
training_best_scores.append(round(valid_score, 3))
valid_best_scores.append(round(valid_score - train_score, 3))
best_score_diffs.append(
booster_summaries.append(booster_summary)
dict_eval_logs.append(dict_eval_log)del booster_summary, dict_eval_log
= round(rmspe(y, oof_predictions), 4)
rmspe_score print(f'Our out of folds RMSPE is {rmspe_score}')
print(f'Our cv fold scores are {scores}')
'oof_predictions', oof_predictions)
np.save(
= neptune.init(
run =cfg['neptune_project'],
project=NEPTUNE_API_TOKEN,
api_token=cfg['neptune_run_name'],
name=cfg['neptune_description'],
description=[cfg['path_features'], cfg['prefix']],
tags=['cfg.json'],
source_files
)'feat_id'] = feat_file
run['cfg'] = cfg
run['RMSPE'] = rmspe_score
run['RMSPE_oof_scores'] = scores
run['RMSPE_cv_std'] = np.std(scores)
run[
'best_iterations'] = best_iterations
run[= int(np.mean(best_iterations))
best_iterations_mean 'best_iterations_mean'] = best_iterations_mean
run['training_best_scores'] = training_best_scores
run['valid_best_scores'] = valid_best_scores
run['best_score_diffs'] = best_score_diffs
run['best_score_diffs_mean'] = round(np.mean(best_score_diffs), 3)
run['dumb_features'] = list(reduce(lambda a, b: set(a).intersection(set(b)), dumb_features))
run['top_features'] = list(reduce(lambda a, b: set(a).intersection(set(b)), top_features))
run[
# Logs for each folds model
for fold in range(5):
f'lgbm_summaries/fold_{fold}'] = booster_summaries[fold]
run[f'lgbm_summaries/dumb_features_{fold}'] = list(dumb_features[fold])
run[f'lgbm_summaries/top_features_{fold}'] = list(top_features[fold])
run[= dict_eval_logs[fold]
dict_eval_log for valid_set, odict in dict_eval_log.items():
for metric, log in odict.items():
for val in log:
f'eval_logs/{fold}_{valid_set}_{metric}'].log(val)
run[
run.stop()
if cfg['rerun']:
print(f'retraining model with all data for {best_iterations} iterations')
= cfg['lgb_params'].copy()
params 'early_stopping_rounds'] = 0 # No valid set to stop with
params[
= x.drop(['fold'], axis=1)
x_train = encode_cols(x_train,
x_train 'encode_stock_cols'],
cfg[=cfg['encode_funcs'],
funcs=cfg['shake'],
shake=cfg['shake_std']).drop('stock_id', axis=1)
shake_std= y
y_train
assert(n_train_cols == x_train.shape[1])
= 1 / np.square(y_train) # Root mean squared percentage error weights
train_weights = lgb.Dataset(x_train, y_train, weight=train_weights)
train_dataset
for fold, best_iter in enumerate(best_iterations):
'n_estimators'] = int(best_iter) # lgbm needs int here
params[= lgb.train(params = params,
model = train_dataset)
train_set 'path_models'], f'{cfg["prefix"]}rerun_lgb_{fold}.txt'))
model.save_model(os.path.join(cfg[
if __name__ == '__main__':
main()
Training fold 0
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:148: UserWarning: Found `n_estimators` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
[1] train's rmse: 0.00114413 train's RMSPE: 0.52998 valid's rmse: 0.00115188 valid's RMSPE: 0.53124
/opt/conda/lib/python3.7/site-packages/lightgbm/callback.py:183: UserWarning: Early stopping is not available in dart mode
_log_warning('Early stopping is not available in dart mode')
[2] train's rmse: 0.00109769 train's RMSPE: 0.508467 valid's rmse: 0.00110573 valid's RMSPE: 0.509958
[3] train's rmse: 0.00105382 train's RMSPE: 0.488149 valid's rmse: 0.00106222 valid's RMSPE: 0.48989
2
Training fold 1
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:148: UserWarning: Found `n_estimators` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
[1] train's rmse: 0.0011456 train's RMSPE: 0.52913 valid's rmse: 0.00114536 valid's RMSPE: 0.534346
/opt/conda/lib/python3.7/site-packages/lightgbm/callback.py:183: UserWarning: Early stopping is not available in dart mode
_log_warning('Early stopping is not available in dart mode')
[2] train's rmse: 0.00109882 train's RMSPE: 0.507522 valid's rmse: 0.00109936 valid's RMSPE: 0.512882
[3] train's rmse: 0.00105481 train's RMSPE: 0.487196 valid's rmse: 0.00105636 valid's RMSPE: 0.492825
2
Training fold 2
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:148: UserWarning: Found `n_estimators` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
[1] train's rmse: 0.00114983 train's RMSPE: 0.531413 valid's rmse: 0.00112919 valid's RMSPE: 0.52551
/opt/conda/lib/python3.7/site-packages/lightgbm/callback.py:183: UserWarning: Early stopping is not available in dart mode
_log_warning('Early stopping is not available in dart mode')
[2] train's rmse: 0.00110306 train's RMSPE: 0.509799 valid's rmse: 0.0010835 valid's RMSPE: 0.504245
[3] train's rmse: 0.00105898 train's RMSPE: 0.489427 valid's rmse: 0.00104054 valid's RMSPE: 0.484253
2
Training fold 3
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:148: UserWarning: Found `n_estimators` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
[1] train's rmse: 0.001147 train's RMSPE: 0.530741 valid's rmse: 0.00114036 valid's RMSPE: 0.528187
/opt/conda/lib/python3.7/site-packages/lightgbm/callback.py:183: UserWarning: Early stopping is not available in dart mode
_log_warning('Early stopping is not available in dart mode')
[2] train's rmse: 0.00110018 train's RMSPE: 0.509077 valid's rmse: 0.00109398 valid's RMSPE: 0.506709
[3] train's rmse: 0.00105622 train's RMSPE: 0.488736 valid's rmse: 0.00105007 valid's RMSPE: 0.48637
2
Training fold 4
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:148: UserWarning: Found `n_estimators` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
[1] train's rmse: 0.0011412 train's RMSPE: 0.529605 valid's rmse: 0.00116322 valid's RMSPE: 0.532431
/opt/conda/lib/python3.7/site-packages/lightgbm/callback.py:183: UserWarning: Early stopping is not available in dart mode
_log_warning('Early stopping is not available in dart mode')
[2] train's rmse: 0.00109532 train's RMSPE: 0.508314 valid's rmse: 0.00111604 valid's RMSPE: 0.510835
[3] train's rmse: 0.00105181 train's RMSPE: 0.488118 valid's rmse: 0.00107131 valid's RMSPE: 0.490362
2
Our out of folds RMSPE is 0.4887
Our cv fold scores are [0.49, 0.493, 0.484, 0.486, 0.49]
https://app.neptune.ai/chrisrichardmiles/optiver/e/OP-125
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 142 operations to synchronize with Neptune. Do not kill this process.
All 142 operations synced, thanks for waiting!
retraining model with all data for [2, 2, 2, 2, 2] iterations
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:148: UserWarning: Found `n_estimators` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:148: UserWarning: Found `n_estimators` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
= lgb.Booster(model_file='./lgb_fold_0.txt') m