#all_no_test
from opt_utils import *
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.cluster.hierarchy as sch
import seaborn as sns
'figure.figsize'] = (15, 7)
plt.rcParams['font.size'] = 18 plt.rcParams[
='../input/optiver-realized-volatility-prediction'
DATA_RAW=0
stock_id='train'
train_or_test= read_train_or_test(DATA_RAW, 'train') train
= train['target'].mean()
target_mean = train['target'].median()
target_median = train['target'].hist(bins=1000)
ax 'target distribution showing a positive skew')
plt.suptitle(=target_mean, color='red')
plt.axvline(x=target_median, color='green')
plt.axvline(x=target_mean, y=-1, s='mean', color='red', rotation=-30)
plt.text(x=target_median - .005, y=-1, s='median', color='green', rotation=-30)
plt.text(x plt.show()
train.target.describe()
0].set_title axes[
= plt.subplots(4, 1, figsize=(15, 28))
fig, axes 'time_id')['target'].mean().sort_values(
train.groupby(=False)[:15].plot(kind='barh', ax=axes[0])
ascending0].set_title('Top 15 most most volatile time periods')
axes[0].set_xlabel('Volatility')
axes[
'time_id')['target'].mean().sort_values()[:15]\
train.groupby(='barh')
.plot(kind1].set_title('15 least most volatile time periods')
axes['Volatility')
ax.set_xlabel(
'stock_id')['target'].mean().sort_values()[:15]\
train.groupby(='barh')
.plot(kind2].set_title('15 least most volatile stock_ids')
axes['Volatility')
ax.set_xlabel(
'stock_id')['target'].mean().sort_values()[:15]\
train.groupby(='barh')
.plot(kind3].set_title('15 least most volatile stock_ids')
axes['Volatility')
ax.set_xlabel( plt.show()
= train.groupby('stock_id')['target'].mean() x
=False)[:10].plot(kind='barh') x.sort_values(ascending
= train['target'].max()
target_max = train['target'].describe()[1:-1].plot(kind='bar')
ax f'statistics of target without max, which is {target_max}')
plt.suptitle( plt.show()
= train.groupby('stock_id')['target'].describe()
stats stats.head()
= plt.subplots((2, 2))
fig, axes = stats['mean'].hist(bins=1000)
ax 'mean distribution')
plt.suptitle( plt.show()
= train[['time_id', 'stock_id', 'target']].set_index(['time_id', 'stock_id']).unstack() piv
='viridis')\
.style.background_gradient(cmap **{'font-size': '20px'}) .set_properties(
= piv.corr() corr
# import scipy
# import scipy.cluster.hierarchy as sch
# import seaborn as sns
def cluster_corr(corr_array, inplace=False):
"""
All credit to Wil Yegelwel for
https://wil.yegelwel.com/cluster-correlation-matrix/#:~:text=Cluster%20a%20Correlation%20Matrix%20%28in%20python%29%20Below%20is,highly%20correlated%20variables%20are%20next%20to%20eachother%20
Rearranges the correlation matrix, corr_array, so that groups of highly
correlated variables are next to eachother
Parameters
----------
corr_array : pandas.DataFrame or numpy.ndarray
a NxN correlation matrix
Returns
-------
pandas.DataFrame or numpy.ndarray
a NxN correlation matrix with the columns and rows rearranged
"""
= sch.distance.pdist(corr_array)
pairwise_distances = sch.linkage(pairwise_distances, method='complete')
linkage = pairwise_distances.max()/2
cluster_distance_threshold = sch.fcluster(linkage, cluster_distance_threshold,
idx_to_cluster_array ='distance')
criterion= np.argsort(idx_to_cluster_array)
idx
if not inplace:
= corr_array.copy()
corr_array
if isinstance(corr_array, pd.DataFrame):
return corr_array.iloc[idx, :].T.iloc[idx, :]
return corr_array[idx, :][:, idx]
sns.heatmap(corr) plt.show()
sns.heatmap(cluster_corr(corr)) plt.show()
30) cluster_corr(corr).head(
= corr.mean().hist(bins=100)
ax 'Distribution of each stock_ids mean correlation with all other stock_ids')
plt.suptitle( plt.show()
corr
='bar') corr.mean().plot(kind
'mean_corr') corr.sort_values(
corr
price features
= load_bt(DATA_RAW, stock_id, train_or_test)
df add_wap(df)
= df[df.time_id == 5]
dff dff.wap.describe()
print(dff.wap.values[-1], dff.wap.values[0])
def first(x): return x.values[0]
def last(x): return x.values[-1]
= df.groupby('time_id').agg({'wap': [first, last, np.min, np.max]}) dfa
= ['_'.join(c) for c in dfa.columns] dfa.columns
dfa.columns
"""Same as p4 except Im goin to use 10 minutes
instead of 5."""
= load_bt(DATA_RAW, stock_id, train_or_test)
df = add_wap(df)
df 'log_return'] = df.groupby(['time_id'])['wap'].apply(log_return)
df['abs_log_return'] = df['log_return'].abs()
df['is_pos_return'] = (df['log_return'] > 0).astype(int)
df['is_neg_return'] = (df['log_return'] < 0).astype(int)
df['spread_pct'] = (df.ask_price1 - df.bid_price1) / df.wap
df['spread_2_pct'] = (df.ask_price2 - df.bid_price2) / df.wap
df['spread'] = (df.ask_price1 - df.bid_price1)
df['spread_2'] = (df.ask_price2 - df.bid_price2)
df['sum_bid'] = (df.bid_size1 + df.bid_size2)
df['sum_ask'] = (df.ask_size1 + df.ask_size2)
df['bid_ask_ratio'] = df['sum_bid'] / df['sum_ask']
df['sum_bid_ask'] = df['sum_bid'] + df['sum_ask'] df[
# This shows there is no missing data in the book or trade data
= 0
bookna = 0
tradena for stock_id in train.stock_id.unique():
= load_bt(DATA_RAW, stock_id, train_or_test, book_only=True)
book = load_bt(DATA_RAW, stock_id, train_or_test, trade_only=True)
trade += book.isna().sum().sum()
bookna += trade.isna().sum().sum()
tradena print('bookna', bookna, 'tradena', tradena)
# for stock_id in train.stock_id.unique():
= train.stock_id.unique()[0]
stock_id = load_bt(DATA_RAW, stock_id, train_or_test, book_only=True, add_stock_id=True)
book = load_bt(DATA_RAW, stock_id, train_or_test, trade_only=True, add_stock_id=True) trade
=[]
dfsfor stock_id in train.stock_id.unique():
= load_bt(DATA_RAW, stock_id, train_or_test, book_only=True, add_stock_id=True)
book = load_bt(DATA_RAW, stock_id, train_or_test, trade_only=True, add_stock_id=True)
trade = book.groupby(['stock_id', 'time_id'])['seconds_in_bucket'].agg(len).to_frame().rename(columns={'seconds_in_bucket': 'len_book'})
b = trade.groupby(['stock_id', 'time_id'])['seconds_in_bucket'].agg(len).to_frame().rename(columns={'seconds_in_bucket': 'len_trade'})
t =1))
dfs.append(pd.concat([b, t], axis= pd.concat(dfs) df_len
df_len
= df_len.reset_index()
dff 'row_id'] = dff['stock_id'].astype(str) + '-' + dff['time_id'].astype(str)
dff[= dff[['row_id', 'len_book', 'len_trade']].set_index('row_id')
dff = dff.join(train).reset_index()
dff 'diff_len_book_len_trade'] = dff['len_book'] - dff['len_trade']
dff[ dff.head()
'len_book', 'len_trade','diff_len_book_len_trade']]\
dff[['target']).to_frame().rename(columns={0: 'target'})\
.corrwith(x[='viridis')\
.style.background_gradient(cmap **{'font-size': '20px'}) .set_properties(
='ladkfj') plt.plot(title
df_len.len_book.hist()
min() df_len.len_book.
max() df_len.len_book.
= lambda x: np.isnan(x).sum() f
'time_id')['bid_size1'].agg(f) df.groupby(
= df.groupby('time_id').agg(len) dff
dff.bid_size1.hist()
= {
agg_dict 'log_return': [realized_volatility, 'count', np.std, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'is_pos_return': [np.sum, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'is_neg_return': [np.sum, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'abs_log_return': [np.sum, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'sum_bid': [np.sum, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'sum_ask': [np.sum, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'wap': [np.mean, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'spread': [np.mean, np.sum, np.std, get_mean_decay(.99, -1), get_mean_decay(.99, 1), get_mean_decay(.95, -1), get_mean_decay(.95, 1)],
'bid_ask_ratio': [np.mean, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'sum_bid_ask': [np.mean, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'size': [np.mean, np.sum, np.std, get_mean_decay(.99, -1), get_mean_decay(.99, 1), get_mean_decay(.95, -1), get_mean_decay(.95, 1)],
'spread_pct': [np.mean, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
}= df.groupby(['time_id']).agg(agg_dict).rename(
df_agg ={'<lambda_0>': 'mean_decay',
columns'<lambda_1>': 'mean_decay_flip',
'<lambda_2>': 'mean_decay_95',
'<lambda_3>': 'mean_decay_flip_95',
}
)= ['_'.join(c) for c in df_agg.columns] df_agg.columns
############ Realized volume for each minute ############
for m in range(1, 11):
= (df.seconds_in_bucket >= 60 * m - 60) & (df.seconds_in_bucket < 60 * m)
mask f'real_vol_min_{m}'] = df[mask].groupby('time_id')['log_return'].agg(realized_volatility)
df_agg[
######### Decay sum of realized volume per minute ########
= [f'real_vol_min_{minute}' for minute in range(1, 11)]
cols = df_agg[cols].values
x for decay, step in product((.99, .95, .9, .85, .75, .65, .55, .45), (1, -1)):
f'real_vol_mean_decay_{decay}_{step}'] = mean_decay(x, decay, step, axis=1)
df_agg[# df_agg['end_beg_decay_ratio'] = df_agg['real_vol_mean_decay_0.85_-1'] / df_agg['real_vol_mean_decay_0.85_1'] # replaced by next code
for c1, c2 in zip(df_agg.columns, df_agg.columns[1:]):
if 'mean_decay_flip' in c2:
= c2.split('mean_decay_flip')
pre, suf + 'momentum' + suf] = df_agg[c1] / df_agg[c2]
df_agg[pre if 'vol_mean_decay' in c2 and '-1' in c2:
= c2.split('vol_mean_decay')
pre, suf + 'momentum' + suf] = df_agg[c2] / df_agg[c1]
df_agg[pre
= df_agg.astype('float32')
df_agg 'no_book'] = (df_agg['log_return_count'] == 0).astype(int)
df_agg['no_book'] = df_agg['no_book'].astype('category')
df_agg[################# Adding 'row_id' column ##################
=True)
df_agg.reset_index(inplace'time_id'] = df_agg.time_id.apply(lambda x: f"{stock_id}-{x}")
df_agg['time_id': 'row_id'}, axis=1, inplace=True)
df_agg.rename({return df_agg.set_index('row_id')
Looking at the feature and target correlation
= pd.read_pickle('../input/generate-train-features-script/p5_train.pkl') train
= train.corrwith(train.target).abs()\
top_50_corr_cols =False)[:50].index
.sort_values(ascending train[top_50_corr_cols].corr()
sns.heatmap(train[top_50_corr_cols].corr())
'log_return_realized_volatility']].corrwith(train.target) train[[
'time_id_mean_real_vol'] = train.groupby('time_id')['log_return_realized_volatility'].transform('mean') train[
'time_id_mean_real_vol']].corrwith(train.target) train[[
= [c for c in train.columns if 'wap' in c]
cols train[cols].corrwith(train.target)
load_bt(stock_id)