#all_no_test
from opt_utils import *
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.cluster.hierarchy as sch
import seaborn as sns
plt.rcParams['figure.figsize'] = (15, 7)
plt.rcParams['font.size'] = 18DATA_RAW='../input/optiver-realized-volatility-prediction'
stock_id=0
train_or_test='train'
train = read_train_or_test(DATA_RAW, 'train')target_mean = train['target'].mean()
target_median = train['target'].median()
ax = train['target'].hist(bins=1000)
plt.suptitle('target distribution showing a positive skew')
plt.axvline(x=target_mean, color='red')
plt.axvline(x=target_median, color='green')
plt.text(x=target_mean, y=-1, s='mean', color='red', rotation=-30)
plt.text(x=target_median - .005, y=-1, s='median', color='green', rotation=-30)
plt.show()train.target.describe()axes[0].set_titlefig, axes = plt.subplots(4, 1, figsize=(15, 28))
train.groupby('time_id')['target'].mean().sort_values(
ascending=False)[:15].plot(kind='barh', ax=axes[0])
axes[0].set_title('Top 15 most most volatile time periods')
axes[0].set_xlabel('Volatility')
train.groupby('time_id')['target'].mean().sort_values()[:15]\
.plot(kind='barh')
axes[1].set_title('15 least most volatile time periods')
ax.set_xlabel('Volatility')
train.groupby('stock_id')['target'].mean().sort_values()[:15]\
.plot(kind='barh')
axes[2].set_title('15 least most volatile stock_ids')
ax.set_xlabel('Volatility')
train.groupby('stock_id')['target'].mean().sort_values()[:15]\
.plot(kind='barh')
axes[3].set_title('15 least most volatile stock_ids')
ax.set_xlabel('Volatility')
plt.show()x = train.groupby('stock_id')['target'].mean()x.sort_values(ascending=False)[:10].plot(kind='barh')target_max = train['target'].max()
ax = train['target'].describe()[1:-1].plot(kind='bar')
plt.suptitle(f'statistics of target without max, which is {target_max}')
plt.show()stats = train.groupby('stock_id')['target'].describe()
stats.head()fig, axes = plt.subplots((2, 2))
ax = stats['mean'].hist(bins=1000)
plt.suptitle('mean distribution')
plt.show()piv = train[['time_id', 'stock_id', 'target']].set_index(['time_id', 'stock_id']).unstack().style.background_gradient(cmap ='viridis')\
.set_properties(**{'font-size': '20px'})corr = piv.corr()# import scipy
# import scipy.cluster.hierarchy as sch
# import seaborn as sns
def cluster_corr(corr_array, inplace=False):
"""
All credit to Wil Yegelwel for
https://wil.yegelwel.com/cluster-correlation-matrix/#:~:text=Cluster%20a%20Correlation%20Matrix%20%28in%20python%29%20Below%20is,highly%20correlated%20variables%20are%20next%20to%20eachother%20
Rearranges the correlation matrix, corr_array, so that groups of highly
correlated variables are next to eachother
Parameters
----------
corr_array : pandas.DataFrame or numpy.ndarray
a NxN correlation matrix
Returns
-------
pandas.DataFrame or numpy.ndarray
a NxN correlation matrix with the columns and rows rearranged
"""
pairwise_distances = sch.distance.pdist(corr_array)
linkage = sch.linkage(pairwise_distances, method='complete')
cluster_distance_threshold = pairwise_distances.max()/2
idx_to_cluster_array = sch.fcluster(linkage, cluster_distance_threshold,
criterion='distance')
idx = np.argsort(idx_to_cluster_array)
if not inplace:
corr_array = corr_array.copy()
if isinstance(corr_array, pd.DataFrame):
return corr_array.iloc[idx, :].T.iloc[idx, :]
return corr_array[idx, :][:, idx]sns.heatmap(corr)
plt.show()sns.heatmap(cluster_corr(corr))
plt.show()cluster_corr(corr).head(30)ax = corr.mean().hist(bins=100)
plt.suptitle('Distribution of each stock_ids mean correlation with all other stock_ids')
plt.show()corrcorr.mean().plot(kind='bar')corr.sort_values('mean_corr')corrprice features
df = load_bt(DATA_RAW, stock_id, train_or_test)
add_wap(df)dff = df[df.time_id == 5]
dff.wap.describe()print(dff.wap.values[-1], dff.wap.values[0])def first(x): return x.values[0]
def last(x): return x.values[-1]dfa = df.groupby('time_id').agg({'wap': [first, last, np.min, np.max]})dfa.columns = ['_'.join(c) for c in dfa.columns]dfa.columns"""Same as p4 except Im goin to use 10 minutes
instead of 5."""
df = load_bt(DATA_RAW, stock_id, train_or_test)
df = add_wap(df)
df['log_return'] = df.groupby(['time_id'])['wap'].apply(log_return)
df['abs_log_return'] = df['log_return'].abs()
df['is_pos_return'] = (df['log_return'] > 0).astype(int)
df['is_neg_return'] = (df['log_return'] < 0).astype(int)
df['spread_pct'] = (df.ask_price1 - df.bid_price1) / df.wap
df['spread_2_pct'] = (df.ask_price2 - df.bid_price2) / df.wap
df['spread'] = (df.ask_price1 - df.bid_price1)
df['spread_2'] = (df.ask_price2 - df.bid_price2)
df['sum_bid'] = (df.bid_size1 + df.bid_size2)
df['sum_ask'] = (df.ask_size1 + df.ask_size2)
df['bid_ask_ratio'] = df['sum_bid'] / df['sum_ask']
df['sum_bid_ask'] = df['sum_bid'] + df['sum_ask']# This shows there is no missing data in the book or trade data
bookna = 0
tradena = 0
for stock_id in train.stock_id.unique():
book = load_bt(DATA_RAW, stock_id, train_or_test, book_only=True)
trade = load_bt(DATA_RAW, stock_id, train_or_test, trade_only=True)
bookna += book.isna().sum().sum()
tradena += trade.isna().sum().sum()
print('bookna', bookna, 'tradena', tradena)# for stock_id in train.stock_id.unique():
stock_id = train.stock_id.unique()[0]
book = load_bt(DATA_RAW, stock_id, train_or_test, book_only=True, add_stock_id=True)
trade = load_bt(DATA_RAW, stock_id, train_or_test, trade_only=True, add_stock_id=True)dfs=[]
for stock_id in train.stock_id.unique():
book = load_bt(DATA_RAW, stock_id, train_or_test, book_only=True, add_stock_id=True)
trade = load_bt(DATA_RAW, stock_id, train_or_test, trade_only=True, add_stock_id=True)
b = book.groupby(['stock_id', 'time_id'])['seconds_in_bucket'].agg(len).to_frame().rename(columns={'seconds_in_bucket': 'len_book'})
t = trade.groupby(['stock_id', 'time_id'])['seconds_in_bucket'].agg(len).to_frame().rename(columns={'seconds_in_bucket': 'len_trade'})
dfs.append(pd.concat([b, t], axis=1))
df_len = pd.concat(dfs)df_lendff = df_len.reset_index()
dff['row_id'] = dff['stock_id'].astype(str) + '-' + dff['time_id'].astype(str)
dff = dff[['row_id', 'len_book', 'len_trade']].set_index('row_id')
dff = dff.join(train).reset_index()
dff['diff_len_book_len_trade'] = dff['len_book'] - dff['len_trade']
dff.head()dff[['len_book', 'len_trade','diff_len_book_len_trade']]\
.corrwith(x['target']).to_frame().rename(columns={0: 'target'})\
.style.background_gradient(cmap ='viridis')\
.set_properties(**{'font-size': '20px'})plt.plot(title='ladkfj')df_len.len_book.hist()df_len.len_book.min()df_len.len_book.max()f = lambda x: np.isnan(x).sum()df.groupby('time_id')['bid_size1'].agg(f)dff = df.groupby('time_id').agg(len)dff.bid_size1.hist()agg_dict = {
'log_return': [realized_volatility, 'count', np.std, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'is_pos_return': [np.sum, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'is_neg_return': [np.sum, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'abs_log_return': [np.sum, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'sum_bid': [np.sum, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'sum_ask': [np.sum, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'wap': [np.mean, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'spread': [np.mean, np.sum, np.std, get_mean_decay(.99, -1), get_mean_decay(.99, 1), get_mean_decay(.95, -1), get_mean_decay(.95, 1)],
'bid_ask_ratio': [np.mean, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'sum_bid_ask': [np.mean, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
'size': [np.mean, np.sum, np.std, get_mean_decay(.99, -1), get_mean_decay(.99, 1), get_mean_decay(.95, -1), get_mean_decay(.95, 1)],
'spread_pct': [np.mean, get_mean_decay(.99, -1), get_mean_decay(.99, 1)],
}
df_agg = df.groupby(['time_id']).agg(agg_dict).rename(
columns={'<lambda_0>': 'mean_decay',
'<lambda_1>': 'mean_decay_flip',
'<lambda_2>': 'mean_decay_95',
'<lambda_3>': 'mean_decay_flip_95',
}
)
df_agg.columns = ['_'.join(c) for c in df_agg.columns]############ Realized volume for each minute ############
for m in range(1, 11):
mask = (df.seconds_in_bucket >= 60 * m - 60) & (df.seconds_in_bucket < 60 * m)
df_agg[f'real_vol_min_{m}'] = df[mask].groupby('time_id')['log_return'].agg(realized_volatility)
######### Decay sum of realized volume per minute ########
cols = [f'real_vol_min_{minute}' for minute in range(1, 11)]
x = df_agg[cols].values
for decay, step in product((.99, .95, .9, .85, .75, .65, .55, .45), (1, -1)):
df_agg[f'real_vol_mean_decay_{decay}_{step}'] = mean_decay(x, decay, step, axis=1)
# df_agg['end_beg_decay_ratio'] = df_agg['real_vol_mean_decay_0.85_-1'] / df_agg['real_vol_mean_decay_0.85_1'] # replaced by next code
for c1, c2 in zip(df_agg.columns, df_agg.columns[1:]):
if 'mean_decay_flip' in c2:
pre, suf = c2.split('mean_decay_flip')
df_agg[pre + 'momentum' + suf] = df_agg[c1] / df_agg[c2]
if 'vol_mean_decay' in c2 and '-1' in c2:
pre, suf = c2.split('vol_mean_decay')
df_agg[pre + 'momentum' + suf] = df_agg[c2] / df_agg[c1]
df_agg = df_agg.astype('float32')
df_agg['no_book'] = (df_agg['log_return_count'] == 0).astype(int)
df_agg['no_book'] = df_agg['no_book'].astype('category')
################# Adding 'row_id' column ##################
df_agg.reset_index(inplace=True)
df_agg['time_id'] = df_agg.time_id.apply(lambda x: f"{stock_id}-{x}")
df_agg.rename({'time_id': 'row_id'}, axis=1, inplace=True)
return df_agg.set_index('row_id')Looking at the feature and target correlation
train = pd.read_pickle('../input/generate-train-features-script/p5_train.pkl')top_50_corr_cols = train.corrwith(train.target).abs()\
.sort_values(ascending=False)[:50].index
train[top_50_corr_cols].corr()sns.heatmap(train[top_50_corr_cols].corr())train[['log_return_realized_volatility']].corrwith(train.target)train['time_id_mean_real_vol'] = train.groupby('time_id')['log_return_realized_volatility'].transform('mean')train[['time_id_mean_real_vol']].corrwith(train.target)cols = [c for c in train.columns if 'wap' in c]
train[cols].corrwith(train.target)load_bt(stock_id)