set()
sns.'figure.figsize'] = (14,6)
plt.rcParams['font.size'] = 16 plt.rcParams[
Feature engineering
= 'data'
PATH_DATA = 'data/raw'
PATH_DATA_RAW = 'data/features'
PATH_DATA_FEATURES os.listdir(PATH_DATA_RAW)
################## Load data ####################
= pd.read_csv(os.path.join(PATH_DATA_RAW, 'sales_train_evaluation.csv'), chunksize=1000)
chunks = pd.concat(list(chunks)) # Safe for low RAM situation
df_stv = pd.read_csv(os.path.join(PATH_DATA_RAW, 'calendar.csv'))
df_cal = pd.read_csv(os.path.join(PATH_DATA_RAW, 'sell_prices.csv'))
df_prices = pd.read_csv(os.path.join(PATH_DATA_RAW, 'sample_submission.csv')) df_ss
What you can get out of this notebook
- Know how to make lag features from the horizontal “rectangle” data representation, which is how the data starts.
- Knoweldge of how to utilize numpy to do quick rolling window aggregations.
Making a grid to align all features
This section develops code for make_grid_df
which will yield: * A dataframe to align all features * A numpy array where each row is a time series. This data representation can be good for for fast feature engineering.
Add prediction horizon
We will start by adding the prediction horizon to our original data so that feature our features will be generated all training data and our test data at the same time.
= int(df_stv.columns[-1][2:])
last_day = 28
pred_horizon for i in range(last_day + 1, last_day + 1 + pred_horizon):
f'd_{i}'] = np.nan df_stv[
Make a tidy grid
We want our data in a tidy format, where we have a row for every product/sales_day combination. To do this, we start be reshaping our data to long format. I will call this our grid_df
, on which we will build our features
Using pandas
We can use pandas dataframe .melt
method
= time.time()
s = time.time()
start_time = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
DROP_COLS = df_stv.drop(DROP_COLS, axis=1).melt(id_vars='id', var_name='d', value_name='sales')
grid_df # print(f"Total time for melt: {(time.time() - start_time)/60} min")
print(f"Total time for melt: ", time_taken(start_time))
# Saving space
= time.time()
start_time 'd'] = grid_df.d.str[2:].astype(np.int16)
grid_df[print(f"Total time for day col change: ", time_taken(start_time))
= time.time()
start_time 'id'] = grid_df.id.astype('category')
grid_df[print(f"Total time for category: ", time_taken(start_time))
print(time_taken(s))
display(grid_df)
del s
Faster grid ceation using numpy
= [col for col in df_stv.columns if col.startswith('d_')]
d_cols = pd.DataFrame({'id': pd.Series(np.tile(df_stv.id, len(d_cols))).astype('category'),
g 'd': np.concatenate([[int(s[2:])] * df_stv.shape[0] for s in d_cols]).astype(np.int16),
'sales': df_stv[d_cols].values.T.reshape(-1,)})
print(f'Both grids are the same: {grid_df.equals(g)}')
Isolate numpy array in “rectangle” representation
I will take the sales values as they are to form my base “rectangle” of sales. I think I can take this recatangle and quickly reshape it so that it lines up with grid_df. If I am correct we can use this to create features quickly.
Test: Reshape the basic rectangle so that it matches sales of grid_df
= df_stv[d_cols].values
rec = rec.T.reshape(-1)
test_sales print('test_sales matches sales?? ', (np.nan_to_num(test_sales) == grid_df['sales'].fillna(0)).all())
The competition guide states that leading zeros sales should not be considered, therefore we need to convert these leading zeros to NaNs.
nan_leading_zeros
nan_leading_zeros (rec)
Leading zeros indicate an item was not for sale. We will mark as np.nan to ensure they are not used for training.
print(rec[: 10, :5])
10, :5]) nan_leading_zeros(rec[:
Main function
make_grid_df
make_grid_df (df:pandas.core.frame.DataFrame, pred_horizon=28)
Specific to the the M5 competition data. Returns a grid_df to allign all features and the sales
data in a “rectangle” data representation, a 2D numpy array where ever row is an items time series.
= make_grid_df(os.path.join(PATH_DATA_RAW, 'sales_train_evaluation.csv'), pred_horizon=28) grid_df, rec
grid_df
rec
Base features
Functions to create basic calendar and price features
We start with a grid_df so that all our features will be aligned on the same index making it easy to add features for trianing. grid_df was created in the last few cells.
Base categorical variables given by heierarchical levels
First, we will add the grouping levels of the data as features. This is easy because the features are already included in df_stv columns. We just need to make a copy of these columns for every day of training and prediction.
add_base
add_base (grid_df, df_stv, rec)
Adds the basic categorical features to grid_df.
add_base(grid_df, df_stv, rec) grid_df.info()
3) grid_df.head(
Price features
create_price_fe
create_price_fe (df_prices)
Adds price features onto price_df. This is the step we take before merging prices onto our grid_df.
= create_price_fe(df_prices)
df_prices df_prices.info()
add_price_fe
add_price_fe (grid_df, df_prices, df_cal)
Adds on price features to grid_df.
= add_price_fe(grid_df, df_prices, df_cal)
grid_df grid_df.info()
Calander Features
= grid_df[['id', 'd', 'sales']] grid_df
add_cal_fe
add_cal_fe (grid_df, df_cal)
Adds calendar features onto grid_df.
= add_cal_fe(grid_df, df_cal)
grid_df grid_df.info()
Snap features
The columns with a name like ‘snap_CA’ indicates whether SNAP (also known as EBT) benefits are accepted in California for each day. What days can people use their SNAP benefits
= df_cal.iloc[-180:, :].copy()
df 'date'] = pd.to_datetime(df.date)
df[= plt.subplots(3, 1)
fig, ax 'date').snap_CA.plot(ax=ax[0], title='California')
df.set_index('date').snap_TX.plot(ax=ax[1], title='Texas')
df.set_index('date').snap_WI.plot(ax=ax[2], title='Wisconsin')
df.set_index(
2].xaxis.set_major_locator(mdates.MonthLocator(bymonthday=1))
ax[2].xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))
ax['Days where people can use SNAP benefits')
fig.suptitle(
fig.autofmt_xdate()
fig.tight_layout() plt.show()
print()
'date'] = pd.to_datetime(df_cal['date'])
df_cal['snap_CA', 'snap_TX', 'snap_WI']].sum().plot(
df_cal.groupby(df_cal.date.dt.day)[[='Are the snap days distributed accross these days for the entire dataset?',kind='bar', figsize=(14, 6))
title plt.show()
It seems the snap days are consistent for every month in the dataset since each day has the same number of snap occurances for each date.
############### Snap days of month ###########################
= grid_df[grid_df['snap_CA'] == 1].tm_d.unique()
ca = grid_df[grid_df['snap_TX'] == 1].tm_d.unique()
tx = grid_df[grid_df['snap_WI'] == 1].tm_d.unique()
wi print('For each state, what days of the month are snap days?')
print('CA:', ca)
print('TX:', tx)
print('WI:', wi)
Simple feature
Just map each snap day to 1 through 10 so the model will know which of the 10 snap days it is.
add_snap_transform_1
add_snap_transform_1 (grid_df)
Adds a column that shows which of the 10 snap days it is. The value is 0 if it is not a snap day.
A more meaningful mapping?
I would like to transform the snap information in a way that might give more information about non snap days. In particular, I want the “gap” days, non-snap days right in between two snap days, to be considered different from the long stretch of non-snap days towards the end of the month. I’d also like the non-snap days leading into the first snap day to be the same, no matter what state we are considering, so that algorithms can use this feature without needing state information to decode meaning.
add_snap_transform_2
add_snap_transform_2 (grid_df)
This maps snap days and non snap days in way that may be more meaningful than snap_transform_1
.
Any day above 40 will be a snap day. Lower days are non snap, and lowest days are “gap” days in between snap days. In this way I’m hoping the model can can use this feature as non-categorical, and be able to efficiently sort when higher demand days may be. Also, 16-21 will always be the days following the last snap day and 27-31 will be the days leading up to the first snap day. My theory is these numbers will encode more meaning with less confusion caused by states having different snap days.
Special event features
How many days have events?
print(f'Type 1: {grid_df.event_type_1.count()/grid_df.shape[0] * 100:.2f} percent')
print(f'Type 2: {grid_df.event_type_2.count()/grid_df.shape[0] * 100:.2f} percent')
What special events do we have?
print('Unique types in event_type_1:')
display(grid_df.event_type_1.unique().tolist())print('Unique types in event_type_2:')
display(grid_df.event_type_2.unique().tolist())print('Unique names in event_name_1:')
display(grid_df.event_name_1.unique().tolist())
Do we ever have event_type_2 if there is not an even_type_1?
How often do we have 2 events on the same day?
'd') grid_df[grid_df.event_name_2.notnull()].drop_duplicates(
= (grid_df['event_type_1'] == 'Religious') & (grid_df['event_type_2'] == 'Cultural')
mask 'd') grid_df[mask].drop_duplicates(
Since there are only a few days with 2 events, I will only consider event_type_1 for my new event features. In the cases where I think the event_type_2 will be more relevant, I will move it to event_type_1. I think cultural event types may be more important than religious events. This will basically amount to making Easter cultural and Cinco De Mayo is also accounted for.
add_event_features
add_event_features (grid_df, n_items=30490, n_days_in_data=1969)
Adds some features related to special events like holidays to grid_df. The columns added are: next_event_type_1 last_event_type_1 days_since_event days_until_event
del df_cal, df_prices, df_ss
Main function
fe_base_features
fe_base_features (path_data_raw:str<pathtorawdatafolder>='data/raw', path_features:str<pathtofeaturefolder>='data/features', path_to_train_file:str<pathtotraindata>=None)
Creates the basic categorical, price, and calendar features using the functions add_base
, create_price_fe
, add_price_fe
, and add_cal_fe
, add_snap_transform_1
, add_snap_transform_1
, and add_event_features
.
'features')) fe_base_features(PATH_DATA_RAW, os.path.join(PATH_DATA,
f'{os.path.join(PATH_DATA, "features")}/fe_base.csv').info())
display(load_file(f'{os.path.join(PATH_DATA, "features")}/fe_price.csv').info())
display(load_file(f'{os.path.join(PATH_DATA, "features")}/fe_cal.csv').info())
display(load_file(f'{os.path.join(PATH_DATA, "features")}/fe_snap_event.csv').info()) display(load_file(
Encoding features with target statistics
encode_target
encode_target (df:pandas.core.frame.DataFrame, target:str, cols:Union[list,str], func:Union[str,<built- infunctioncallable>], verbose=True)
Uses pandas groupby(col)[target].transform(func
) to encode each col in cols
. The target
col can be any numerical column.
Main function
Poor encoding
Below is the method I used in the competition. I found that the models performed much better during training, and much worse during validation, so I dropped them from training.
I should have done the encoding more carefully, using no future data. I will explore encodings more in the post competition experiments.
fe_encodings
fe_encodings (path_features:<pathtofeaturefolder>='data/features', path_out_features:str<pathtofeaturefolderforoutput>=None, start_test:int<Firstdaytostartnans>=1942)
Creates target encoding with mean and std for various columns, with sales after start_test
set to np.nan.
fe_encodings()
f'{PATH_DATA_FEATURES}/fe_enc_mean.csv').info())
display(load_file(f'{PATH_DATA_FEATURES}/fe_enc_std.csv').info())
display(load_file(f'{PATH_DATA_FEATURES}/fe_enc_special.csv').info()) display(load_file(
Lags and rolling features
These are the features created from the raw sales data directly.
Be carefule with lag features. We must be mindful that we will not have the same information for all days in forecast horizon. If we want a single model to predict all days, we can only use lagging features from 28 days and older. In order to create one set of features that we can use for all predictions we will do the following: * Create all lagging features as if we are building a model for 1 day into the future. So “lag_1” means sales one day before the first day of the prediction horizon. * When building a model for the nth day of the horizon, we need to shift the lag features n - 1 extra days. Since we have 30490 time series, We do this by shifting the index of the features by (n - 1) * 30490 so that the lag features for all training and testing data will be lagged by an extra (n - 1) days to keep information aligned properly.
Basic lag features
Before reshaping the data to become a column, we need to shift our rectangle lag_shift
by prepended the data with np.nans to make up for the data we have cut off. Therefore, all the d_1 products in grid_df will have np.nan for lag_1. In fact, as we carry out this process for all lag days, rows with sales on d_x will have np.nan values for all lags lag_y where y >= x.
= make_grid_df(os.path.join(PATH_DATA_RAW, 'sales_train_evaluation.csv'))
grid_df, rec grid_df.shape
make_lag_col
make_lag_col (rec:<built-infunctionarray>, lag:int)
Transform the ‘rectangle’ of time series into a lag feature
2, :5] rec[:
2, :5], 1) make_lag_col(rec[:
add_lags
add_lags (grid_df, rec, lags=range(1, 16))
add_lags(grid_df, rec)
grid_df.info()
Pandas shift
We can also just use pandas shift which is easier to implement and not much slower. The only downside is that we must use num_series
to shift by the correct increment. Here we will do it for g, which was the same as grid_df before adding lags. Our time was not wasted though, because we learned skills that we will need for making rolling windows.
rec.shape
= make_grid_df(os.path.join(PATH_DATA_RAW, 'sales_train_evaluation.csv')) g, rec_tmp
= df_stv.shape[0]
num_series for i in range(1,16):
f'lag_{i}'] = g['sales'].shift(num_series * i).astype(np.float16) g[
del g
gc.collect()
Main function
fe_lags
fe_lags (path_data_raw:str<pathtorawdatafolder>='data/raw', path_features:str<pathtofeaturefolder>='data/features', path_to_train_file:str<pathtotraindata>=None)
Creates lags and rolling window features using add_lags
add_rolling_cols
fe_lags()# fe_lags(PATH_DATA_RAW, path_features='.')
= 84
max_lag = 14
cols_per_file for lag in range(1, max_lag, cols_per_file):
f'{PATH_DATA_FEATURES}/shift_fe_lags_{lag}_{lag + cols_per_file - 1}.csv').info()) display(load_file(
Rolling features
rolling window function
Please check out “Efficient rolling statistics with NumPy” by Erik Rigtorp. The article shows some cool numpy tricks to do really fast rolling window calculations by creating “rolling windows views”
Update to the article 2021-04-21: “NumPy now comes with a builtin function sliding_window_view that does exactly this. There’s also the Bottleneck library with optimized functions for rolling mean, standard deviation etc.”
rolling_window
rolling_window (a:<built-infunctionarray>, window:int)
A super fast way of getting rolling window view with size window
on a numpy array. Reference: https://rigtorp.se/2011/01/01/rolling-statistics-numpy.html
= np.arange(10).reshape((2,5))
x x
= rolling_window(x, 3)
rw rw
=-1) np.mean(rw, axis
=-1) np.std(rw, axis
=-1) np.median(rw, axis
del x, rw
/opt/hostedtoolcache/Python/3.10.7/x64/lib/python3.10/site-packages/fastcore/docscrape.py:225: UserWarning: Unknown section Examples
else: warn(msg)
split_array
split_array (ary, sections, axis=0)
Works just like np.split, but sections must be a single integer. It will work, even when sections doesn’t evenly divide the length of ary.
This avoids errors that occur when using make_rolling_col
with high n_splits
that do not divide the number of series evenly.
= np.array(range(9))
x x
4) split_array(x,
make_rolling_col
make_rolling_col (rw, function, n_splits=10)
Returns a one dimensional np.array after function
has been applied to the rolling window view rw
.
= rolling_window(rec, 3)
rw print(f'The shape {rw.shape}, represents (num_series, num_windows, window_size)')
= np.mean
function = make_rolling_col(rw, function)
col
print('Make sure the shape of the resulting column matches grid_df')
print(grid_df.shape[0],'=', col.shape[0])
= np.arange(10).reshape((2,5))
x x
= rolling_window(x, 3)
rw =1) make_rolling_col(rw, function, n_splits
del rw, function, col
Some more functions for rolling windows
These functions are designed to act on a rolling_window array created by the rolling_window function, similar to np.mean or np.std.
mean_decay
mean_decay (rolling_window, axis=-1)
Returns the mean_decay along an axis of a rolling window object, which is created by the rolling_window() function.
diff_nanmean
diff_nanmean (rolling_window, axis=-1)
For M5 purposes, used on an object generated by the rolling_window function. Returns the mean of the first difference of a window of sales.
diff_mean
diff_mean (rolling_window, axis=-1)
For M5 purposes, used on an object generated by the rolling_window function. Returns the mean of the first difference of a window of sales.
add_rolling_cols
add_rolling_cols (grid_df:pandas.core.frame.DataFrame, rec:<built- infunctionarray>, windows:list, functions:list, function_names:list=None, n_splits:list=10)
Adds rolling features to grid_df.
= add_rolling_cols(grid_df,
grid_df
rec, =[7, 14, 30, 60, 140],
windows=[np.mean, np.std],
functions=['mean', 'std']) function_names
grid_df.info()
Main function
fe_rw_stats
fe_rw_stats (path_data_raw:str<pathtorawdatafolder>='data/raw', path_features:str<pathtofeaturefolder>='data/features', path_to_train_file:str<pathtotraindata>=None)
Creates lags and rolling window features using add_lags
add_rolling_cols
fe_rw_stats()# fe_rw_stats(PATH_DATA_RAW, path_features='.')
f'{PATH_DATA_FEATURES}/shift_fe_rw_1.csv').info())
display(load_file(f'{PATH_DATA_FEATURES}/shift_fe_rw_2.csv').info())
display(load_file(f'{PATH_DATA_FEATURES}/shift_fe_rw_3.csv').info()) display(load_file(
Average for each day of the week
Feature telling how long its been since theres been a sale
get_days_since_sale
get_days_since_sale (grid_df, num_series=30490)
Returns a column that shows how many days its been Since there has been a sale.
add_dow_means
add_dow_means (grid_df, rec, n_weeks)
Adds features to grid_df for the mean of each day of the week for the past n_weeks
.
For any row, the column ‘mean_{n_weeks}dow{i}’ represents the mean of the last n_weeks
of sales for the day of the week that is i
days behind the date of this row. So if today is Friday, n_weeks=4 and i = 1, this column is equal to the mean sales of the last 4 Thursdays.
Main function
fe_dow_means
fe_dow_means (path_data_raw:str<pathtorawdatafolder>='data/raw', path_features:str<pathtofeaturefolder>='data/features', path_to_train_file:str<pathtotraindata>=None)
Creates the features for day of week means using add_dow_means
We also use ‘get_days_since_sale’ in this script since there isn’t another group very similar to this feature.
# fe_dow_means(PATH_DATA_RAW, '.')\
fe_dow_means()
f'{PATH_DATA_FEATURES}/shift_fe_dow_means_and_days_since_sale.csv').info()) display(load_file(
Shifted lag rolling features
Perhaps I want to also want to know 7 day rolling mean, but from 7 seven days ago. This could go directly into a model, or we could create a weekly momentum_7_rolling_mean_7 = shift_1_rolling_mean_7/shift_8_rolling_mean_7
. We have already calculated these features, we just need to shift the columns by num_series * (shift_days - 1)
. We subtract 1 from shift_days because the column shift_1_rolling_mean_7 is already shifted 1 day.
add_shift_cols
add_shift_cols (grid_df:pandas.core.frame.DataFrame, shifts:list, cols:list, num_series:int=30490, momentum:bool=True)
Adds shift_{shift
} and momentum_{shift
- 1} features for each int shift
in shifts
for each column in cols
. cols
must be a list of columns that begin with ‘shift_1’ for this function to work.
############## Adding shifted rolling mean ###############
= [8, 15, 22, 29]
shifts = [f'shift_1_rolling_mean_{i}' for i in [7, 14]]
cols =df_stv.shape[0]) add_shift_cols(grid_df, shifts, cols, num_series
grid_df.info()
Main function
fe_shifts_momentum
fe_shifts_momentum (path_features:str<pathtofeaturefolder>='data/feature s', path_out_features:str<pathtofeaturefolderforoutpu t>='', num_series:int<Numberofseriesforshifting>=30490)
Creates shifts and momentum features using add_shift_cols
Parameters
path_features: Param(‘path to feature folder’, str)=‘data/features’
path_out_features: Param(‘path to feature folder for output’, str)=‘data/features’ This is mainly to run on kaggle where path_features
is set to an input dataset, because we need the rolling window stat features to be present, and path_out_features
is set the working directory for the output.
num_series: Param(‘Number of series for shifting’, int)=30490
# fe_shifts_momentum('.', num_series=df_stv.shape[0])
# time.sleep(1)
=df_stv.shape[0]) fe_shifts_momentum(num_series
f'{PATH_DATA_FEATURES}/shift_fe_shifts_mom_1.csv').info())
display(load_file(f'{PATH_DATA_FEATURES}/shift_fe_shifts_mom_2.csv').info())
display(load_file(f'{PATH_DATA_FEATURES}/shift_fe_shifts_mom_3.csv').info()) display(load_file(
Dimensionality reduction of lags
Since we have so many lags, I will try to use pca to reduce the number of features I have. I can’t fit all the lags into memory, so I will create the pca features iteratively, starting with lags 71 through 84, save the file, then save the top 7 components to do pca again with lags 57 through 70, and so on until I have 14 pca components for lags 1 through 84. Then I can decide how many lags features I want to keep without reducing their dimension.
Main function
fe_ipca_lags
fe_ipca_lags (path_data_raw:str<pathtorawdatafolder>='data/raw', path_features:str<Pathtofeaturefile>='data/features', path_to_train_file:str<pathtotraindata>=None, end:int<lastdaytostartlagsfrom>=1, restart:int<startifresum ingwithlags_df.pklinrestart_dir>=None, target:str<Nameoftargetcolumn>='sales')
Creates ipca columns for 84 lag days, starting from the end and accumulating backward. With 16 GB of RAM, we can only fit 14 with ipca at a time, so for each iteration, we:
1) Create 14 new lag days, and use ipca to reduce it to
the top 7 compnents.
2) Combine those with the top 7 components from the previous step
3) Perform ipca on these 14, features, save the output, and
keep the top 7 components for the next iteration.
In the end we will have files with the top 14 ipca components for each of these separate ranges: Days 1_84 Days 15_84 Days 29_84 Days 43_84 Days 57_84 Days 71_84
# fe_ipca_lags(PATH_DATA_RAW, '.')
fe_ipca_lags()
f'{PATH_DATA_FEATURES}/shift_fe_ipca_1_84.csv').info())
display(load_file(f'{PATH_DATA_FEATURES}/shift_fe_ipca_15_84.csv').info())
display(load_file(f'{PATH_DATA_FEATURES}/shift_fe_ipca_29_84.csv').info())
display(load_file(f'{PATH_DATA_FEATURES}/shift_fe_ipca_43_84.csv').info())
display(load_file(f'{PATH_DATA_FEATURES}/shift_fe_ipca_57_84.csv').info())
display(load_file(f'{PATH_DATA_FEATURES}/shift_fe_ipca_71_84.csv').info()) display(load_file(
Lets see all the features we created
get_file_cols_dict(PATH_DATA_FEATURES)
!rm data/features/*csv
Make all features
fe
fe ()