def mkdirs_data(data_dir_name: str='data') -> None:
"""Initializes the data directory structure"""
f'{data_dir_name}/raw', exist_ok=True)
os.makedirs(f'{data_dir_name}/interim', exist_ok=True)
os.makedirs(f'{data_dir_name}/features', exist_ok=True)
os.makedirs(f'{data_dir_name}/models', exist_ok=True)
os.makedirs(
@call_parse
def cli_mkdirs_data(data_dir_name: Param('Name of data folder', str)='data') -> None:
mkdirs_data(data_dir_name)
Core
Make data folder with subfolders raw, interim, features, models
cli_mkdirs_data
cli_mkdirs_data (data_dir_name:str<Nameofdatafolder>='data')
mkdirs_data
mkdirs_data (data_dir_name:str='data')
Initializes the data directory structure
Downloading kaggle data
cli_download_kaggle_data
cli_download_kaggle_data (comp_name:str<nameofkagglecompetition>=None)
download_kaggle_data
download_kaggle_data (comp_name:str=None)
Downloads competition data using the kaggle api
Saving and loading flies
Prevent saving over files
In order to to make sure we don’t accidentally save over a file with the the same name, we have some functions to ensures that certain strings and paths are unique.
def make_unique(name: str, names: "list or set or dict") -> str:
"""Returns name with (x)_ prefix if `name` already in `names`.
This is useful when you want to make sure you don't save over
existing data with the same key `name`.
"""
if name in names:
= 1
x while f'({x})_' + name in names: x += 1
= f'({x})_' + name
name return name
make_unique
make_unique (name:str, names:list)
Returns name with (x)_ prefix if name
already in names
. This is useful when you want to make sure you don’t save over existing data with the same key name
.
'd', ['a', 'b', 'c']) make_unique(
'd'
'a', ['a', 'b', 'c']) make_unique(
'(1)_a'
assert make_unique('d', ['a', 'b', 'c']) == 'd'
assert make_unique('chris', ['chris', 'dan', 'bill']) == '(1)_chris'
assert make_unique('chris', ['chris', 'dan', 'bill', '(1)_chris']) == '(2)_chris'
assert make_unique('chris', set(['chris', 'dan', 'bill', '(1)_chris'])) == '(2)_chris'
assert make_unique('a', {'a': 1, 'b': 2, 'c': 3}) == '(1)_a'
assert make_unique('d', {'a': 1, 'b': 2, 'c': 3}) == 'd'
def make_unique_path(path):
"""Returns path with prefix '(n)_' before the last element in
path if it is a duplicate.
"""
= os.path.split(path)
pre_path, file_name = make_unique(file_name, os.listdir(pre_path or '.'))
file_name return os.path.join(pre_path, file_name)
make_unique_path
make_unique_path (path)
Returns path with prefix ’(n)_’ before the last element in path if it is a duplicate.
'tmp', exist_ok=True)
os.makedirs(print(make_unique_path('tmp/tmp.csv'))
open('tmp/tmp.csv', 'w').close()
print(make_unique_path('tmp/tmp.csv'))
'tmp') shutil.rmtree(
tmp/tmp.csv
tmp/(1)_tmp.csv
Saving and loading pandas dataframes
def save_file(df: pd.DataFrame,
str,
path: list=None,
usecols: bool=False,
save_index: bool=True,
save_dtypes: bool=False) -> None:
pickle: """Saves `df` to `path` with dtypes as top column if `save_dtypes`
is set to True. Load a files in this structure with `load_file`
"""
if pickle:
= usecols if usecols else list(df)
usecols = os.path.split(path)[0] if path.endswith('.csv') else path # For M5 project maintenence
path_dir for col in list(df):
+ '.pkl'))
df[[col]].to_pickle(os.path.join(path_dir, col return
= make_unique_path(path)
path if save_dtypes:
= df.iloc[[0], :]
df_tmp if usecols: df_tmp = df_tmp.loc[:, usecols]
if save_index:
=True)
df_tmp.reset_index(inplace= df_tmp.dtypes.to_frame().T
df_dtypes =False)
df_dtypes.to_csv(path, index='a', index=save_index, header=False,
df.to_csv(path, mode=usecols)
columnselse:
=save_index, columns=usecols)
df.to_csv(path, index
def load_file(path: str, load_dtypes=True, usecols: list=None) -> pd.DataFrame:
"""Loads a file into a DataFrame from `path` with dtypes
taken from the top column if `load_dtypes` is set to True.
Loads a files in the structure created with `save_file`.
"""
if path.endswith('pkl'):
= pd.read_pickle(path)
df return df[usecols] if usecols else df
if load_dtypes:
= pd.read_csv(path, nrows=1).iloc[0].to_dict()
dtypes return pd.read_csv(path, skiprows=[1], dtype=dtypes, usecols=usecols)
else:
return pd.read_csv(path, usecols=usecols)
load_file
load_file (path:str, load_dtypes=True, usecols:list=None)
Loads a file into a DataFrame from path
with dtypes taken from the top column if load_dtypes
is set to True. Loads a files in the structure created with save_file
.
save_file
save_file (df:pandas.core.frame.DataFrame, path:str, usecols:list=None, save_index:bool=False, save_dtypes:bool=True, pickle:bool=False)
Saves df
to path
with dtypes as top column if save_dtypes
is set to True. Load a files in this structure with load_file
# Example
= pd.DataFrame({'a': [1, 2], 'b': ['foo', 'bar'], 'c': [1.2, 3.3]})
df = df.astype(dict(zip(['a', 'b', 'c'], ['int32', 'category', np.float16])))
df print('Saving the the dataframe to csv with the dtypes')
display(df.dtypes)'tmp.csv', pickle=False)
save_file(df, print('Now the csv has the datatypes as the top line when we read it in')
'tmp.csv'))
display(pd.read_csv(
print('We can use `load_file` to read in the csv with the right dtypes')
'tmp.csv'))
display(load_file('tmp.csv').dtypes) display(load_file(
Saving the the dataframe to csv with the dtypes
a int32
b category
c float16
dtype: object
Now the csv has the datatypes as the top line when we read it in
a | b | c | |
---|---|---|---|
0 | int32 | category | float16 |
1 | 1 | foo | 1.2 |
2 | 2 | bar | 3.3 |
We can use `load_file` to read in the csv with the right dtypes
a | b | c | |
---|---|---|---|
0 | 1 | foo | 1.200195 |
1 | 2 | bar | 3.300781 |
a int32
b category
c float16
dtype: object
'.', pickle=True)
save_file(df, + '.pkl') for x in 'abc'], axis=1))
display(pd.concat([load_file(x !rm a.pkl b.pkl c.pkl
a | b | c | |
---|---|---|---|
0 | 1 | foo | 1.200195 |
1 | 2 | bar | 3.300781 |
'tmp2.csv', usecols=['a', 'c'], save_index=True, pickle=False)
save_file(df, 'tmp2.csv') load_file(
index | a | c | |
---|---|---|---|
0 | 0 | 1 | 1.200195 |
1 | 1 | 2 | 3.300781 |
!rm tmp*.csv
Making a dictionary and json with file names as keys and list of column names as values.
for file in sorted(os.listdir('.')):
print(file)
.devcontainer.json
.git
.gitattributes
.gitconfig
.github
.gitignore
.ipynb_checkpoints
.pypirc
00_core.ipynb
CONTRIBUTING.md
LICENSE
MANIFEST.in
Makefile
README.md
chrisrichardmiles
chrisrichardmiles.egg-info
data
docker-compose.yml
docs
index.ipynb
log.log
projects
settings.ini
setup.py
small_data
def get_file_cols_dict(path: str='.',
str='',
path_json: list=['index']):
ignore_cols: """Explores `path` and returns a dictionary of file names and their columns
for each file in `path`. Only file names that end with
'.csv' and '.pkl' will be considered. Pickle file names
will go in the 'pickles' key of the returned dictionary.
Csv files will see their file name saved as a key with
a list of their column names saved as the corresponding
value.
"""
= {}
d for file in sorted(os.listdir(path)):
if file.endswith('.csv'):
= pd.read_csv(os.path.join(path, file), nrows=0).columns.tolist()
cols file] = [c for c in cols if c not in ignore_cols]
d[if file.endswith('.pkl'):
'pickles', []).append(file)
d.setdefault(if path_json:
with open(path_json, 'w') as path_json:
=0)
json.dump(d, path_json, indentreturn d
@call_parse
def fe_dict(path: Param('path to directory with files', str)='data/features',
'path to json for saving dict', str)='fe_dict.json'):
path_json: Param( get_file_cols_dict(path, path_json)
fe_dict
fe_dict (path:str<pathtodirectorywithfiles>='data/features', path_json:str<pathtojsonforsavingdict>='fe_dict.json')
get_file_cols_dict
get_file_cols_dict (path:str='.', path_json:str='', ignore_cols:list=['index'])
Explores path
and returns a dictionary of file names and their columns for each file in path
. Only file names that end with ‘.csv’ and ‘.pkl’ will be considered. Pickle file names will go in the ‘pickles’ key of the returned dictionary. Csv files will see their file name saved as a key with a list of their column names saved as the corresponding value.
= pd.DataFrame({'feat_1': [1,2,2,4], 'feat_2': [1,1,3,3], 'feat_3': [1,4,3,3]})
df1 = pd.DataFrame({'shift_feat_4': [1,9,2,4], 'shift_feat_5': [1,1,3,9], 'shift_feat_6': [1,9,3,3]})
df2 = pd.DataFrame({'feat_7': [1,7,2,4], 'feat_8': [7,1,3,3], 'feat_9': [1,7,3,3]})
df3 = pd.DataFrame({'feat_10': [1,7,2,4], 'feat_11': [7,1,3,3], 'feat_12': [1,7,3,3],
df4 'feat_13': ['a', 'b', 'c', 'd']})
= df4.feat_10.astype('int8')
df4.feat_10 = df4.feat_13.astype('category')
df4.feat_13
'features_1.csv', pickle=False)
save_file(df1, 'shift_features_2.csv', pickle=False)
save_file(df2, 'features_3.csv', pickle=False)
save_file(df3, 'features_4.csv', save_index=True, pickle=False)
save_file(df4, 'features_3_less_cols.csv', usecols=['feat_7'], pickle=False) save_file(df3,
'.', path_json='tmp_features.json') get_file_cols_dict(
{'features_1.csv': ['feat_1', 'feat_2', 'feat_3'],
'features_3.csv': ['feat_7', 'feat_8', 'feat_9'],
'features_3_less_cols.csv': ['feat_7'],
'features_4.csv': ['feat_10', 'feat_11', 'feat_12', 'feat_13'],
'shift_features_2.csv': ['shift_feat_4', 'shift_feat_5', 'shift_feat_6']}
'.') get_file_cols_dict(
{'features_1.csv': ['feat_1', 'feat_2', 'feat_3'],
'features_3.csv': ['feat_7', 'feat_8', 'feat_9'],
'features_3_less_cols.csv': ['feat_7'],
'features_4.csv': ['feat_10', 'feat_11', 'feat_12', 'feat_13'],
'shift_features_2.csv': ['shift_feat_4', 'shift_feat_5', 'shift_feat_6']}
Loading features
Now we can easily load in our features with the correct data types
'.', 'tmp_features.json', pickle=False) load_features(
feat_1 | feat_2 | feat_3 | feat_7 | feat_8 | feat_9 | feat_7 | feat_10 | feat_11 | feat_12 | feat_13 | shift_feat_4 | shift_feat_5 | shift_feat_6 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | 1 | 7 | 1 | 1 | 1 | 7 | 1 | a | 1 | 1 | 1 |
1 | 2 | 1 | 4 | 7 | 1 | 7 | 7 | 7 | 1 | 7 | b | 9 | 1 | 9 |
2 | 2 | 3 | 3 | 2 | 3 | 3 | 2 | 2 | 3 | 3 | c | 2 | 3 | 3 |
3 | 4 | 3 | 3 | 4 | 3 | 3 | 4 | 4 | 3 | 3 | d | 4 | 9 | 3 |
= load_features('.', 'tmp_features.json', features=['feat_3', 'feat_10', 'feat_13'], pickle=False)
df
display(df) display(df.dtypes)
feat_3 | feat_10 | feat_13 | |
---|---|---|---|
0 | 1 | 1 | a |
1 | 4 | 7 | b |
2 | 3 | 2 | c |
3 | 3 | 4 | d |
feat_3 int64
feat_10 int8
feat_13 category
dtype: object
Sometimes we need to shift the index so that our lag features are in the correct allignment.
'.', 'tmp_features.json', shift_index=1, pickle=False) load_features(
feat_1 | feat_2 | feat_3 | feat_7 | feat_8 | feat_9 | feat_7 | feat_10 | feat_11 | feat_12 | feat_13 | shift_feat_4 | shift_feat_5 | shift_feat_6 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1.0 | 1.0 | 1.0 | 7.0 | 1.0 | 1.0 | 1.0 | 7.0 | 1.0 | a | NaN | NaN | NaN |
1 | 2.0 | 1.0 | 4.0 | 7.0 | 1.0 | 7.0 | 7.0 | 7.0 | 1.0 | 7.0 | b | 1.0 | 1.0 | 1.0 |
2 | 2.0 | 3.0 | 3.0 | 2.0 | 3.0 | 3.0 | 2.0 | 2.0 | 3.0 | 3.0 | c | 9.0 | 1.0 | 9.0 |
3 | 4.0 | 3.0 | 3.0 | 4.0 | 3.0 | 3.0 | 4.0 | 4.0 | 3.0 | 3.0 | d | 2.0 | 3.0 | 3.0 |
4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4.0 | 9.0 | 3.0 |
Sometimes we are loading features for a subset of the data so we only need to load the rows associated with certain indexes.
'.', 'tmp_features.json', shift_index=1, reindex_with=[1, 3], pickle=False) load_features(
feat_1 | feat_2 | feat_3 | feat_7 | feat_8 | feat_9 | feat_7 | feat_10 | feat_11 | feat_12 | feat_13 | shift_feat_4 | shift_feat_5 | shift_feat_6 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 1 | 4 | 7 | 1 | 7 | 7 | 7 | 1 | 7 | b | 1 | 1 | 1 |
3 | 4 | 3 | 3 | 4 | 3 | 3 | 4 | 4 | 3 | 3 | d | 2 | 3 | 3 |
We can use a copy of the feature module, easily comment out features we don’t want, and use this to load features.
'tmp_features.json', 'tmp_features_1.json') shutil.copyfile(
'tmp_features_1.json'
open tmp_features_1.json
and delete features
!cat tmp_features_1.json
{
"features_1.csv": [
"feat_1",
"feat_2",
"feat_3"
],
"features_3.csv": [
"feat_7",
"feat_8",
"feat_9"
],
"features_3_less_cols.csv": [
"feat_7"
],
"features_4.csv": [
"feat_10",
"feat_11",
"feat_12",
"feat_13"
],
"shift_features_2.csv": [
"shift_feat_4",
"shift_feat_5",
"shift_feat_6"
]
}
'.', 'tmp_features_1.json', pickle=False) load_features(
feat_1 | feat_2 | feat_3 | feat_7 | feat_8 | feat_9 | feat_7 | feat_10 | feat_11 | feat_12 | feat_13 | shift_feat_4 | shift_feat_5 | shift_feat_6 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | 1 | 7 | 1 | 1 | 1 | 7 | 1 | a | 1 | 1 | 1 |
1 | 2 | 1 | 4 | 7 | 1 | 7 | 7 | 7 | 1 | 7 | b | 9 | 1 | 9 |
2 | 2 | 3 | 3 | 2 | 3 | 3 | 2 | 2 | 3 | 3 | c | 2 | 3 | 3 |
3 | 4 | 3 | 3 | 4 | 3 | 3 | 4 | 4 | 3 | 3 | d | 4 | 9 | 3 |
!rm *.csv
!rm tmp*.json
Speed and memory functions
Paralel runs
pool_func
pool_func (function, input_list:list, verbose=False, n_cpu=99)
Uses the Pool function from the package ‘multiprocessing’ to run function
over the list input_list
. The function
should only take
def f(x): return x * 5
list(range(20)), True) pool_func(f,
#############################################
Pooling function:
f
16 of 16 cpus used
Number of function calls: 20
Time taken: 0.0 minutes
[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
Saving memory
reduce_mem_usage
reduce_mem_usage (df, verbose=True)
Converts numeric columns to smallest datatype that preserves information
merge_by_concat
merge_by_concat (df1, df2, merge_on)
sizeof_fmt
sizeof_fmt (num, suffix='B')
Reformats num
, which is num bytes
/opt/hostedtoolcache/Python/3.10.7/x64/lib/python3.10/site-packages/fastcore/docscrape.py:225: UserWarning: Unknown section Explanation Of Code
else: warn(msg)
get_memory_usage
get_memory_usage ()
Returns RAM usage in gigabytes
get_memory_usage()
0.14
time_taken
time_taken (start_time:float=0, time_elapsed:float=None)
Returns a string with the time elapsed from start_time
in a nice format. If time_elapsed
is provided, we ignore the start time.
start_time
should come from by calling the time module: start_time = time.time()
= time.time()
start_time 2)
time.sleep( time_taken(start_time)
'Time taken: 2 seconds'
=3666) time_taken(time_elapsed
'Time taken: 1 hours 1 minutes 6 seconds'