Core

Some basic functions used in the chrisrichardmiles library.

Make data folder with subfolders raw, interim, features, models

def mkdirs_data(data_dir_name: str='data') -> None: 
    """Initializes the data directory structure"""
    os.makedirs(f'{data_dir_name}/raw', exist_ok=True)
    os.makedirs(f'{data_dir_name}/interim', exist_ok=True)
    os.makedirs(f'{data_dir_name}/features', exist_ok=True)
    os.makedirs(f'{data_dir_name}/models', exist_ok=True)
    
@call_parse
def cli_mkdirs_data(data_dir_name: Param('Name of data folder', str)='data') -> None: 
    mkdirs_data(data_dir_name)

source

cli_mkdirs_data

 cli_mkdirs_data (data_dir_name:str<Nameofdatafolder>='data')

source

mkdirs_data

 mkdirs_data (data_dir_name:str='data')

Initializes the data directory structure

Downloading kaggle data

source

cli_download_kaggle_data

 cli_download_kaggle_data (comp_name:str<nameofkagglecompetition>=None)

source

download_kaggle_data

 download_kaggle_data (comp_name:str=None)

Downloads competition data using the kaggle api

Saving and loading flies

Prevent saving over files

In order to to make sure we don’t accidentally save over a file with the the same name, we have some functions to ensures that certain strings and paths are unique.

def make_unique(name: str, names: "list or set or dict") -> str: 
    """Returns name with (x)_ prefix if `name` already in `names`. 
    This is useful when you want to make sure you don't save over
    existing data with the same key `name`.
    """
    if name in names:
        x = 1
        while f'({x})_' + name in names: x += 1
        name = f'({x})_' + name
    return name

source

make_unique

 make_unique (name:str, names:list)

Returns name with (x)_ prefix if name already in names. This is useful when you want to make sure you don’t save over existing data with the same key name.

make_unique('d', ['a', 'b', 'c'])

'd'

make_unique('a', ['a', 'b', 'c'])

'(1)_a'

assert make_unique('d', ['a', 'b', 'c']) == 'd'
assert make_unique('chris', ['chris', 'dan', 'bill']) == '(1)_chris'
assert make_unique('chris', ['chris', 'dan', 'bill', '(1)_chris']) == '(2)_chris'
assert make_unique('chris', set(['chris', 'dan', 'bill', '(1)_chris'])) == '(2)_chris'
assert make_unique('a', {'a': 1, 'b': 2, 'c': 3}) == '(1)_a'
assert make_unique('d', {'a': 1, 'b': 2, 'c': 3}) == 'd'

def make_unique_path(path):  
    """Returns path with prefix '(n)_' before the last element in 
    path if it is a duplicate. 
    """
    pre_path, file_name = os.path.split(path)
    file_name = make_unique(file_name, os.listdir(pre_path or '.'))
    return os.path.join(pre_path, file_name)

source

make_unique_path

 make_unique_path (path)

Returns path with prefix ’(n)_’ before the last element in path if it is a duplicate.

os.makedirs('tmp', exist_ok=True)
print(make_unique_path('tmp/tmp.csv'))
open('tmp/tmp.csv', 'w').close()
print(make_unique_path('tmp/tmp.csv'))
shutil.rmtree('tmp')

tmp/tmp.csv
tmp/(1)_tmp.csv

Saving and loading pandas dataframes

def save_file(df: pd.DataFrame, 
              path: str, 
              usecols: list=None,
              save_index: bool=False, 
              save_dtypes: bool=True, 
              pickle: bool=False) -> None:
    """Saves `df` to `path` with dtypes as top column if `save_dtypes` 
    is set to True. Load a files in this structure with `load_file`
    """
    if pickle: 
        usecols = usecols if usecols else list(df)
        path_dir = os.path.split(path)[0] if path.endswith('.csv') else path # For M5 project maintenence
        for col in list(df): 
            df[[col]].to_pickle(os.path.join(path_dir, col + '.pkl'))
        return 
    
    path = make_unique_path(path)
    if save_dtypes:
        df_tmp = df.iloc[[0], :]
        if usecols: df_tmp = df_tmp.loc[:, usecols]
        if save_index: 
            df_tmp.reset_index(inplace=True)
        df_dtypes = df_tmp.dtypes.to_frame().T
        df_dtypes.to_csv(path, index=False)
        df.to_csv(path, mode='a', index=save_index, header=False, 
                  columns=usecols)
    else: 
        df.to_csv(path, index=save_index, columns=usecols)
        
def load_file(path: str, load_dtypes=True, usecols: list=None) -> pd.DataFrame:
    """Loads a file into a DataFrame from `path` with dtypes 
    taken from the top column if `load_dtypes` is set to True. 
    Loads a files in the structure created with `save_file`.
    """
    if path.endswith('pkl'): 
        df = pd.read_pickle(path)
        return df[usecols] if usecols else df
    
    if load_dtypes:
        dtypes = pd.read_csv(path, nrows=1).iloc[0].to_dict()
        return pd.read_csv(path, skiprows=[1], dtype=dtypes, usecols=usecols)
    else:
        return pd.read_csv(path, usecols=usecols)

source

load_file

 load_file (path:str, load_dtypes=True, usecols:list=None)

Loads a file into a DataFrame from path with dtypes taken from the top column if load_dtypes is set to True. Loads a files in the structure created with save_file.

source

save_file

 save_file (df:pandas.core.frame.DataFrame, path:str, usecols:list=None,
            save_index:bool=False, save_dtypes:bool=True,
            pickle:bool=False)

Saves df to path with dtypes as top column if save_dtypes is set to True. Load a files in this structure with load_file

# Example
df = pd.DataFrame({'a': [1, 2], 'b': ['foo', 'bar'], 'c': [1.2, 3.3]})
df = df.astype(dict(zip(['a', 'b', 'c'], ['int32', 'category', np.float16])))
print('Saving the the dataframe to csv with the dtypes')
display(df.dtypes)
save_file(df, 'tmp.csv', pickle=False)
print('Now the csv has the datatypes as the top line when we read it in')
display(pd.read_csv('tmp.csv'))

print('We can use `load_file` to read in the csv with the right dtypes')
display(load_file('tmp.csv'))
display(load_file('tmp.csv').dtypes)

Saving the the dataframe to csv with the dtypes

a       int32
b    category
c     float16
dtype: object

Now the csv has the datatypes as the top line when we read it in

	a	b	c
0	int32	category	float16
1	1	foo	1.2
2	2	bar	3.3

We can use `load_file` to read in the csv with the right dtypes

	a	b	c
0	1	foo	1.200195
1	2	bar	3.300781

a       int32
b    category
c     float16
dtype: object

save_file(df, '.', pickle=True)
display(pd.concat([load_file(x + '.pkl') for x in 'abc'], axis=1))
!rm a.pkl b.pkl c.pkl

	a	b	c
0	1	foo	1.200195
1	2	bar	3.300781

save_file(df, 'tmp2.csv', usecols=['a', 'c'], save_index=True, pickle=False)
load_file('tmp2.csv')

	index	a	c
0	0	1	1.200195
1	1	2	3.300781

!rm tmp*.csv

Making a dictionary and json with file names as keys and list of column names as values.

for file in sorted(os.listdir('.')):
    print(file)

.devcontainer.json
.git
.gitattributes
.gitconfig
.github
.gitignore
.ipynb_checkpoints
.pypirc
00_core.ipynb
CONTRIBUTING.md
LICENSE
MANIFEST.in
Makefile
README.md
chrisrichardmiles
chrisrichardmiles.egg-info
data
docker-compose.yml
docs
index.ipynb
log.log
projects
settings.ini
setup.py
small_data

def get_file_cols_dict(path: str='.', 
                       path_json: str='', 
                       ignore_cols: list=['index']):
    """Explores `path` and returns a dictionary of file names and their columns
    for each file in `path`. Only file names that end with 
    '.csv' and '.pkl' will be considered. Pickle file names
    will go in the 'pickles' key of the returned dictionary.
    Csv files will see their file name saved as a key with 
    a list of their column names saved as the corresponding 
    value.
    """
    
    d = {}
    for file in sorted(os.listdir(path)): 
        if file.endswith('.csv'): 
            cols = pd.read_csv(os.path.join(path, file), nrows=0).columns.tolist()
            d[file] = [c for c in cols if c not in ignore_cols]
        if file.endswith('.pkl'): 
            d.setdefault('pickles', []).append(file)
    if path_json: 
        with open(path_json, 'w') as path_json: 
            json.dump(d, path_json, indent=0)
    return d

@call_parse
def fe_dict(path: Param('path to directory with files', str)='data/features', 
            path_json: Param('path to json for saving dict', str)='fe_dict.json'):
    get_file_cols_dict(path, path_json)

source

fe_dict

 fe_dict (path:str<pathtodirectorywithfiles>='data/features',
          path_json:str<pathtojsonforsavingdict>='fe_dict.json')

source

get_file_cols_dict

 get_file_cols_dict (path:str='.', path_json:str='',
                     ignore_cols:list=['index'])

Explores path and returns a dictionary of file names and their columns for each file in path. Only file names that end with ‘.csv’ and ‘.pkl’ will be considered. Pickle file names will go in the ‘pickles’ key of the returned dictionary. Csv files will see their file name saved as a key with a list of their column names saved as the corresponding value.

df1 = pd.DataFrame({'feat_1': [1,2,2,4], 'feat_2': [1,1,3,3], 'feat_3': [1,4,3,3]})
df2 = pd.DataFrame({'shift_feat_4': [1,9,2,4], 'shift_feat_5': [1,1,3,9], 'shift_feat_6': [1,9,3,3]})
df3 = pd.DataFrame({'feat_7': [1,7,2,4], 'feat_8': [7,1,3,3], 'feat_9': [1,7,3,3]})
df4 = pd.DataFrame({'feat_10': [1,7,2,4], 'feat_11': [7,1,3,3], 'feat_12': [1,7,3,3], 
                    'feat_13': ['a', 'b', 'c', 'd']})
df4.feat_10 = df4.feat_10.astype('int8')
df4.feat_13 = df4.feat_13.astype('category')

save_file(df1, 'features_1.csv', pickle=False)
save_file(df2, 'shift_features_2.csv', pickle=False)
save_file(df3, 'features_3.csv', pickle=False)
save_file(df4, 'features_4.csv', save_index=True, pickle=False)
save_file(df3, 'features_3_less_cols.csv', usecols=['feat_7'], pickle=False)

get_file_cols_dict('.', path_json='tmp_features.json')

{'features_1.csv': ['feat_1', 'feat_2', 'feat_3'],
 'features_3.csv': ['feat_7', 'feat_8', 'feat_9'],
 'features_3_less_cols.csv': ['feat_7'],
 'features_4.csv': ['feat_10', 'feat_11', 'feat_12', 'feat_13'],
 'shift_features_2.csv': ['shift_feat_4', 'shift_feat_5', 'shift_feat_6']}

get_file_cols_dict('.')

{'features_1.csv': ['feat_1', 'feat_2', 'feat_3'],
 'features_3.csv': ['feat_7', 'feat_8', 'feat_9'],
 'features_3_less_cols.csv': ['feat_7'],
 'features_4.csv': ['feat_10', 'feat_11', 'feat_12', 'feat_13'],
 'shift_features_2.csv': ['shift_feat_4', 'shift_feat_5', 'shift_feat_6']}

Loading features

Now we can easily load in our features with the correct data types

load_features('.', 'tmp_features.json', pickle=False)

	feat_1	feat_2	feat_3	feat_7	feat_8	feat_9	feat_7	feat_10	feat_11	feat_12	feat_13	shift_feat_4	shift_feat_5	shift_feat_6
0	1	1	1	1	7	1	1	1	7	1	a	1	1	1
1	2	1	4	7	1	7	7	7	1	7	b	9	1	9
2	2	3	3	2	3	3	2	2	3	3	c	2	3	3
3	4	3	3	4	3	3	4	4	3	3	d	4	9	3

df = load_features('.', 'tmp_features.json', features=['feat_3', 'feat_10', 'feat_13'], pickle=False)
display(df)
display(df.dtypes)

	feat_3	feat_10	feat_13
0	1	1	a
1	4	7	b
2	3	2	c
3	3	4	d

feat_3        int64
feat_10        int8
feat_13    category
dtype: object

Sometimes we need to shift the index so that our lag features are in the correct allignment.

load_features('.', 'tmp_features.json', shift_index=1, pickle=False)

	feat_1	feat_2	feat_3	feat_7	feat_8	feat_9	feat_7	feat_10	feat_11	feat_12	feat_13	shift_feat_4	shift_feat_5	shift_feat_6
0	1.0	1.0	1.0	1.0	7.0	1.0	1.0	1.0	7.0	1.0	a	NaN	NaN	NaN
1	2.0	1.0	4.0	7.0	1.0	7.0	7.0	7.0	1.0	7.0	b	1.0	1.0	1.0
2	2.0	3.0	3.0	2.0	3.0	3.0	2.0	2.0	3.0	3.0	c	9.0	1.0	9.0
3	4.0	3.0	3.0	4.0	3.0	3.0	4.0	4.0	3.0	3.0	d	2.0	3.0	3.0
4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4.0	9.0	3.0

Sometimes we are loading features for a subset of the data so we only need to load the rows associated with certain indexes.

load_features('.', 'tmp_features.json', shift_index=1, reindex_with=[1, 3], pickle=False)

	feat_1	feat_2	feat_3	feat_7	feat_8	feat_9	feat_7	feat_10	feat_11	feat_12	feat_13	shift_feat_4	shift_feat_5	shift_feat_6
1	2	1	4	7	1	7	7	7	1	7	b	1	1	1
3	4	3	3	4	3	3	4	4	3	3	d	2	3	3

We can use a copy of the feature module, easily comment out features we don’t want, and use this to load features.

shutil.copyfile('tmp_features.json', 'tmp_features_1.json')

'tmp_features_1.json'

open tmp_features_1.json and delete features

!cat tmp_features_1.json

{
"features_1.csv": [
"feat_1",
"feat_2",
"feat_3"
],
"features_3.csv": [
"feat_7",
"feat_8",
"feat_9"
],
"features_3_less_cols.csv": [
"feat_7"
],
"features_4.csv": [
"feat_10",
"feat_11",
"feat_12",
"feat_13"
],
"shift_features_2.csv": [
"shift_feat_4",
"shift_feat_5",
"shift_feat_6"
]
}

load_features('.', 'tmp_features_1.json', pickle=False)

	feat_1	feat_2	feat_3	feat_7	feat_8	feat_9	feat_7	feat_10	feat_11	feat_12	feat_13	shift_feat_4	shift_feat_5	shift_feat_6
0	1	1	1	1	7	1	1	1	7	1	a	1	1	1
1	2	1	4	7	1	7	7	7	1	7	b	9	1	9
2	2	3	3	2	3	3	2	2	3	3	c	2	3	3
3	4	3	3	4	3	3	4	4	3	3	d	4	9	3

!rm *.csv
!rm tmp*.json

Speed and memory functions

Paralel runs

source

pool_func

 pool_func (function, input_list:list, verbose=False, n_cpu=99)

Uses the Pool function from the package ‘multiprocessing’ to run function over the list input_list. The function should only take

def f(x): return x * 5
pool_func(f, list(range(20)), True)

#############################################
Pooling function: 
f
16 of 16 cpus used
Number of function calls:  20
Time taken: 0.0 minutes

[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]

Saving memory

source

reduce_mem_usage

 reduce_mem_usage (df, verbose=True)

Converts numeric columns to smallest datatype that preserves information

source

merge_by_concat

 merge_by_concat (df1, df2, merge_on)

source

sizeof_fmt

 sizeof_fmt (num, suffix='B')

Reformats num, which is num bytes

/opt/hostedtoolcache/Python/3.10.7/x64/lib/python3.10/site-packages/fastcore/docscrape.py:225: UserWarning: Unknown section Explanation Of Code
  else: warn(msg)

source

get_memory_usage

 get_memory_usage ()

Returns RAM usage in gigabytes

get_memory_usage()

0.14

source

time_taken

 time_taken (start_time:float=0, time_elapsed:float=None)

Returns a string with the time elapsed from start_time in a nice format. If time_elapsed is provided, we ignore the start time.

start_time should come from by calling the time module: start_time = time.time()

start_time = time.time()
time.sleep(2)
time_taken(start_time)

'Time taken: 2 seconds'

time_taken(time_elapsed=3666)

'Time taken: 1 hours 1 minutes 6 seconds'