import pandas as pd
import numpy as np
import public
[docs]@public.add
def replace_nulls(df, replace, column_names):
return df.fillna({k: replace for k in column_names})
[docs]@public.add
# Normalize timestamp column values.
def normalize_ts_columns(df, column_names):
for column_name in column_names:
normalize_ts_column(df, column_name)
return df
# convert timestamp in HH:mm:ss to seconds -
# pandas timedelta takes the time format and converts them to seconds.
# divide by the result by the total number of seconds in a day.
# this normalizes the timestamp to a number between 0 and 1.
# round off the value to 5 decimal places.
[docs]@public.add
def normalize_ts_column(df, column_name):
df[column_name] = pd.to_timedelta(
df[column_name].dt.strftime("%H:%M:%S")
).dt.total_seconds()
df[column_name] = df[column_name].replace(np.nan, -1)
df[column_name] = df[column_name].apply(
lambda x: round(x / 86400, 5) if x >= 0 else x
)
return df