Source code for pyeasyeda.clean_up

import numpy as np
import pandas as pd
from scipy import stats

[docs]def clean_up(df): """Takes a dataframe object and returns a cleaned version with rows containing any NaN values dropped. Inspects the clean dataframe and prints a list of potential outliers for each explanatory variable, based on the threshold distance of 3 standard deviations. Parameters ---------- df : dataframe dataframe to be cleaned Returns ------- df_clean same dataframe with all the NaN's removed Examples -------- >>> df_clean = clean_up(df) '**The following potenital outliers were detected:** Variable X: [ 300, 301, 500, 1000 ] Variable Y: [ 6.42, 6.44, 58.52, 60.22 ]' """ if not isinstance(df, pd.DataFrame): raise TypeError("the input df must be pd.DataFrame type") # Drop any row that contains missing value and reset the index df_clean = df.dropna(axis=0, how='any').reset_index(drop=True) # Keep only numerical variables relevant for outlier detection num_df = df_clean.select_dtypes(['number']) # Create a dataframe that contains only the outliers outlier_df = num_df[(np.abs(stats.zscore(num_df)) > 3)] # Prints out unique outlier values for each variable print("**The following potenital outliers were detected:**") for col in outlier_df: outliers = outlier_df[col].dropna() if len(outliers) != 0: print(f"Variable {col}: ") print(np.unique(outliers.values)) # returns the clean dataframe with NaN values dropped return df_clean