Source code for pyeasyeda.clean_up

import numpy as np
import pandas as pd
from scipy import stats

[docs]def clean_up(df):
    """Takes a dataframe object and returns a cleaned version 
     with rows containing any NaN values dropped. 
     Inspects the clean dataframe and prints a list of potential outliers for each explanatory variable, 
     based on the threshold distance of 3 standard deviations.
        Parameters
        ----------
        df : dataframe
            dataframe to be cleaned
    
        Returns
        -------
        df_clean
            same dataframe with all the NaN's removed
        Examples
        --------
        >>> df_clean = clean_up(df)
                
        '**The following potenital outliers were detected:**
        Variable X: 
        [ 300, 301, 500, 1000 ]
        Variable Y: 
        [ 6.42, 6.44, 58.52, 60.22 ]'
    
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("the input df must be pd.DataFrame type")
    
    # Drop any row that contains missing value and reset the index
    df_clean = df.dropna(axis=0, how='any').reset_index(drop=True)

    # Keep only numerical variables relevant for outlier detection
    num_df = df_clean.select_dtypes(['number'])
    
    # Create a dataframe that contains only the outliers
    outlier_df = num_df[(np.abs(stats.zscore(num_df)) > 3)] 

    # Prints out unique outlier values for each variable
    print("**The following potenital outliers were detected:**")
    for col in outlier_df:
        outliers = outlier_df[col].dropna()
        if len(outliers) != 0:
            print(f"Variable {col}: ")
            print(np.unique(outliers.values))

    # returns the clean dataframe with NaN values dropped
    return df_clean