Source code for pyeasyeda.summary_suggestions

import pandas as pd
import numpy as np

[docs]def summary_suggestions(df, threshold = 0.8): """Takes in a pandas dataframe and returns a list object comprising of 3 dataframes and a list. The dataframes correspond to the summary statistics of numeric and categorical variables each and the proportion of unique values for categorical variables. The nested list is of the categorical variables that exceed the threshold for considering dropping variables with high unique values. Parameters ---------- df : pandas dataframe Dataframe to be examined threshold : float threshold for considering dropping variables with high unique values Returns ------- results : list List of summary dataframes Examples -------- >>> summary_suggestions(df) [ (summary statistics for numeric variables), (summary statistics for categorical variables), (percentage of unique values for categorical variables), [list of variables with percentage of unique values higher than the threshold] ] """ # check if input is a DataFrame if not isinstance(df, pd.DataFrame): raise TypeError("Input df must be a pandas dataframe object") if not ((type(threshold) == float) | (type(threshold) == int)): raise TypeError("Input threshold must be a float value between 0 and 1") if not (0 < threshold < 1): raise TypeError("Input threshold must be a float value between 0 and 1") numeric_summary_df = df.select_dtypes(include=np.number).describe() categorical_summary_df = df.select_dtypes(include=np.object_).describe() results = [] results.extend([numeric_summary_df, categorical_summary_df]) unique_val_df = categorical_summary_df[categorical_summary_df.index == 'unique']/len(df) filtered_unique_val_df = unique_val_df.loc['unique'] > threshold unique_val_vars = [*filter(filtered_unique_val_df.get, filtered_unique_val_df.index)] results.extend([unique_val_df, unique_val_vars]) return results