Generate a clustering report given these two arguments: predicted - dataframe with predicted cluster column fill_missing_report - dictionary of rules on how we are going to fill in missing values for final generated report (not included in modelling); >>> predicted =
(
predicted: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
)
| 216 | |
| 217 | |
| 218 | def report_generator( |
| 219 | predicted: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None |
| 220 | ) -> pd.DataFrame: |
| 221 | """ |
| 222 | Generate a clustering report given these two arguments: |
| 223 | predicted - dataframe with predicted cluster column |
| 224 | fill_missing_report - dictionary of rules on how we are going to fill in missing |
| 225 | values for final generated report (not included in modelling); |
| 226 | >>> predicted = pd.DataFrame() |
| 227 | >>> predicted['numbers'] = [1, 2, 3] |
| 228 | >>> predicted['col1'] = [0.5, 2.5, 4.5] |
| 229 | >>> predicted['col2'] = [100, 200, 300] |
| 230 | >>> predicted['col3'] = [10, 20, 30] |
| 231 | >>> predicted['Cluster'] = [1, 1, 2] |
| 232 | >>> report_generator(predicted, ['col1', 'col2'], 0) |
| 233 | Features Type Mark 1 2 |
| 234 | 0 # of Customers ClusterSize False 2.000000 1.000000 |
| 235 | 1 % of Customers ClusterProportion False 0.666667 0.333333 |
| 236 | 2 col1 mean_with_zeros True 1.500000 4.500000 |
| 237 | 3 col2 mean_with_zeros True 150.000000 300.000000 |
| 238 | 4 numbers mean_with_zeros False 1.500000 3.000000 |
| 239 | .. ... ... ... ... ... |
| 240 | 99 dummy 5% False 1.000000 1.000000 |
| 241 | 100 dummy 95% False 1.000000 1.000000 |
| 242 | 101 dummy stdev False 0.000000 NaN |
| 243 | 102 dummy mode False 1.000000 1.000000 |
| 244 | 103 dummy median False 1.000000 1.000000 |
| 245 | <BLANKLINE> |
| 246 | [104 rows x 5 columns] |
| 247 | """ |
| 248 | # Fill missing values with given rules |
| 249 | if fill_missing_report: |
| 250 | predicted = predicted.fillna(value=fill_missing_report) |
| 251 | predicted["dummy"] = 1 |
| 252 | numeric_cols = predicted.select_dtypes(np.number).columns |
| 253 | report = ( |
| 254 | predicted.groupby(["Cluster"])[ # construct report dataframe |
| 255 | numeric_cols |
| 256 | ] # group by cluster number |
| 257 | .agg( |
| 258 | [ |
| 259 | ("sum", "sum"), |
| 260 | ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))), |
| 261 | ("mean_without_zeros", lambda x: x.replace(0, np.nan).mean()), |
| 262 | ( |
| 263 | "mean_25-75", |
| 264 | lambda x: np.mean( |
| 265 | np.nan_to_num( |
| 266 | sorted(x)[ |
| 267 | round(len(x) * 25 / 100) : round(len(x) * 75 / 100) |
| 268 | ] |
| 269 | ) |
| 270 | ), |
| 271 | ), |
| 272 | ("mean_with_na", "mean"), |
| 273 | ("min", lambda x: x.min()), |
| 274 | ("5%", lambda x: x.quantile(0.05)), |
| 275 | ("25%", lambda x: x.quantile(0.25)), |