ZScore Normalization
| 226 | |
| 227 | |
| 228 | class ZScoreNorm(Processor): |
| 229 | """ZScore Normalization""" |
| 230 | |
| 231 | def __init__(self, fit_start_time, fit_end_time, fields_group=None): |
| 232 | # NOTE: correctly set the `fit_start_time` and `fit_end_time` is very important !!! |
| 233 | # `fit_end_time` **must not** include any information from the test data!!! |
| 234 | self.fit_start_time = fit_start_time |
| 235 | self.fit_end_time = fit_end_time |
| 236 | self.fields_group = fields_group |
| 237 | |
| 238 | def fit(self, df: pd.DataFrame = None): |
| 239 | df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime") |
| 240 | cols = get_group_columns(df, self.fields_group) |
| 241 | self.mean_train = np.nanmean(df[cols].values, axis=0) |
| 242 | self.std_train = np.nanstd(df[cols].values, axis=0) |
| 243 | self.ignore = self.std_train == 0 |
| 244 | # To improve the speed, we set the value of `std_train` to `1` for the columns that do not need to be processed, |
| 245 | # and the value of `mean_train` to `0`, when using `(x - mean_train) / std_train` for uniform calculation, |
| 246 | # the columns that do not need to be processed will be calculated by `(x - 0) / 1`, |
| 247 | # as you can see, the columns that do not need to be processed, will not be affected. |
| 248 | for _i, _con in enumerate(self.ignore): |
| 249 | if _con: |
| 250 | self.std_train[_i] = 1 |
| 251 | self.mean_train[_i] = 0 |
| 252 | self.cols = cols |
| 253 | |
| 254 | def __call__(self, df): |
| 255 | def normalize(x, mean_train=self.mean_train, std_train=self.std_train): |
| 256 | return (x - mean_train) / std_train |
| 257 | |
| 258 | df.loc(axis=1)[self.cols] = normalize(df[self.cols].values) |
| 259 | return df |
| 260 | |
| 261 | |
| 262 | class RobustZScoreNorm(Processor): |
no outgoing calls