MCPcopy
hub / github.com/microsoft/qlib / ZScoreNorm

Class ZScoreNorm

qlib/data/dataset/processor.py:228–259  ·  view source on GitHub ↗

ZScore Normalization

Source from the content-addressed store, hash-verified

226
227
228class ZScoreNorm(Processor):
229 """ZScore Normalization"""
230
231 def __init__(self, fit_start_time, fit_end_time, fields_group=None):
232 # NOTE: correctly set the `fit_start_time` and `fit_end_time` is very important !!!
233 # `fit_end_time` **must not** include any information from the test data!!!
234 self.fit_start_time = fit_start_time
235 self.fit_end_time = fit_end_time
236 self.fields_group = fields_group
237
238 def fit(self, df: pd.DataFrame = None):
239 df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime")
240 cols = get_group_columns(df, self.fields_group)
241 self.mean_train = np.nanmean(df[cols].values, axis=0)
242 self.std_train = np.nanstd(df[cols].values, axis=0)
243 self.ignore = self.std_train == 0
244 # To improve the speed, we set the value of `std_train` to `1` for the columns that do not need to be processed,
245 # and the value of `mean_train` to `0`, when using `(x - mean_train) / std_train` for uniform calculation,
246 # the columns that do not need to be processed will be calculated by `(x - 0) / 1`,
247 # as you can see, the columns that do not need to be processed, will not be affected.
248 for _i, _con in enumerate(self.ignore):
249 if _con:
250 self.std_train[_i] = 1
251 self.mean_train[_i] = 0
252 self.cols = cols
253
254 def __call__(self, df):
255 def normalize(x, mean_train=self.mean_train, std_train=self.std_train):
256 return (x - mean_train) / std_train
257
258 df.loc(axis=1)[self.cols] = normalize(df[self.cols].values)
259 return df
260
261
262class RobustZScoreNorm(Processor):

Callers 1

test_ZScoreNormMethod · 0.90

Calls

no outgoing calls

Tested by 1

test_ZScoreNormMethod · 0.72