MCPcopy
hub / github.com/microsoft/qlib / setup_data

Method setup_data

qlib/contrib/data/dataset.py:164–210  ·  view source on GitHub ↗
(self, handler_kwargs: dict = None, **kwargs)

Source from the content-addressed store, hash-verified

162 super().__init__(handler, segments, **kwargs)
163
164 def setup_data(self, handler_kwargs: dict = None, **kwargs):
165 super().setup_data(**kwargs)
166
167 if handler_kwargs is not None:
168 self.handler.setup_data(**handler_kwargs)
169
170 # pre-fetch data and change index to <code, date>
171 # NOTE: we will use inplace sort to reduce memory use
172 try:
173 df = self.handler._learn.copy() # use copy otherwise recorder will fail
174 # FIXME: currently we cannot support switching from `_learn` to `_infer` for inference
175 except Exception:
176 warnings.warn("cannot access `_learn`, will load raw data")
177 df = self.handler._data.copy()
178 df.index = df.index.swaplevel()
179 df.sort_index(inplace=True)
180
181 # convert to numpy
182 self._data = df["feature"].values.astype("float32")
183 np.nan_to_num(self._data, copy=False) # NOTE: fillna in case users forget using the fillna processor
184 self._label = df["label"].squeeze().values.astype("float32")
185 self._index = df.index
186
187 if self.input_size is not None and self.input_size != self._data.shape[1]:
188 warnings.warn("the data has different shape from input_size and the data will be reshaped")
189 assert self._data.shape[1] % self.input_size == 0, "data mismatch, please check `input_size`"
190
191 # create batch slices
192 self._batch_slices = _create_ts_slices(self._index, self.seq_len)
193
194 # create daily slices
195 daily_slices = {date: [] for date in sorted(self._index.unique(level=1))} # sorted by date
196 for i, (code, date) in enumerate(self._index):
197 daily_slices[date].append(self._batch_slices[i])
198 self._daily_slices = np.array(list(daily_slices.values()), dtype="object")
199 self._daily_index = pd.Series(list(daily_slices.keys())) # index is the original date index
200
201 # add memory (sample wise and daily)
202 if self.memory_mode == "sample":
203 self._memory = np.zeros((len(self._data), self.num_states), dtype=np.float32)
204 elif self.memory_mode == "daily":
205 self._memory = np.zeros((len(self._daily_index), self.num_states), dtype=np.float32)
206 else:
207 raise ValueError(f"invalid memory_mode `{self.memory_mode}`")
208
209 # padding tensor
210 self._zeros = np.zeros((self.seq_len, max(self.num_states, self._data.shape[1])), dtype=np.float32)
211
212 def _prepare_seg(self, slc, **kwargs):
213 fn = _get_date_parse_fn(self._index[0][1])

Callers 1

generate_datasetMethod · 0.45

Calls 4

copyMethod · 0.80
valuesMethod · 0.80
_create_ts_slicesFunction · 0.70
sort_indexMethod · 0.45

Tested by

no test coverage detected