MCPcopy Index your code
hub / github.com/microsoft/qlib / _dump_pit

Method _dump_pit

scripts/dump_pit.py:150–264  ·  view source on GitHub ↗

dump data as the following format: `/path/to/ .data` [date, period, value, _next] [date, period, value, _next] [...] `/path/to/ .index` [first_year, index, index, ...] ` `

(
        self,
        file_path: str,
        interval: str = "quarterly",
        overwrite: bool = False,
    )

Source from the content-addressed store, hash-verified

148 )
149
150 def _dump_pit(
151 self,
152 file_path: str,
153 interval: str = "quarterly",
154 overwrite: bool = False,
155 ):
156 """
157 dump data as the following format:
158 `/path/to/<field>.data`
159 [date, period, value, _next]
160 [date, period, value, _next]
161 [...]
162 `/path/to/<field>.index`
163 [first_year, index, index, ...]
164
165 `<field.data>` contains the data as the point-in-time (PIT) order: `value` of `period`
166 is published at `date`, and its successive revised value can be found at `_next` (linked list).
167
168 `<field>.index` contains the index of value for each period (quarter or year). To save
169 disk space, we only store the `first_year` as its followings periods can be easily infered.
170
171 Parameters
172 ----------
173 symbol: str
174 stock symbol
175 interval: str
176 data interval
177 overwrite: bool
178 whether overwrite existing data or update only
179 """
180 symbol = self.get_symbol_from_file(file_path)
181 df = self.get_source_data(file_path)
182 if df.empty:
183 logger.warning(f"{symbol} file is empty")
184 return
185 for field in self.get_dump_fields(df):
186 df_sub = df.query(f'{self.field_column_name}=="{field}"').sort_values(self.date_column_name)
187 if df_sub.empty:
188 logger.warning(f"field {field} of {symbol} is empty")
189 continue
190 data_file, index_file = self.get_filenames(symbol, field, interval)
191
192 ## calculate first & last period
193 start_year = df_sub[self.period_column_name].min()
194 end_year = df_sub[self.period_column_name].max()
195 if interval == self.INTERVAL_quarterly:
196 start_year //= 100
197 end_year //= 100
198
199 # adjust `first_year` if existing data found
200 if not overwrite and index_file.exists():
201 with open(index_file, "rb") as fi:
202 (first_year,) = struct.unpack(self.PERIOD_DTYPE, fi.read(self.PERIOD_DTYPE_SIZE))
203 n_years = len(fi.read()) // self.INDEX_DTYPE_SIZE
204 if interval == self.INTERVAL_quarterly:
205 n_years //= 4
206 start_year = first_year + n_years
207 else:

Callers

nothing calls this directly

Calls 9

get_symbol_from_fileMethod · 0.95
get_source_dataMethod · 0.95
get_dump_fieldsMethod · 0.95
get_filenamesMethod · 0.95
get_period_offsetFunction · 0.90
queryMethod · 0.80
maxMethod · 0.80
existsMethod · 0.45
writeMethod · 0.45

Tested by

no test coverage detected