dump data as the following format: `/path/to/ .data` [date, period, value, _next] [date, period, value, _next] [...] `/path/to/ .index` [first_year, index, index, ...] ` `
(
self,
file_path: str,
interval: str = "quarterly",
overwrite: bool = False,
)
| 148 | ) |
| 149 | |
| 150 | def _dump_pit( |
| 151 | self, |
| 152 | file_path: str, |
| 153 | interval: str = "quarterly", |
| 154 | overwrite: bool = False, |
| 155 | ): |
| 156 | """ |
| 157 | dump data as the following format: |
| 158 | `/path/to/<field>.data` |
| 159 | [date, period, value, _next] |
| 160 | [date, period, value, _next] |
| 161 | [...] |
| 162 | `/path/to/<field>.index` |
| 163 | [first_year, index, index, ...] |
| 164 | |
| 165 | `<field.data>` contains the data as the point-in-time (PIT) order: `value` of `period` |
| 166 | is published at `date`, and its successive revised value can be found at `_next` (linked list). |
| 167 | |
| 168 | `<field>.index` contains the index of value for each period (quarter or year). To save |
| 169 | disk space, we only store the `first_year` as its followings periods can be easily infered. |
| 170 | |
| 171 | Parameters |
| 172 | ---------- |
| 173 | symbol: str |
| 174 | stock symbol |
| 175 | interval: str |
| 176 | data interval |
| 177 | overwrite: bool |
| 178 | whether overwrite existing data or update only |
| 179 | """ |
| 180 | symbol = self.get_symbol_from_file(file_path) |
| 181 | df = self.get_source_data(file_path) |
| 182 | if df.empty: |
| 183 | logger.warning(f"{symbol} file is empty") |
| 184 | return |
| 185 | for field in self.get_dump_fields(df): |
| 186 | df_sub = df.query(f'{self.field_column_name}=="{field}"').sort_values(self.date_column_name) |
| 187 | if df_sub.empty: |
| 188 | logger.warning(f"field {field} of {symbol} is empty") |
| 189 | continue |
| 190 | data_file, index_file = self.get_filenames(symbol, field, interval) |
| 191 | |
| 192 | ## calculate first & last period |
| 193 | start_year = df_sub[self.period_column_name].min() |
| 194 | end_year = df_sub[self.period_column_name].max() |
| 195 | if interval == self.INTERVAL_quarterly: |
| 196 | start_year //= 100 |
| 197 | end_year //= 100 |
| 198 | |
| 199 | # adjust `first_year` if existing data found |
| 200 | if not overwrite and index_file.exists(): |
| 201 | with open(index_file, "rb") as fi: |
| 202 | (first_year,) = struct.unpack(self.PERIOD_DTYPE, fi.read(self.PERIOD_DTYPE_SIZE)) |
| 203 | n_years = len(fi.read()) // self.INDEX_DTYPE_SIZE |
| 204 | if interval == self.INTERVAL_quarterly: |
| 205 | n_years //= 4 |
| 206 | start_year = first_year + n_years |
| 207 | else: |
nothing calls this directly
no test coverage detected