| 131 | self._filter_index = self.s.extractor_info_index() |
| 132 | |
| 133 | def classify(self, gz_path: str) -> Decision: |
| 134 | short_path = config.source_from_path(gz_path) |
| 135 | |
| 136 | if self.small_only or self.large_only: |
| 137 | size = os.path.getsize(gz_path) |
| 138 | if self.small_only and size > self.size_threshold: |
| 139 | return SizeSkip(gz_path, short_path, size, self.size_threshold, "small") |
| 140 | if self.large_only and size <= self.size_threshold: |
| 141 | return SizeSkip(gz_path, short_path, size, self.size_threshold, "large") |
| 142 | |
| 143 | if os.path.islink(gz_path): |
| 144 | canonical_path = os.path.realpath(gz_path) |
| 145 | canonical_source = config.source_from_path(canonical_path) |
| 146 | if canonical_source != short_path: |
| 147 | return Symlink( |
| 148 | gz_path=gz_path, |
| 149 | short_path=short_path, |
| 150 | canonical_source=canonical_source, |
| 151 | stale_in_db=self.s.has_manpage_source(short_path), |
| 152 | canonical_in_inputs=canonical_path in self.normalized_inputs, |
| 153 | ) |
| 154 | |
| 155 | if self.overwrite and self.filter_specs: |
| 156 | existing = self._filter_index.get(short_path) |
| 157 | if existing is not None: |
| 158 | stored_extractor, stored_meta = existing |
| 159 | if _matches_filter( |
| 160 | self.filter_specs, |
| 161 | stored_extractor, |
| 162 | stored_meta, |
| 163 | ): |
| 164 | # Matching row: queue for re-extraction. Deliberately skip |
| 165 | # the dedup branch — and don't seed _hash_to_canonical — |
| 166 | # so a same-hash sibling doesn't silently alias onto this |
| 167 | # row's stale parsed_manpages. |
| 168 | return Work(gz_path, short_path) |
| 169 | # Non-matching row: keep its data; don't seed dedup either, |
| 170 | # for the same reason. |
| 171 | return FilterSkip( |
| 172 | gz_path=gz_path, |
| 173 | short_path=short_path, |
| 174 | stored_extractor=stored_extractor, |
| 175 | stored_model=stored_meta.model, |
| 176 | ) |
| 177 | |
| 178 | if not self.overwrite and self.s.has_manpage_source(short_path): |
| 179 | return AlreadyStored(gz_path, short_path) |
| 180 | |
| 181 | h = common.gz_sha256(gz_path) |
| 182 | key = _dedup_key(h, short_path) |
| 183 | canonical = self._hash_to_canonical.get(key) |
| 184 | if canonical is not None: |
| 185 | return ContentDup(gz_path, short_path, canonical) |
| 186 | self._hash_to_canonical[key] = short_path |
| 187 | return Work(gz_path, short_path) |
| 188 | |
| 189 | |
| 190 | # --------------------------------------------------------------------------- |