(
stream: BinaryIO,
feed_url: str,
max_bytes: int = MAX_FEED_BYTES,
)
| 191 | |
| 192 | |
| 193 | def parse_feed_metadata_stream( |
| 194 | stream: BinaryIO, |
| 195 | feed_url: str, |
| 196 | max_bytes: int = MAX_FEED_BYTES, |
| 197 | ) -> FeedMetadata: |
| 198 | parser = ET.XMLPullParser(events=("start", "end")) |
| 199 | path: List[str] = [] |
| 200 | root_name = "" |
| 201 | title = "" |
| 202 | html_url = "" |
| 203 | atom_fallback_html_url = "" |
| 204 | bytes_read = 0 |
| 205 | |
| 206 | while True: |
| 207 | chunk = stream.read(READ_CHUNK_SIZE) |
| 208 | if not chunk: |
| 209 | break |
| 210 | bytes_read += len(chunk) |
| 211 | if bytes_read > max_bytes: |
| 212 | raise ValueError(f"Feed payload is too large (> {max_bytes} bytes)") |
| 213 | |
| 214 | try: |
| 215 | parser.feed(chunk) |
| 216 | except ET.ParseError as exc: |
| 217 | raise ValueError(f"RSS/Atom XML parse failed: {exc}") from exc |
| 218 | |
| 219 | for event, elem in parser.read_events(): |
| 220 | local_name = strip_namespace(elem.tag) |
| 221 | |
| 222 | if event == "start": |
| 223 | path.append(local_name) |
| 224 | |
| 225 | if len(path) == 1: |
| 226 | root_name = local_name |
| 227 | if root_name not in {"rss", "feed", "rdf"}: |
| 228 | raise ValueError(f"Unsupported feed root tag: {root_name}") |
| 229 | elif root_name == "feed" and len(path) == 2: |
| 230 | if local_name == "entry": |
| 231 | return _build_feed_metadata( |
| 232 | root_name, |
| 233 | title, |
| 234 | html_url or atom_fallback_html_url, |
| 235 | feed_url, |
| 236 | ) |
| 237 | if local_name == "link": |
| 238 | href = normalize_url(elem.attrib.get("href", "")) |
| 239 | if href: |
| 240 | rel = (elem.attrib.get("rel", "alternate") or "alternate").strip().lower() |
| 241 | if rel in {"alternate", ""}: |
| 242 | html_url = href |
| 243 | elif not atom_fallback_html_url: |
| 244 | atom_fallback_html_url = href |
| 245 | if title and html_url: |
| 246 | return _build_feed_metadata(root_name, title, html_url, feed_url) |
| 247 | continue |
| 248 | |
| 249 | if root_name in {"rss", "rdf"}: |
| 250 | if len(path) == 3 and path[1] == "channel": |
no test coverage detected