(page: Page | Frame)
| 168 | * Uses clone + remove approach: clones body, removes marked elements, returns innerText. |
| 169 | */ |
| 170 | export async function getCleanTextWithStripping(page: Page | Frame): Promise<string> { |
| 171 | const raw = await page.evaluate(() => { |
| 172 | const body = document.body; |
| 173 | if (!body) return ''; |
| 174 | const clone = body.cloneNode(true) as HTMLElement; |
| 175 | // Remove standard noise elements |
| 176 | clone.querySelectorAll('script, style, noscript, svg').forEach(el => el.remove()); |
| 177 | // Remove hidden-marked elements |
| 178 | clone.querySelectorAll('[data-gstack-hidden]').forEach(el => el.remove()); |
| 179 | return clone.innerText |
| 180 | .split('\n') |
| 181 | .map(line => line.trim()) |
| 182 | .filter(line => line.length > 0) |
| 183 | .join('\n'); |
| 184 | }); |
| 185 | return stripLoneSurrogates(raw); |
| 186 | } |
| 187 | |
| 188 | /** |
| 189 | * Clean up data-gstack-hidden attributes from the page. |
no test coverage detected