(self)
| 308 | self._group = group |
| 309 | |
| 310 | def get_items(self): |
| 311 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'} |
| 312 | |
| 313 | pageletDataPattern = re.compile(r'"GroupEntstreamPagelet",\{.*?\}(?=,\{)') |
| 314 | pageletDataPrefixLength = len('"GroupEntstreamPagelet",') |
| 315 | spuriousForLoopPattern = re.compile(r'^for \(;;\);') |
| 316 | |
| 317 | baseUrl = f'https://upload.facebook.com/groups/{self._group}/?sorting_setting=CHRONOLOGICAL' |
| 318 | r = self._get(baseUrl, headers = headers) |
| 319 | if r.status_code == 404: |
| 320 | _logger.warning('Group does not exist') |
| 321 | return |
| 322 | elif r.status_code != 200: |
| 323 | raise snscrape.base.ScraperException(f'Got status code {r.status_code}') |
| 324 | |
| 325 | if 'content:{pagelet_group_mall:{container_id:"' not in r.text: |
| 326 | raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)') |
| 327 | |
| 328 | soup = bs4.BeautifulSoup(r.text, 'lxml') |
| 329 | |
| 330 | # Posts are inside an HTML comment in two code tags with IDs listed in JS... |
| 331 | for codeContainerIdStart in ('content:{pagelet_group_mall:{container_id:"', 'content:{group_mall_after_tti:{container_id:"'): |
| 332 | codeContainerIdPos = r.text.index(codeContainerIdStart) + len(codeContainerIdStart) |
| 333 | codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)] |
| 334 | codeContainer = soup.find('code', id = codeContainerId) |
| 335 | if not codeContainer: |
| 336 | raise snscrape.base.ScraperException('Code container not found') |
| 337 | if type(codeContainer.string) is not bs4.element.Comment: |
| 338 | raise snscrape.base.ScraperException('Code container does not contain a comment') |
| 339 | codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml') |
| 340 | yield from self._soup_to_items(codeSoup, baseUrl, 'group') |
| 341 | |
| 342 | # Pagination |
| 343 | while (data := pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]): |
| 344 | # As on the user profile pages, the web app sends a lot of additional parameters, but those all seem to be unnecessary (although some change the response format, e.g. from JSON to HTML) |
| 345 | r = self._get( |
| 346 | 'https://upload.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet', |
| 347 | params = {'data': data, '__a': 1}, |
| 348 | headers = headers, |
| 349 | ) |
| 350 | if r.status_code != 200: |
| 351 | raise snscrape.base.ScraperException(f'Got status code {r.status_code}') |
| 352 | obj = json.loads(spuriousForLoopPattern.sub('', r.text)) |
| 353 | if obj['payload'] == '': |
| 354 | # End of pagination |
| 355 | break |
| 356 | soup = bs4.BeautifulSoup(obj['payload'], 'lxml') |
| 357 | yield from self._soup_to_items(soup, baseUrl, 'group') |
| 358 | |
| 359 | @classmethod |
| 360 | def _cli_setup_parser(cls, subparser): |
nothing calls this directly
no test coverage detected