MCPcopy
hub / github.com/JustAnotherArchivist/snscrape / get_items

Method get_items

snscrape/modules/facebook.py:310–357  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

308 self._group = group
309
310 def get_items(self):
311 headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'}
312
313 pageletDataPattern = re.compile(r'"GroupEntstreamPagelet",\{.*?\}(?=,\{)')
314 pageletDataPrefixLength = len('"GroupEntstreamPagelet",')
315 spuriousForLoopPattern = re.compile(r'^for \(;;\);')
316
317 baseUrl = f'https://upload.facebook.com/groups/{self._group}/?sorting_setting=CHRONOLOGICAL'
318 r = self._get(baseUrl, headers = headers)
319 if r.status_code == 404:
320 _logger.warning('Group does not exist')
321 return
322 elif r.status_code != 200:
323 raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
324
325 if 'content:{pagelet_group_mall:{container_id:"' not in r.text:
326 raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)')
327
328 soup = bs4.BeautifulSoup(r.text, 'lxml')
329
330 # Posts are inside an HTML comment in two code tags with IDs listed in JS...
331 for codeContainerIdStart in ('content:{pagelet_group_mall:{container_id:"', 'content:{group_mall_after_tti:{container_id:"'):
332 codeContainerIdPos = r.text.index(codeContainerIdStart) + len(codeContainerIdStart)
333 codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)]
334 codeContainer = soup.find('code', id = codeContainerId)
335 if not codeContainer:
336 raise snscrape.base.ScraperException('Code container not found')
337 if type(codeContainer.string) is not bs4.element.Comment:
338 raise snscrape.base.ScraperException('Code container does not contain a comment')
339 codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml')
340 yield from self._soup_to_items(codeSoup, baseUrl, 'group')
341
342 # Pagination
343 while (data := pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]):
344 # As on the user profile pages, the web app sends a lot of additional parameters, but those all seem to be unnecessary (although some change the response format, e.g. from JSON to HTML)
345 r = self._get(
346 'https://upload.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
347 params = {'data': data, '__a': 1},
348 headers = headers,
349 )
350 if r.status_code != 200:
351 raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
352 obj = json.loads(spuriousForLoopPattern.sub('', r.text))
353 if obj['payload'] == '':
354 # End of pagination
355 break
356 soup = bs4.BeautifulSoup(obj['payload'], 'lxml')
357 yield from self._soup_to_items(soup, baseUrl, 'group')
358
359 @classmethod
360 def _cli_setup_parser(cls, subparser):

Callers

nothing calls this directly

Calls 3

_getMethod · 0.80
warningMethod · 0.80
_soup_to_itemsMethod · 0.45

Tested by

no test coverage detected