hub / github.com/JustAnotherArchivist/snscrape / get_items

Method get_items

snscrape/modules/facebook.py:310–357 · view source on GitHub ↗

(self)

Source from the content-addressed store, hash-verified

308	self._group = group
309
310	def get_items(self):
311	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'}
312
313	pageletDataPattern = re.compile(r'"GroupEntstreamPagelet",\{.*?\}(?=,\{)')
314	pageletDataPrefixLength = len('"GroupEntstreamPagelet",')
315	spuriousForLoopPattern = re.compile(r'^for \(;;\);')
316
317	baseUrl = f'https://upload.facebook.com/groups/{self._group}/?sorting_setting=CHRONOLOGICAL'
318	r = self._get(baseUrl, headers = headers)
319	if r.status_code == 404:
320	_logger.warning('Group does not exist')
321	return
322	elif r.status_code != 200:
323	raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
324
325	if 'content:{pagelet_group_mall:{container_id:"' not in r.text:
326	raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)')
327
328	soup = bs4.BeautifulSoup(r.text, 'lxml')
329
330	# Posts are inside an HTML comment in two code tags with IDs listed in JS...
331	for codeContainerIdStart in ('content:{pagelet_group_mall:{container_id:"', 'content:{group_mall_after_tti:{container_id:"'):
332	codeContainerIdPos = r.text.index(codeContainerIdStart) + len(codeContainerIdStart)
333	codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)]
334	codeContainer = soup.find('code', id = codeContainerId)
335	if not codeContainer:
336	raise snscrape.base.ScraperException('Code container not found')
337	if type(codeContainer.string) is not bs4.element.Comment:
338	raise snscrape.base.ScraperException('Code container does not contain a comment')
339	codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml')
340	yield from self._soup_to_items(codeSoup, baseUrl, 'group')
341
342	# Pagination
343	while (data := pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]):
344	# As on the user profile pages, the web app sends a lot of additional parameters, but those all seem to be unnecessary (although some change the response format, e.g. from JSON to HTML)
345	r = self._get(
346	'https://upload.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
347	params = {'data': data, '__a': 1},
348	headers = headers,
349	)
350	if r.status_code != 200:
351	raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
352	obj = json.loads(spuriousForLoopPattern.sub('', r.text))
353	if obj['payload'] == '':
354	# End of pagination
355	break
356	soup = bs4.BeautifulSoup(obj['payload'], 'lxml')
357	yield from self._soup_to_items(soup, baseUrl, 'group')
358
359	@classmethod
360	def _cli_setup_parser(cls, subparser):

Callers

nothing calls this directly

Calls 3

_getMethod · 0.80

warningMethod · 0.80

_soup_to_itemsMethod · 0.45

Tested by

no test coverage detected