Method is_valid_body

newspaper/article.py:261–298 · view source on GitHub ↗

If the article's body text is long enough to meet standard article requirements, keep the article

(self)

Source from the content-addressed store, hash-verified

259	return urls.valid_url(self.url)
260
261	def is_valid_body(self):
262	"""If the article's body text is long enough to meet
263	standard article requirements, keep the article
264	"""
265	if not self.is_parsed:
266	raise ArticleException('must parse article before checking \
267	if it\'s body is valid!')
268	meta_type = self.extractor.get_meta_type(self.clean_doc)
269	wordcount = self.text.split(' ')
270	sentcount = self.text.split('.')
271
272	if (meta_type == 'article' and len(wordcount) >
273	(self.config.MIN_WORD_COUNT)):
274	log.debug('%s verified for article and wc' % self.url)
275	return True
276
277	if not self.is_media_news() and not self.text:
278	log.debug('%s caught for no media no text' % self.url)
279	return False
280
281	if self.title is None or len(self.title.split(' ')) < 2:
282	log.debug('%s caught for bad title' % self.url)
283	return False
284
285	if len(wordcount) < self.config.MIN_WORD_COUNT:
286	log.debug('%s caught for word cnt' % self.url)
287	return False
288
289	if len(sentcount) < self.config.MIN_SENT_COUNT:
290	log.debug('%s caught for sent cnt' % self.url)
291	return False
292
293	if self.html is None or self.html == '':
294	log.debug('%s caught for no html' % self.url)
295	return False
296
297	log.debug('%s verified for default true' % self.url)
298	return True
299
300	def is_media_news(self):
301	"""If the article is related heavily to media:

purge_articlesMethod · 0.80

is_media_newsMethod · 0.95

ArticleExceptionClass · 0.85

get_meta_typeMethod · 0.80

splitMethod · 0.80

no test coverage detected