MCPcopy
hub / github.com/codelucas/newspaper / is_valid_body

Method is_valid_body

newspaper/article.py:261–298  ·  view source on GitHub ↗

If the article's body text is long enough to meet standard article requirements, keep the article

(self)

Source from the content-addressed store, hash-verified

259 return urls.valid_url(self.url)
260
261 def is_valid_body(self):
262 """If the article's body text is long enough to meet
263 standard article requirements, keep the article
264 """
265 if not self.is_parsed:
266 raise ArticleException('must parse article before checking \
267 if it\'s body is valid!')
268 meta_type = self.extractor.get_meta_type(self.clean_doc)
269 wordcount = self.text.split(' ')
270 sentcount = self.text.split('.')
271
272 if (meta_type == 'article' and len(wordcount) >
273 (self.config.MIN_WORD_COUNT)):
274 log.debug('%s verified for article and wc' % self.url)
275 return True
276
277 if not self.is_media_news() and not self.text:
278 log.debug('%s caught for no media no text' % self.url)
279 return False
280
281 if self.title is None or len(self.title.split(' ')) < 2:
282 log.debug('%s caught for bad title' % self.url)
283 return False
284
285 if len(wordcount) < self.config.MIN_WORD_COUNT:
286 log.debug('%s caught for word cnt' % self.url)
287 return False
288
289 if len(sentcount) < self.config.MIN_SENT_COUNT:
290 log.debug('%s caught for sent cnt' % self.url)
291 return False
292
293 if self.html is None or self.html == '':
294 log.debug('%s caught for no html' % self.url)
295 return False
296
297 log.debug('%s verified for default true' % self.url)
298 return True
299
300 def is_media_news(self):
301 """If the article is related heavily to media:

Callers 1

purge_articlesMethod · 0.80

Calls 4

is_media_newsMethod · 0.95
ArticleExceptionClass · 0.85
get_meta_typeMethod · 0.80
splitMethod · 0.80

Tested by

no test coverage detected