If the article's body text is long enough to meet standard article requirements, keep the article
(self)
| 259 | return urls.valid_url(self.url) |
| 260 | |
| 261 | def is_valid_body(self): |
| 262 | """If the article's body text is long enough to meet |
| 263 | standard article requirements, keep the article |
| 264 | """ |
| 265 | if not self.is_parsed: |
| 266 | raise ArticleException('must parse article before checking \ |
| 267 | if it\'s body is valid!') |
| 268 | meta_type = self.extractor.get_meta_type(self.clean_doc) |
| 269 | wordcount = self.text.split(' ') |
| 270 | sentcount = self.text.split('.') |
| 271 | |
| 272 | if (meta_type == 'article' and len(wordcount) > |
| 273 | (self.config.MIN_WORD_COUNT)): |
| 274 | log.debug('%s verified for article and wc' % self.url) |
| 275 | return True |
| 276 | |
| 277 | if not self.is_media_news() and not self.text: |
| 278 | log.debug('%s caught for no media no text' % self.url) |
| 279 | return False |
| 280 | |
| 281 | if self.title is None or len(self.title.split(' ')) < 2: |
| 282 | log.debug('%s caught for bad title' % self.url) |
| 283 | return False |
| 284 | |
| 285 | if len(wordcount) < self.config.MIN_WORD_COUNT: |
| 286 | log.debug('%s caught for word cnt' % self.url) |
| 287 | return False |
| 288 | |
| 289 | if len(sentcount) < self.config.MIN_SENT_COUNT: |
| 290 | log.debug('%s caught for sent cnt' % self.url) |
| 291 | return False |
| 292 | |
| 293 | if self.html is None or self.html == '': |
| 294 | log.debug('%s caught for no html' % self.url) |
| 295 | return False |
| 296 | |
| 297 | log.debug('%s verified for default true' % self.url) |
| 298 | return True |
| 299 | |
| 300 | def is_media_news(self): |
| 301 | """If the article is related heavily to media: |
no test coverage detected