When we parse the links in an page, on the 2nd run and later, check the links of previous runs. If they match, it means the link must not be an article, because article urls change as time passes. This method also uniquifies articles.
(source, articles)
| 256 | |
| 257 | |
| 258 | def memoize_articles(source, articles): |
| 259 | """When we parse the <a> links in an <html> page, on the 2nd run |
| 260 | and later, check the <a> links of previous runs. If they match, |
| 261 | it means the link must not be an article, because article urls |
| 262 | change as time passes. This method also uniquifies articles. |
| 263 | """ |
| 264 | source_domain = source.domain |
| 265 | config = source.config |
| 266 | |
| 267 | if len(articles) == 0: |
| 268 | return [] |
| 269 | |
| 270 | memo = {} |
| 271 | cur_articles = {article.url: article for article in articles} |
| 272 | d_pth = os.path.join(settings.MEMO_DIR, domain_to_filename(source_domain)) |
| 273 | |
| 274 | if os.path.exists(d_pth): |
| 275 | f = codecs.open(d_pth, 'r', 'utf8') |
| 276 | urls = f.readlines() |
| 277 | f.close() |
| 278 | urls = [u.strip() for u in urls] |
| 279 | |
| 280 | memo = {url: True for url in urls} |
| 281 | # prev_length = len(memo) |
| 282 | for url, article in list(cur_articles.items()): |
| 283 | if memo.get(url): |
| 284 | del cur_articles[url] |
| 285 | |
| 286 | valid_urls = list(memo.keys()) + list(cur_articles.keys()) |
| 287 | |
| 288 | memo_text = '\r\n'.join( |
| 289 | [href.strip() for href in (valid_urls)]) |
| 290 | # Our first run with memoization, save every url as valid |
| 291 | else: |
| 292 | memo_text = '\r\n'.join( |
| 293 | [href.strip() for href in list(cur_articles.keys())]) |
| 294 | |
| 295 | # new_length = len(cur_articles) |
| 296 | if len(memo) > config.MAX_FILE_MEMO: |
| 297 | # We still keep current batch of articles though! |
| 298 | log.critical('memo overflow, dumping') |
| 299 | memo_text = '' |
| 300 | |
| 301 | # TODO if source: source.write_upload_times(prev_length, new_length) |
| 302 | ff = codecs.open(d_pth, 'w', 'utf-8') |
| 303 | ff.write(memo_text) |
| 304 | ff.close() |
| 305 | return list(cur_articles.values()) |
| 306 | |
| 307 | |
| 308 | def get_useragent(): |
nothing calls this directly
no test coverage detected
searching dependent graphs…