| 23 | |
| 24 | |
| 25 | def parse_doc(content): |
| 26 | result = '' |
| 27 | url_list = re.findall('(https.*?0.json.*?)\\\\x22}', content) |
| 28 | url_list = [addr.replace("\\\\\\/", "/") for addr in url_list] |
| 29 | for url in url_list[:-5]: |
| 30 | content = fetch_url(url) |
| 31 | y = 0 |
| 32 | txtlists = re.findall('"c":"(.*?)".*?"y":(.*?),', content) |
| 33 | for item in txtlists: |
| 34 | if not y == item[1]: |
| 35 | y = item[1] |
| 36 | n = '\n' |
| 37 | else: |
| 38 | n = '' |
| 39 | result += n |
| 40 | result += item[0].encode('utf-8').decode('unicode_escape', 'ignore') |
| 41 | return result |
| 42 | |
| 43 | |
| 44 | def parse_txt(doc_id): |