| 42 | |
| 43 | |
| 44 | def parse_txt(doc_id): |
| 45 | content_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=' + doc_id |
| 46 | content = fetch_url(content_url) |
| 47 | md5 = re.findall('"md5sum":"(.*?)"', content)[0] |
| 48 | pn = re.findall('"totalPageNum":"(.*?)"', content)[0] |
| 49 | rsign = re.findall('"rsign":"(.*?)"', content)[0] |
| 50 | content_url = 'https://wkretype.bdimg.com/retype/text/' + doc_id + '?rn=' + pn + '&type=txt' + md5 + '&rsign=' + rsign |
| 51 | content = json.loads(fetch_url(content_url)) |
| 52 | result = '' |
| 53 | for item in content: |
| 54 | for i in item['parags']: |
| 55 | result += i['c'].replace('\\r', '\r').replace('\\n', '\n') |
| 56 | return result |
| 57 | |
| 58 | |
| 59 | def parse_other(doc_id): |