(data_path, out_path)
| 38 | |
| 39 | |
| 40 | def split_doc2(data_path, out_path): |
| 41 | import re |
| 42 | data = load_txt_data(data_path) |
| 43 | doc_index = 0 |
| 44 | for i in tqdm(range(len(data))): |
| 45 | try: |
| 46 | line = data[i].split(',') |
| 47 | if len(line[0]) < 100: |
| 48 | continue |
| 49 | abstract = re.sub("[\" ]", "", line[1]) |
| 50 | abstract = ' '.join(abstract) |
| 51 | tmp = re.sub("[\" ]", "", line[0]) |
| 52 | tmp = tmp.split('。') |
| 53 | document = [] |
| 54 | for x in tmp: |
| 55 | document.append(' '.join(x)) |
| 56 | except IndexError: |
| 57 | continue |
| 58 | |
| 59 | # print(document) |
| 60 | for j in range(len(document)): |
| 61 | document[j] = document[j] + '\n' |
| 62 | new_doc = document + ['@highlight\n'] + [abstract] |
| 63 | save_txt_file(new_doc, out_path + str(doc_index) + '.story') |
| 64 | doc_index += 1 |
| 65 | |
| 66 | |
| 67 | def delete_data(path, b_range=None, e_range=None): |
nothing calls this directly
no test coverage detected