(data_path, out_path)
| 18 | |
| 19 | |
| 20 | def split_doc(data_path, out_path): |
| 21 | data = load_txt_data(data_path) |
| 22 | doc_index = 0 |
| 23 | for i in tqdm(range(len(data)), desc='split_doc'): |
| 24 | line = data[i].split(',') |
| 25 | abstract = " ".join(line[0]) |
| 26 | from pyparsing import oneOf |
| 27 | punc = oneOf(list("。,;;!?")) |
| 28 | document = [' '.join(x) for x in punc.split(line[1])] |
| 29 | # print(document) |
| 30 | for j in range(len(document)): |
| 31 | document[j] = document[j] + '\n' |
| 32 | new_doc = document + ['@highlight\n'] + [abstract] |
| 33 | _doc_index = str(doc_index) |
| 34 | while len(_doc_index) <= 8: |
| 35 | _doc_index = '0' + _doc_index |
| 36 | save_txt_file(new_doc, out_path + _doc_index + '.story') |
| 37 | doc_index += 1 |
| 38 | |
| 39 | |
| 40 | def split_doc2(data_path, out_path): |
nothing calls this directly
no test coverage detected