(raw_corpus_file_name, result_file_name)
| 26 | result_file.close() |
| 27 | |
| 28 | def prepocess_v2(raw_corpus_file_name, result_file_name): |
| 29 | raw_corpus_file = codecs.open(raw_corpus_file_name, encoding=Config.encoding, errors="replace") |
| 30 | result_file = codecs.open(result_file_name, "w", encoding=Config.encoding) |
| 31 | csv_raw_corpus = csv.DictReader(raw_corpus_file) |
| 32 | for index, line in enumerate(csv_raw_corpus): |
| 33 | if index % 100000 == 0: |
| 34 | print(raw_corpus_file_name, index) |
| 35 | question = line["question"] |
| 36 | answer = line["answer"] |
| 37 | if "沒有資料" in question or "沒有資料" in answer: |
| 38 | continue |
| 39 | |
| 40 | question = tradition2simple(question) |
| 41 | answer = tradition2simple(answer) |
| 42 | |
| 43 | result_file.write("\t".join([question, answer]) + "\n") |
| 44 | |
| 45 | raw_corpus_file.close() |
| 46 | result_file.close() |
| 47 | |
| 48 | |
| 49 | def ptt_process_pipeline(): |
no test coverage detected