Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Parame
(
data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True, UNK_ID=3,
_DIGIT_RE=re.compile(br"\d")
)
| 1050 | |
| 1051 | |
| 1052 | def data_to_token_ids( |
| 1053 | data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True, UNK_ID=3, |
| 1054 | _DIGIT_RE=re.compile(br"\d") |
| 1055 | ): |
| 1056 | """Tokenize data file and turn into token-ids using given vocabulary file. |
| 1057 | |
| 1058 | This function loads data line-by-line from data_path, calls the above |
| 1059 | sentence_to_token_ids, and saves the result to target_path. See comment |
| 1060 | for sentence_to_token_ids on the details of token-ids format. |
| 1061 | |
| 1062 | Parameters |
| 1063 | ----------- |
| 1064 | data_path : str |
| 1065 | Path to the data file in one-sentence-per-line format. |
| 1066 | target_path : str |
| 1067 | Path where the file with token-ids will be created. |
| 1068 | vocabulary_path : str |
| 1069 | Path to the vocabulary file. |
| 1070 | tokenizer : function |
| 1071 | A function to use to tokenize each sentence. If None, ``basic_tokenizer`` will be used. |
| 1072 | normalize_digits : boolean |
| 1073 | If true, all digits are replaced by 0. |
| 1074 | |
| 1075 | References |
| 1076 | ---------- |
| 1077 | - Code from ``/tensorflow/models/rnn/translation/data_utils.py`` |
| 1078 | |
| 1079 | """ |
| 1080 | if not gfile.Exists(target_path): |
| 1081 | tl.logging.info("Tokenizing data in %s" % data_path) |
| 1082 | vocab, _ = initialize_vocabulary(vocabulary_path) |
| 1083 | with gfile.GFile(data_path, mode="rb") as data_file: |
| 1084 | with gfile.GFile(target_path, mode="w") as tokens_file: |
| 1085 | counter = 0 |
| 1086 | for line in data_file: |
| 1087 | counter += 1 |
| 1088 | if counter % 100000 == 0: |
| 1089 | tl.logging.info(" tokenizing line %d" % counter) |
| 1090 | token_ids = sentence_to_token_ids( |
| 1091 | line, vocab, tokenizer, normalize_digits, UNK_ID=UNK_ID, _DIGIT_RE=_DIGIT_RE |
| 1092 | ) |
| 1093 | tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n") |
| 1094 | else: |
| 1095 | tl.logging.info("Target path %s exists" % target_path) |
| 1096 | |
| 1097 | |
| 1098 | def moses_multi_bleu(hypotheses, references, lowercase=False): |
nothing calls this directly
no test coverage detected
searching dependent graphs…