r"""Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We
(
vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True,
_DIGIT_RE=re.compile(br"\d"), _START_VOCAB=None
)
| 906 | |
| 907 | |
| 908 | def create_vocabulary( |
| 909 | vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True, |
| 910 | _DIGIT_RE=re.compile(br"\d"), _START_VOCAB=None |
| 911 | ): |
| 912 | r"""Create vocabulary file (if it does not exist yet) from data file. |
| 913 | |
| 914 | Data file is assumed to contain one sentence per line. Each sentence is |
| 915 | tokenized and digits are normalized (if normalize_digits is set). |
| 916 | Vocabulary contains the most-frequent tokens up to max_vocabulary_size. |
| 917 | We write it to vocabulary_path in a one-token-per-line format, so that later |
| 918 | token in the first line gets id=0, second line gets id=1, and so on. |
| 919 | |
| 920 | Parameters |
| 921 | ----------- |
| 922 | vocabulary_path : str |
| 923 | Path where the vocabulary will be created. |
| 924 | data_path : str |
| 925 | Data file that will be used to create vocabulary. |
| 926 | max_vocabulary_size : int |
| 927 | Limit on the size of the created vocabulary. |
| 928 | tokenizer : function |
| 929 | A function to use to tokenize each data sentence. If None, basic_tokenizer will be used. |
| 930 | normalize_digits : boolean |
| 931 | If true, all digits are replaced by `0`. |
| 932 | _DIGIT_RE : regular expression function |
| 933 | Default is ``re.compile(br"\d")``. |
| 934 | _START_VOCAB : list of str |
| 935 | The pad, go, eos and unk token, default is ``[b"_PAD", b"_GO", b"_EOS", b"_UNK"]``. |
| 936 | |
| 937 | References |
| 938 | ---------- |
| 939 | - Code from ``/tensorflow/models/rnn/translation/data_utils.py`` |
| 940 | |
| 941 | """ |
| 942 | if _START_VOCAB is None: |
| 943 | _START_VOCAB = [b"_PAD", b"_GO", b"_EOS", b"_UNK"] |
| 944 | if not gfile.Exists(vocabulary_path): |
| 945 | tl.logging.info("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) |
| 946 | vocab = {} |
| 947 | with gfile.GFile(data_path, mode="rb") as f: |
| 948 | counter = 0 |
| 949 | for line in f: |
| 950 | counter += 1 |
| 951 | if counter % 100000 == 0: |
| 952 | tl.logging.info(" processing line %d" % counter) |
| 953 | tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) |
| 954 | for w in tokens: |
| 955 | word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w |
| 956 | if word in vocab: |
| 957 | vocab[word] += 1 |
| 958 | else: |
| 959 | vocab[word] = 1 |
| 960 | vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) |
| 961 | if len(vocab_list) > max_vocabulary_size: |
| 962 | vocab_list = vocab_list[:max_vocabulary_size] |
| 963 | with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: |
| 964 | for w in vocab_list: |
| 965 | vocab_file.write(w + b"\n") |
nothing calls this directly
no test coverage detected
searching dependent graphs…