Convert a list of string (words) to IDs. Parameters ---------- data : list of string or byte The context in list format word_to_id : a dictionary that maps word to ID. unk_key : str Represent the unknown words. Returns -------- list of int
(data=None, word_to_id=None, unk_key='UNK')
| 740 | |
| 741 | |
| 742 | def words_to_word_ids(data=None, word_to_id=None, unk_key='UNK'): |
| 743 | """Convert a list of string (words) to IDs. |
| 744 | |
| 745 | Parameters |
| 746 | ---------- |
| 747 | data : list of string or byte |
| 748 | The context in list format |
| 749 | word_to_id : a dictionary |
| 750 | that maps word to ID. |
| 751 | unk_key : str |
| 752 | Represent the unknown words. |
| 753 | |
| 754 | Returns |
| 755 | -------- |
| 756 | list of int |
| 757 | A list of IDs to represent the context. |
| 758 | |
| 759 | Examples |
| 760 | -------- |
| 761 | >>> words = tl.files.load_matt_mahoney_text8_dataset() |
| 762 | >>> vocabulary_size = 50000 |
| 763 | >>> data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(words, vocabulary_size, True) |
| 764 | >>> context = [b'hello', b'how', b'are', b'you'] |
| 765 | >>> ids = tl.nlp.words_to_word_ids(words, dictionary) |
| 766 | >>> context = tl.nlp.word_ids_to_words(ids, reverse_dictionary) |
| 767 | >>> print(ids) |
| 768 | [6434, 311, 26, 207] |
| 769 | >>> print(context) |
| 770 | [b'hello', b'how', b'are', b'you'] |
| 771 | |
| 772 | References |
| 773 | --------------- |
| 774 | - `tensorflow.models.rnn.ptb.reader <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/models/rnn/ptb>`__ |
| 775 | |
| 776 | """ |
| 777 | if data is None: |
| 778 | raise Exception("data : list of string or byte") |
| 779 | if word_to_id is None: |
| 780 | raise Exception("word_to_id : a dictionary") |
| 781 | # if isinstance(data[0], six.string_types): |
| 782 | # tl.logging.info(type(data[0])) |
| 783 | # # exit() |
| 784 | # tl.logging.info(data[0]) |
| 785 | # tl.logging.info(word_to_id) |
| 786 | # return [word_to_id[str(word)] for word in data] |
| 787 | # else: |
| 788 | |
| 789 | word_ids = [] |
| 790 | for word in data: |
| 791 | if word_to_id.get(word) is not None: |
| 792 | word_ids.append(word_to_id[word]) |
| 793 | else: |
| 794 | word_ids.append(word_to_id[unk_key]) |
| 795 | return word_ids |
| 796 | # return [word_to_id[word] for word in data] # this one |
| 797 | |
| 798 | # if isinstance(data[0], str): |
| 799 | # # tl.logging.info('is a string object') |
nothing calls this directly
no test coverage detected
searching dependent graphs…