Very basic tokenizer: split the sentence into a list of tokens. Parameters ----------- sentence : tensorflow.python.platform.gfile.GFile Object _WORD_SPLIT : regular expression for word spliting. Examples -------- >>> see create_vocabulary >>> from tensorflow.pytho
(sentence, _WORD_SPLIT=re.compile(b"([.,!?\"':;)(])"))
| 869 | |
| 870 | |
| 871 | def basic_tokenizer(sentence, _WORD_SPLIT=re.compile(b"([.,!?\"':;)(])")): |
| 872 | """Very basic tokenizer: split the sentence into a list of tokens. |
| 873 | |
| 874 | Parameters |
| 875 | ----------- |
| 876 | sentence : tensorflow.python.platform.gfile.GFile Object |
| 877 | _WORD_SPLIT : regular expression for word spliting. |
| 878 | |
| 879 | |
| 880 | Examples |
| 881 | -------- |
| 882 | >>> see create_vocabulary |
| 883 | >>> from tensorflow.python.platform import gfile |
| 884 | >>> train_path = "wmt/giga-fren.release2" |
| 885 | >>> with gfile.GFile(train_path + ".en", mode="rb") as f: |
| 886 | >>> for line in f: |
| 887 | >>> tokens = tl.nlp.basic_tokenizer(line) |
| 888 | >>> tl.logging.info(tokens) |
| 889 | >>> exit() |
| 890 | [b'Changing', b'Lives', b'|', b'Changing', b'Society', b'|', b'How', |
| 891 | b'It', b'Works', b'|', b'Technology', b'Drives', b'Change', b'Home', |
| 892 | b'|', b'Concepts', b'|', b'Teachers', b'|', b'Search', b'|', b'Overview', |
| 893 | b'|', b'Credits', b'|', b'HHCC', b'Web', b'|', b'Reference', b'|', |
| 894 | b'Feedback', b'Virtual', b'Museum', b'of', b'Canada', b'Home', b'Page'] |
| 895 | |
| 896 | References |
| 897 | ---------- |
| 898 | - Code from ``/tensorflow/models/rnn/translation/data_utils.py`` |
| 899 | |
| 900 | """ |
| 901 | words = [] |
| 902 | sentence = as_bytes(sentence) |
| 903 | for space_separated_fragment in sentence.strip().split(): |
| 904 | words.extend(re.split(_WORD_SPLIT, space_separated_fragment)) |
| 905 | return [w for w in words if w] |
| 906 | |
| 907 | |
| 908 | def create_vocabulary( |
no test coverage detected
searching dependent graphs…