| 38 | |
| 39 | |
| 40 | def read_data(): |
| 41 | # Vectorize the data. |
| 42 | input_texts = [] |
| 43 | target_texts = [] |
| 44 | input_characters = set() |
| 45 | target_characters = set() |
| 46 | lines = io.open(FLAGS.data_path, 'r', encoding='utf-8').read().split('\n') |
| 47 | for line in lines[: min(FLAGS.num_samples, len(lines) - 1)]: |
| 48 | input_text, target_text, *_ = line.split('\t') |
| 49 | # We use "tab" as the "start sequence" character for the targets, and "\n" |
| 50 | # as "end sequence" character. |
| 51 | target_text = '\t' + target_text + '\n' |
| 52 | input_texts.append(input_text) |
| 53 | target_texts.append(target_text) |
| 54 | for char in input_text: |
| 55 | if char not in input_characters: |
| 56 | input_characters.add(char) |
| 57 | for char in target_text: |
| 58 | if char not in target_characters: |
| 59 | target_characters.add(char) |
| 60 | |
| 61 | input_characters = sorted(list(input_characters)) |
| 62 | target_characters = sorted(list(target_characters)) |
| 63 | num_encoder_tokens = len(input_characters) |
| 64 | num_decoder_tokens = len(target_characters) |
| 65 | max_encoder_seq_length = max([len(txt) for txt in input_texts]) |
| 66 | max_decoder_seq_length = max([len(txt) for txt in target_texts]) |
| 67 | |
| 68 | print('Number of samples:', len(input_texts)) |
| 69 | print('Number of unique input tokens:', num_encoder_tokens) |
| 70 | print('Number of unique output tokens:', num_decoder_tokens) |
| 71 | print('Max sequence length for inputs:', max_encoder_seq_length) |
| 72 | print('Max sequence length for outputs:', max_decoder_seq_length) |
| 73 | |
| 74 | input_token_index = dict( |
| 75 | [(char, i) for i, char in enumerate(input_characters)]) |
| 76 | target_token_index = dict( |
| 77 | [(char, i) for i, char in enumerate(target_characters)]) |
| 78 | |
| 79 | # Save the token indices to file. |
| 80 | metadata_json_path = os.path.join( |
| 81 | FLAGS.artifacts_dir, 'metadata.json') |
| 82 | if not os.path.isdir(os.path.dirname(metadata_json_path)): |
| 83 | os.makedirs(os.path.dirname(metadata_json_path)) |
| 84 | with io.open(metadata_json_path, 'w', encoding='utf-8') as f: |
| 85 | metadata = { |
| 86 | 'input_token_index': input_token_index, |
| 87 | 'target_token_index': target_token_index, |
| 88 | 'max_encoder_seq_length': max_encoder_seq_length, |
| 89 | 'max_decoder_seq_length': max_decoder_seq_length |
| 90 | } |
| 91 | f.write(json.dumps(metadata, ensure_ascii=False)) |
| 92 | print('Saved metadata at: %s' % metadata_json_path) |
| 93 | |
| 94 | encoder_input_data = np.zeros( |
| 95 | (len(input_texts), max_encoder_seq_length, num_encoder_tokens), |
| 96 | dtype='float32') |
| 97 | decoder_input_data = np.zeros( |