| 113 | |
| 114 | |
| 115 | def _init_reader(self): |
| 116 | self.data = [] |
| 117 | input_f = open(self.input_file, 'rb') |
| 118 | target_f = open(self.target_file, 'rb') |
| 119 | for input_line in input_f: |
| 120 | input_line = input_line.decode('utf-8')[:-1] |
| 121 | target_line = target_f.readline().decode('utf-8')[:-1] |
| 122 | input_words = [x for x in input_line.split(' ') if x != ''] |
| 123 | if len(input_words) >= self.max_len: |
| 124 | input_words = input_words[:self.max_len-1] |
| 125 | input_words.append(self.end_token) |
| 126 | target_words = [x for x in target_line.split(' ') if x != ''] |
| 127 | if len(target_words) >= self.max_len: |
| 128 | target_words = target_words[:self.max_len-1] |
| 129 | target_words = ['<s>',] + target_words |
| 130 | target_words.append(self.end_token) |
| 131 | in_seq = encode_text(input_words, self.vocab_indices) |
| 132 | target_seq = encode_text(target_words, self.vocab_indices) |
| 133 | self.data.append({ |
| 134 | 'in_seq': in_seq, |
| 135 | 'in_seq_len': len(in_seq), |
| 136 | 'target_seq': target_seq, |
| 137 | 'target_seq_len': len(target_seq) - 1 |
| 138 | }) |
| 139 | input_f.close() |
| 140 | target_f.close() |
| 141 | self.data_pos = len(self.data) |