Another way to do this is keep the file handler. But when DataLoader's num_worker bigger than 1, error will occur. Args: config:
(self, config, json_files, generate_dict=False,
mode=ModeType.EVAL)
| 55 | BIG_VALUE = 1000 * 1000 * 1000 |
| 56 | |
| 57 | def __init__(self, config, json_files, generate_dict=False, |
| 58 | mode=ModeType.EVAL): |
| 59 | """ |
| 60 | Another way to do this is keep the file handler. But when DataLoader's |
| 61 | num_worker bigger than 1, error will occur. |
| 62 | Args: |
| 63 | config: |
| 64 | """ |
| 65 | self.config = config |
| 66 | self.logger = Logger(config) |
| 67 | self._init_dict() |
| 68 | self.sample_index = [] |
| 69 | self.sample_size = 0 |
| 70 | self.model_mode = mode |
| 71 | self.hierarchy_classes = [] |
| 72 | |
| 73 | self.files = json_files |
| 74 | for i, json_file in enumerate(json_files): |
| 75 | with open(json_file) as fin: |
| 76 | self.sample_index.append([i, 0]) |
| 77 | while True: |
| 78 | json_str = fin.readline() |
| 79 | if not json_str: |
| 80 | self.sample_index.pop() |
| 81 | break |
| 82 | self.sample_size += 1 |
| 83 | self.sample_index.append([i, fin.tell()]) |
| 84 | |
| 85 | def _insert_vocab(files, _mode=InsertVocabMode.ALL): |
| 86 | for _i, _json_file in enumerate(files): |
| 87 | with open(_json_file) as _fin: |
| 88 | for _json_str in _fin: |
| 89 | try: |
| 90 | self._insert_vocab(json.loads(_json_str), mode) |
| 91 | except: |
| 92 | print(_json_str) |
| 93 | |
| 94 | # Dict can be generated using: |
| 95 | # json files or/and pretrained embedding |
| 96 | if generate_dict: |
| 97 | # Use train json files to generate dict |
| 98 | # If generate_dict_using_json_files is true, then all vocab in train |
| 99 | # will be used, else only part vocab will be used. e.g. label |
| 100 | vocab_json_files = config.data.train_json_files |
| 101 | mode = InsertVocabMode.LABEL |
| 102 | if self.config.data.generate_dict_using_json_files: |
| 103 | mode = InsertVocabMode.ALL |
| 104 | self.logger.info("Use dataset to generate dict.") |
| 105 | _insert_vocab(vocab_json_files, mode) |
| 106 | |
| 107 | if self.config.data.generate_dict_using_all_json_files: |
| 108 | vocab_json_files += self.config.data.validate_json_files + \ |
| 109 | self.config.data.test_json_files |
| 110 | _insert_vocab(vocab_json_files, InsertVocabMode.OTHER) |
| 111 | |
| 112 | if self.config.data.generate_dict_using_pretrained_embedding: |
| 113 | self.logger.info("Use pretrained embedding to generate dict.") |
| 114 | self._load_pretrained_dict() |
nothing calls this directly
no test coverage detected