MCPcopy
hub / github.com/Tencent/NeuralNLP-NeuralClassifier / __init__

Method __init__

dataset/dataset.py:57–124  ·  view source on GitHub ↗

Another way to do this is keep the file handler. But when DataLoader's num_worker bigger than 1, error will occur. Args: config:

(self, config, json_files, generate_dict=False,
                 mode=ModeType.EVAL)

Source from the content-addressed store, hash-verified

55 BIG_VALUE = 1000 * 1000 * 1000
56
57 def __init__(self, config, json_files, generate_dict=False,
58 mode=ModeType.EVAL):
59 """
60 Another way to do this is keep the file handler. But when DataLoader's
61 num_worker bigger than 1, error will occur.
62 Args:
63 config:
64 """
65 self.config = config
66 self.logger = Logger(config)
67 self._init_dict()
68 self.sample_index = []
69 self.sample_size = 0
70 self.model_mode = mode
71 self.hierarchy_classes = []
72
73 self.files = json_files
74 for i, json_file in enumerate(json_files):
75 with open(json_file) as fin:
76 self.sample_index.append([i, 0])
77 while True:
78 json_str = fin.readline()
79 if not json_str:
80 self.sample_index.pop()
81 break
82 self.sample_size += 1
83 self.sample_index.append([i, fin.tell()])
84
85 def _insert_vocab(files, _mode=InsertVocabMode.ALL):
86 for _i, _json_file in enumerate(files):
87 with open(_json_file) as _fin:
88 for _json_str in _fin:
89 try:
90 self._insert_vocab(json.loads(_json_str), mode)
91 except:
92 print(_json_str)
93
94 # Dict can be generated using:
95 # json files or/and pretrained embedding
96 if generate_dict:
97 # Use train json files to generate dict
98 # If generate_dict_using_json_files is true, then all vocab in train
99 # will be used, else only part vocab will be used. e.g. label
100 vocab_json_files = config.data.train_json_files
101 mode = InsertVocabMode.LABEL
102 if self.config.data.generate_dict_using_json_files:
103 mode = InsertVocabMode.ALL
104 self.logger.info("Use dataset to generate dict.")
105 _insert_vocab(vocab_json_files, mode)
106
107 if self.config.data.generate_dict_using_all_json_files:
108 vocab_json_files += self.config.data.validate_json_files + \
109 self.config.data.test_json_files
110 _insert_vocab(vocab_json_files, InsertVocabMode.OTHER)
111
112 if self.config.data.generate_dict_using_pretrained_embedding:
113 self.logger.info("Use pretrained embedding to generate dict.")
114 self._load_pretrained_dict()

Callers

nothing calls this directly

Calls 10

_init_dictMethod · 0.95
_load_pretrained_dictMethod · 0.95
_print_dict_infoMethod · 0.95
_shrink_dictMethod · 0.95
_save_dictMethod · 0.95
_clear_dictMethod · 0.95
_load_dictMethod · 0.95
LoggerClass · 0.90
infoMethod · 0.80

Tested by

no test coverage detected