MCPcopy Index your code
hub / github.com/deepspeedai/DeepSpeedExamples / lazy_array_loader

Class lazy_array_loader

Megatron-LM/data_utils/lazy_loader.py:92–194  ·  view source on GitHub ↗

Arguments: path: path to directory where array entries are concatenated into one big string file and the .len file are located data_type (str): Some datsets have multiple fields that are stored in different paths. `data_type` specifies which of these fiel

Source from the content-addressed store, hash-verified

90 return string
91
92class lazy_array_loader(object):
93 """
94 Arguments:
95 path: path to directory where array entries are concatenated into one big string file
96 and the .len file are located
97 data_type (str): Some datsets have multiple fields that are stored in different paths.
98 `data_type` specifies which of these fields to load in this class
99 mem_map (boolean): Specifies whether to memory map file `path`
100 map_fn (callable): Fetched strings are passed through map_fn before being returned.
101
102 Example of lazy loader directory structure:
103 file.json
104 file.lazy/
105 data_type1
106 data_type1.len.pkl
107 data_type2
108 data_type2.len.pkl
109 """
110 def __init__(self, path, data_type='data', mem_map=False, map_fn=None):
111 lazypath = get_lazy_path(path)
112 datapath = os.path.join(lazypath, data_type)
113 #get file where array entries are concatenated into one big string
114 self._file = open(datapath, 'rb')
115 self.file = self._file
116 #memory map file if necessary
117 self.mem_map = mem_map
118 if self.mem_map:
119 self.file = mmap.mmap(self.file.fileno(), 0, prot=mmap.PROT_READ)
120 lenpath = os.path.join(lazypath, data_type+'.len.pkl')
121 self.lens = pkl.load(open(lenpath, 'rb'))
122 self.ends = list(accumulate(self.lens))
123 self.dumb_ends = list(self.ends)
124 self.read_lock = Lock()
125 self.process_fn = map_fn
126 self.map_fn = map_fn
127 self._tokenizer = None
128
129 def SetTokenizer(self, tokenizer):
130 """
131 logic to set and remove (set to None) tokenizer.
132 combines preprocessing/tokenization into one callable.
133 """
134 if tokenizer is None:
135 if not hasattr(self, '_tokenizer'):
136 self._tokenizer = tokenizer
137 else:
138 self._tokenizer = tokenizer
139 self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn)
140
141 def GetTokenizer(self):
142 return self._tokenizer
143
144 def __getitem__(self, index):
145 """
146 read file and splice strings based on string ending array `self.ends`
147 """
148 if not isinstance(index, slice):
149 if index == 0:

Callers 1

get_dataset_from_pathFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected