Arguments: path: path to directory where array entries are concatenated into one big string file and the .len file are located data_type (str): Some datsets have multiple fields that are stored in different paths. `data_type` specifies which of these fiel
| 90 | return string |
| 91 | |
| 92 | class lazy_array_loader(object): |
| 93 | """ |
| 94 | Arguments: |
| 95 | path: path to directory where array entries are concatenated into one big string file |
| 96 | and the .len file are located |
| 97 | data_type (str): Some datsets have multiple fields that are stored in different paths. |
| 98 | `data_type` specifies which of these fields to load in this class |
| 99 | mem_map (boolean): Specifies whether to memory map file `path` |
| 100 | map_fn (callable): Fetched strings are passed through map_fn before being returned. |
| 101 | |
| 102 | Example of lazy loader directory structure: |
| 103 | file.json |
| 104 | file.lazy/ |
| 105 | data_type1 |
| 106 | data_type1.len.pkl |
| 107 | data_type2 |
| 108 | data_type2.len.pkl |
| 109 | """ |
| 110 | def __init__(self, path, data_type='data', mem_map=False, map_fn=None): |
| 111 | lazypath = get_lazy_path(path) |
| 112 | datapath = os.path.join(lazypath, data_type) |
| 113 | #get file where array entries are concatenated into one big string |
| 114 | self._file = open(datapath, 'rb') |
| 115 | self.file = self._file |
| 116 | #memory map file if necessary |
| 117 | self.mem_map = mem_map |
| 118 | if self.mem_map: |
| 119 | self.file = mmap.mmap(self.file.fileno(), 0, prot=mmap.PROT_READ) |
| 120 | lenpath = os.path.join(lazypath, data_type+'.len.pkl') |
| 121 | self.lens = pkl.load(open(lenpath, 'rb')) |
| 122 | self.ends = list(accumulate(self.lens)) |
| 123 | self.dumb_ends = list(self.ends) |
| 124 | self.read_lock = Lock() |
| 125 | self.process_fn = map_fn |
| 126 | self.map_fn = map_fn |
| 127 | self._tokenizer = None |
| 128 | |
| 129 | def SetTokenizer(self, tokenizer): |
| 130 | """ |
| 131 | logic to set and remove (set to None) tokenizer. |
| 132 | combines preprocessing/tokenization into one callable. |
| 133 | """ |
| 134 | if tokenizer is None: |
| 135 | if not hasattr(self, '_tokenizer'): |
| 136 | self._tokenizer = tokenizer |
| 137 | else: |
| 138 | self._tokenizer = tokenizer |
| 139 | self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn) |
| 140 | |
| 141 | def GetTokenizer(self): |
| 142 | return self._tokenizer |
| 143 | |
| 144 | def __getitem__(self, index): |
| 145 | """ |
| 146 | read file and splice strings based on string ending array `self.ends` |
| 147 | """ |
| 148 | if not isinstance(index, slice): |
| 149 | if index == 0: |
no outgoing calls
no test coverage detected