| 366 | return table |
| 367 | |
| 368 | def _encode_sparse(self, examples): |
| 369 | N = len(examples) |
| 370 | idxs, data = [], [] |
| 371 | |
| 372 | for row, feat_dict in enumerate(examples): |
| 373 | for f_id, val in feat_dict.items(): |
| 374 | if isinstance(f_id, str): |
| 375 | f_id = f_id.encode("utf-8") |
| 376 | |
| 377 | # use json module to convert the feature id into a unique |
| 378 | # string compatible with the buffer API (required by hashlib) |
| 379 | if isinstance(f_id, (tuple, dict, list)): |
| 380 | f_id = json.dumps(f_id, sort_keys=True).encode("utf-8") |
| 381 | |
| 382 | h = int(self.hash(f_id).hexdigest(), base=16) |
| 383 | col = h % self.n_dim |
| 384 | idxs.append((row, col)) |
| 385 | data.append(np.sign(h) * val) |
| 386 | |
| 387 | table = csr_matrix((data, zip(*idxs)), shape=(N, self.n_dim)) |
| 388 | return table |