Computes approximate mode of multivariate hypergeometric. This is an approximation to the mode of the multivariate hypergeometric given by class_counts and n_draws. It shouldn't be off by more than one. It is the mostly likely outcome of drawing n_draws many samples from the po
(class_counts, n_draws, rng)
| 1414 | |
| 1415 | |
| 1416 | def _approximate_mode(class_counts, n_draws, rng): |
| 1417 | """Computes approximate mode of multivariate hypergeometric. |
| 1418 | |
| 1419 | This is an approximation to the mode of the multivariate |
| 1420 | hypergeometric given by class_counts and n_draws. |
| 1421 | It shouldn't be off by more than one. |
| 1422 | |
| 1423 | It is the mostly likely outcome of drawing n_draws many |
| 1424 | samples from the population given by class_counts. |
| 1425 | |
| 1426 | Parameters |
| 1427 | ---------- |
| 1428 | class_counts : ndarray of int |
| 1429 | Population per class. |
| 1430 | n_draws : int |
| 1431 | Number of draws (samples to draw) from the overall population. |
| 1432 | rng : random state |
| 1433 | Used to break ties. |
| 1434 | |
| 1435 | Returns |
| 1436 | ------- |
| 1437 | sampled_classes : ndarray of int |
| 1438 | Number of samples drawn from each class. |
| 1439 | np.sum(sampled_classes) == n_draws |
| 1440 | |
| 1441 | Examples |
| 1442 | -------- |
| 1443 | >>> import numpy as np |
| 1444 | >>> from sklearn.utils.extmath import _approximate_mode |
| 1445 | >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0) |
| 1446 | array([2, 1]) |
| 1447 | >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0) |
| 1448 | array([3, 1]) |
| 1449 | >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), |
| 1450 | ... n_draws=2, rng=0) |
| 1451 | array([0, 1, 1, 0]) |
| 1452 | >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), |
| 1453 | ... n_draws=2, rng=42) |
| 1454 | array([1, 1, 0, 0]) |
| 1455 | """ |
| 1456 | rng = check_random_state(rng) |
| 1457 | # this computes a bad approximation to the mode of the |
| 1458 | # multivariate hypergeometric given by class_counts and n_draws |
| 1459 | continuous = class_counts / class_counts.sum() * n_draws |
| 1460 | # floored means we don't overshoot n_samples, but probably undershoot |
| 1461 | floored = np.floor(continuous) |
| 1462 | # we add samples according to how much "left over" probability |
| 1463 | # they had, until we arrive at n_samples |
| 1464 | need_to_add = int(n_draws - floored.sum()) |
| 1465 | if need_to_add > 0: |
| 1466 | remainder = continuous - floored |
| 1467 | values = np.sort(np.unique(remainder))[::-1] |
| 1468 | # add according to remainder, but break ties |
| 1469 | # randomly to avoid biases |
| 1470 | for value in values: |
| 1471 | (inds,) = np.where(remainder == value) |
| 1472 | # if we need_to_add less than what's in inds |
| 1473 | # we draw randomly from them. |
searching dependent graphs…