| 104 | * each column. |
| 105 | */ |
| 106 | export async function computeDatasetStatistics( |
| 107 | dataset: Dataset<TabularRecord>, sampleSize?: number, |
| 108 | shuffleWindowSize?: number): Promise<DatasetStatistics> { |
| 109 | let sampleDataset = dataset; |
| 110 | // TODO(soergel): allow for deep shuffle where possible. |
| 111 | if (shuffleWindowSize != null) { |
| 112 | sampleDataset = sampleDataset.shuffle(shuffleWindowSize); |
| 113 | } |
| 114 | if (sampleSize != null) { |
| 115 | sampleDataset = sampleDataset.take(sampleSize); |
| 116 | } |
| 117 | |
| 118 | // TODO(soergel): prepare the column objects based on a schema. |
| 119 | const result: DatasetStatistics = {}; |
| 120 | |
| 121 | await sampleDataset.forEachAsync(e => { |
| 122 | for (const key of Object.keys(e)) { |
| 123 | const value = e[key]; |
| 124 | if (typeof (value) === 'string') { |
| 125 | // No statistics for string element. |
| 126 | } else { |
| 127 | let previousMean = 0; |
| 128 | let previousLength = 0; |
| 129 | let previousVariance = 0; |
| 130 | let columnStats: NumericColumnStatistics = result[key]; |
| 131 | if (columnStats == null) { |
| 132 | columnStats = { |
| 133 | min: Number.POSITIVE_INFINITY, |
| 134 | max: Number.NEGATIVE_INFINITY, |
| 135 | mean: 0, |
| 136 | variance: 0, |
| 137 | stddev: 0, |
| 138 | length: 0 |
| 139 | }; |
| 140 | result[key] = columnStats; |
| 141 | } else { |
| 142 | previousMean = columnStats.mean; |
| 143 | previousLength = columnStats.length; |
| 144 | previousVariance = columnStats.variance; |
| 145 | } |
| 146 | let recordMin: number; |
| 147 | let recordMax: number; |
| 148 | |
| 149 | // Calculate accumulated mean and variance following tf.Transform |
| 150 | // implementation |
| 151 | let valueLength = 0; |
| 152 | let valueMean = 0; |
| 153 | let valueVariance = 0; |
| 154 | let combinedLength = 0; |
| 155 | let combinedMean = 0; |
| 156 | let combinedVariance = 0; |
| 157 | |
| 158 | if (value instanceof tf.Tensor) { |
| 159 | recordMin = min(value).dataSync()[0]; |
| 160 | recordMax = max(value).dataSync()[0]; |
| 161 | const valueMoment = tf.moments(value); |
| 162 | valueMean = valueMoment.mean.dataSync()[0]; |
| 163 | valueVariance = valueMoment.variance.dataSync()[0]; |