(*xs)
| 148 | OBJECT_DTYPES = (object, pd.StringDtype("python")) |
| 149 | |
| 150 | def object_size(*xs): |
| 151 | if not xs: |
| 152 | return 0 |
| 153 | ncells = sum(len(x) for x in xs) |
| 154 | if not ncells: |
| 155 | return 0 |
| 156 | |
| 157 | # Deduplicate Series of references to the same objects, |
| 158 | # e.g. as produced by read_parquet |
| 159 | unique_samples = {} |
| 160 | for x in xs: |
| 161 | sample = np.random.choice(x, size=100, replace=True) |
| 162 | for i in sample.tolist(): |
| 163 | unique_samples[id(i)] = i |
| 164 | |
| 165 | nsamples = 100 * len(xs) |
| 166 | sample_nbytes = sum(sizeof(i) for i in unique_samples.values()) |
| 167 | if len(unique_samples) / nsamples > 0.5: |
| 168 | # Less than half of the references are duplicated. |
| 169 | # Assume that, if we were to analyze twice the amount of random references, |
| 170 | # we would get twice the amount of unique objects too. |
| 171 | return int(sample_nbytes * ncells / nsamples) |
| 172 | else: |
| 173 | # Assume we've already found all unique objects and that all references that |
| 174 | # we have not yet analyzed are going to point to the same data. |
| 175 | return sample_nbytes |
| 176 | |
| 177 | @sizeof.register(pd.DataFrame) |
| 178 | def sizeof_pandas_dataframe(df): |
no test coverage detected
searching dependent graphs…