Convert this dataset into a dask.dataframe.DataFrame. The dimensions, coordinates and data variables in this dataset form the columns of the DataFrame. Parameters ---------- dim_order : list, optional Hierarchical dimension order for the
(
self, dim_order: Sequence[Hashable] | None = None, set_index: bool = False
)
| 7485 | return obj[dataframe.columns] if len(dataframe.columns) else obj |
| 7486 | |
| 7487 | def to_dask_dataframe( |
| 7488 | self, dim_order: Sequence[Hashable] | None = None, set_index: bool = False |
| 7489 | ) -> DaskDataFrame: |
| 7490 | """ |
| 7491 | Convert this dataset into a dask.dataframe.DataFrame. |
| 7492 | |
| 7493 | The dimensions, coordinates and data variables in this dataset form |
| 7494 | the columns of the DataFrame. |
| 7495 | |
| 7496 | Parameters |
| 7497 | ---------- |
| 7498 | dim_order : list, optional |
| 7499 | Hierarchical dimension order for the resulting dataframe. All |
| 7500 | arrays are transposed to this order and then written out as flat |
| 7501 | vectors in contiguous order, so the last dimension in this list |
| 7502 | will be contiguous in the resulting DataFrame. This has a major |
| 7503 | influence on which operations are efficient on the resulting dask |
| 7504 | dataframe. |
| 7505 | |
| 7506 | If provided, must include all dimensions of this dataset. By |
| 7507 | default, dimensions are sorted alphabetically. |
| 7508 | set_index : bool, default: False |
| 7509 | If set_index=True, the dask DataFrame is indexed by this dataset's |
| 7510 | coordinate. Since dask DataFrames do not support multi-indexes, |
| 7511 | set_index only works if the dataset only contains one dimension. |
| 7512 | |
| 7513 | Returns |
| 7514 | ------- |
| 7515 | dask.dataframe.DataFrame |
| 7516 | """ |
| 7517 | |
| 7518 | import dask.array as da |
| 7519 | import dask.dataframe as dd |
| 7520 | |
| 7521 | ordered_dims = self._normalize_dim_order(dim_order=dim_order) |
| 7522 | |
| 7523 | columns = list(ordered_dims) |
| 7524 | columns.extend(k for k in self.coords if k not in self.dims) |
| 7525 | columns.extend(self.data_vars) |
| 7526 | |
| 7527 | ds_chunks = self.chunks |
| 7528 | |
| 7529 | series_list = [] |
| 7530 | df_meta = pd.DataFrame() |
| 7531 | for name in columns: |
| 7532 | try: |
| 7533 | var = self.variables[name] |
| 7534 | except KeyError: |
| 7535 | # dimension without a matching coordinate |
| 7536 | size = self.sizes[name] |
| 7537 | data = da.arange(size, chunks=size, dtype=np.int64) |
| 7538 | var = Variable((name,), data) |
| 7539 | |
| 7540 | # IndexVariable objects have a dummy .chunk() method |
| 7541 | if isinstance(var, IndexVariable): |
| 7542 | var = var.to_base_variable() |
| 7543 | |
| 7544 | # Make sure var is a dask array, otherwise the array can become too large |