MCPcopy
hub / github.com/huggingface/datasets / encode_example

Method encode_example

src/datasets/features/pdf.py:78–111  ·  view source on GitHub ↗

Encode example into a format for Arrow. Args: value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`): Data passed as input to Pdf feature. Returns: `dict` with "path" and "bytes" fields

(self, value: Union[str, bytes, bytearray, dict, "pdfplumber.pdf.PDF"])

Source from the content-addressed store, hash-verified

76 return self.pa_type
77
78 def encode_example(self, value: Union[str, bytes, bytearray, dict, "pdfplumber.pdf.PDF"]) -> dict:
79 """Encode example into a format for Arrow.
80
81 Args:
82 value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`):
83 Data passed as input to Pdf feature.
84
85 Returns:
86 `dict` with "path" and "bytes" fields
87 """
88 if config.PDFPLUMBER_AVAILABLE:
89 import pdfplumber
90 else:
91 pdfplumber = None
92
93 if isinstance(value, str):
94 return {"path": value, "bytes": None}
95 elif isinstance(value, Path):
96 return {"path": str(value.absolute()), "bytes": None}
97 elif isinstance(value, (bytes, bytearray)):
98 return {"path": None, "bytes": value}
99 elif pdfplumber is not None and isinstance(value, pdfplumber.pdf.PDF):
100 # convert the pdfplumber.pdf.PDF to bytes
101 return encode_pdfplumber_pdf(value)
102 elif value.get("path") is not None and os.path.isfile(value["path"]):
103 # we set "bytes": None to not duplicate the data if they're already available locally
104 return {"bytes": None, "path": value.get("path")}
105 elif value.get("bytes") is not None or value.get("path") is not None:
106 # store the pdf bytes, and path is used to infer the pdf format using the file extension
107 return {"bytes": value.get("bytes"), "path": value.get("path")}
108 else:
109 raise ValueError(
110 f"A pdf sample should have one of 'path' or 'bytes' but they are missing or None in {value}."
111 )
112
113 def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf.PDF":
114 """Decode example pdf file into pdf data.

Calls 1

encode_pdfplumber_pdfFunction · 0.85