Encode example into a format for Arrow. Args: value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`): Data passed as input to Pdf feature. Returns: `dict` with "path" and "bytes" fields
(self, value: Union[str, bytes, bytearray, dict, "pdfplumber.pdf.PDF"])
| 76 | return self.pa_type |
| 77 | |
| 78 | def encode_example(self, value: Union[str, bytes, bytearray, dict, "pdfplumber.pdf.PDF"]) -> dict: |
| 79 | """Encode example into a format for Arrow. |
| 80 | |
| 81 | Args: |
| 82 | value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`): |
| 83 | Data passed as input to Pdf feature. |
| 84 | |
| 85 | Returns: |
| 86 | `dict` with "path" and "bytes" fields |
| 87 | """ |
| 88 | if config.PDFPLUMBER_AVAILABLE: |
| 89 | import pdfplumber |
| 90 | else: |
| 91 | pdfplumber = None |
| 92 | |
| 93 | if isinstance(value, str): |
| 94 | return {"path": value, "bytes": None} |
| 95 | elif isinstance(value, Path): |
| 96 | return {"path": str(value.absolute()), "bytes": None} |
| 97 | elif isinstance(value, (bytes, bytearray)): |
| 98 | return {"path": None, "bytes": value} |
| 99 | elif pdfplumber is not None and isinstance(value, pdfplumber.pdf.PDF): |
| 100 | # convert the pdfplumber.pdf.PDF to bytes |
| 101 | return encode_pdfplumber_pdf(value) |
| 102 | elif value.get("path") is not None and os.path.isfile(value["path"]): |
| 103 | # we set "bytes": None to not duplicate the data if they're already available locally |
| 104 | return {"bytes": None, "path": value.get("path")} |
| 105 | elif value.get("bytes") is not None or value.get("path") is not None: |
| 106 | # store the pdf bytes, and path is used to infer the pdf format using the file extension |
| 107 | return {"bytes": value.get("bytes"), "path": value.get("path")} |
| 108 | else: |
| 109 | raise ValueError( |
| 110 | f"A pdf sample should have one of 'path' or 'bytes' but they are missing or None in {value}." |
| 111 | ) |
| 112 | |
| 113 | def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf.PDF": |
| 114 | """Decode example pdf file into pdf data. |