(self, data, format=None)
| 3332 | return self.content |
| 3333 | |
| 3334 | def _parse(self, data, format=None): |
| 3335 | |
| 3336 | # The output will be ugly: it may be useful for mining but probably not for displaying. |
| 3337 | # You can also try PDF(data, format="html") to preserve some layout information. |
| 3338 | from pdf.pdfinterp import PDFResourceManager, process_pdf |
| 3339 | from pdf.converter import TextConverter, HTMLConverter |
| 3340 | from pdf.layout import LAParams |
| 3341 | s = "" |
| 3342 | m = PDFResourceManager() |
| 3343 | try: |
| 3344 | # Given data is a PDF file path. |
| 3345 | data = os.path.exists(data) and open(data) or StringIO.StringIO(data) |
| 3346 | except TypeError: |
| 3347 | # Given data is a PDF string. |
| 3348 | data = StringIO.StringIO(data) |
| 3349 | try: |
| 3350 | stream = StringIO.StringIO() |
| 3351 | parser = format=="html" and HTMLConverter or TextConverter |
| 3352 | parser = parser(m, stream, codec="utf-8", laparams=LAParams()) |
| 3353 | process_pdf(m, parser, data, set(), maxpages=0, password="") |
| 3354 | except Exception, e: |
| 3355 | raise PDFParseError, str(e) |
| 3356 | s = stream.getvalue() |
| 3357 | s = decode_utf8(s) |
| 3358 | s = s.strip() |
| 3359 | s = re.sub(r"([a-z])\-\n", "\\1", s) # Join hyphenated words. |
| 3360 | s = s.replace("\n\n", "<!-- paragraph -->") # Preserve paragraph spacing. |
| 3361 | s = s.replace("\n", " ") |
| 3362 | s = s.replace("<!-- paragraph -->", "\n\n") |
| 3363 | s = collapse_spaces(s) |
| 3364 | return s |
no test coverage detected