MCPcopy Index your code
hub / github.com/clips/pattern / _parse

Method _parse

pattern/web/__init__.py:3334–3364  ·  view source on GitHub ↗
(self, data, format=None)

Source from the content-addressed store, hash-verified

3332 return self.content
3333
3334 def _parse(self, data, format=None):
3335
3336 # The output will be ugly: it may be useful for mining but probably not for displaying.
3337 # You can also try PDF(data, format="html") to preserve some layout information.
3338 from pdf.pdfinterp import PDFResourceManager, process_pdf
3339 from pdf.converter import TextConverter, HTMLConverter
3340 from pdf.layout import LAParams
3341 s = ""
3342 m = PDFResourceManager()
3343 try:
3344 # Given data is a PDF file path.
3345 data = os.path.exists(data) and open(data) or StringIO.StringIO(data)
3346 except TypeError:
3347 # Given data is a PDF string.
3348 data = StringIO.StringIO(data)
3349 try:
3350 stream = StringIO.StringIO()
3351 parser = format=="html" and HTMLConverter or TextConverter
3352 parser = parser(m, stream, codec="utf-8", laparams=LAParams())
3353 process_pdf(m, parser, data, set(), maxpages=0, password="")
3354 except Exception, e:
3355 raise PDFParseError, str(e)
3356 s = stream.getvalue()
3357 s = decode_utf8(s)
3358 s = s.strip()
3359 s = re.sub(r"([a-z])\-\n", "\\1", s) # Join hyphenated words.
3360 s = s.replace("\n\n", "<!-- paragraph -->") # Preserve paragraph spacing.
3361 s = s.replace("\n", " ")
3362 s = s.replace("<!-- paragraph -->", "\n\n")
3363 s = collapse_spaces(s)
3364 return s

Callers 1

__init__Method · 0.95

Calls 8

PDFResourceManagerClass · 0.90
LAParamsClass · 0.90
process_pdfFunction · 0.90
strFunction · 0.85
decode_utf8Function · 0.85
collapse_spacesFunction · 0.85
existsMethod · 0.80
stripMethod · 0.80

Tested by

no test coverage detected