Method _parse

pattern/web/__init__.py:3334–3364 · view source on GitHub ↗

(self, data, format=None)

Source from the content-addressed store, hash-verified

3332	return self.content
3333
3334	def _parse(self, data, format=None):
3335
3336	# The output will be ugly: it may be useful for mining but probably not for displaying.
3337	# You can also try PDF(data, format="html") to preserve some layout information.
3338	from pdf.pdfinterp import PDFResourceManager, process_pdf
3339	from pdf.converter import TextConverter, HTMLConverter
3340	from pdf.layout import LAParams
3341	s = ""
3342	m = PDFResourceManager()
3343	try:
3344	# Given data is a PDF file path.
3345	data = os.path.exists(data) and open(data) or StringIO.StringIO(data)
3346	except TypeError:
3347	# Given data is a PDF string.
3348	data = StringIO.StringIO(data)
3349	try:
3350	stream = StringIO.StringIO()
3351	parser = format=="html" and HTMLConverter or TextConverter
3352	parser = parser(m, stream, codec="utf-8", laparams=LAParams())
3353	process_pdf(m, parser, data, set(), maxpages=0, password="")
3354	except Exception, e:
3355	raise PDFParseError, str(e)
3356	s = stream.getvalue()
3357	s = decode_utf8(s)
3358	s = s.strip()
3359	s = re.sub(r"([a-z])\-\n", "\\1", s) # Join hyphenated words.
3360	s = s.replace("\n\n", "<!-- paragraph -->") # Preserve paragraph spacing.
3361	s = s.replace("\n", " ")
3362	s = s.replace("<!-- paragraph -->", "\n\n")
3363	s = collapse_spaces(s)
3364	return s

__init__Method · 0.95

PDFResourceManagerClass · 0.90

LAParamsClass · 0.90

process_pdfFunction · 0.90

strFunction · 0.85

decode_utf8Function · 0.85

collapse_spacesFunction · 0.85

existsMethod · 0.80

stripMethod · 0.80

no test coverage detected