Function pdf_to_text

(path, start_page=1, end_page=None)

Source from the content-addressed store, hash-verified

28
29
30	def pdf_to_text(path, start_page=1, end_page=None):
31	doc = fitz.open(path)
32	total_pages = doc.page_count
33
34	if end_page is None:
35	end_page = total_pages
36
37	text_list = []
38
39	for i in range(start_page - 1, end_page):
40	text = doc.load_page(i).get_text("text")
41	text = preprocess(text)
42	text_list.append(text)
43
44	doc.close()
45	return text_list
46
47
48	def text_to_chunks(texts, word_length=150, start_page=1):

load_recommenderFunction · 0.85

preprocessFunction · 0.85

no test coverage detected