| 5 | const logger = createLogger('PdfParser') |
| 6 | |
| 7 | export class PdfParser implements FileParser { |
| 8 | async parseFile(filePath: string): Promise<FileParseResult> { |
| 9 | try { |
| 10 | logger.info('Starting to parse file:', filePath) |
| 11 | |
| 12 | if (!filePath) { |
| 13 | throw new Error('No file path provided') |
| 14 | } |
| 15 | |
| 16 | logger.info('Reading file...') |
| 17 | const dataBuffer = await readFile(filePath) |
| 18 | logger.info('File read successfully, size:', dataBuffer.length) |
| 19 | |
| 20 | return this.parseBuffer(dataBuffer) |
| 21 | } catch (error) { |
| 22 | logger.error('Error reading file:', error) |
| 23 | throw error |
| 24 | } |
| 25 | } |
| 26 | |
| 27 | async parseBuffer(dataBuffer: Buffer): Promise<FileParseResult> { |
| 28 | try { |
| 29 | logger.info('Starting to parse buffer, size:', dataBuffer.length) |
| 30 | |
| 31 | const { extractText, getDocumentProxy } = await import('unpdf') |
| 32 | |
| 33 | const uint8Array = new Uint8Array(dataBuffer) |
| 34 | |
| 35 | const pdf = await getDocumentProxy(uint8Array) |
| 36 | |
| 37 | const { totalPages, text } = await extractText(pdf, { mergePages: true }) |
| 38 | |
| 39 | logger.info('PDF parsed successfully, pages:', totalPages, 'text length:', text.length) |
| 40 | |
| 41 | const cleanContent = text.replace(/\u0000/g, '') |
| 42 | |
| 43 | return { |
| 44 | content: cleanContent, |
| 45 | metadata: { |
| 46 | pageCount: totalPages, |
| 47 | source: 'unpdf', |
| 48 | }, |
| 49 | } |
| 50 | } catch (error) { |
| 51 | logger.error('Error parsing buffer:', error) |
| 52 | throw error |
| 53 | } |
| 54 | } |
| 55 | } |
nothing calls this directly
no outgoing calls
no test coverage detected