( fileUrl: string, filename: string, mimeType: string, userId?: string, workspaceId?: string | null )
| 275 | } |
| 276 | |
| 277 | async function parseDocument( |
| 278 | fileUrl: string, |
| 279 | filename: string, |
| 280 | mimeType: string, |
| 281 | userId?: string, |
| 282 | workspaceId?: string | null |
| 283 | ): Promise<{ |
| 284 | content: string |
| 285 | processingMethod: 'file-parser' | 'mistral-ocr' |
| 286 | cloudUrl?: string |
| 287 | metadata?: FileParseMetadata |
| 288 | }> { |
| 289 | const isPDF = mimeType === 'application/pdf' |
| 290 | const hasAzureMistralOCR = |
| 291 | env.OCR_AZURE_API_KEY && env.OCR_AZURE_ENDPOINT && env.OCR_AZURE_MODEL_NAME |
| 292 | |
| 293 | const mistralApiKey = await getMistralApiKey(workspaceId) |
| 294 | const hasMistralOCR = !!mistralApiKey |
| 295 | |
| 296 | if (isPDF && (hasAzureMistralOCR || hasMistralOCR)) { |
| 297 | if (hasAzureMistralOCR) { |
| 298 | logger.info(`Using Azure Mistral OCR: ${filename}`) |
| 299 | return parseWithAzureMistralOCR(fileUrl, filename, mimeType, userId) |
| 300 | } |
| 301 | |
| 302 | if (hasMistralOCR) { |
| 303 | logger.info(`Using Mistral OCR: ${filename}`) |
| 304 | return parseWithMistralOCR(fileUrl, filename, mimeType, userId, workspaceId, mistralApiKey) |
| 305 | } |
| 306 | } |
| 307 | |
| 308 | logger.info(`Using file parser: ${filename}`) |
| 309 | return parseWithFileParser(fileUrl, filename, mimeType, userId) |
| 310 | } |
| 311 | |
| 312 | async function handleFileForOCR( |
| 313 | fileUrl: string, |
no test coverage detected