( fileUrl: string, filename: string, mimeType: string, chunkSize = 1024, chunkOverlap = 200, minCharactersPerChunk = 100, userId?: string, workspaceId?: string | null, strategy?: ChunkingStrategy, strategyOptions?: StrategyOptions )
| 168 | } |
| 169 | |
| 170 | export async function processDocument( |
| 171 | fileUrl: string, |
| 172 | filename: string, |
| 173 | mimeType: string, |
| 174 | chunkSize = 1024, |
| 175 | chunkOverlap = 200, |
| 176 | minCharactersPerChunk = 100, |
| 177 | userId?: string, |
| 178 | workspaceId?: string | null, |
| 179 | strategy?: ChunkingStrategy, |
| 180 | strategyOptions?: StrategyOptions |
| 181 | ): Promise<{ |
| 182 | chunks: Chunk[] |
| 183 | metadata: { |
| 184 | filename: string |
| 185 | fileSize: number |
| 186 | mimeType: string |
| 187 | chunkCount: number |
| 188 | tokenCount: number |
| 189 | characterCount: number |
| 190 | processingMethod: 'file-parser' | 'mistral-ocr' |
| 191 | cloudUrl?: string |
| 192 | } |
| 193 | }> { |
| 194 | logger.info(`Processing document: ${filename}`) |
| 195 | |
| 196 | try { |
| 197 | const parseResult = await parseDocument(fileUrl, filename, mimeType, userId, workspaceId) |
| 198 | const { content, processingMethod } = parseResult |
| 199 | const cloudUrl = 'cloudUrl' in parseResult ? parseResult.cloudUrl : undefined |
| 200 | |
| 201 | let chunks: Chunk[] |
| 202 | const metadata: FileParseMetadata = parseResult.metadata ?? {} |
| 203 | |
| 204 | if (strategy && strategy !== 'auto') { |
| 205 | logger.info(`Using explicit chunking strategy: ${strategy}`) |
| 206 | chunks = await applyStrategy( |
| 207 | strategy, |
| 208 | content, |
| 209 | chunkSize, |
| 210 | chunkOverlap, |
| 211 | minCharactersPerChunk, |
| 212 | strategyOptions |
| 213 | ) |
| 214 | } else { |
| 215 | const isJsonYaml = |
| 216 | metadata.type === 'json' || |
| 217 | metadata.type === 'yaml' || |
| 218 | mimeType.includes('json') || |
| 219 | mimeType.includes('yaml') |
| 220 | |
| 221 | if (isJsonYaml && JsonYamlChunker.isStructuredData(content)) { |
| 222 | logger.info('Using JSON/YAML chunker for structured data') |
| 223 | chunks = await JsonYamlChunker.chunkJsonYaml(content, { |
| 224 | chunkSize, |
| 225 | minCharactersPerChunk, |
| 226 | }) |
| 227 | } else if (StructuredDataChunker.isStructuredData(content, mimeType)) { |
no test coverage detected