({
key,
context,
signal,
}: CsvPreviewSliceArgs)
| 47 | * read (one past the cap, to detect truncation), so a multi-GB file costs O(rows) of memory. |
| 48 | */ |
| 49 | export async function getCsvPreviewSlice({ |
| 50 | key, |
| 51 | context, |
| 52 | signal, |
| 53 | }: CsvPreviewSliceArgs): Promise<CsvPreviewSlice> { |
| 54 | const source = await downloadFileStream({ key, context }) |
| 55 | const onAbort = () => source.destroy() |
| 56 | signal?.addEventListener('abort', onAbort, { once: true }) |
| 57 | |
| 58 | const reader = source[Symbol.asyncIterator]() |
| 59 | |
| 60 | try { |
| 61 | // Pull chunks until the first newline so the delimiter can be sniffed before parsing. |
| 62 | // Accumulate the header line incrementally — appending each chunk's decoded text rather than |
| 63 | // re-concatenating the whole buffer each iteration (which would be O(n²) for a header split |
| 64 | // across many small chunks). The delimiter chars (`,` `\t` `;`) are ASCII, so a multi-byte |
| 65 | // character split at a chunk boundary can't introduce a false delimiter into the count. |
| 66 | const sniffed: Buffer[] = [] |
| 67 | let firstLine = '' |
| 68 | let sniffedBytes = 0 |
| 69 | while (true) { |
| 70 | const { value, done } = await reader.next() |
| 71 | if (done) break |
| 72 | const chunk = Buffer.isBuffer(value) ? value : Buffer.from(value) |
| 73 | sniffed.push(chunk) |
| 74 | sniffedBytes += chunk.length |
| 75 | const text = chunk.toString('utf-8') |
| 76 | const nl = text.indexOf('\n') |
| 77 | if (nl !== -1) { |
| 78 | firstLine += text.slice(0, nl) |
| 79 | break |
| 80 | } |
| 81 | firstLine += text |
| 82 | if (sniffedBytes >= DELIMITER_SNIFF_MAX_BYTES) break |
| 83 | } |
| 84 | |
| 85 | if (sniffed.length === 0) { |
| 86 | return { headers: [], rows: [], truncated: false } |
| 87 | } |
| 88 | |
| 89 | const delimiter = detectDelimiter(firstLine) |
| 90 | const parser = parseCsvStream({ |
| 91 | columns: false, |
| 92 | skip_empty_lines: true, |
| 93 | trim: true, |
| 94 | relax_column_count: true, |
| 95 | relax_quotes: true, |
| 96 | skip_records_with_error: true, |
| 97 | cast: false, |
| 98 | bom: true, |
| 99 | delimiter, |
| 100 | }) |
| 101 | |
| 102 | // Re-feed the sniffed prefix, then drain the rest of the source into the parser. |
| 103 | async function* rejoin() { |
| 104 | for (const chunk of sniffed) yield chunk |
| 105 | while (true) { |
| 106 | const { value, done } = await reader.next() |
no test coverage detected