* Parse a single knowledge file's metadata without loading its full content. * We still read the file once to extract heading + front-matter — these are * tiny so it's cheap. Full content is loaded later only for selected files.
(filePath, rootDir)
| 105 | * tiny so it's cheap. Full content is loaded later only for selected files. |
| 106 | */ |
| 107 | _parseEntry(filePath, rootDir) { |
| 108 | let stat; |
| 109 | try { stat = fs.statSync(filePath); } catch { return null; } |
| 110 | // Hard cap on file size — we don't want a rogue 50MB file blowing up memory |
| 111 | if (stat.size > 100 * 1024) return null; |
| 112 | |
| 113 | let head; |
| 114 | try { head = fs.readFileSync(filePath, 'utf-8'); } |
| 115 | catch { return null; } |
| 116 | |
| 117 | const rel = path.relative(rootDir, filePath); |
| 118 | const name = path.basename(filePath, path.extname(filePath)); |
| 119 | // Slug components from path become keywords (binary-search.md → ["binary","search"]) |
| 120 | const pathTokens = rel.toLowerCase().split(/[\\/_\-.\s]+/).filter(Boolean); |
| 121 | |
| 122 | // Optional YAML-ish front-matter: --- ... --- |
| 123 | let keywords = []; |
| 124 | let heading = ''; |
| 125 | let body = head; |
| 126 | const fmMatch = head.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/); |
| 127 | if (fmMatch) { |
| 128 | const fm = fmMatch[1]; |
| 129 | body = fmMatch[2]; |
| 130 | const kwMatch = fm.match(/keywords?:\s*(.+)/i); |
| 131 | if (kwMatch) { |
| 132 | keywords = kwMatch[1] |
| 133 | .replace(/^\[|\]$/g, '') |
| 134 | .split(/[,\s]+/) |
| 135 | .map(s => s.trim().toLowerCase().replace(/^['"]|['"]$/g, '')) |
| 136 | .filter(Boolean); |
| 137 | } |
| 138 | } |
| 139 | const hMatch = body.match(/^#\s+(.+)/m); |
| 140 | if (hMatch) heading = hMatch[1].trim(); |
| 141 | |
| 142 | // Combine all keyword sources |
| 143 | const allKeywords = new Set([ |
| 144 | ...pathTokens, |
| 145 | ...keywords, |
| 146 | ...heading.toLowerCase().split(/\W+/).filter(t => t.length > 2), |
| 147 | name.toLowerCase(), |
| 148 | ]); |
| 149 | |
| 150 | return { |
| 151 | path: filePath, |
| 152 | relPath: rel, |
| 153 | name, |
| 154 | heading, |
| 155 | keywords: [...allKeywords], |
| 156 | size: stat.size, |
| 157 | _bodyCache: body, // already-read; keep for tiny files |
| 158 | }; |
| 159 | } |
| 160 | |
| 161 | /** |
| 162 | * Tokenize a query into normalized lowercase words ≥ 3 chars. |