applyContentPatterns samples file content and matches regex patterns.
(lines []string, cats map[string]struct{})
| 201 | |
| 202 | // applyContentPatterns samples file content and matches regex patterns. |
| 203 | func (c *Classifier) applyContentPatterns(lines []string, cats map[string]struct{}) { |
| 204 | // Count pattern matches per category |
| 205 | counts := make(map[string]int) |
| 206 | for _, line := range lines { |
| 207 | for _, cp := range c.contentREs { |
| 208 | if cp.re.MatchString(line) { |
| 209 | if canonical, ok := c.taxonomy.Lookup(cp.category); ok { |
| 210 | counts[canonical]++ |
| 211 | } |
| 212 | } |
| 213 | } |
| 214 | } |
| 215 | |
| 216 | // A category needs at least 10% of sampled lines matching to be assigned |
| 217 | threshold := len(lines) / 10 |
| 218 | if threshold < 3 { |
| 219 | threshold = 3 |
| 220 | } |
| 221 | for cat, count := range counts { |
| 222 | if count >= threshold { |
| 223 | cats[cat] = struct{}{} |
| 224 | } |
| 225 | } |
| 226 | } |
| 227 | |
| 228 | // applySourceTags uses source-level tags as a weak classification signal. |
| 229 | func (c *Classifier) applySourceTags(sourceName string, cats map[string]struct{}) { |