Build TF-IDF index from CVE SQLite database.
(self, cve_db_path: str, interactive: bool = True)
| 115 | logger.info(f"RAG index saved: {self.index_path} ({len(self._cve_meta)} CVEs)") |
| 116 | |
| 117 | def build(self, cve_db_path: str, interactive: bool = True) -> bool: |
| 118 | """Build TF-IDF index from CVE SQLite database.""" |
| 119 | if not _ensure_sklearn(interactive=interactive): |
| 120 | logger.warning("RAG: scikit-learn unavailable; skipping index build") |
| 121 | return False |
| 122 | |
| 123 | if not Path(cve_db_path).exists(): |
| 124 | logger.warning(f"RAG: CVE DB not found at {cve_db_path}") |
| 125 | return False |
| 126 | |
| 127 | from sklearn.feature_extraction.text import TfidfVectorizer |
| 128 | |
| 129 | rows = self._load_cve_rows(cve_db_path) |
| 130 | if not rows: |
| 131 | logger.info("RAG: CVE table empty; skipping build") |
| 132 | return False |
| 133 | |
| 134 | documents = [] |
| 135 | meta = [] |
| 136 | for row in rows: |
| 137 | cve_id, description, cvss_score, severity, technologies = row |
| 138 | tech_str = " ".join(technologies) if technologies else "" |
| 139 | doc = f"{description or ''} {tech_str}".strip() |
| 140 | if not doc: |
| 141 | continue |
| 142 | documents.append(doc) |
| 143 | meta.append( |
| 144 | { |
| 145 | "cve_id": cve_id, |
| 146 | "description": description or "", |
| 147 | "cvss_score": cvss_score or 0.0, |
| 148 | "severity": severity or "", |
| 149 | "affected_products": technologies or [], |
| 150 | } |
| 151 | ) |
| 152 | |
| 153 | if not documents: |
| 154 | logger.info("RAG: no documents after filter; skipping build") |
| 155 | return False |
| 156 | |
| 157 | vectorizer = TfidfVectorizer( |
| 158 | stop_words="english", |
| 159 | ngram_range=(1, 2), |
| 160 | max_features=50000, |
| 161 | sublinear_tf=True, |
| 162 | ) |
| 163 | matrix = vectorizer.fit_transform(documents) |
| 164 | |
| 165 | self._vectorizer = vectorizer |
| 166 | self._matrix = matrix |
| 167 | self._cve_meta = meta |
| 168 | self._loaded = True |
| 169 | logger.info(f"RAG index built: {len(meta)} CVEs, vocab={len(vectorizer.vocabulary_)}") |
| 170 | return True |
| 171 | |
| 172 | def _load_cve_rows(self, cve_db_path: str) -> List[tuple]: |
| 173 | """Fetch CVE rows + aggregated technologies.""" |