MCPcopy
hub / github.com/zakirkun/deep-eye / build

Method build

modules/cve_intelligence/rag_index.py:117–170  ·  view source on GitHub ↗

Build TF-IDF index from CVE SQLite database.

(self, cve_db_path: str, interactive: bool = True)

Source from the content-addressed store, hash-verified

115 logger.info(f"RAG index saved: {self.index_path} ({len(self._cve_meta)} CVEs)")
116
117 def build(self, cve_db_path: str, interactive: bool = True) -> bool:
118 """Build TF-IDF index from CVE SQLite database."""
119 if not _ensure_sklearn(interactive=interactive):
120 logger.warning("RAG: scikit-learn unavailable; skipping index build")
121 return False
122
123 if not Path(cve_db_path).exists():
124 logger.warning(f"RAG: CVE DB not found at {cve_db_path}")
125 return False
126
127 from sklearn.feature_extraction.text import TfidfVectorizer
128
129 rows = self._load_cve_rows(cve_db_path)
130 if not rows:
131 logger.info("RAG: CVE table empty; skipping build")
132 return False
133
134 documents = []
135 meta = []
136 for row in rows:
137 cve_id, description, cvss_score, severity, technologies = row
138 tech_str = " ".join(technologies) if technologies else ""
139 doc = f"{description or ''} {tech_str}".strip()
140 if not doc:
141 continue
142 documents.append(doc)
143 meta.append(
144 {
145 "cve_id": cve_id,
146 "description": description or "",
147 "cvss_score": cvss_score or 0.0,
148 "severity": severity or "",
149 "affected_products": technologies or [],
150 }
151 )
152
153 if not documents:
154 logger.info("RAG: no documents after filter; skipping build")
155 return False
156
157 vectorizer = TfidfVectorizer(
158 stop_words="english",
159 ngram_range=(1, 2),
160 max_features=50000,
161 sublinear_tf=True,
162 )
163 matrix = vectorizer.fit_transform(documents)
164
165 self._vectorizer = vectorizer
166 self._matrix = matrix
167 self._cve_meta = meta
168 self._loaded = True
169 logger.info(f"RAG index built: {len(meta)} CVEs, vocab={len(vectorizer.vocabulary_)}")
170 return True
171
172 def _load_cve_rows(self, cve_db_path: str) -> List[tuple]:
173 """Fetch CVE rows + aggregated technologies."""

Callers 14

scanMethod · 0.95
mainFunction · 0.95
test_build_missing_dbMethod · 0.95
test_build_empty_dbMethod · 0.95
test_search_basicMethod · 0.95
test_search_strutsMethod · 0.95
test_search_min_scoreMethod · 0.95
test_search_top_kMethod · 0.95
test_save_and_loadMethod · 0.95

Calls 2

_load_cve_rowsMethod · 0.95
_ensure_sklearnFunction · 0.85

Tested by 11

test_build_missing_dbMethod · 0.76
test_build_empty_dbMethod · 0.76
test_search_basicMethod · 0.76
test_search_strutsMethod · 0.76
test_search_min_scoreMethod · 0.76
test_search_top_kMethod · 0.76
test_save_and_loadMethod · 0.76