Prepare the retriever for a repository. Will load database from local storage if available. Args: repo_url_or_path: URL or local path to the repository access_token: Optional access token for private repositories excluded_dirs: Optional l
(self, repo_url_or_path: str, type: str = "github", access_token: str = None,
excluded_dirs: List[str] = None, excluded_files: List[str] = None,
included_dirs: List[str] = None, included_files: List[str] = None)
| 343 | return valid_documents |
| 344 | |
| 345 | def prepare_retriever(self, repo_url_or_path: str, type: str = "github", access_token: str = None, |
| 346 | excluded_dirs: List[str] = None, excluded_files: List[str] = None, |
| 347 | included_dirs: List[str] = None, included_files: List[str] = None): |
| 348 | """ |
| 349 | Prepare the retriever for a repository. |
| 350 | Will load database from local storage if available. |
| 351 | |
| 352 | Args: |
| 353 | repo_url_or_path: URL or local path to the repository |
| 354 | access_token: Optional access token for private repositories |
| 355 | excluded_dirs: Optional list of directories to exclude from processing |
| 356 | excluded_files: Optional list of file patterns to exclude from processing |
| 357 | included_dirs: Optional list of directories to include exclusively |
| 358 | included_files: Optional list of file patterns to include exclusively |
| 359 | """ |
| 360 | self.initialize_db_manager() |
| 361 | self.repo_url_or_path = repo_url_or_path |
| 362 | self.transformed_docs = self.db_manager.prepare_database( |
| 363 | repo_url_or_path, |
| 364 | type, |
| 365 | access_token, |
| 366 | embedder_type=self.embedder_type, |
| 367 | excluded_dirs=excluded_dirs, |
| 368 | excluded_files=excluded_files, |
| 369 | included_dirs=included_dirs, |
| 370 | included_files=included_files |
| 371 | ) |
| 372 | logger.info(f"Loaded {len(self.transformed_docs)} documents for retrieval") |
| 373 | |
| 374 | # Validate and filter embeddings to ensure consistent sizes |
| 375 | self.transformed_docs = self._validate_and_filter_embeddings(self.transformed_docs) |
| 376 | |
| 377 | if not self.transformed_docs: |
| 378 | raise ValueError("No valid documents with embeddings found. Cannot create retriever.") |
| 379 | |
| 380 | logger.info(f"Using {len(self.transformed_docs)} documents with valid embeddings for retrieval") |
| 381 | |
| 382 | try: |
| 383 | # Use the appropriate embedder for retrieval |
| 384 | retrieve_embedder = self.query_embedder if self.is_ollama_embedder else self.embedder |
| 385 | self.retriever = FAISSRetriever( |
| 386 | **configs["retriever"], |
| 387 | embedder=retrieve_embedder, |
| 388 | documents=self.transformed_docs, |
| 389 | document_map_func=lambda doc: doc.vector, |
| 390 | ) |
| 391 | logger.info("FAISS retriever created successfully") |
| 392 | except Exception as e: |
| 393 | logger.error(f"Error creating FAISS retriever: {str(e)}") |
| 394 | # Try to provide more specific error information |
| 395 | if "All embeddings should be of the same size" in str(e): |
| 396 | logger.error("Embedding size validation failed. This suggests there are still inconsistent embedding sizes.") |
| 397 | # Log embedding sizes for debugging |
| 398 | sizes = [] |
| 399 | for i, doc in enumerate(self.transformed_docs[:10]): # Check first 10 docs |
| 400 | if hasattr(doc, 'vector') and doc.vector is not None: |
| 401 | try: |
| 402 | if isinstance(doc.vector, list): |
no test coverage detected