MCPcopy
hub / github.com/AsyncFuncAI/deepwiki-open / prepare_retriever

Method prepare_retriever

api/rag.py:345–414  ·  view source on GitHub ↗

Prepare the retriever for a repository. Will load database from local storage if available. Args: repo_url_or_path: URL or local path to the repository access_token: Optional access token for private repositories excluded_dirs: Optional l

(self, repo_url_or_path: str, type: str = "github", access_token: str = None,
                      excluded_dirs: List[str] = None, excluded_files: List[str] = None,
                      included_dirs: List[str] = None, included_files: List[str] = None)

Source from the content-addressed store, hash-verified

343 return valid_documents
344
345 def prepare_retriever(self, repo_url_or_path: str, type: str = "github", access_token: str = None,
346 excluded_dirs: List[str] = None, excluded_files: List[str] = None,
347 included_dirs: List[str] = None, included_files: List[str] = None):
348 """
349 Prepare the retriever for a repository.
350 Will load database from local storage if available.
351
352 Args:
353 repo_url_or_path: URL or local path to the repository
354 access_token: Optional access token for private repositories
355 excluded_dirs: Optional list of directories to exclude from processing
356 excluded_files: Optional list of file patterns to exclude from processing
357 included_dirs: Optional list of directories to include exclusively
358 included_files: Optional list of file patterns to include exclusively
359 """
360 self.initialize_db_manager()
361 self.repo_url_or_path = repo_url_or_path
362 self.transformed_docs = self.db_manager.prepare_database(
363 repo_url_or_path,
364 type,
365 access_token,
366 embedder_type=self.embedder_type,
367 excluded_dirs=excluded_dirs,
368 excluded_files=excluded_files,
369 included_dirs=included_dirs,
370 included_files=included_files
371 )
372 logger.info(f"Loaded {len(self.transformed_docs)} documents for retrieval")
373
374 # Validate and filter embeddings to ensure consistent sizes
375 self.transformed_docs = self._validate_and_filter_embeddings(self.transformed_docs)
376
377 if not self.transformed_docs:
378 raise ValueError("No valid documents with embeddings found. Cannot create retriever.")
379
380 logger.info(f"Using {len(self.transformed_docs)} documents with valid embeddings for retrieval")
381
382 try:
383 # Use the appropriate embedder for retrieval
384 retrieve_embedder = self.query_embedder if self.is_ollama_embedder else self.embedder
385 self.retriever = FAISSRetriever(
386 **configs["retriever"],
387 embedder=retrieve_embedder,
388 documents=self.transformed_docs,
389 document_map_func=lambda doc: doc.vector,
390 )
391 logger.info("FAISS retriever created successfully")
392 except Exception as e:
393 logger.error(f"Error creating FAISS retriever: {str(e)}")
394 # Try to provide more specific error information
395 if "All embeddings should be of the same size" in str(e):
396 logger.error("Embedding size validation failed. This suggests there are still inconsistent embedding sizes.")
397 # Log embedding sizes for debugging
398 sizes = []
399 for i, doc in enumerate(self.transformed_docs[:10]): # Check first 10 docs
400 if hasattr(doc, 'vector') and doc.vector is not None:
401 try:
402 if isinstance(doc.vector, list):

Callers 2

handle_websocket_chatFunction · 0.95
chat_completions_streamFunction · 0.95

Calls 3

initialize_db_managerMethod · 0.95
prepare_databaseMethod · 0.80

Tested by

no test coverage detected