MCPcopy Index your code
hub / github.com/AsyncFuncAI/deepwiki-open / prepare_db_index

Method prepare_db_index

api/data_pipeline.py:839–921  ·  view source on GitHub ↗

Prepare the indexed database for the repository. Args: embedder_type (str, optional): Embedder type to use ('openai', 'google', 'ollama'). If None, will be determined from configuration. is_ollama_embedder (bool, opti

(self, embedder_type: str = None, is_ollama_embedder: bool = None, 
                        excluded_dirs: List[str] = None, excluded_files: List[str] = None,
                        included_dirs: List[str] = None, included_files: List[str] = None)

Source from the content-addressed store, hash-verified

837 raise
838
839 def prepare_db_index(self, embedder_type: str = None, is_ollama_embedder: bool = None,
840 excluded_dirs: List[str] = None, excluded_files: List[str] = None,
841 included_dirs: List[str] = None, included_files: List[str] = None) -> List[Document]:
842 """
843 Prepare the indexed database for the repository.
844
845 Args:
846 embedder_type (str, optional): Embedder type to use ('openai', 'google', 'ollama').
847 If None, will be determined from configuration.
848 is_ollama_embedder (bool, optional): DEPRECATED. Use embedder_type instead.
849 If None, will be determined from configuration.
850 excluded_dirs (List[str], optional): List of directories to exclude from processing
851 excluded_files (List[str], optional): List of file patterns to exclude from processing
852 included_dirs (List[str], optional): List of directories to include exclusively
853 included_files (List[str], optional): List of file patterns to include exclusively
854
855 Returns:
856 List[Document]: List of Document objects
857 """
858 def _embedding_vector_length(doc: Document) -> int:
859 vector = getattr(doc, "vector", None)
860 if vector is None:
861 return 0
862 try:
863 if hasattr(vector, "shape"):
864 if len(vector.shape) == 0:
865 return 0
866 return int(vector.shape[-1])
867 if hasattr(vector, "__len__"):
868 return int(len(vector))
869 except Exception:
870 return 0
871 return 0
872
873 # Handle backward compatibility
874 if embedder_type is None and is_ollama_embedder is not None:
875 embedder_type = 'ollama' if is_ollama_embedder else None
876 # check the database
877 if self.repo_paths and os.path.exists(self.repo_paths["save_db_file"]):
878 logger.info("Loading existing database...")
879 try:
880 self.db = LocalDB.load_state(self.repo_paths["save_db_file"])
881 documents = self.db.get_transformed_data(key="split_and_embed")
882 if documents:
883 lengths = [_embedding_vector_length(doc) for doc in documents]
884 non_empty = sum(1 for n in lengths if n > 0)
885 empty = len(lengths) - non_empty
886 sample_sizes = sorted({n for n in lengths if n > 0})[:3]
887 logger.info(
888 "Loaded %s documents from existing database (embeddings: %s non-empty, %s empty; sample_dims=%s)",
889 len(documents),
890 non_empty,
891 empty,
892 sample_sizes,
893 )
894
895 if non_empty == 0:
896 logger.warning(

Callers 1

prepare_databaseMethod · 0.95

Calls 2

read_all_documentsFunction · 0.85

Tested by

no test coverage detected