Recursively reads all documents in a directory and its subdirectories. Args: path (str): The root directory path. embedder_type (str, optional): The embedder type ('openai', 'google', 'ollama'). If None, will be determined from configura
(path: str, embedder_type: str = None, is_ollama_embedder: bool = None,
excluded_dirs: List[str] = None, excluded_files: List[str] = None,
included_dirs: List[str] = None, included_files: List[str] = None)
| 159 | download_github_repo = download_repo |
| 160 | |
| 161 | def read_all_documents(path: str, embedder_type: str = None, is_ollama_embedder: bool = None, |
| 162 | excluded_dirs: List[str] = None, excluded_files: List[str] = None, |
| 163 | included_dirs: List[str] = None, included_files: List[str] = None): |
| 164 | """ |
| 165 | Recursively reads all documents in a directory and its subdirectories. |
| 166 | |
| 167 | Args: |
| 168 | path (str): The root directory path. |
| 169 | embedder_type (str, optional): The embedder type ('openai', 'google', 'ollama'). |
| 170 | If None, will be determined from configuration. |
| 171 | is_ollama_embedder (bool, optional): DEPRECATED. Use embedder_type instead. |
| 172 | If None, will be determined from configuration. |
| 173 | excluded_dirs (List[str], optional): List of directories to exclude from processing. |
| 174 | Overrides the default configuration if provided. |
| 175 | excluded_files (List[str], optional): List of file patterns to exclude from processing. |
| 176 | Overrides the default configuration if provided. |
| 177 | included_dirs (List[str], optional): List of directories to include exclusively. |
| 178 | When provided, only files in these directories will be processed. |
| 179 | included_files (List[str], optional): List of file patterns to include exclusively. |
| 180 | When provided, only files matching these patterns will be processed. |
| 181 | |
| 182 | Returns: |
| 183 | list: A list of Document objects with metadata. |
| 184 | """ |
| 185 | # Handle backward compatibility |
| 186 | if embedder_type is None and is_ollama_embedder is not None: |
| 187 | embedder_type = 'ollama' if is_ollama_embedder else None |
| 188 | documents = [] |
| 189 | # File extensions to look for, prioritizing code files |
| 190 | code_extensions = [".py", ".js", ".ts", ".java", ".cpp", ".c", ".h", ".hpp", ".go", ".rs", |
| 191 | ".jsx", ".tsx", ".html", ".css", ".php", ".swift", ".cs"] |
| 192 | doc_extensions = [".md", ".txt", ".rst", ".json", ".yaml", ".yml"] |
| 193 | |
| 194 | # Determine filtering mode: inclusion or exclusion |
| 195 | use_inclusion_mode = (included_dirs is not None and len(included_dirs) > 0) or (included_files is not None and len(included_files) > 0) |
| 196 | |
| 197 | if use_inclusion_mode: |
| 198 | # Inclusion mode: only process specified directories and files |
| 199 | final_included_dirs = set(included_dirs) if included_dirs else set() |
| 200 | final_included_files = set(included_files) if included_files else set() |
| 201 | |
| 202 | logger.info(f"Using inclusion mode") |
| 203 | logger.info(f"Included directories: {list(final_included_dirs)}") |
| 204 | logger.info(f"Included files: {list(final_included_files)}") |
| 205 | |
| 206 | # Convert to lists for processing |
| 207 | included_dirs = list(final_included_dirs) |
| 208 | included_files = list(final_included_files) |
| 209 | excluded_dirs = [] |
| 210 | excluded_files = [] |
| 211 | else: |
| 212 | # Exclusion mode: use default exclusions plus any additional ones |
| 213 | final_excluded_dirs = set(DEFAULT_EXCLUDED_DIRS) |
| 214 | final_excluded_files = set(DEFAULT_EXCLUDED_FILES) |
| 215 | |
| 216 | # Add any additional excluded directories from config |
| 217 | if "file_filters" in configs and "excluded_dirs" in configs["file_filters"]: |
| 218 | final_excluded_dirs.update(configs["file_filters"]["excluded_dirs"]) |
no test coverage detected