Count the number of tokens in a text string using tiktoken. Args: text (str): The text to count tokens for. embedder_type (str, optional): The embedder type ('openai', 'google', 'ollama', 'bedrock'). If None, will be determined from conf
(text: str, embedder_type: str = None, is_ollama_embedder: bool = None)
| 25 | MAX_EMBEDDING_TOKENS = 8192 |
| 26 | |
| 27 | def count_tokens(text: str, embedder_type: str = None, is_ollama_embedder: bool = None) -> int: |
| 28 | """ |
| 29 | Count the number of tokens in a text string using tiktoken. |
| 30 | |
| 31 | Args: |
| 32 | text (str): The text to count tokens for. |
| 33 | embedder_type (str, optional): The embedder type ('openai', 'google', 'ollama', 'bedrock'). |
| 34 | If None, will be determined from configuration. |
| 35 | is_ollama_embedder (bool, optional): DEPRECATED. Use embedder_type instead. |
| 36 | If None, will be determined from configuration. |
| 37 | |
| 38 | Returns: |
| 39 | int: The number of tokens in the text. |
| 40 | """ |
| 41 | try: |
| 42 | # Handle backward compatibility |
| 43 | if embedder_type is None and is_ollama_embedder is not None: |
| 44 | embedder_type = 'ollama' if is_ollama_embedder else None |
| 45 | |
| 46 | # Determine embedder type if not specified |
| 47 | if embedder_type is None: |
| 48 | from api.config import get_embedder_type |
| 49 | embedder_type = get_embedder_type() |
| 50 | |
| 51 | # Choose encoding based on embedder type |
| 52 | if embedder_type == 'ollama': |
| 53 | # Ollama typically uses cl100k_base encoding |
| 54 | encoding = tiktoken.get_encoding("cl100k_base") |
| 55 | elif embedder_type == 'google': |
| 56 | # Google uses similar tokenization to GPT models for rough estimation |
| 57 | encoding = tiktoken.get_encoding("cl100k_base") |
| 58 | elif embedder_type == 'bedrock': |
| 59 | # Bedrock embedding models vary; use a common GPT-like encoding for rough estimation |
| 60 | encoding = tiktoken.get_encoding("cl100k_base") |
| 61 | else: # OpenAI or default |
| 62 | # Use OpenAI embedding model encoding |
| 63 | encoding = tiktoken.encoding_for_model("text-embedding-3-small") |
| 64 | |
| 65 | return len(encoding.encode(text)) |
| 66 | except Exception as e: |
| 67 | # Fallback to a simple approximation if tiktoken fails |
| 68 | logger.warning(f"Error counting tokens with tiktoken: {e}") |
| 69 | # Rough approximation: 4 characters per token |
| 70 | return len(text) // 4 |
| 71 | |
| 72 | def download_repo(repo_url: str, local_path: str, repo_type: str = None, access_token: str = None) -> str: |
| 73 | """ |