MCPcopy
hub / github.com/AsyncFuncAI/deepwiki-open / count_tokens

Function count_tokens

api/data_pipeline.py:27–70  ·  view source on GitHub ↗

Count the number of tokens in a text string using tiktoken. Args: text (str): The text to count tokens for. embedder_type (str, optional): The embedder type ('openai', 'google', 'ollama', 'bedrock'). If None, will be determined from conf

(text: str, embedder_type: str = None, is_ollama_embedder: bool = None)

Source from the content-addressed store, hash-verified

25MAX_EMBEDDING_TOKENS = 8192
26
27def count_tokens(text: str, embedder_type: str = None, is_ollama_embedder: bool = None) -> int:
28 """
29 Count the number of tokens in a text string using tiktoken.
30
31 Args:
32 text (str): The text to count tokens for.
33 embedder_type (str, optional): The embedder type ('openai', 'google', 'ollama', 'bedrock').
34 If None, will be determined from configuration.
35 is_ollama_embedder (bool, optional): DEPRECATED. Use embedder_type instead.
36 If None, will be determined from configuration.
37
38 Returns:
39 int: The number of tokens in the text.
40 """
41 try:
42 # Handle backward compatibility
43 if embedder_type is None and is_ollama_embedder is not None:
44 embedder_type = 'ollama' if is_ollama_embedder else None
45
46 # Determine embedder type if not specified
47 if embedder_type is None:
48 from api.config import get_embedder_type
49 embedder_type = get_embedder_type()
50
51 # Choose encoding based on embedder type
52 if embedder_type == 'ollama':
53 # Ollama typically uses cl100k_base encoding
54 encoding = tiktoken.get_encoding("cl100k_base")
55 elif embedder_type == 'google':
56 # Google uses similar tokenization to GPT models for rough estimation
57 encoding = tiktoken.get_encoding("cl100k_base")
58 elif embedder_type == 'bedrock':
59 # Bedrock embedding models vary; use a common GPT-like encoding for rough estimation
60 encoding = tiktoken.get_encoding("cl100k_base")
61 else: # OpenAI or default
62 # Use OpenAI embedding model encoding
63 encoding = tiktoken.encoding_for_model("text-embedding-3-small")
64
65 return len(encoding.encode(text))
66 except Exception as e:
67 # Fallback to a simple approximation if tiktoken fails
68 logger.warning(f"Error counting tokens with tiktoken: {e}")
69 # Rough approximation: 4 characters per token
70 return len(text) // 4
71
72def download_repo(repo_url: str, local_path: str, repo_type: str = None, access_token: str = None) -> str:
73 """

Callers 5

test_count_tokensMethod · 0.90
handle_websocket_chatFunction · 0.90
chat_completions_streamFunction · 0.90
read_all_documentsFunction · 0.85

Calls 1

get_embedder_typeFunction · 0.90

Tested by 2

test_count_tokensMethod · 0.72