MCPcopy
hub / github.com/HKUDS/LightRAG / encode

Method encode

lightrag/utils.py:2504–2531  ·  view source on GitHub ↗

Encodes a string into a list of tokens using the underlying tokenizer. Args: content: The string to encode. Returns: A list of integer tokens.

(self, content: str)

Source from the content-addressed store, hash-verified

2502 self.tokenizer: TokenizerInterface = tokenizer
2503
2504 def encode(self, content: str) -> List[int]:
2505 """
2506 Encodes a string into a list of tokens using the underlying tokenizer.
2507
2508 Args:
2509 content: The string to encode.
2510
2511 Returns:
2512 A list of integer tokens.
2513 """
2514 try:
2515 return self.tokenizer.encode(content)
2516 except ValueError as e:
2517 # tiktoken (and some other tokenizers) raise ValueError when the
2518 # content contains literal special-token strings such as
2519 # "<|endoftext|>", because by default disallowed_special is the
2520 # full set of special tokens. This crashes document indexing on
2521 # any user content that happens to contain those strings — common
2522 # in documentation, notes, or model output captured in source
2523 # corpora. Retry with disallowed_special=() so the tokens are
2524 # encoded as ordinary text. Tokenizers that don't accept the
2525 # kwarg fall through and re-raise the original error.
2526 if "special token" not in str(e):
2527 raise
2528 try:
2529 return self.tokenizer.encode(content, disallowed_special=())
2530 except TypeError:
2531 raise e
2532
2533 def decode(self, tokens: List[int]) -> str:
2534 """

Calls

no outgoing calls