Initialise a Language object. vocab (Vocab): A `Vocab` object. If `True`, a vocab is created. meta (dict): Custom meta data for the Language class. Is written to by models to add model meta data. max_length (int): Maximum number of characters in a single text. Th
(
self,
vocab: Union[Vocab, bool] = True,
*,
max_length: int = 10**6,
meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
batch_size: int = 1000,
**kwargs,
)
| 154 | _factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory |
| 155 | |
| 156 | def __init__( |
| 157 | self, |
| 158 | vocab: Union[Vocab, bool] = True, |
| 159 | *, |
| 160 | max_length: int = 10**6, |
| 161 | meta: Dict[str, Any] = {}, |
| 162 | create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, |
| 163 | create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None, |
| 164 | batch_size: int = 1000, |
| 165 | **kwargs, |
| 166 | ) -> None: |
| 167 | """Initialise a Language object. |
| 168 | |
| 169 | vocab (Vocab): A `Vocab` object. If `True`, a vocab is created. |
| 170 | meta (dict): Custom meta data for the Language class. Is written to by |
| 171 | models to add model meta data. |
| 172 | max_length (int): Maximum number of characters in a single text. The |
| 173 | current models may run out memory on extremely long texts, due to |
| 174 | large internal allocations. You should segment these texts into |
| 175 | meaningful units, e.g. paragraphs, subsections etc, before passing |
| 176 | them to spaCy. Default maximum length is 1,000,000 charas (1mb). As |
| 177 | a rule of thumb, if all pipeline components are enabled, spaCy's |
| 178 | default models currently requires roughly 1GB of temporary memory per |
| 179 | 100,000 characters in one text. |
| 180 | create_tokenizer (Callable): Function that takes the nlp object and |
| 181 | returns a tokenizer. |
| 182 | batch_size (int): Default batch size for pipe and evaluate. |
| 183 | |
| 184 | DOCS: https://spacy.io/api/language#init |
| 185 | """ |
| 186 | from .pipeline.factories import register_factories |
| 187 | |
| 188 | register_factories() |
| 189 | # We're only calling this to import all factories provided via entry |
| 190 | # points. The factory decorator applied to these functions takes care |
| 191 | # of the rest. |
| 192 | util.registry._entry_point_factories.get_all() |
| 193 | |
| 194 | self._config = DEFAULT_CONFIG.merge(self.default_config) |
| 195 | self._meta = dict(meta) |
| 196 | self._path = None |
| 197 | self._optimizer: Optional[Optimizer] = None |
| 198 | # Component meta and configs are only needed on the instance |
| 199 | self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component |
| 200 | self._pipe_configs: Dict[str, Config] = {} # config by component |
| 201 | |
| 202 | if not isinstance(vocab, Vocab) and vocab is not True: |
| 203 | raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) |
| 204 | if vocab is True: |
| 205 | vectors_name = meta.get("vectors", {}).get("name") |
| 206 | vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name) |
| 207 | if not create_vectors: |
| 208 | vectors_cfg = {"vectors": self._config["nlp"]["vectors"]} |
| 209 | create_vectors = registry.resolve(vectors_cfg)["vectors"] |
| 210 | vocab.vectors = create_vectors(vocab) |
| 211 | else: |
| 212 | if (self.lang and vocab.lang) and (self.lang != vocab.lang): |
| 213 | raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) |
nothing calls this directly
no test coverage detected