MCPcopy
hub / github.com/explosion/spaCy / __init__

Method __init__

spacy/language.py:156–226  ·  view source on GitHub ↗

Initialise a Language object. vocab (Vocab): A `Vocab` object. If `True`, a vocab is created. meta (dict): Custom meta data for the Language class. Is written to by models to add model meta data. max_length (int): Maximum number of characters in a single text. Th

(
        self,
        vocab: Union[Vocab, bool] = True,
        *,
        max_length: int = 10**6,
        meta: Dict[str, Any] = {},
        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
        create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
        batch_size: int = 1000,
        **kwargs,
    )

Source from the content-addressed store, hash-verified

154 _factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
155
156 def __init__(
157 self,
158 vocab: Union[Vocab, bool] = True,
159 *,
160 max_length: int = 10**6,
161 meta: Dict[str, Any] = {},
162 create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
163 create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
164 batch_size: int = 1000,
165 **kwargs,
166 ) -> None:
167 """Initialise a Language object.
168
169 vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
170 meta (dict): Custom meta data for the Language class. Is written to by
171 models to add model meta data.
172 max_length (int): Maximum number of characters in a single text. The
173 current models may run out memory on extremely long texts, due to
174 large internal allocations. You should segment these texts into
175 meaningful units, e.g. paragraphs, subsections etc, before passing
176 them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
177 a rule of thumb, if all pipeline components are enabled, spaCy's
178 default models currently requires roughly 1GB of temporary memory per
179 100,000 characters in one text.
180 create_tokenizer (Callable): Function that takes the nlp object and
181 returns a tokenizer.
182 batch_size (int): Default batch size for pipe and evaluate.
183
184 DOCS: https://spacy.io/api/language#init
185 """
186 from .pipeline.factories import register_factories
187
188 register_factories()
189 # We're only calling this to import all factories provided via entry
190 # points. The factory decorator applied to these functions takes care
191 # of the rest.
192 util.registry._entry_point_factories.get_all()
193
194 self._config = DEFAULT_CONFIG.merge(self.default_config)
195 self._meta = dict(meta)
196 self._path = None
197 self._optimizer: Optional[Optimizer] = None
198 # Component meta and configs are only needed on the instance
199 self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
200 self._pipe_configs: Dict[str, Config] = {} # config by component
201
202 if not isinstance(vocab, Vocab) and vocab is not True:
203 raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
204 if vocab is True:
205 vectors_name = meta.get("vectors", {}).get("name")
206 vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
207 if not create_vectors:
208 vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
209 create_vectors = registry.resolve(vectors_cfg)["vectors"]
210 vocab.vectors = create_vectors(vocab)
211 else:
212 if (self.lang and vocab.lang) and (self.lang != vocab.lang):
213 raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))

Callers

nothing calls this directly

Calls 4

register_factoriesFunction · 0.85
mergeMethod · 0.80
create_tokenizerFunction · 0.70
getMethod · 0.45

Tested by

no test coverage detected