hub / github.com/explosion/spaCy / __init__

Method init

spacy/language.py:156–226 · view source on GitHub ↗

Initialise a Language object. vocab (Vocab): A `Vocab` object. If `True`, a vocab is created. meta (dict): Custom meta data for the Language class. Is written to by models to add model meta data. max_length (int): Maximum number of characters in a single text. Th

(
        self,
        vocab: Union[Vocab, bool] = True,
        *,
        max_length: int = 10**6,
        meta: Dict[str, Any] = {},
        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
        create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
        batch_size: int = 1000,
        **kwargs,
    )

Source from the content-addressed store, hash-verified

154	_factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
155
156	def __init__(
157	self,
158	vocab: Union[Vocab, bool] = True,
159	*,
160	max_length: int = 10**6,
161	meta: Dict[str, Any] = {},
162	create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
163	create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
164	batch_size: int = 1000,
165	**kwargs,
166	) -> None:
167	"""Initialise a Language object.
168
169	vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
170	meta (dict): Custom meta data for the Language class. Is written to by
171	models to add model meta data.
172	max_length (int): Maximum number of characters in a single text. The
173	current models may run out memory on extremely long texts, due to
174	large internal allocations. You should segment these texts into
175	meaningful units, e.g. paragraphs, subsections etc, before passing
176	them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
177	a rule of thumb, if all pipeline components are enabled, spaCy's
178	default models currently requires roughly 1GB of temporary memory per
179	100,000 characters in one text.
180	create_tokenizer (Callable): Function that takes the nlp object and
181	returns a tokenizer.
182	batch_size (int): Default batch size for pipe and evaluate.
183
184	DOCS: https://spacy.io/api/language#init
185	"""
186	from .pipeline.factories import register_factories
187
188	register_factories()
189	# We're only calling this to import all factories provided via entry
190	# points. The factory decorator applied to these functions takes care
191	# of the rest.
192	util.registry._entry_point_factories.get_all()
193
194	self._config = DEFAULT_CONFIG.merge(self.default_config)
195	self._meta = dict(meta)
196	self._path = None
197	self._optimizer: Optional[Optimizer] = None
198	# Component meta and configs are only needed on the instance
199	self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
200	self._pipe_configs: Dict[str, Config] = {} # config by component
201
202	if not isinstance(vocab, Vocab) and vocab is not True:
203	raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
204	if vocab is True:
205	vectors_name = meta.get("vectors", {}).get("name")
206	vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
207	if not create_vectors:
208	vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
209	create_vectors = registry.resolve(vectors_cfg)["vectors"]
210	vocab.vectors = create_vectors(vocab)
211	else:
212	if (self.lang and vocab.lang) and (self.lang != vocab.lang):
213	raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))

Callers

nothing calls this directly

Calls 4

register_factoriesFunction · 0.85

mergeMethod · 0.80

create_tokenizerFunction · 0.70

getMethod · 0.45

Tested by

no test coverage detected

Method __init__

Source from the content-addressed store, hash-verified

Callers

Calls 4

Tested by

Method init