Initialize the pipe for training, using data examples if available. get_examples (Callable[[], Iterable[Example]]): Optional function that returns gold-standard Example objects. sgd (Optional[Optimizer]): An optimizer to use for updates. If not provided, will
(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
sgd: Optional[Optimizer] = None,
)
| 1291 | return self.initialize(get_examples, sgd=sgd) |
| 1292 | |
| 1293 | def initialize( |
| 1294 | self, |
| 1295 | get_examples: Optional[Callable[[], Iterable[Example]]] = None, |
| 1296 | *, |
| 1297 | sgd: Optional[Optimizer] = None, |
| 1298 | ) -> Optimizer: |
| 1299 | """Initialize the pipe for training, using data examples if available. |
| 1300 | |
| 1301 | get_examples (Callable[[], Iterable[Example]]): Optional function that |
| 1302 | returns gold-standard Example objects. |
| 1303 | sgd (Optional[Optimizer]): An optimizer to use for updates. If not |
| 1304 | provided, will be created using the .create_optimizer() method. |
| 1305 | RETURNS (thinc.api.Optimizer): The optimizer. |
| 1306 | |
| 1307 | DOCS: https://spacy.io/api/language#initialize |
| 1308 | """ |
| 1309 | if get_examples is None: |
| 1310 | util.logger.debug( |
| 1311 | "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples" |
| 1312 | ) |
| 1313 | doc = Doc(self.vocab, words=["x", "y", "z"]) |
| 1314 | |
| 1315 | def get_examples(): |
| 1316 | return [Example.from_dict(doc, {})] |
| 1317 | |
| 1318 | if not hasattr(get_examples, "__call__"): |
| 1319 | err = Errors.E930.format( |
| 1320 | method="Language.initialize", obj=type(get_examples) |
| 1321 | ) |
| 1322 | raise TypeError(err) |
| 1323 | # Make sure the config is interpolated so we can resolve subsections |
| 1324 | config = self.config.interpolate() |
| 1325 | # These are the settings provided in the [initialize] block in the config |
| 1326 | I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) # type: ignore[arg-type] |
| 1327 | before_init = I["before_init"] |
| 1328 | if before_init is not None: |
| 1329 | before_init(self) |
| 1330 | try: |
| 1331 | init_vocab( |
| 1332 | self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"] |
| 1333 | ) |
| 1334 | except IOError: |
| 1335 | raise IOError(Errors.E884.format(vectors=I["vectors"])) |
| 1336 | if self.vocab.vectors.shape[1] >= 1: |
| 1337 | ops = get_current_ops() |
| 1338 | self.vocab.vectors.to_ops(ops) |
| 1339 | if hasattr(self.tokenizer, "initialize"): |
| 1340 | tok_settings = validate_init_settings( |
| 1341 | self.tokenizer.initialize, # type: ignore[union-attr] |
| 1342 | I["tokenizer"], |
| 1343 | section="tokenizer", |
| 1344 | name="tokenizer", |
| 1345 | ) |
| 1346 | self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) # type: ignore[union-attr] |
| 1347 | for name, proc in self.pipeline: |
| 1348 | if isinstance(proc, ty.InitializableComponent): |
| 1349 | p_settings = I["components"].get(name, {}) |
| 1350 | p_settings = validate_init_settings( |