MCPcopy
hub / github.com/explosion/spaCy / test_serialize_doc_bin

Function test_serialize_doc_bin

spacy/tests/serialize/test_serialize_docbin.py:43–74  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

41
42
43def test_serialize_doc_bin():
44 doc_bin = DocBin(
45 attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True
46 )
47 texts = ["Some text", "Lots of texts...", "..."]
48 cats = {"A": 0.5}
49 nlp = English()
50 for doc in nlp.pipe(texts):
51 doc.cats = cats
52 span = doc[0:2]
53 span.label_ = "UNUSUAL_SPAN_LABEL"
54 span.id_ = "UNUSUAL_SPAN_ID"
55 span.kb_id_ = "UNUSUAL_SPAN_KB_ID"
56 doc.spans["start"] = [span]
57 doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
58 doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
59 doc_bin.add(doc)
60 bytes_data = doc_bin.to_bytes()
61
62 # Deserialize later, e.g. in a new process
63 nlp = spacy.blank("en")
64 doc_bin = DocBin().from_bytes(bytes_data)
65 reloaded_docs = list(doc_bin.get_docs(nlp.vocab))
66 for i, doc in enumerate(reloaded_docs):
67 assert doc.text == texts[i]
68 assert doc.cats == cats
69 assert len(doc.spans) == 1
70 assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL"
71 assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID"
72 assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID"
73 assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
74 assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
75
76
77def test_serialize_doc_bin_unknown_spaces(en_vocab):

Callers

nothing calls this directly

Calls 7

addMethod · 0.95
to_bytesMethod · 0.95
get_docsMethod · 0.95
DocBinClass · 0.90
EnglishClass · 0.90
pipeMethod · 0.45
from_bytesMethod · 0.45

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…