| 41 | |
| 42 | |
| 43 | def test_serialize_doc_bin(): |
| 44 | doc_bin = DocBin( |
| 45 | attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True |
| 46 | ) |
| 47 | texts = ["Some text", "Lots of texts...", "..."] |
| 48 | cats = {"A": 0.5} |
| 49 | nlp = English() |
| 50 | for doc in nlp.pipe(texts): |
| 51 | doc.cats = cats |
| 52 | span = doc[0:2] |
| 53 | span.label_ = "UNUSUAL_SPAN_LABEL" |
| 54 | span.id_ = "UNUSUAL_SPAN_ID" |
| 55 | span.kb_id_ = "UNUSUAL_SPAN_KB_ID" |
| 56 | doc.spans["start"] = [span] |
| 57 | doc[0].norm_ = "UNUSUAL_TOKEN_NORM" |
| 58 | doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" |
| 59 | doc_bin.add(doc) |
| 60 | bytes_data = doc_bin.to_bytes() |
| 61 | |
| 62 | # Deserialize later, e.g. in a new process |
| 63 | nlp = spacy.blank("en") |
| 64 | doc_bin = DocBin().from_bytes(bytes_data) |
| 65 | reloaded_docs = list(doc_bin.get_docs(nlp.vocab)) |
| 66 | for i, doc in enumerate(reloaded_docs): |
| 67 | assert doc.text == texts[i] |
| 68 | assert doc.cats == cats |
| 69 | assert len(doc.spans) == 1 |
| 70 | assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL" |
| 71 | assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID" |
| 72 | assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID" |
| 73 | assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" |
| 74 | assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID" |
| 75 | |
| 76 | |
| 77 | def test_serialize_doc_bin_unknown_spaces(en_vocab): |