Add a Doc's annotations to the DocBin for serialization. doc (Doc): The Doc object to add. DOCS: https://spacy.io/api/docbin#add
(self, doc: Doc)
| 89 | return len(self.tokens) |
| 90 | |
| 91 | def add(self, doc: Doc) -> None: |
| 92 | """Add a Doc's annotations to the DocBin for serialization. |
| 93 | |
| 94 | doc (Doc): The Doc object to add. |
| 95 | |
| 96 | DOCS: https://spacy.io/api/docbin#add |
| 97 | """ |
| 98 | array = doc.to_array(self.attrs) |
| 99 | if len(array.shape) == 1: |
| 100 | array = array.reshape((array.shape[0], 1)) |
| 101 | self.tokens.append(array) |
| 102 | spaces = doc.to_array(SPACY) |
| 103 | assert array.shape[0] == spaces.shape[0] # this should never happen |
| 104 | spaces = spaces.reshape((spaces.shape[0], 1)) |
| 105 | self.spaces.append(numpy.asarray(spaces, dtype=bool)) |
| 106 | self.flags.append({"has_unknown_spaces": doc.has_unknown_spaces}) |
| 107 | for token in doc: |
| 108 | self.strings.add(token.text) |
| 109 | self.strings.add(token.tag_) |
| 110 | self.strings.add(token.lemma_) |
| 111 | self.strings.add(token.norm_) |
| 112 | self.strings.add(str(token.morph)) |
| 113 | self.strings.add(token.dep_) |
| 114 | self.strings.add(token.ent_type_) |
| 115 | self.strings.add(token.ent_kb_id_) |
| 116 | self.strings.add(token.ent_id_) |
| 117 | self.cats.append(doc.cats) |
| 118 | if self.store_user_data: |
| 119 | self.user_data.append(srsly.msgpack_dumps(doc.user_data)) |
| 120 | self.span_groups.append(doc.spans.to_bytes()) |
| 121 | for key, group in doc.spans.items(): |
| 122 | for span in group: |
| 123 | self.strings.add(span.label_) |
| 124 | if span.kb_id in span.doc.vocab.strings: |
| 125 | self.strings.add(span.kb_id_) |
| 126 | if span.id in span.doc.vocab.strings: |
| 127 | self.strings.add(span.id_) |
| 128 | |
| 129 | def get_docs(self, vocab: Vocab) -> Iterator[Doc]: |
| 130 | """Recover Doc objects from the annotations, using the given vocab. |