(
data_path: Path,
output_file: Path,
model: str,
json_field: str,
batch_size: int,
n_process: int,
)
| 106 | |
| 107 | |
| 108 | def apply( |
| 109 | data_path: Path, |
| 110 | output_file: Path, |
| 111 | model: str, |
| 112 | json_field: str, |
| 113 | batch_size: int, |
| 114 | n_process: int, |
| 115 | ): |
| 116 | docbin = DocBin(store_user_data=True) |
| 117 | paths = walk_directory(data_path) |
| 118 | if len(paths) == 0: |
| 119 | docbin.to_disk(output_file) |
| 120 | msg.warn( |
| 121 | f"Did not find data to process, {data_path} seems to be an empty directory." |
| 122 | ) |
| 123 | return |
| 124 | nlp = load_model(model) |
| 125 | msg.good(f"Loaded model {model}") |
| 126 | vocab = nlp.vocab |
| 127 | streams: List[DocOrStrStream] = [] |
| 128 | text_files = [] |
| 129 | for path in paths: |
| 130 | if path.suffix == ".spacy": |
| 131 | streams.append(_stream_docbin(path, vocab)) |
| 132 | elif path.suffix == ".jsonl": |
| 133 | streams.append(_stream_jsonl(path, json_field)) |
| 134 | else: |
| 135 | text_files.append(path) |
| 136 | if len(text_files) > 0: |
| 137 | streams.append(_stream_texts(text_files)) |
| 138 | datagen = cast(DocOrStrStream, chain(*streams)) |
| 139 | for doc in tqdm.tqdm( |
| 140 | nlp.pipe(datagen, batch_size=batch_size, n_process=n_process), disable=None |
| 141 | ): |
| 142 | docbin.add(doc) |
| 143 | if output_file.suffix == "": |
| 144 | output_file = output_file.with_suffix(".spacy") |
| 145 | docbin.to_disk(output_file) |
searching dependent graphs…