Function apply

spacy/cli/apply.py:108–145 · view source on GitHub ↗

(
    data_path: Path,
    output_file: Path,
    model: str,
    json_field: str,
    batch_size: int,
    n_process: int,
)

Source from the content-addressed store, hash-verified

106
107
108	def apply(
109	data_path: Path,
110	output_file: Path,
111	model: str,
112	json_field: str,
113	batch_size: int,
114	n_process: int,
115	):
116	docbin = DocBin(store_user_data=True)
117	paths = walk_directory(data_path)
118	if len(paths) == 0:
119	docbin.to_disk(output_file)
120	msg.warn(
121	f"Did not find data to process, {data_path} seems to be an empty directory."
122	)
123	return
124	nlp = load_model(model)
125	msg.good(f"Loaded model {model}")
126	vocab = nlp.vocab
127	streams: List[DocOrStrStream] = []
128	text_files = []
129	for path in paths:
130	if path.suffix == ".spacy":
131	streams.append(_stream_docbin(path, vocab))
132	elif path.suffix == ".jsonl":
133	streams.append(_stream_jsonl(path, json_field))
134	else:
135	text_files.append(path)
136	if len(text_files) > 0:
137	streams.append(_stream_texts(text_files))
138	datagen = cast(DocOrStrStream, chain(*streams))
139	for doc in tqdm.tqdm(
140	nlp.pipe(datagen, batch_size=batch_size, n_process=n_process), disable=None
141	):
142	docbin.add(doc)
143	if output_file.suffix == "":
144	output_file = output_file.with_suffix(".spacy")
145	docbin.to_disk(output_file)

test_applycli_empty_dirFunction · 0.90

test_applycli_docbinFunction · 0.90

test_applycli_jsonlFunction · 0.90

test_applycli_txtFunction · 0.90

test_applycli_mixedFunction · 0.90

test_applycli_user_dataFunction · 0.90

apply_cliFunction · 0.85

to_diskMethod · 0.95

addMethod · 0.95

DocBinClass · 0.85

walk_directoryFunction · 0.85

load_modelFunction · 0.85

_stream_docbinFunction · 0.85

_stream_jsonlFunction · 0.85

_stream_textsFunction · 0.85

appendMethod · 0.80

pipeMethod · 0.45

test_applycli_empty_dirFunction · 0.72

test_applycli_docbinFunction · 0.72

test_applycli_jsonlFunction · 0.72

test_applycli_txtFunction · 0.72

test_applycli_mixedFunction · 0.72

test_applycli_user_dataFunction · 0.72

searching dependent graphs…