MCPcopy
hub / github.com/explosion/spaCy / debug_data

Function debug_data

spacy/cli/debug_data.py:123–778  ·  view source on GitHub ↗
(
    config_path: Path,
    *,
    config_overrides: Dict[str, Any] = {},
    ignore_warnings: bool = False,
    verbose: bool = False,
    no_format: bool = True,
    silent: bool = True,
)

Source from the content-addressed store, hash-verified

121
122
123def debug_data(
124 config_path: Path,
125 *,
126 config_overrides: Dict[str, Any] = {},
127 ignore_warnings: bool = False,
128 verbose: bool = False,
129 no_format: bool = True,
130 silent: bool = True,
131):
132 msg = Printer(
133 no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
134 )
135 # Make sure all files and paths exists if they are needed
136 with show_validation_error(config_path):
137 cfg = util.load_config(config_path, overrides=config_overrides)
138 nlp = util.load_model_from_config(cfg)
139 config = nlp.config.interpolate()
140 T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type]
141 # Use original config here, not resolved version
142 sourced_components = get_sourced_components(cfg)
143 frozen_components = T["frozen_components"]
144 resume_components = [p for p in sourced_components if p not in frozen_components]
145 pipeline = nlp.pipe_names
146 factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
147 msg.divider("Data file validation")
148
149 # Create the gold corpus to be able to better analyze data
150 dot_names = [T["train_corpus"], T["dev_corpus"]]
151 train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
152
153 nlp.initialize(lambda: train_corpus(nlp))
154 msg.good("Pipeline can be initialized with data")
155
156 train_dataset = list(train_corpus(nlp))
157 dev_dataset = list(dev_corpus(nlp))
158 msg.good("Corpus is loadable")
159
160 # Create all gold data here to avoid iterating over the train_dataset constantly
161 gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
162 gold_train_unpreprocessed_data = _compile_gold(
163 train_dataset, factory_names, nlp, make_proj=False
164 )
165 gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True)
166
167 train_texts = gold_train_data["texts"]
168 dev_texts = gold_dev_data["texts"]
169 frozen_components = T["frozen_components"]
170
171 msg.divider("Training stats")
172 msg.text(f"Language: {nlp.lang}")
173 msg.text(f"Training pipeline: {', '.join(pipeline)}")
174 if resume_components:
175 msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
176 if frozen_components:
177 msg.text(f"Frozen components: {', '.join(frozen_components)}")
178 msg.text(f"{len(train_dataset)} training docs")
179 msg.text(f"{len(dev_dataset)} evaluation docs")
180

Callers 1

debug_data_cliFunction · 0.85

Calls 15

show_validation_errorFunction · 0.85
get_sourced_componentsFunction · 0.85
resolve_dot_namesFunction · 0.85
_compile_goldFunction · 0.85
_format_labelsFunction · 0.85
_get_labels_from_spancatFunction · 0.85
_format_freqsFunction · 0.85

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…