(
config_path: Path,
*,
config_overrides: Dict[str, Any] = {},
ignore_warnings: bool = False,
verbose: bool = False,
no_format: bool = True,
silent: bool = True,
)
| 121 | |
| 122 | |
| 123 | def debug_data( |
| 124 | config_path: Path, |
| 125 | *, |
| 126 | config_overrides: Dict[str, Any] = {}, |
| 127 | ignore_warnings: bool = False, |
| 128 | verbose: bool = False, |
| 129 | no_format: bool = True, |
| 130 | silent: bool = True, |
| 131 | ): |
| 132 | msg = Printer( |
| 133 | no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings |
| 134 | ) |
| 135 | # Make sure all files and paths exists if they are needed |
| 136 | with show_validation_error(config_path): |
| 137 | cfg = util.load_config(config_path, overrides=config_overrides) |
| 138 | nlp = util.load_model_from_config(cfg) |
| 139 | config = nlp.config.interpolate() |
| 140 | T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type] |
| 141 | # Use original config here, not resolved version |
| 142 | sourced_components = get_sourced_components(cfg) |
| 143 | frozen_components = T["frozen_components"] |
| 144 | resume_components = [p for p in sourced_components if p not in frozen_components] |
| 145 | pipeline = nlp.pipe_names |
| 146 | factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] |
| 147 | msg.divider("Data file validation") |
| 148 | |
| 149 | # Create the gold corpus to be able to better analyze data |
| 150 | dot_names = [T["train_corpus"], T["dev_corpus"]] |
| 151 | train_corpus, dev_corpus = resolve_dot_names(config, dot_names) |
| 152 | |
| 153 | nlp.initialize(lambda: train_corpus(nlp)) |
| 154 | msg.good("Pipeline can be initialized with data") |
| 155 | |
| 156 | train_dataset = list(train_corpus(nlp)) |
| 157 | dev_dataset = list(dev_corpus(nlp)) |
| 158 | msg.good("Corpus is loadable") |
| 159 | |
| 160 | # Create all gold data here to avoid iterating over the train_dataset constantly |
| 161 | gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True) |
| 162 | gold_train_unpreprocessed_data = _compile_gold( |
| 163 | train_dataset, factory_names, nlp, make_proj=False |
| 164 | ) |
| 165 | gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True) |
| 166 | |
| 167 | train_texts = gold_train_data["texts"] |
| 168 | dev_texts = gold_dev_data["texts"] |
| 169 | frozen_components = T["frozen_components"] |
| 170 | |
| 171 | msg.divider("Training stats") |
| 172 | msg.text(f"Language: {nlp.lang}") |
| 173 | msg.text(f"Training pipeline: {', '.join(pipeline)}") |
| 174 | if resume_components: |
| 175 | msg.text(f"Components from other pipelines: {', '.join(resume_components)}") |
| 176 | if frozen_components: |
| 177 | msg.text(f"Frozen components: {', '.join(frozen_components)}") |
| 178 | msg.text(f"{len(train_dataset)} training docs") |
| 179 | msg.text(f"{len(dev_dataset)} evaluation docs") |
| 180 |
no test coverage detected
searching dependent graphs…