Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes verbose: whether to print intermediate meta information punct_pre_process: whether to per
(
self,
text: str,
verbose: bool = False,
punct_pre_process: bool = False,
punct_post_process: bool = False,
)
| 267 | return splits |
| 268 | |
| 269 | def normalize( |
| 270 | self, |
| 271 | text: str, |
| 272 | verbose: bool = False, |
| 273 | punct_pre_process: bool = False, |
| 274 | punct_post_process: bool = False, |
| 275 | ) -> str: |
| 276 | """ |
| 277 | Main function. Normalizes tokens from written to spoken form |
| 278 | e.g. 12 kg -> twelve kilograms |
| 279 | |
| 280 | Args: |
| 281 | text: string that may include semiotic classes |
| 282 | verbose: whether to print intermediate meta information |
| 283 | punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ] |
| 284 | punct_post_process: whether to normalize punctuation |
| 285 | |
| 286 | Returns: spoken form |
| 287 | """ |
| 288 | if len(text.split()) > 500: |
| 289 | print( |
| 290 | "WARNING! Your input is too long and could take a long time to normalize." |
| 291 | "Use split_text_into_sentences() to make the input shorter and then call normalize_list()." |
| 292 | ) |
| 293 | |
| 294 | original_text = text |
| 295 | if punct_pre_process: |
| 296 | text = pre_process(text) |
| 297 | text = text.strip() |
| 298 | if not text: |
| 299 | if verbose: |
| 300 | print(text) |
| 301 | return text |
| 302 | text = pynini.escape(text) |
| 303 | tagged_lattice = self.find_tags(text) |
| 304 | tagged_text = self.select_tag(tagged_lattice) |
| 305 | if verbose: |
| 306 | print(tagged_text) |
| 307 | self.parser(tagged_text) |
| 308 | tokens = self.parser.parse() |
| 309 | split_tokens = self._split_tokens_to_reduce_number_of_permutations(tokens) |
| 310 | output = "" |
| 311 | for s in split_tokens: |
| 312 | tags_reordered = self.generate_permutations(s) |
| 313 | verbalizer_lattice = None |
| 314 | for tagged_text in tags_reordered: |
| 315 | tagged_text = pynini.escape(tagged_text) |
| 316 | |
| 317 | verbalizer_lattice = self.find_verbalizer(tagged_text) |
| 318 | if verbalizer_lattice.num_states() != 0: |
| 319 | break |
| 320 | if verbalizer_lattice is None: |
| 321 | raise ValueError(f"No permutations were generated from tokens {s}") |
| 322 | output += " " + self.select_verbalizer(verbalizer_lattice) |
| 323 | output = SPACE_DUP.sub(" ", output[1:]) |
| 324 | |
| 325 | if self.lang == "en" and hasattr(self, "post_processor"): |
| 326 | output = self.post_process(output) |
no test coverage detected