(text, keep_punctuation_exact_string=None)
| 22 | |
| 23 | |
| 24 | def canonicalize(text, keep_punctuation_exact_string=None): |
| 25 | text = text.replace('_', ' ') |
| 26 | if keep_punctuation_exact_string: |
| 27 | text = keep_punctuation_exact_string.join( |
| 28 | part.translate(str.maketrans('', '', string.punctuation)) |
| 29 | for part in text.split(keep_punctuation_exact_string)) |
| 30 | else: |
| 31 | text = text.translate(str.maketrans('', '', string.punctuation)) |
| 32 | text = text.lower() |
| 33 | text = re.sub(r'\s+', ' ', text) |
| 34 | return text.strip() |
| 35 | |
| 36 | |
| 37 | class HuggingfaceTokenizer: |