| 1234 | self.tokenizer = TokenizerFast.from_file(rec_char_dict_path) |
| 1235 | |
| 1236 | def post_process(self, s): |
| 1237 | text_reg = r"(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})" |
| 1238 | letter = "[a-zA-Z]" |
| 1239 | noletter = "[\W_^\d]" |
| 1240 | names = [x[0].replace(" ", "") for x in re.findall(text_reg, s)] |
| 1241 | s = re.sub(text_reg, lambda match: str(names.pop(0)), s) |
| 1242 | news = s |
| 1243 | while True: |
| 1244 | s = news |
| 1245 | news = re.sub(r"(?!\\ )(%s)\s+?(%s)" % (noletter, noletter), r"\1\2", s) |
| 1246 | news = re.sub(r"(?!\\ )(%s)\s+?(%s)" % (noletter, letter), r"\1\2", news) |
| 1247 | news = re.sub(r"(%s)\s+?(%s)" % (letter, noletter), r"\1\2", news) |
| 1248 | if news == s: |
| 1249 | break |
| 1250 | return s |
| 1251 | |
| 1252 | def decode(self, tokens): |
| 1253 | if len(tokens.shape) == 1: |