(self, remove_punc=True)
| 617 | return self.norm_text |
| 618 | |
| 619 | def normalize(self, remove_punc=True): |
| 620 | text = self.raw_text |
| 621 | |
| 622 | # 规范化日期 |
| 623 | pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)") |
| 624 | matchers = pattern.findall(text) |
| 625 | if matchers: |
| 626 | # print('date') |
| 627 | for matcher in matchers: |
| 628 | text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1) |
| 629 | |
| 630 | # 规范化金钱 |
| 631 | pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)") |
| 632 | matchers = pattern.findall(text) |
| 633 | if matchers: |
| 634 | # print('money') |
| 635 | for matcher in matchers: |
| 636 | text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1) |
| 637 | |
| 638 | # 规范化固话/手机号码 |
| 639 | # 手机 |
| 640 | # http://www.jihaoba.com/news/show/13680 |
| 641 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 |
| 642 | # 联通:130、131、132、156、155、186、185、176 |
| 643 | # 电信:133、153、189、180、181、177 |
| 644 | pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D") |
| 645 | matchers = pattern.findall(text) |
| 646 | if matchers: |
| 647 | # print('telephone') |
| 648 | for matcher in matchers: |
| 649 | text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1) |
| 650 | # 固话 |
| 651 | pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D") |
| 652 | matchers = pattern.findall(text) |
| 653 | if matchers: |
| 654 | # print('fixed telephone') |
| 655 | for matcher in matchers: |
| 656 | text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1) |
| 657 | |
| 658 | # 规范化分数 |
| 659 | pattern = re.compile(r"(\d+/\d+)") |
| 660 | matchers = pattern.findall(text) |
| 661 | if matchers: |
| 662 | # print('fraction') |
| 663 | for matcher in matchers: |
| 664 | text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1) |
| 665 | |
| 666 | # 规范化百分数 |
| 667 | text = text.replace('%', '%') |
| 668 | pattern = re.compile(r"(\d+(\.\d+)?%)") |
| 669 | matchers = pattern.findall(text) |
| 670 | if matchers: |
| 671 | # print('percentage') |
| 672 | for matcher in matchers: |
| 673 | text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1) |
| 674 | |
| 675 | # 规范化纯数+量词 |
| 676 | pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS) |
no test coverage detected