MCPcopy Index your code
hub / github.com/MoonInTheRiver/DiffSinger / normalize

Method normalize

utils/text_norm.py:619–709  ·  view source on GitHub ↗
(self, remove_punc=True)

Source from the content-addressed store, hash-verified

617 return self.norm_text
618
619 def normalize(self, remove_punc=True):
620 text = self.raw_text
621
622 # 规范化日期
623 pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)")
624 matchers = pattern.findall(text)
625 if matchers:
626 # print('date')
627 for matcher in matchers:
628 text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
629
630 # 规范化金钱
631 pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)")
632 matchers = pattern.findall(text)
633 if matchers:
634 # print('money')
635 for matcher in matchers:
636 text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
637
638 # 规范化固话/手机号码
639 # 手机
640 # http://www.jihaoba.com/news/show/13680
641 # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
642 # 联通:130、131、132、156、155、186、185、176
643 # 电信:133、153、189、180、181、177
644 pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
645 matchers = pattern.findall(text)
646 if matchers:
647 # print('telephone')
648 for matcher in matchers:
649 text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1)
650 # 固话
651 pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
652 matchers = pattern.findall(text)
653 if matchers:
654 # print('fixed telephone')
655 for matcher in matchers:
656 text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1)
657
658 # 规范化分数
659 pattern = re.compile(r"(\d+/\d+)")
660 matchers = pattern.findall(text)
661 if matchers:
662 # print('fraction')
663 for matcher in matchers:
664 text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
665
666 # 规范化百分数
667 text = text.replace('%', '%')
668 pattern = re.compile(r"(\d+(\.\d+)?%)")
669 matchers = pattern.findall(text)
670 if matchers:
671 # print('percentage')
672 for matcher in matchers:
673 text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1)
674
675 # 规范化纯数+量词
676 pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)

Callers 5

nsw_test_caseFunction · 0.80
text_norm.pyFile · 0.80
process_utteranceFunction · 0.80
preprocess_textMethod · 0.80
preprocess_textMethod · 0.80

Calls 15

_particularMethod · 0.95
DateClass · 0.85
MoneyClass · 0.85
TelePhoneClass · 0.85
FractionClass · 0.85
PercentageClass · 0.85
CardinalClass · 0.85
DigitClass · 0.85
date2chntextMethod · 0.80
money2chntextMethod · 0.80
telephone2chntextMethod · 0.80
fraction2chntextMethod · 0.80

Tested by

no test coverage detected