hub / github.com/MoonInTheRiver/DiffSinger / normalize

Method normalize

utils/text_norm.py:619–709 · view source on GitHub ↗

(self, remove_punc=True)

Source from the content-addressed store, hash-verified

617	return self.norm_text
618
619	def normalize(self, remove_punc=True):
620	text = self.raw_text
621
622	# 规范化日期
623	pattern = re.compile(r"\D+((([089]\d\|(19\|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)")
624	matchers = pattern.findall(text)
625	if matchers:
626	# print('date')
627	for matcher in matchers:
628	text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
629
630	# 规范化金钱
631	pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)")
632	matchers = pattern.findall(text)
633	if matchers:
634	# print('money')
635	for matcher in matchers:
636	text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
637
638	# 规范化固话/手机号码
639	# 手机
640	# http://www.jihaoba.com/news/show/13680
641	# 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
642	# 联通：130、131、132、156、155、186、185、176
643	# 电信：133、153、189、180、181、177
644	pattern = re.compile(r"\D((\+?86 ?)?1([38]\d\|5[0-35-9]\|7[678]\|9[89])\d{8})\D")
645	matchers = pattern.findall(text)
646	if matchers:
647	# print('telephone')
648	for matcher in matchers:
649	text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1)
650	# 固话
651	pattern = re.compile(r"\D((0(10\|2[1-3]\|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
652	matchers = pattern.findall(text)
653	if matchers:
654	# print('fixed telephone')
655	for matcher in matchers:
656	text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1)
657
658	# 规范化分数
659	pattern = re.compile(r"(\d+/\d+)")
660	matchers = pattern.findall(text)
661	if matchers:
662	# print('fraction')
663	for matcher in matchers:
664	text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
665
666	# 规范化百分数
667	text = text.replace('％', '%')
668	pattern = re.compile(r"(\d+(\.\d+)?%)")
669	matchers = pattern.findall(text)
670	if matchers:
671	# print('percentage')
672	for matcher in matchers:
673	text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1)
674
675	# 规范化纯数+量词
676	pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)

Callers 5

nsw_test_caseFunction · 0.80

text_norm.pyFile · 0.80

process_utteranceFunction · 0.80

preprocess_textMethod · 0.80

Calls 15

_particularMethod · 0.95

DateClass · 0.85

MoneyClass · 0.85

TelePhoneClass · 0.85

FractionClass · 0.85

PercentageClass · 0.85

CardinalClass · 0.85

DigitClass · 0.85

date2chntextMethod · 0.80

money2chntextMethod · 0.80

telephone2chntextMethod · 0.80

fraction2chntextMethod · 0.80

Tested by

no test coverage detected