MCPcopy
hub / github.com/Mimino666/langdetect / normalize

Method normalize

langdetect/utils/ngram.py:65–97  ·  view source on GitHub ↗
(cls, ch)

Source from the content-addressed store, hash-verified

63
64 @classmethod
65 def normalize(cls, ch):
66 block = unicode_block(ch)
67 if block == UNICODE_BASIC_LATIN:
68 if ch < 'A' or ('Z' < ch < 'a') or 'z' < ch:
69 ch = ' '
70 elif block == UNICODE_LATIN_1_SUPPLEMENT:
71 if cls.LATIN1_EXCLUDED.find(ch) >= 0:
72 ch = ' '
73 elif block == UNICODE_LATIN_EXTENDED_B:
74 # normalization for Romanian
75 if ch == six.u('\u0219'): # Small S with comma below => with cedilla
76 ch = six.u('\u015f')
77 if ch == six.u('\u021b'): # Small T with comma below => with cedilla
78 ch = six.u('\u0163')
79 elif block == UNICODE_GENERAL_PUNCTUATION:
80 ch = ' '
81 elif block == UNICODE_ARABIC:
82 if ch == six.u('\u06cc'):
83 ch = six.u('\u064a') # Farsi yeh => Arabic yeh
84 elif block == UNICODE_LATIN_EXTENDED_ADDITIONAL:
85 if ch >= six.u('\u1ea0'):
86 ch = six.u('\u1ec3')
87 elif block == UNICODE_HIRAGANA:
88 ch = six.u('\u3042')
89 elif block == UNICODE_KATAKANA:
90 ch = six.u('\u30a2')
91 elif block in (UNICODE_BOPOMOFO, UNICODE_BOPOMOFO_EXTENDED):
92 ch = six.u('\u3105')
93 elif block == UNICODE_CJK_UNIFIED_IDEOGRAPHS:
94 ch = cls.CJK_MAP.get(ch, ch)
95 elif block == UNICODE_HANGUL_SYLLABLES:
96 ch = six.u('\uac00')
97 return ch
98
99 @classmethod
100 def normalize_vi(cls, text):

Callers 4

add_charMethod · 0.95

Calls 2

unicode_blockFunction · 0.90
getMethod · 0.80