| 63 | |
| 64 | @classmethod |
| 65 | def normalize(cls, ch): |
| 66 | block = unicode_block(ch) |
| 67 | if block == UNICODE_BASIC_LATIN: |
| 68 | if ch < 'A' or ('Z' < ch < 'a') or 'z' < ch: |
| 69 | ch = ' ' |
| 70 | elif block == UNICODE_LATIN_1_SUPPLEMENT: |
| 71 | if cls.LATIN1_EXCLUDED.find(ch) >= 0: |
| 72 | ch = ' ' |
| 73 | elif block == UNICODE_LATIN_EXTENDED_B: |
| 74 | # normalization for Romanian |
| 75 | if ch == six.u('\u0219'): # Small S with comma below => with cedilla |
| 76 | ch = six.u('\u015f') |
| 77 | if ch == six.u('\u021b'): # Small T with comma below => with cedilla |
| 78 | ch = six.u('\u0163') |
| 79 | elif block == UNICODE_GENERAL_PUNCTUATION: |
| 80 | ch = ' ' |
| 81 | elif block == UNICODE_ARABIC: |
| 82 | if ch == six.u('\u06cc'): |
| 83 | ch = six.u('\u064a') # Farsi yeh => Arabic yeh |
| 84 | elif block == UNICODE_LATIN_EXTENDED_ADDITIONAL: |
| 85 | if ch >= six.u('\u1ea0'): |
| 86 | ch = six.u('\u1ec3') |
| 87 | elif block == UNICODE_HIRAGANA: |
| 88 | ch = six.u('\u3042') |
| 89 | elif block == UNICODE_KATAKANA: |
| 90 | ch = six.u('\u30a2') |
| 91 | elif block in (UNICODE_BOPOMOFO, UNICODE_BOPOMOFO_EXTENDED): |
| 92 | ch = six.u('\u3105') |
| 93 | elif block == UNICODE_CJK_UNIFIED_IDEOGRAPHS: |
| 94 | ch = cls.CJK_MAP.get(ch, ch) |
| 95 | elif block == UNICODE_HANGUL_SYLLABLES: |
| 96 | ch = six.u('\uac00') |
| 97 | return ch |
| 98 | |
| 99 | @classmethod |
| 100 | def normalize_vi(cls, text): |