Normalize title
(title)
| 675 | |
| 676 | |
| 677 | def normalizeTitle(title): |
| 678 | """Normalize title""" |
| 679 | # remove leading/trailing whitespace and underscores |
| 680 | title = title.strip(' _') |
| 681 | # replace sequences of whitespace and underscore chars with a single space |
| 682 | title = re.sub(r'[\s_]+', ' ', title) |
| 683 | |
| 684 | m = re.match(r'([^:]*):(\s*)(\S(?:.*))', title) |
| 685 | if m: |
| 686 | prefix = m.group(1) |
| 687 | if m.group(2): |
| 688 | optionalWhitespace = ' ' |
| 689 | else: |
| 690 | optionalWhitespace = '' |
| 691 | rest = m.group(3) |
| 692 | |
| 693 | ns = normalizeNamespace(prefix) |
| 694 | if ns in knownNamespaces: |
| 695 | # If the prefix designates a known namespace, then it might be |
| 696 | # followed by optional whitespace that should be removed to get |
| 697 | # the canonical page name |
| 698 | # (e.g., "Category: Births" should become "Category:Births"). |
| 699 | title = ns + ":" + ucfirst(rest) |
| 700 | else: |
| 701 | # No namespace, just capitalize first letter. |
| 702 | # If the part before the colon is not a known namespace, then we |
| 703 | # must not remove the space after the colon (if any), e.g., |
| 704 | # "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey". |
| 705 | # However, to get the canonical page name we must contract multiple |
| 706 | # spaces into one, because |
| 707 | # "3001: The_Final_Odyssey" != "3001: The_Final_Odyssey". |
| 708 | title = ucfirst(prefix) + ":" + optionalWhitespace + ucfirst(rest) |
| 709 | else: |
| 710 | # no namespace, just capitalize first letter |
| 711 | title = ucfirst(title) |
| 712 | return title |
| 713 | |
| 714 | |
| 715 | def unescape(text): |
nothing calls this directly
no test coverage detected