(self, base_dir: Path | str | None = None)
| 61 | } |
| 62 | |
| 63 | def __init__(self, base_dir: Path | str | None = None): |
| 64 | root = Path(base_dir).resolve() if base_dir else Path(__file__).resolve().parent.parent |
| 65 | self.project_root = root |
| 66 | if not (self.project_root / "config").exists(): |
| 67 | for parent in [root] + list(root.parents): |
| 68 | if (parent / "config").exists(): |
| 69 | self.project_root = parent |
| 70 | break |
| 71 | |
| 72 | config_dir = self.project_root / "config" |
| 73 | self.name_config = _load_json(config_dir / "name_normalization.json", self.DEFAULT_NAME_CONFIG) |
| 74 | self.audit_config = _load_json(config_dir / "content_audit.json", self.DEFAULT_AUDIT_CONFIG) |
| 75 | |
| 76 | self.require_pure_chinese = bool(self.name_config.get("require_pure_chinese", True)) |
| 77 | self.min_length = int(self.name_config.get("min_length", 2)) |
| 78 | self.max_length = int(self.name_config.get("max_length", 16)) |
| 79 | self.generic_blacklist = set(self.name_config.get("generic_blacklist", [])) |
| 80 | self.reject_name_patterns = [ |
| 81 | re.compile(pattern) for pattern in self.name_config.get("reject_patterns", []) |
| 82 | ] |
| 83 | self.drop_ascii_suffixes = tuple(self.name_config.get("drop_ascii_suffixes", [])) |
| 84 | self.token_replacements = self.name_config.get("token_replacements", {}) |
| 85 | self.domain_to_canonical = { |
| 86 | self._normalize_domain(k): v for k, v in self.name_config.get("domain_to_canonical", {}).items() |
| 87 | } |
| 88 | self.alias_to_canonical = {} |
| 89 | for alias, canonical in self.name_config.get("alias_to_canonical", {}).items(): |
| 90 | self.alias_to_canonical[normalize_source_name(alias)] = canonical |
| 91 | self.alias_to_canonical[alias.strip()] = canonical |
| 92 | |
| 93 | self.text_patterns = [ |
| 94 | (re.compile(item["pattern"]), item["reason"]) |
| 95 | for item in self.audit_config.get("text_patterns", []) |
| 96 | ] |
| 97 | self.url_patterns = [ |
| 98 | (re.compile(item["pattern"]), item["reason"]) |
| 99 | for item in self.audit_config.get("url_patterns", []) |
| 100 | ] |
| 101 | self.allow_patterns = tuple(self.audit_config.get("allow_patterns", [])) |
| 102 | |
| 103 | def _normalize_domain(self, domain: str) -> str: |
| 104 | cleaned = domain.lower().strip() |
nothing calls this directly
no test coverage detected