书源准入策略。
| 42 | |
| 43 | |
| 44 | class SourcePolicy: |
| 45 | """书源准入策略。""" |
| 46 | |
| 47 | DEFAULT_NAME_CONFIG = { |
| 48 | "require_pure_chinese": True, |
| 49 | "min_length": 2, |
| 50 | "max_length": 16, |
| 51 | "drop_ascii_suffixes": [], |
| 52 | "token_replacements": {}, |
| 53 | "domain_to_canonical": {}, |
| 54 | "alias_to_canonical": {}, |
| 55 | "generic_blacklist": [], |
| 56 | } |
| 57 | DEFAULT_AUDIT_CONFIG = { |
| 58 | "text_patterns": [], |
| 59 | "url_patterns": [], |
| 60 | "allow_patterns": [], |
| 61 | } |
| 62 | |
| 63 | def __init__(self, base_dir: Path | str | None = None): |
| 64 | root = Path(base_dir).resolve() if base_dir else Path(__file__).resolve().parent.parent |
| 65 | self.project_root = root |
| 66 | if not (self.project_root / "config").exists(): |
| 67 | for parent in [root] + list(root.parents): |
| 68 | if (parent / "config").exists(): |
| 69 | self.project_root = parent |
| 70 | break |
| 71 | |
| 72 | config_dir = self.project_root / "config" |
| 73 | self.name_config = _load_json(config_dir / "name_normalization.json", self.DEFAULT_NAME_CONFIG) |
| 74 | self.audit_config = _load_json(config_dir / "content_audit.json", self.DEFAULT_AUDIT_CONFIG) |
| 75 | |
| 76 | self.require_pure_chinese = bool(self.name_config.get("require_pure_chinese", True)) |
| 77 | self.min_length = int(self.name_config.get("min_length", 2)) |
| 78 | self.max_length = int(self.name_config.get("max_length", 16)) |
| 79 | self.generic_blacklist = set(self.name_config.get("generic_blacklist", [])) |
| 80 | self.reject_name_patterns = [ |
| 81 | re.compile(pattern) for pattern in self.name_config.get("reject_patterns", []) |
| 82 | ] |
| 83 | self.drop_ascii_suffixes = tuple(self.name_config.get("drop_ascii_suffixes", [])) |
| 84 | self.token_replacements = self.name_config.get("token_replacements", {}) |
| 85 | self.domain_to_canonical = { |
| 86 | self._normalize_domain(k): v for k, v in self.name_config.get("domain_to_canonical", {}).items() |
| 87 | } |
| 88 | self.alias_to_canonical = {} |
| 89 | for alias, canonical in self.name_config.get("alias_to_canonical", {}).items(): |
| 90 | self.alias_to_canonical[normalize_source_name(alias)] = canonical |
| 91 | self.alias_to_canonical[alias.strip()] = canonical |
| 92 | |
| 93 | self.text_patterns = [ |
| 94 | (re.compile(item["pattern"]), item["reason"]) |
| 95 | for item in self.audit_config.get("text_patterns", []) |
| 96 | ] |
| 97 | self.url_patterns = [ |
| 98 | (re.compile(item["pattern"]), item["reason"]) |
| 99 | for item in self.audit_config.get("url_patterns", []) |
| 100 | ] |
| 101 | self.allow_patterns = tuple(self.audit_config.get("allow_patterns", [])) |
no outgoing calls