MCPcopy
hub / github.com/tickmao/Novel / SourcePolicy

Class SourcePolicy

scripts/source_policy.py:44–332  ·  view source on GitHub ↗

书源准入策略。

Source from the content-addressed store, hash-verified

42
43
44class SourcePolicy:
45 """书源准入策略。"""
46
47 DEFAULT_NAME_CONFIG = {
48 "require_pure_chinese": True,
49 "min_length": 2,
50 "max_length": 16,
51 "drop_ascii_suffixes": [],
52 "token_replacements": {},
53 "domain_to_canonical": {},
54 "alias_to_canonical": {},
55 "generic_blacklist": [],
56 }
57 DEFAULT_AUDIT_CONFIG = {
58 "text_patterns": [],
59 "url_patterns": [],
60 "allow_patterns": [],
61 }
62
63 def __init__(self, base_dir: Path | str | None = None):
64 root = Path(base_dir).resolve() if base_dir else Path(__file__).resolve().parent.parent
65 self.project_root = root
66 if not (self.project_root / "config").exists():
67 for parent in [root] + list(root.parents):
68 if (parent / "config").exists():
69 self.project_root = parent
70 break
71
72 config_dir = self.project_root / "config"
73 self.name_config = _load_json(config_dir / "name_normalization.json", self.DEFAULT_NAME_CONFIG)
74 self.audit_config = _load_json(config_dir / "content_audit.json", self.DEFAULT_AUDIT_CONFIG)
75
76 self.require_pure_chinese = bool(self.name_config.get("require_pure_chinese", True))
77 self.min_length = int(self.name_config.get("min_length", 2))
78 self.max_length = int(self.name_config.get("max_length", 16))
79 self.generic_blacklist = set(self.name_config.get("generic_blacklist", []))
80 self.reject_name_patterns = [
81 re.compile(pattern) for pattern in self.name_config.get("reject_patterns", [])
82 ]
83 self.drop_ascii_suffixes = tuple(self.name_config.get("drop_ascii_suffixes", []))
84 self.token_replacements = self.name_config.get("token_replacements", {})
85 self.domain_to_canonical = {
86 self._normalize_domain(k): v for k, v in self.name_config.get("domain_to_canonical", {}).items()
87 }
88 self.alias_to_canonical = {}
89 for alias, canonical in self.name_config.get("alias_to_canonical", {}).items():
90 self.alias_to_canonical[normalize_source_name(alias)] = canonical
91 self.alias_to_canonical[alias.strip()] = canonical
92
93 self.text_patterns = [
94 (re.compile(item["pattern"]), item["reason"])
95 for item in self.audit_config.get("text_patterns", [])
96 ]
97 self.url_patterns = [
98 (re.compile(item["pattern"]), item["reason"])
99 for item in self.audit_config.get("url_patterns", [])
100 ]
101 self.allow_patterns = tuple(self.audit_config.get("allow_patterns", []))

Callers 4

__init__Method · 0.90
__init__Method · 0.90
clean_adult_sourcesFunction · 0.90
setUpMethod · 0.90

Calls

no outgoing calls

Tested by 1

setUpMethod · 0.72