hub / github.com/twitter/communitynotes / apply_scoring_rules

Function apply_scoring_rules

scoring/src/scoring/scoring_rules.py:1646–1743 · view source on GitHub ↗

Apply a list of ScoringRules to note inputs and return noteStats augmented with scoring results. This function applies a list of ScoringRules in order. Once each rule has run a final ratingStatus is set for each note. An additional column is added to capture which rules acted on the note and

(
  noteStats: pd.DataFrame,
  rules: List[ScoringRule],
  statusColumn: str,
  ruleColumn: str,
  decidedByColumn: Optional[str] = None,
)

Source from the content-addressed store, hash-verified

1644
1645
1646	def apply_scoring_rules(
1647	noteStats: pd.DataFrame,
1648	rules: List[ScoringRule],
1649	statusColumn: str,
1650	ruleColumn: str,
1651	decidedByColumn: Optional[str] = None,
1652	) -> pd.DataFrame:
1653	"""Apply a list of ScoringRules to note inputs and return noteStats augmented with scoring results.
1654
1655	This function applies a list of ScoringRules in order. Once each rule has run
1656	a final ratingStatus is set for each note. An additional column is added to capture
1657	which rules acted on the note and any additional columns generated by the ScoringRules
1658	are merged with the scored notes to generate the final return value.
1659
1660	Args:
1661	noteStats: attributes, aggregates and raw scoring signals for each note.
1662	rules: ScoringRules which will be applied in the order given.
1663	statusColumn: str indicating column where status should be assigned.
1664	ruleColumn: str indicating column where active rules should be stored.
1665	decidedByColumn: None or str indicating column where the last rule to act on a note is stored.
1666
1667	Returns:
1668	noteStats with additional columns representing scoring results.
1669	"""
1670	# Initialize empty dataframes to store labels for each note and which rules impacted
1671	# scoring for each note.
1672	noteLabels = pd.DataFrame.from_dict(
1673	{c.noteIdKey: pd.Series([], dtype=np.int64), statusColumn: pd.Series([], dtype=object)}
1674	)
1675	noteRules = pd.DataFrame.from_dict(
1676	{c.noteIdKey: pd.Series([], dtype=np.int64), ruleColumn: pd.Series([], dtype=object)}
1677	)
1678	noteColumns = pd.DataFrame.from_dict({c.noteIdKey: pd.Series([], dtype=np.int64)})
1679
1680	# Establish state to enforce rule dependencies.
1681	ruleIDs: Set[RuleID] = set()
1682
1683	# Successively apply each rule
1684	for rule in rules:
1685	with c.time_block(f"Applying scoring rule: {rule.get_name()}"):
1686	logger.info(f"Applying scoring rule: {rule.get_name()}")
1687	rule.check_dependencies(ruleIDs)
1688	assert rule.get_rule_id() not in ruleIDs, f"repeat ruleID: {rule.get_name()}"
1689	ruleIDs.add(rule.get_rule_id())
1690	with c.time_block(f"Calling score_notes: {rule.get_name()}"):
1691	noteStatusUpdates, additionalColumns = rule.score_notes(noteStats, noteLabels, statusColumn)
1692	if (
1693	additionalColumns is not None
1694	# This rule updates both status and NmrDueToStableCrhTime (in additional column), they can
1695	# be on different rows.
1696	and rule.get_rule_id() != RuleID.NMR_DUE_TO_MIN_STABLE_CRH_TIME
1697	):
1698	assert set(noteStatusUpdates[c.noteIdKey]) == set(additionalColumns[c.noteIdKey])
1699
1700	# Update noteLabels, which will always hold at most one label per note.
1701	noteLabels = pd.concat([noteLabels, noteStatusUpdates]).groupby(c.noteIdKey).tail(1)
1702	# Update note rules to have one row per rule which was active for a note
1703	noteRules = pd.concat(

Callers

nothing calls this directly

Calls 5

time_blockMethod · 0.80

check_dependenciesMethod · 0.80

get_rule_idMethod · 0.80

get_nameMethod · 0.45

score_notesMethod · 0.45

Tested by

no test coverage detected