Grade a candidate patch against a verified crash. Spins a fresh container, applies the diff, walks the ladder. Short-circuits on the first failing tier (T0→T1→T2→re-attack). Re-attack commits the patched container to a temp image and runs a 50-turn find-agent against it; any crash f
(
target: TargetConfig,
crash: CrashArtifact,
diff: bytes | list[bytes],
model: str,
container_name: str = "pgrade",
run_reattack: bool = True,
reattack_with_diff: bool = False,
run_style: bool = False,
reattack_focus: str | None = None,
agent_env: dict[str, str] | None = None,
progress_prefix: str | None = None,
transcript_path: str | None = None,
system_prompt: str | None = None,
)
| 35 | |
| 36 | |
| 37 | async def grade_patch( |
| 38 | target: TargetConfig, |
| 39 | crash: CrashArtifact, |
| 40 | diff: bytes | list[bytes], |
| 41 | model: str, |
| 42 | container_name: str = "pgrade", |
| 43 | run_reattack: bool = True, |
| 44 | reattack_with_diff: bool = False, |
| 45 | run_style: bool = False, |
| 46 | reattack_focus: str | None = None, |
| 47 | agent_env: dict[str, str] | None = None, |
| 48 | progress_prefix: str | None = None, |
| 49 | transcript_path: str | None = None, |
| 50 | system_prompt: str | None = None, |
| 51 | ) -> PatchVerdict: |
| 52 | """Grade a candidate patch against a verified crash. |
| 53 | |
| 54 | Spins a fresh container, applies the diff, walks the ladder. Short-circuits |
| 55 | on the first failing tier (T0→T1→T2→re-attack). Re-attack commits the |
| 56 | patched container to a temp image and runs a 50-turn find-agent against it; |
| 57 | any crash fails the tier. |
| 58 | """ |
| 59 | if not target.build_command: |
| 60 | raise ValueError( |
| 61 | f"target {target.name!r} has no build_command — patch grading " |
| 62 | f"requires an in-container rebuild step (set it in config.yaml)" |
| 63 | ) |
| 64 | if crash.poc_path not in crash.reproduction_command: |
| 65 | raise ValueError( |
| 66 | f"poc_path {crash.poc_path!r} not in reproduction_command " |
| 67 | f"{crash.reproduction_command!r}" |
| 68 | ) |
| 69 | |
| 70 | evidence: dict[str, str] = {} |
| 71 | timings: dict[str, float] = {} |
| 72 | t0 = t1 = re_clean = False |
| 73 | t2: bool | None = None |
| 74 | t3: float | None = None |
| 75 | |
| 76 | patched_tag: str | None = None |
| 77 | try: |
| 78 | # T0–T2 only run target code (apply diff, rebuild, replay PoC, test |
| 79 | # suite) via docker_ops.exec_sh — never `claude -p`. So: |
| 80 | # auth=None — don't put the API credential in the env of a |
| 81 | # container running a binary the PoC was crafted to |
| 82 | # crash. |
| 83 | # network=none — it never calls the API, so it needs no egress. |
| 84 | # Without this the sandbox default would give it |
| 85 | # vp-internal (sandboxed) or bridge (--dangerously- |
| 86 | # no-sandbox); the latter is full egress for a |
| 87 | # process fed attacker-controlled input. |
| 88 | # The re-attack and style-judge stages spawn their own containers with |
| 89 | # agent_env + network where they need it. |
| 90 | with sandbox.agent_container( |
| 91 | target.image_tag, |
| 92 | container_name, |
| 93 | None, |
| 94 | memory=target.memory_limit, |