MCPcopy Index your code
hub / github.com/google/adk-python / TauBenchAdapter

Class TauBenchAdapter

contributing/samples/integrations/gepa/experiment.py:253–442  ·  view source on GitHub ↗

A GEPA adapter for evaluating agent performance on tau-bench benchmark.

Source from the content-addressed store, hash-verified

251
252
253class TauBenchAdapter(
254 GEPAAdapter[
255 TauBenchDataInst,
256 TauBenchTrajectory,
257 TauBenchRolloutOutput,
258 ]
259):
260 """A GEPA adapter for evaluating agent performance on tau-bench benchmark."""
261
262 def __init__(
263 self,
264 env_name: str,
265 agent_model: str = 'gemini-2.5-flash',
266 agent_model_provider: str = 'vertex_ai',
267 user_model: str = 'gemini-2.5-pro',
268 user_model_provider: str = 'vertex_ai',
269 agent_strategy: str = 'tool-calling',
270 user_strategy: str = 'llm',
271 system_instruction_name: str = 'system_instruction',
272 max_concurrency: int = 4,
273 rater: rater_lib.Rater | None = None,
274 log_dir: str | None = None,
275 ):
276 """Initializes the TauBenchAdapter.
277
278 Args:
279 env_name: environment
280 agent_model: The model to use for the agent.
281 agent_model_provider: The provider for the agent model.
282 user_model: The model to use for simulating the user.
283 user_model_provider: The provider for the user model.
284 agent_strategy: The agent strategy to use (e.g., 'tool-calling').
285 user_strategy: The user simulation strategy (e.g., 'llm').
286 system_instruction_name: The key in the candidate dictionary that holds
287 the system instruction.
288 max_concurrency: The maximum number of tasks to run in parallel.
289 rater: An optional rater to evaluate the agent's performance.
290 log_dir: The directory to save traces and other logs.
291 """
292 self._env_name = env_name
293 self._agent_model = agent_model
294 self._agent_model_provider = agent_model_provider
295 self._user_model = user_model
296 self._user_model_provider = user_model_provider
297 self._agent_strategy = agent_strategy
298 self._user_strategy = user_strategy
299 self._max_concurrency = max_concurrency
300 self._system_instruction_name = system_instruction_name
301 self._rater = rater
302 self._log_dir = log_dir
303
304 def evaluate(
305 self,
306 batch: list[TauBenchDataInst],
307 candidate: dict[str, str],
308 capture_traces: bool = False,
309 ) -> EvaluationBatch[TauBenchTrajectory, TauBenchRolloutOutput]:
310 """Evaluates a candidate prompt on a batch of tau-bench tasks.

Callers 1

run_gepaFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected