MCPcopy Index your code
hub / github.com/zai-org/CodeGeeX2 / evaluate_functional_correctness

Function evaluate_functional_correctness

evaluation/evaluation.py:94–211  ·  view source on GitHub ↗
(
    input_path: str = None,
    output_path: str = None,
    log_path: str = None,
    tmp_dir: str = "./",
    n_workers: int = 32,
    timeout: float = 5.0,
    k: List[int] = [1, 10, 100],
    model_name: str = None,
    problem_file: str = None,
    language_type: str = None,
    dataset_type: str = "humanevalx",
    generation_mode: str = "completion",
    test_groundtruth: bool = False,
)

Source from the content-addressed store, hash-verified

92
93
94def evaluate_functional_correctness(
95 input_path: str = None,
96 output_path: str = None,
97 log_path: str = None,
98 tmp_dir: str = "./",
99 n_workers: int = 32,
100 timeout: float = 5.0,
101 k: List[int] = [1, 10, 100],
102 model_name: str = None,
103 problem_file: str = None,
104 language_type: str = None,
105 dataset_type: str = "humanevalx",
106 generation_mode: str = "completion",
107 test_groundtruth: bool = False,
108):
109 if log_path is None:
110 log_path = os.path.join(output_path, "evaluation.log")
111 logger = Logger(__name__, log_file=log_path)
112
113 if os.path.isdir(input_path):
114 input_list = glob.glob(input_path + '/*generation*.jsonl')
115 sample_jsonl = []
116 for input_file in input_list:
117 sample_jsonl += stream_jsonl_all(input_file)
118 else:
119 input_file = input_path
120 sample_jsonl = stream_jsonl_all(input_file)
121
122 problems = read_dataset(problem_file, dataset_type=dataset_type)
123
124 if output_path is not None:
125 os.makedirs(output_path, exist_ok=True)
126
127 with ThreadPoolExecutor(max_workers=n_workers) as executor:
128
129 futures = []
130 completion_id = Counter()
131 n_samples = 0
132 results = defaultdict(list)
133
134 if test_groundtruth:
135 logger.info("Testing ground truth...")
136 else:
137 logger.info("Testing generation...")
138 for sample in sample_jsonl:
139 task_id = sample["task_id"]
140 if language_type is None:
141 language_type = LANGUAGE_NAME[task_id.split("/")[0]]
142 if test_groundtruth:
143 if dataset_type == "humanevalx":
144 sample["generation"] = sample["canonical_solution"]
145 sample["prompt"] = problems[task_id]["prompt"]
146 if dataset_type == "mbpp":
147 sample["generation"] = sample["code"]
148 sample["prompt"] = problems[task_id]["prompt"]
149 sample = postprocess_generation(sample, generation_mode)
150 sample["test_code"] = process_test(sample, problems, dataset_type, language_type, generation_mode)
151 if sample["test_code"] is None:

Callers

nothing calls this directly

Calls 7

infoMethod · 0.95
LoggerClass · 0.90
stream_jsonl_allFunction · 0.90
read_datasetFunction · 0.90
estimate_pass_at_kFunction · 0.90
postprocess_generationFunction · 0.85
process_testFunction · 0.85

Tested by

no test coverage detected