hub / github.com/zai-org/CodeGeeX2 / read_translation_dataset

Function read_translation_dataset

evaluation/utils.py:257–287 · view source on GitHub ↗

(
    data_file_src: str = None,
    data_file_tgt: str = None,
    lang_src: str = None,
    lang_tgt: str = None,
    dataset_type: str = "humanevalx",
)

Source from the content-addressed store, hash-verified

255
256
257	def read_translation_dataset(
258	data_file_src: str = None,
259	data_file_tgt: str = None,
260	lang_src: str = None,
261	lang_tgt: str = None,
262	dataset_type: str = "humanevalx",
263	) -> Dict:
264	if "humanevalx" in dataset_type.lower():
265	dataset_src = {task["task_id"]: task for task in stream_jsonl(data_file_src)}
266	dataset_tgt = {task["task_id"].split("/")[-1]: task for task in stream_jsonl(data_file_tgt)}
267	for k, sample in dataset_src.items():
268	prompt = "code translation\n"
269	if lang_src == "cpp":
270	prompt += "C++:\n"
271	elif lang_src == "js":
272	prompt += "JavaScript:\n"
273	else:
274	prompt += f"{lang_src}:\n".capitalize()
275	prompt += dataset_src[k]["declaration"] + "\n" + dataset_src[k]["canonical_solution"].rstrip() + "\n"
276	if lang_tgt == "cpp":
277	prompt += "C++:\n"
278	elif lang_tgt == "js":
279	prompt += "JavaScript:\n"
280	else:
281	prompt += f"{lang_tgt}:\n".capitalize()
282	prompt += dataset_tgt[k.split("/")[-1]]["declaration"]
283	dataset_src[k]["prompt"] = prompt
284	else:
285	raise f"Dataset: {dataset_type} not supported."
286
287	return dataset_src
288
289
290	def process_extra_prompt(

Callers

nothing calls this directly

Calls 1

stream_jsonlFunction · 0.85

Tested by

no test coverage detected