(model_name, prompt_list)
| 38 | |
| 39 | # Run the benchmark test |
| 40 | def run_test(model_name, prompt_list): |
| 41 | |
| 42 | print(f"\n > Loading model '{model_name}'") |
| 43 | prompter = Prompt().load_model(model_name) |
| 44 | |
| 45 | print(f"\n > Running RAG Benchmark Test against '{model_name}' - 200 questions") |
| 46 | for i, entry in enumerate(prompt_list): |
| 47 | |
| 48 | start_time = time.time() |
| 49 | |
| 50 | prompt = entry["query"] |
| 51 | context = entry["context"] |
| 52 | response = prompter.prompt_main(prompt,context=context,prompt_name="default_with_context", temperature=0.3) |
| 53 | |
| 54 | # Print results |
| 55 | time_taken = round(time.time() - start_time, 2) |
| 56 | print("\n") |
| 57 | print(f"{i+1}. llm_response - {response['llm_response']}") |
| 58 | print(f"{i+1}. gold_answer - {entry['answer']}") |
| 59 | print(f"{i+1}. time_taken - {time_taken}") |
| 60 | |
| 61 | # Fact checking |
| 62 | fc = prompter.evidence_check_numbers(response) |
| 63 | sc = prompter.evidence_comparison_stats(response) |
| 64 | sr = prompter.evidence_check_sources(response) |
| 65 | for fc_entry in fc: |
| 66 | for f, facts in enumerate(fc_entry["fact_check"]): |
| 67 | print(f"{i+1}. fact_check - {f} {facts}") |
| 68 | |
| 69 | for sc_entry in sc: |
| 70 | print(f"{i+1}. comparison_stats - {sc_entry['comparison_stats']}") |
| 71 | |
| 72 | for sr_entry in sr: |
| 73 | for s, source in enumerate(sr_entry["source_review"]): |
| 74 | print(f"{i+1}. source - {s} {source}") |
| 75 | |
| 76 | return 0 |
| 77 | |
| 78 | |
| 79 | if __name__ == "__main__": |
no test coverage detected