from multivon_eval import EvalSuite, EvalCase, Faithfulness
# Cases you want to score, with the context each answer was grounded in.
cases = [
EvalCase(
input="What's the refund window?",
context="Refunds are available within 30 days of purchase.",
),
EvalCase(
input="Do you ship internationally?",
context="We ship to the US and Canada only.",
),
]
# The answers you already logged in production, keyed by the prompt.
logged = {
"What's the refund window?": "You can request a refund within 30 days.",
"Do you ship internationally?": "We ship to the US and Canada.",
}
suite = EvalSuite("Replay scoring")
suite.add_cases(cases)
suite.add_evaluators(Faithfulness())
report = suite.run(lambda prompt: logged[prompt])
print(f"Pass rate {report.pass_rate:.0%}")