EvalForge is a standalone, provider-agnostic harness that lets you define test cases, run any agent against them, score results with multiple strategies, generate reports, and track regressions over time. No vendor lock-in. No hidden magic. Just clean, composable eval primitives.
# Install
Python
$ pip install evalforge
# With all extras:$ pip install "evalforge[all]"
TypeScript / Node.js
$ npm install evalforge
# Quick Start
Python
from evalforge import EvalHarness, TestCase
from evalforge.scorer import fuzzy_match, exact_match
def my_agent(prompt: str) -> str:
return "The capital of France is Paris."
harness = EvalHarness(agent=my_agent, suite_name="geo-smoke")
harness.add(TestCase(
id="france-capital",
description="Knows EU capitals",
input="What is the capital of France?",
expected_output="Paris",
scoring=fuzzy_match(threshold=0.8),
tags=["geography"],
))
result = harness.run(report_html="reports/run.html")
# Prints a rich table to the terminal# Saves an HTML report to reports/run.html
TypeScript
import { EvalHarness, TestCase } from "evalforge";
import { fuzzyMatch } from "evalforge/scorer";
const harness = new EvalHarness({
agent: async (input) => "The capital of France is Paris.",
suiteName: "geo-smoke",
});
harness.add(new TestCase({
id: "france-capital",
input: "What is the capital of France?",
expectedOutput: "Paris",
scoring: fuzzyMatch(0.8),
tags: ["geography"],
}));
const result = await harness.run({ reportHtml: "reports/run.html" });
Regression Tracking
harness = EvalHarness(
agent=my_agent,
suite_name="my-suite",
history_path="eval_history.jsonl", # appends every run
)
result = harness.run()
# Or use RegressionTracker directly:
from evalforge.reporter import RegressionTracker
tracker = RegressionTracker("eval_history.jsonl")
regressions = tracker.compare_and_save(result)
if regressions:
print(f"Regressions: {regressions}")
# Register reusable suites and agents
from evalforge.registry import registry
@registry.suite("support-faq")
def support_faq():
return [TestCase(id="refund", input="What is your refund policy?",
expected_output="30-day", scoring=contains_match())]
@registry.agent("support-bot")
async def support_bot(prompt):
return "We offer a 30-day money back guarantee."
# Run by name
result = asyncio.run(registry.run("support-faq", "support-bot"))