Skip to main content

EvalRunner

from aevyra_verdict import EvalRunner
The runner orchestrates completions and scoring across all configured models and metrics.

EvalRunner(config=None)

from aevyra_verdict.runner import RunConfig

runner = EvalRunner()
runner = EvalRunner(config=RunConfig(max_workers=5))

.add_provider(provider_name, model, *, label=None, api_key=None, base_url=None)

Add a model to evaluate. Returns self for chaining.
runner.add_provider("openai", "gpt-5.4-nano")
runner.add_provider("openrouter", "qwen/qwen3.5-9b", label="qwen3.5-9b")
runner.add_provider("openai", "llama", base_url="http://localhost:8000/v1", api_key="none")

.add_provider_instance(label, provider)

Add a pre-configured Provider instance.
from aevyra_verdict.providers import get_provider

provider = get_provider("openrouter", "meta-llama/llama-3.1-8b-instruct")
runner.add_provider_instance("llama-openrouter", provider)

.add_metric(metric)

Add a scoring metric. Returns self for chaining.
from aevyra_verdict import RougeScore, LLMJudge
from aevyra_verdict.providers import get_provider

runner.add_metric(RougeScore())
runner.add_metric(LLMJudge(judge_provider=get_provider("openai", "gpt-5.4")))

.run(dataset, show_progress=True)

Run the eval. Returns an EvalResults object.
results = runner.run(dataset)
results = runner.run(dataset, show_progress=False)

RunConfig

from aevyra_verdict.runner import RunConfig
ParameterDefaultDescription
temperature0.0Sampling temperature for completions.
max_tokens1024Max tokens per completion.
max_workers10Concurrent requests per model.
max_model_workers4Models evaluated concurrently.
num_retries4Retry attempts after the first failure.
retry_base_delay1.0Initial backoff delay in seconds.
retry_max_delay60.0Maximum backoff delay in seconds.
retry_jitter0.25±fraction of random jitter added to each delay.