Skip to content

scorers

scorers

Checklist scorers.

ChecklistScorer

Configurable checklist scorer that supports batch and per-item modes.

Consolidates the former BatchScorer, ItemScorer, WeightedScorer, and NormalizedScorer into a single class. All three aggregate metrics (pass_rate, weighted_score, normalized_score) are always computed.

Parameters:

Name Type Description Default
mode str

"batch" (one LLM call) or "item" (one call per item).

'batch'
capture_reasoning bool

Item mode only — include per-item reasoning.

False
use_logprobs bool

Item mode only — use logprobs for confidence scoring.

False
primary_metric str

Which metric Score.primary_score aliases. One of "pass" (pass_rate), "weighted" (weighted_score), "normalized" (normalized_score).

'pass'
custom_prompt Optional[Union[str, Path]]

Override the default prompt template (str text or Path).

None
model Optional[str]

LLM model identifier.

None
temperature float

Sampling temperature.

0.0
api_key Optional[str]

Provider API key.

None
provider Optional[str]

LLM provider name.

None
base_url Optional[str]

Override base URL.

None
client Any

Pre-configured LLM client.

None
api_format Optional[str]

API format ("chat" or "responses").

None
max_tokens int

Maximum response tokens.

2048
reasoning_effort Optional[str]

Reasoning effort hint for supported models.

None
Example

scorer = ChecklistScorer(mode="batch") score = scorer.score(checklist, target="The response text...") print(score.primary_score) # uses primary_metric

Source code in autochecklist/scorers/base.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
class ChecklistScorer:
    """Configurable checklist scorer that supports batch and per-item modes.

    Consolidates the former BatchScorer, ItemScorer, WeightedScorer, and
    NormalizedScorer into a single class.  All three aggregate metrics
    (pass_rate, weighted_score, normalized_score) are always computed.

    Args:
        mode: ``"batch"`` (one LLM call) or ``"item"`` (one call per item).
        capture_reasoning: Item mode only — include per-item reasoning.
        use_logprobs: Item mode only — use logprobs for confidence scoring.
        primary_metric: Which metric ``Score.primary_score`` aliases.
            One of ``"pass"`` (pass_rate), ``"weighted"`` (weighted_score),
            ``"normalized"`` (normalized_score).
        custom_prompt: Override the default prompt template (str text or Path).
        model: LLM model identifier.
        temperature: Sampling temperature.
        api_key: Provider API key.
        provider: LLM provider name.
        base_url: Override base URL.
        client: Pre-configured LLM client.
        api_format: API format (``"chat"`` or ``"responses"``).
        max_tokens: Maximum response tokens.
        reasoning_effort: Reasoning effort hint for supported models.

    Example:
        >>> scorer = ChecklistScorer(mode="batch")
        >>> score = scorer.score(checklist, target="The response text...")
        >>> print(score.primary_score)  # uses primary_metric
    """

    def __init__(
        self,
        mode: str = "batch",
        capture_reasoning: bool = False,
        use_logprobs: bool = False,
        primary_metric: str = "pass",
        custom_prompt: Optional[Union[str, Path]] = None,
        # LLM config
        model: Optional[str] = None,
        temperature: float = 0.0,
        api_key: Optional[str] = None,
        provider: Optional[str] = None,
        base_url: Optional[str] = None,
        client: Any = None,
        api_format: Optional[str] = None,
        max_tokens: int = 2048,
        reasoning_effort: Optional[str] = None,
    ):
        if mode not in _VALID_MODES:
            raise ValueError(
                f"mode must be one of {_VALID_MODES!r}, got {mode!r}"
            )
        if primary_metric not in _VALID_PRIMARY_METRICS:
            raise ValueError(
                f"primary_metric must be one of {_VALID_PRIMARY_METRICS!r}, "
                f"got {primary_metric!r}"
            )

        self.mode = mode
        self.capture_reasoning = capture_reasoning
        self.use_logprobs = use_logprobs
        self.primary_metric = primary_metric

        # LLM settings
        config = get_config()
        self.model = model or config.scorer_model.model_id
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.api_key = api_key
        self._client = client
        self._provider = provider or config.scorer_model.provider or "openrouter"
        self._base_url = base_url
        self._api_format = api_format or "chat"
        self.reasoning_effort = reasoning_effort

        # Load prompt template and format
        self._template, self._format_text = self._load_prompt_and_format(
            custom_prompt
        )

        # Logprobs availability check (deferred to avoid API call unless needed)
        self._logprobs_available = False
        if self.use_logprobs:
            client_instance = self._get_or_create_client()
            if client_instance.supports_logprobs(self.model):
                self._logprobs_available = True
            else:
                warnings.warn(
                    f"Model '{self.model}' does not support logprobs. "
                    "Scorer will use text-based scoring with confidence=None. "
                    "normalized_score will equal pass_rate (unweighted).",
                    UserWarning,
                    stacklevel=2,
                )

    # ── Public properties ──────────────────────────────────────────────────

    @property
    def scoring_method(self) -> str:
        """Backward-compat scoring method string for Score metadata."""
        if self.mode == "batch":
            return "batch"
        if self.use_logprobs:
            return "normalized"
        if self.primary_metric == "weighted":
            return "weighted"
        if self.capture_reasoning:
            return "item"
        return "item"

    @property
    def prompt_text(self) -> str:
        """The raw prompt template text."""
        return self._template.template

    # ── Core scoring API ───────────────────────────────────────────────────

    def score(
        self,
        checklist: Checklist,
        target: str,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> Score:
        """Score a target response against a checklist.

        Args:
            checklist: The checklist to evaluate against.
            target: The target text to score.
            input: Optional input/context (falls back to checklist.input).
            **kwargs: Additional arguments (ignored).

        Returns:
            Score object with item-level and all aggregate scores.
        """
        if self.mode == "batch":
            return self._score_batch(checklist, target, input)
        else:
            return self._score_items_dispatch(checklist, target, input)

    def score_batch(
        self,
        checklist: Checklist,
        targets: List[str],
        inputs: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> List[Score]:
        """Score multiple targets sequentially."""
        results = []
        for i, target in enumerate(targets):
            input_text = inputs[i] if inputs else None
            results.append(self.score(checklist, target, input_text, **kwargs))
        return results

    # ── Batch scoring ──────────────────────────────────────────────────────

    def _score_batch(
        self,
        checklist: Checklist,
        target: str,
        input: Optional[str],
    ) -> Score:
        """Evaluate ALL checklist items in a single LLM call."""
        inst = input or checklist.input or ""

        checklist_text = "\n".join(
            f"Q{i}: {item.question}"
            for i, item in enumerate(checklist.items, 1)
        )

        prompt = self._template.format(
            input=inst, target=target, checklist=checklist_text,
        )
        full_prompt = prompt + "\n\n" + self._format_text

        if self.capture_reasoning:
            rf = to_response_format(
                BatchScoringResponseReasoned, "batch_scoring_reasoned"
            )
            raw = self._call_model(full_prompt, response_format=rf)
            try:
                data = json.loads(raw)
            except json.JSONDecodeError:
                data = extract_json(raw)
            validated = BatchScoringResponseReasoned.model_validate(data)

            answer_map = {a.question_index: a.answer for a in validated.answers}
            reasoning_map = {
                a.question_index: a.reasoning for a in validated.answers
            }
        else:
            rf = to_response_format(BatchScoringResponse, "batch_scoring")
            raw = self._call_model(full_prompt, response_format=rf)
            try:
                data = json.loads(raw)
            except json.JSONDecodeError:
                data = extract_json(raw)
            validated = BatchScoringResponse.model_validate(data)

            answer_map = {a.question_index: a.answer for a in validated.answers}
            reasoning_map = {}

        item_scores = []
        for i, item in enumerate(checklist.items):
            answer_str = answer_map.get(i + 1, "NO")
            answer = (
                ChecklistItemAnswer.YES
                if answer_str == "YES"
                else ChecklistItemAnswer.NO
            )
            reasoning = reasoning_map.get(i + 1)
            item_scores.append(
                ItemScore(item_id=item.id, answer=answer, reasoning=reasoning)
            )

        return self._build_score(checklist, item_scores, raw_response=raw)

    # ── Item scoring ───────────────────────────────────────────────────────

    def _score_items_dispatch(
        self,
        checklist: Checklist,
        target: str,
        input: Optional[str],
    ) -> Score:
        """Evaluate each checklist item individually."""
        inst = input or checklist.input or ""
        item_scores = []

        client = self._get_or_create_client()

        for item in checklist.items:
            prompt = self._template.format(
                input=inst, target=target, question=item.question,
                # normalized template may use {history}
                history="",
            )

            if self.use_logprobs and self._logprobs_available:
                # Logprobs path — confidence scoring
                messages = [{"role": "user", "content": prompt}]
                probs = client.get_logprobs(
                    model=self.model,
                    messages=messages,
                    temperature=self.temperature,
                )
                confidence, answer, level = self._interpret_probs(probs)
                item_scores.append(
                    ItemScore(
                        item_id=item.id,
                        answer=answer,
                        confidence=confidence,
                        confidence_level=level,
                    )
                )
            else:
                # Structured output path
                full_prompt = prompt + "\n\n" + self._format_text

                if self.capture_reasoning:
                    rf = to_response_format(
                        ItemScoringResponseReasoned, "item_scoring_reasoned"
                    )
                    raw = self._call_model(full_prompt, response_format=rf)
                    try:
                        data = json.loads(raw)
                    except json.JSONDecodeError:
                        data = extract_json(raw)
                    validated = ItemScoringResponseReasoned.model_validate(data)
                    answer = (
                        ChecklistItemAnswer.YES
                        if validated.answer == "YES"
                        else ChecklistItemAnswer.NO
                    )
                    item_scores.append(
                        ItemScore(
                            item_id=item.id,
                            answer=answer,
                            reasoning=validated.reasoning,
                        )
                    )
                else:
                    rf = to_response_format(
                        ItemScoringResponse, "item_scoring"
                    )
                    raw = self._call_model(full_prompt, response_format=rf)
                    try:
                        data = json.loads(raw)
                    except json.JSONDecodeError:
                        data = extract_json(raw)
                    validated = ItemScoringResponse.model_validate(data)
                    answer = (
                        ChecklistItemAnswer.YES
                        if validated.answer == "YES"
                        else ChecklistItemAnswer.NO
                    )
                    item_scores.append(
                        ItemScore(item_id=item.id, answer=answer)
                    )

        return self._build_score(
            checklist,
            item_scores,
            num_calls=len(checklist.items),
        )

    # ── Score assembly ─────────────────────────────────────────────────────

    def _build_score(
        self,
        checklist: Checklist,
        item_scores: List[ItemScore],
        raw_response: Optional[str] = None,
        num_calls: Optional[int] = None,
    ) -> Score:
        """Build a Score with all three aggregate metrics computed."""
        yes_count = sum(
            1 for s in item_scores if s.answer == ChecklistItemAnswer.YES
        )
        total = len(item_scores)
        total_score = yes_count / total if total > 0 else 0.0

        # Weighted score
        weighted_score = self._calculate_weighted_score(item_scores, checklist)

        # Normalized score
        confidences = [
            s.confidence for s in item_scores if s.confidence is not None
        ]
        normalized_score = (
            sum(confidences) / len(confidences) if confidences else total_score
        )

        metadata: Dict[str, Any] = {}
        if raw_response is not None:
            metadata["raw_response"] = raw_response
        if num_calls is not None:
            metadata["num_calls"] = num_calls

        return Score(
            checklist_id=checklist.id,
            item_scores=item_scores,
            total_score=total_score,
            weighted_score=weighted_score,
            normalized_score=normalized_score,
            primary_metric=self.primary_metric,
            judge_model=self.model,
            scoring_method=self.scoring_method,
            metadata=metadata,
        )

    # ── Weighted score calculation ─────────────────────────────────────────

    def _calculate_weighted_score(
        self,
        item_scores: List[ItemScore],
        checklist: Checklist,
    ) -> float:
        """Calculate weighted score: sum(weight_i * score_i) / sum(weight_i)."""
        item_weights = {item.id: item.weight for item in checklist.items}

        weighted_sum = 0.0
        total_weight = 0.0

        for s in item_scores:
            weight = item_weights.get(s.item_id, 1.0)
            total_weight += weight
            if s.answer == ChecklistItemAnswer.YES:
                weighted_sum += weight

        return weighted_sum / total_weight if total_weight > 0 else 0.0

    # ── Logprobs helpers ───────────────────────────────────────────────────

    def _interpret_probs(
        self,
        probs: Dict[str, float],
    ) -> tuple:
        """Interpret Yes/No probabilities into confidence and answer.

        Confidence = P(Yes) / (P(Yes) + P(No))
        """
        yes_prob = probs.get("yes", 0.0)
        no_prob = probs.get("no", 0.0)

        if yes_prob + no_prob < 1e-10:
            return 0.5, ChecklistItemAnswer.NO, ConfidenceLevel.UNSURE

        confidence = yes_prob / (yes_prob + no_prob)
        answer, level = self._confidence_to_level(confidence)
        return confidence, answer, level

    def _confidence_to_level(
        self,
        confidence: float,
    ) -> tuple:
        """Map confidence value to answer and confidence level."""
        if confidence < 0.2:
            return ChecklistItemAnswer.NO, ConfidenceLevel.NO_10
        elif confidence < 0.4:
            return ChecklistItemAnswer.NO, ConfidenceLevel.NO_30
        elif confidence < 0.6:
            return ChecklistItemAnswer.NO, ConfidenceLevel.UNSURE
        elif confidence < 0.8:
            return ChecklistItemAnswer.YES, ConfidenceLevel.YES_70
        else:
            return ChecklistItemAnswer.YES, ConfidenceLevel.YES_90

    # ── Prompt loading ─────────────────────────────────────────────────────

    def _load_prompt_and_format(
        self,
        custom_prompt: Optional[Union[str, Path]],
    ) -> tuple:
        """Load prompt template and format instruction based on mode."""
        # Determine template text
        if custom_prompt is not None:
            if isinstance(custom_prompt, Path):
                template_text = custom_prompt.read_text(encoding="utf-8")
            else:
                template_text = custom_prompt
        else:
            template_text = self._default_template_text()

        template = PromptTemplate(template_text)

        # Determine format text
        format_text = self._default_format_text()

        return template, format_text

    def _default_template_text(self) -> str:
        """Load the default prompt template for the current mode config.

        Simplified: just batch vs item. Pipeline presets override via
        ``scorer_prompt`` key for paper-specific prompts (rlcf, rocketeval).
        """
        if self.mode == "batch":
            return load_template("scoring", "batch")
        return load_template("scoring", "item")

    def _default_format_text(self) -> str:
        """Load the default format instruction for the current mode config."""
        if self.mode == "batch":
            if self.capture_reasoning:
                return load_format("batch_scoring_reasoned")
            return load_format("batch_scoring")
        if self.capture_reasoning:
            return load_format("item_scoring_reasoned")
        return load_format("item_scoring")

    # ── LLM client management ─────────────────────────────────────────────

    def _get_or_create_client(self) -> Any:
        """Get injected client or create one from provider settings."""
        if self._client is not None:
            return self._client
        from ..providers.factory import get_client
        return get_client(
            provider=self._provider,
            api_key=self.api_key,
            base_url=self._base_url,
            model=self.model,
            api_format=self._api_format,
        )

    def _call_model(
        self,
        prompt: str,
        system_prompt: Optional[str] = None,
        response_format: Optional[dict] = None,
    ) -> str:
        """Call the LLM and return the response text.

        Handles fallback from structured output to schema-in-prompt on
        400 errors.
        """
        messages: List[Dict[str, str]] = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": prompt})

        kwargs: Dict[str, Any] = {}
        if response_format is not None:
            kwargs["response_format"] = response_format
        if self.reasoning_effort is not None:
            kwargs["reasoning_effort"] = self.reasoning_effort

        client = self._get_or_create_client()

        try:
            response = client.chat_completion(
                model=self.model,
                messages=messages,
                temperature=self.temperature,
                max_tokens=self.max_tokens,
                **kwargs,
            )
        except (ValueError, KeyError, TypeError) as e:
            if response_format is not None:
                logger.warning(
                    "Structured output failed (%s), retrying without "
                    "response_format (fallback to schema-in-prompt).",
                    e,
                )
                fallback_kwargs = {
                    k: v for k, v in kwargs.items() if k != "response_format"
                }
                response = client.chat_completion(
                    model=self.model,
                    messages=messages,
                    temperature=self.temperature,
                    max_tokens=self.max_tokens,
                    **fallback_kwargs,
                )
            else:
                raise
        except Exception as e:
            import httpx
            if (
                response_format is not None
                and isinstance(e, httpx.HTTPStatusError)
                and e.response.status_code == 400
            ):
                logger.warning(
                    "Structured output failed (%s), retrying without "
                    "response_format (fallback to schema-in-prompt).",
                    e,
                )
                fallback_kwargs = {
                    k: v for k, v in kwargs.items() if k != "response_format"
                }
                response = client.chat_completion(
                    model=self.model,
                    messages=messages,
                    temperature=self.temperature,
                    max_tokens=self.max_tokens,
                    **fallback_kwargs,
                )
            else:
                raise

        return response["choices"][0]["message"]["content"]

scoring_method property

Backward-compat scoring method string for Score metadata.

prompt_text property

The raw prompt template text.

score(checklist, target, input=None, **kwargs)

Score a target response against a checklist.

Parameters:

Name Type Description Default
checklist Checklist

The checklist to evaluate against.

required
target str

The target text to score.

required
input Optional[str]

Optional input/context (falls back to checklist.input).

None
**kwargs Any

Additional arguments (ignored).

{}

Returns:

Type Description
Score

Score object with item-level and all aggregate scores.

Source code in autochecklist/scorers/base.py
def score(
    self,
    checklist: Checklist,
    target: str,
    input: Optional[str] = None,
    **kwargs: Any,
) -> Score:
    """Score a target response against a checklist.

    Args:
        checklist: The checklist to evaluate against.
        target: The target text to score.
        input: Optional input/context (falls back to checklist.input).
        **kwargs: Additional arguments (ignored).

    Returns:
        Score object with item-level and all aggregate scores.
    """
    if self.mode == "batch":
        return self._score_batch(checklist, target, input)
    else:
        return self._score_items_dispatch(checklist, target, input)

score_batch(checklist, targets, inputs=None, **kwargs)

Score multiple targets sequentially.

Source code in autochecklist/scorers/base.py
def score_batch(
    self,
    checklist: Checklist,
    targets: List[str],
    inputs: Optional[List[str]] = None,
    **kwargs: Any,
) -> List[Score]:
    """Score multiple targets sequentially."""
    results = []
    for i, target in enumerate(targets):
        input_text = inputs[i] if inputs else None
        results.append(self.score(checklist, target, input_text, **kwargs))
    return results

BatchScorer(**kwargs)

Deprecated: use ChecklistScorer(mode='batch').

Source code in autochecklist/scorers/__init__.py
def BatchScorer(**kwargs):
    """Deprecated: use ``ChecklistScorer(mode='batch')``."""
    warnings.warn(
        "BatchScorer is deprecated, use ChecklistScorer(mode='batch')",
        DeprecationWarning,
        stacklevel=2,
    )
    return ChecklistScorer(mode="batch", **kwargs)

ItemScorer(**kwargs)

Deprecated: use ChecklistScorer(mode='item', capture_reasoning=True).

Source code in autochecklist/scorers/__init__.py
def ItemScorer(**kwargs):
    """Deprecated: use ``ChecklistScorer(mode='item', capture_reasoning=True)``."""
    warnings.warn(
        "ItemScorer is deprecated, use ChecklistScorer(mode='item', capture_reasoning=True)",
        DeprecationWarning,
        stacklevel=2,
    )
    kwargs.setdefault("capture_reasoning", True)
    return ChecklistScorer(mode="item", **kwargs)

WeightedScorer(**kwargs)

Deprecated: use ChecklistScorer(mode='item', primary_metric='weighted').

Source code in autochecklist/scorers/__init__.py
def WeightedScorer(**kwargs):
    """Deprecated: use ``ChecklistScorer(mode='item', primary_metric='weighted')``."""
    warnings.warn(
        "WeightedScorer is deprecated, use ChecklistScorer(mode='item', primary_metric='weighted')",
        DeprecationWarning,
        stacklevel=2,
    )
    kwargs.setdefault("primary_metric", "weighted")
    return ChecklistScorer(mode="item", **kwargs)

NormalizedScorer(**kwargs)

Deprecated: use ChecklistScorer(mode='item', use_logprobs=True, primary_metric='normalized').

Source code in autochecklist/scorers/__init__.py
def NormalizedScorer(**kwargs):
    """Deprecated: use ``ChecklistScorer(mode='item', use_logprobs=True, primary_metric='normalized')``."""
    warnings.warn(
        "NormalizedScorer is deprecated, use ChecklistScorer(mode='item', use_logprobs=True, primary_metric='normalized')",
        DeprecationWarning,
        stacklevel=2,
    )
    kwargs.setdefault("primary_metric", "normalized")
    kwargs.setdefault("use_logprobs", True)
    return ChecklistScorer(mode="item", **kwargs)