Skip to content

pipeline

pipeline

Composable pipeline for checklist generation and scoring.

This module provides a pipeline API for end-to-end checklist workflows: Generator → Refiners → Scorer.

Example

from autochecklist import pipeline pipe = pipeline("tick", generator_model="openai/gpt-4o-mini") result = pipe("Write a haiku", target="Leaves fall gently...") print(result.pass_rate) # 0.85

Split models

pipe = pipeline("tick", generator_model="gpt-4o", scorer_model="gpt-4o-mini")

Generate only (no target → no scoring)

checklists = pipe.generate_batch(inputs=["Write a haiku", "Write a poem"])

Resumable batch

result = pipe.run_batch(data, output_path="results.jsonl")

PipelineResult dataclass

Result from a single pipeline execution.

Attributes:

Name Type Description
checklist Checklist

Generated (and optionally refined) checklist

score Optional[Score]

Score object if target was provided, None otherwise

Source code in autochecklist/pipeline.py
@dataclass
class PipelineResult:
    """Result from a single pipeline execution.

    Attributes:
        checklist: Generated (and optionally refined) checklist
        score: Score object if target was provided, None otherwise
    """
    checklist: Checklist
    score: Optional[Score] = None

    @property
    def pass_rate(self) -> Optional[float]:
        """Shortcut to score.pass_rate."""
        return self.score.pass_rate if self.score else None

    @property
    def weighted_score(self) -> Optional[float]:
        """Shortcut to score.weighted_score."""
        return self.score.weighted_score if self.score else None

    @property
    def normalized_score(self) -> Optional[float]:
        """Shortcut to score.normalized_score."""
        return self.score.normalized_score if self.score else None

pass_rate property

Shortcut to score.pass_rate.

weighted_score property

Shortcut to score.weighted_score.

normalized_score property

Shortcut to score.normalized_score.

BatchResult dataclass

Result from batch corpus evaluation.

Attributes:

Name Type Description
checklist Optional[Checklist]

The checklist used for evaluation (shared if provided, otherwise each score references its own checklist)

scores List[Score]

List of Score objects, one per input

data List[Dict[str, Any]]

Original input data

checklists List[Checklist]

Individual checklists when not using shared checklist

Source code in autochecklist/pipeline.py
@dataclass
class BatchResult:
    """Result from batch corpus evaluation.

    Attributes:
        checklist: The checklist used for evaluation (shared if provided,
            otherwise each score references its own checklist)
        scores: List of Score objects, one per input
        data: Original input data
        checklists: Individual checklists when not using shared checklist
    """
    scores: List[Score]
    data: List[Dict[str, Any]]
    checklist: Optional[Checklist] = None
    checklists: List[Checklist] = field(default_factory=list)

    @property
    def macro_pass_rate(self) -> float:
        """Macro-averaged pass rate across all scored examples.

        Computes pass_rate for each example independently, then averages.
        Each example contributes equally regardless of checklist size.

        Example: If example A scores 2/4 (0.5) and example B scores 3/3 (1.0),
        macro_pass_rate = (0.5 + 1.0) / 2 = 0.75
        """
        if not self.scores:
            return 0.0
        return sum(s.pass_rate for s in self.scores) / len(self.scores)

    @property
    def micro_pass_rate(self) -> float:
        """Micro-averaged pass rate (DFPR: Decomposed Requirements Following Ratio).

        Pools all checklist items across all examples into a single count.
        Examples with more checklist items have proportionally more influence.

        Example: If example A scores 2/4 and example B scores 3/3,
        micro_pass_rate = (2 + 3) / (4 + 3) = 5/7 ≈ 0.714
        """
        total_yes = 0
        total = 0
        for score in self.scores:
            for item_score in score.item_scores:
                total += 1
                if item_score.answer == ChecklistItemAnswer.YES:
                    total_yes += 1
        return total_yes / total if total > 0 else 0.0

    @property
    def mean_score(self) -> float:
        """Mean of Score.primary_score across all examples.

        Respects each Score's primary_metric — averages weighted_score
        for weighted pipelines, normalized_score for normalized, pass_rate for pass.
        """
        if not self.scores:
            return 0.0
        return sum(s.primary_score for s in self.scores) / len(self.scores)

    def per_category_pass_rates(self) -> List[Dict[str, float]]:
        """Compute per-category pass rates for each example.

        Uses the checklist(s) to map item IDs to categories, then computes
        pass rates per category for each scored example.

        Returns:
            List of dicts, one per example, mapping category -> pass_rate
        """
        results = []
        for i, score in enumerate(self.scores):
            # Get the checklist for this example
            if self.checklist is not None:
                cl = self.checklist
            elif i < len(self.checklists):
                cl = self.checklists[i]
            else:
                results.append({})
                continue

            # Build item_id -> category mapping
            id_to_cat = {item.id: (item.category or "ungrouped") for item in cl.items}

            # Group item scores by category
            cat_yes: Dict[str, int] = {}
            cat_total: Dict[str, int] = {}
            for item_score in score.item_scores:
                cat = id_to_cat.get(item_score.item_id, "ungrouped")
                cat_total[cat] = cat_total.get(cat, 0) + 1
                if item_score.answer == ChecklistItemAnswer.YES:
                    cat_yes[cat] = cat_yes.get(cat, 0) + 1

            rates = {}
            for cat, total in cat_total.items():
                rates[cat] = cat_yes.get(cat, 0) / total if total > 0 else 0.0
            results.append(rates)

        return results

    def to_dataframe(self) -> "pd.DataFrame":
        """Export results to pandas DataFrame."""
        try:
            import pandas as pd
        except ImportError:
            raise ImportError("pandas is required for to_dataframe(). Install with: pip install pandas")

        rows = []
        for i, (item, score) in enumerate(zip(self.data, self.scores)):
            row = {
                "index": i,
                "input": item.get("input", ""),
                "target": item.get("target", ""),
                "pass_rate": score.pass_rate,
            }
            if score.weighted_score is not None:
                row["weighted_score"] = score.weighted_score
            if score.normalized_score is not None:
                row["normalized_score"] = score.normalized_score

            for item_score in score.item_scores:
                row[f"item_{item_score.item_id}"] = item_score.answer.value

            rows.append(row)

        return pd.DataFrame(rows)

    def to_jsonl(self, path: str) -> None:
        """Export results to JSONL file."""
        with open(path, "w") as f:
            for item, score in zip(self.data, self.scores):
                record = {
                    "input": item.get("input", ""),
                    "target": item.get("target", ""),
                    "pass_rate": score.pass_rate,
                    "item_scores": [
                        {
                            "item_id": s.item_id,
                            "answer": s.answer.value,
                            "reasoning": s.reasoning,
                        }
                        for s in score.item_scores
                    ],
                }
                if score.weighted_score is not None:
                    record["weighted_score"] = score.weighted_score
                if score.normalized_score is not None:
                    record["normalized_score"] = score.normalized_score
                f.write(json.dumps(record) + "\n")

macro_pass_rate property

Macro-averaged pass rate across all scored examples.

Computes pass_rate for each example independently, then averages. Each example contributes equally regardless of checklist size.

Example: If example A scores 2/4 (0.5) and example B scores 3/3 (1.0), macro_pass_rate = (0.5 + 1.0) / 2 = 0.75

micro_pass_rate property

Micro-averaged pass rate (DFPR: Decomposed Requirements Following Ratio).

Pools all checklist items across all examples into a single count. Examples with more checklist items have proportionally more influence.

Example: If example A scores 2/4 and example B scores 3/3, micro_pass_rate = (2 + 3) / (4 + 3) = 5/7 ≈ 0.714

mean_score property

Mean of Score.primary_score across all examples.

Respects each Score's primary_metric — averages weighted_score for weighted pipelines, normalized_score for normalized, pass_rate for pass.

per_category_pass_rates()

Compute per-category pass rates for each example.

Uses the checklist(s) to map item IDs to categories, then computes pass rates per category for each scored example.

Returns:

Type Description
List[Dict[str, float]]

List of dicts, one per example, mapping category -> pass_rate

Source code in autochecklist/pipeline.py
def per_category_pass_rates(self) -> List[Dict[str, float]]:
    """Compute per-category pass rates for each example.

    Uses the checklist(s) to map item IDs to categories, then computes
    pass rates per category for each scored example.

    Returns:
        List of dicts, one per example, mapping category -> pass_rate
    """
    results = []
    for i, score in enumerate(self.scores):
        # Get the checklist for this example
        if self.checklist is not None:
            cl = self.checklist
        elif i < len(self.checklists):
            cl = self.checklists[i]
        else:
            results.append({})
            continue

        # Build item_id -> category mapping
        id_to_cat = {item.id: (item.category or "ungrouped") for item in cl.items}

        # Group item scores by category
        cat_yes: Dict[str, int] = {}
        cat_total: Dict[str, int] = {}
        for item_score in score.item_scores:
            cat = id_to_cat.get(item_score.item_id, "ungrouped")
            cat_total[cat] = cat_total.get(cat, 0) + 1
            if item_score.answer == ChecklistItemAnswer.YES:
                cat_yes[cat] = cat_yes.get(cat, 0) + 1

        rates = {}
        for cat, total in cat_total.items():
            rates[cat] = cat_yes.get(cat, 0) / total if total > 0 else 0.0
        results.append(rates)

    return results

to_dataframe()

Export results to pandas DataFrame.

Source code in autochecklist/pipeline.py
def to_dataframe(self) -> "pd.DataFrame":
    """Export results to pandas DataFrame."""
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas is required for to_dataframe(). Install with: pip install pandas")

    rows = []
    for i, (item, score) in enumerate(zip(self.data, self.scores)):
        row = {
            "index": i,
            "input": item.get("input", ""),
            "target": item.get("target", ""),
            "pass_rate": score.pass_rate,
        }
        if score.weighted_score is not None:
            row["weighted_score"] = score.weighted_score
        if score.normalized_score is not None:
            row["normalized_score"] = score.normalized_score

        for item_score in score.item_scores:
            row[f"item_{item_score.item_id}"] = item_score.answer.value

        rows.append(row)

    return pd.DataFrame(rows)

to_jsonl(path)

Export results to JSONL file.

Source code in autochecklist/pipeline.py
def to_jsonl(self, path: str) -> None:
    """Export results to JSONL file."""
    with open(path, "w") as f:
        for item, score in zip(self.data, self.scores):
            record = {
                "input": item.get("input", ""),
                "target": item.get("target", ""),
                "pass_rate": score.pass_rate,
                "item_scores": [
                    {
                        "item_id": s.item_id,
                        "answer": s.answer.value,
                        "reasoning": s.reasoning,
                    }
                    for s in score.item_scores
                ],
            }
            if score.weighted_score is not None:
                record["weighted_score"] = score.weighted_score
            if score.normalized_score is not None:
                record["normalized_score"] = score.normalized_score
            f.write(json.dumps(record) + "\n")

ChecklistPipeline

Chains: Generator → Refiners → Scorer.

A composable pipeline for checklist-based evaluation. Three construction modes:

  1. Preset: ChecklistPipeline(from_preset="tick") — resolves generator AND auto-attaches the preset's default scorer.
  2. Explicit components: ChecklistPipeline(generator="tick", scorer="batch") — resolves each component by name. No auto scorer.
  3. Pre-configured instances: ChecklistPipeline(generator=my_gen, scorer=my_scorer)

The :func:pipeline factory is equivalent to mode 1.

Parameters:

Name Type Description Default
generator Optional[Any]

Generator name string (e.g., "tick", "rlcf_direct") or a pre-configured generator instance.

None
refiners Optional[List[Union[str, Any]]]

Optional list of refiner instances or name strings.

None
scorer Optional[Union[str, Any]]

Optional scorer instance or name string (e.g., "batch", "weighted"). Not auto-resolved unless using from_preset.

None
generator_model Optional[str]

Model for the generator (used when generator is a string).

None
scorer_model Optional[str]

Model for the scorer (used when scorer is a string).

None
provider Optional[str]

LLM provider ("openrouter", "openai", "vllm").

None
base_url Optional[str]

Override base URL for the LLM provider.

None
client Any

Injected LLM client instance.

None
api_key Optional[str]

API key for the provider.

None
api_format Optional[str]

API format ("chat" or "responses").

None
generator_kwargs Optional[Dict[str, Any]]

Extra kwargs passed to generator constructor.

None
scorer_kwargs Optional[Dict[str, Any]]

Extra kwargs passed to scorer constructor.

None
from_preset Optional[str]

Pipeline preset name (e.g., "tick"). Resolves generator and auto-attaches default scorer. Mutually exclusive with generator.

None
Example

pipe = ChecklistPipeline(from_preset="tick", ... generator_model="gpt-4o", scorer_model="gpt-4o-mini")

pipe = ChecklistPipeline(generator="tick", scorer="batch")

gen = DirectGenerator(method_name="tick", model="gpt-4o") pipe = ChecklistPipeline(generator=gen, scorer="batch")

Source code in autochecklist/pipeline.py
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
class ChecklistPipeline:
    """Chains: Generator → Refiners → Scorer.

    A composable pipeline for checklist-based evaluation. Three construction
    modes:

    1. **Preset**: ``ChecklistPipeline(from_preset="tick")`` — resolves
       generator AND auto-attaches the preset's default scorer.
    2. **Explicit components**: ``ChecklistPipeline(generator="tick", scorer="batch")``
       — resolves each component by name. No auto scorer.
    3. **Pre-configured instances**: ``ChecklistPipeline(generator=my_gen, scorer=my_scorer)``

    The :func:`pipeline` factory is equivalent to mode 1.

    Args:
        generator: Generator name string (e.g., ``"tick"``, ``"rlcf_direct"``)
            or a pre-configured generator instance.
        refiners: Optional list of refiner instances or name strings.
        scorer: Optional scorer instance or name string (e.g., ``"batch"``,
            ``"weighted"``). Not auto-resolved unless using ``from_preset``.
        generator_model: Model for the generator (used when generator is a string).
        scorer_model: Model for the scorer (used when scorer is a string).
        provider: LLM provider ("openrouter", "openai", "vllm").
        base_url: Override base URL for the LLM provider.
        client: Injected LLM client instance.
        api_key: API key for the provider.
        api_format: API format ("chat" or "responses").
        generator_kwargs: Extra kwargs passed to generator constructor.
        scorer_kwargs: Extra kwargs passed to scorer constructor.
        from_preset: Pipeline preset name (e.g., ``"tick"``). Resolves generator
            and auto-attaches default scorer. Mutually exclusive with
            ``generator``.

    Example:
        >>> pipe = ChecklistPipeline(from_preset="tick",
        ...     generator_model="gpt-4o", scorer_model="gpt-4o-mini")

        >>> pipe = ChecklistPipeline(generator="tick", scorer="batch")

        >>> gen = DirectGenerator(method_name="tick", model="gpt-4o")
        >>> pipe = ChecklistPipeline(generator=gen, scorer="batch")
    """

    def __init__(
        self,
        generator: Optional[Any] = None,
        refiners: Optional[List[Union[str, Any]]] = None,
        scorer: Optional[Union[str, Any]] = None,
        # Model config
        generator_model: Optional[str] = None,
        scorer_model: Optional[str] = None,
        # Provider config (shared)
        provider: Optional[str] = None,
        base_url: Optional[str] = None,
        client: Any = None,
        api_key: Optional[str] = None,
        api_format: Optional[str] = None,
        reasoning_effort: Optional[str] = None,
        # Component-specific kwargs passthrough
        generator_kwargs: Optional[Dict[str, Any]] = None,
        scorer_kwargs: Optional[Dict[str, Any]] = None,
        # Preset loading (resolves generator + default scorer)
        from_preset: Optional[str] = None,
        # Custom prompt override for generator
        custom_prompt: Optional[Union[str, Path]] = None,
    ):
        # Thread custom_prompt into generator_kwargs
        if custom_prompt is not None:
            generator_kwargs = generator_kwargs or {}
            generator_kwargs["custom_prompt"] = custom_prompt

        # from_preset: resolve generator string AND auto-attach default scorer
        if from_preset is not None:
            if generator is not None:
                raise ValueError("Cannot specify both from_preset and generator")
            generator = from_preset
            # Auto-resolve scorer from preset if not explicitly provided
            if scorer is None:
                scorer = DEFAULT_SCORERS.get(from_preset, "batch")

        # Shared provider kwargs
        provider_kwargs = {k: v for k, v in {
            "provider": provider, "base_url": base_url,
            "client": client, "api_key": api_key, "api_format": api_format,
            "reasoning_effort": reasoning_effort,
        }.items() if v is not None}

        # -- Generator resolution --
        if isinstance(generator, str):
            gen_cls = get_generator(generator)
            gen_kw = {**provider_kwargs, **(generator_kwargs or {})}
            if generator_model:
                gen_kw["model"] = generator_model
            self.generator = gen_cls(**gen_kw)
            self._generator_name = generator
        elif generator is not None:
            self.generator = generator
            self._generator_name = getattr(generator, 'method_name', 'custom')
        else:
            raise ValueError("Must provide generator (name string or instance)")

        # -- Refiner resolution --
        self.refiners = []
        if refiners:
            for refiner in refiners:
                if isinstance(refiner, str):
                    ref_cls = get_refiner(refiner)
                    ref_kw = {**provider_kwargs}
                    self.refiners.append(ref_cls(**ref_kw))
                else:
                    self.refiners.append(refiner)

        # -- Scorer resolution --
        if isinstance(scorer, dict):
            # Config dict from pipeline presets (new style)
            # Resolve scorer_prompt key → custom_prompt before constructing
            scorer = dict(scorer)  # avoid mutating the preset dict
            if "scorer_prompt" in scorer:
                prompt_ref = scorer.pop("scorer_prompt")
                try:
                    from .prompts import load_template
                    scorer["custom_prompt"] = load_template(
                        "scoring", prompt_ref
                    )
                except FileNotFoundError:
                    # Not a built-in name — treat as inline prompt text
                    scorer["custom_prompt"] = prompt_ref
            from .scorers import ChecklistScorer
            scorer_kw = {**provider_kwargs, **scorer, **(scorer_kwargs or {})}
            if scorer_model:
                scorer_kw["model"] = scorer_model
            self.scorer = ChecklistScorer(**scorer_kw)
        elif isinstance(scorer, str):
            scorer_cls = get_scorer(scorer)
            scorer_kw = {**provider_kwargs, **(scorer_kwargs or {})}
            if scorer_model:
                scorer_kw["model"] = scorer_model
            self.scorer = scorer_cls(**scorer_kw)
        elif scorer is not None:
            self.scorer = scorer  # pre-configured instance
        else:
            self.scorer = None  # generator instance without scorer

    @property
    def is_instance_level(self) -> bool:
        """Check if the generator is instance-level."""
        return self.generator.generation_level == "instance"

    @property
    def is_corpus_level(self) -> bool:
        """Check if the generator is corpus-level."""
        return self.generator.generation_level == "corpus"

    def __call__(
        self,
        input: Optional[str] = None,
        target: Optional[str] = None,
        **kwargs: Any,
    ) -> PipelineResult:
        """Run the full pipeline: generate → refine → score.

        For instance-level generators, pass input and target.
        For corpus-level generators, pass the appropriate inputs via kwargs
        (e.g., feedback=..., dimensions=...).

        Args:
            input: Input instruction/query (for instance-level)
            target: Target response to evaluate (optional for generation-only)
            **kwargs: Additional arguments passed to generator

        Returns:
            PipelineResult with checklist and optional score
        """
        checklist = self.generate(input=input, **kwargs)

        score = None
        if target is not None:
            score = self.score(checklist, target, input=input)

        return PipelineResult(checklist=checklist, score=score)

    def generate(
        self,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> Checklist:
        """Generate a checklist (without scoring).

        Args:
            input: Input instruction/query (for instance-level)
            **kwargs: Additional arguments for generator

        Returns:
            Generated and refined checklist
        """
        if self.generator is None:
            raise RuntimeError("No generator configured. Provide generator (name or instance) to use generate().")

        if self.is_instance_level:
            if input is None:
                raise ValueError("input is required for instance-level generators")
            checklist = self.generator.generate(input=input, **kwargs)
        else:
            checklist = self.generator.generate(**kwargs)

        return self.refine(checklist)

    def refine(self, checklist: Checklist) -> Checklist:
        """Apply refiners to a checklist."""
        for refiner in self.refiners:
            checklist = refiner.refine(checklist)
        return checklist

    def score(
        self,
        checklist: Checklist,
        target: str,
        input: Optional[str] = None,
    ) -> Score:
        """Score a target response against a checklist.

        Args:
            checklist: Checklist to evaluate against
            target: Target response to score
            input: Optional input for context

        Returns:
            Score object
        """
        if self.scorer is None:
            raise RuntimeError(
                "No scorer configured. Provide scorer (name or instance) to use score(). Use pipeline() for automatic scorer defaults."
            )
        return self.scorer.score(
            checklist=checklist,
            target=target,
            input=input,
        )

    def score_group(
        self,
        sub_checklists: Dict[str, Checklist],
        target: str,
        input: Optional[str] = None,
    ) -> "GroupedScore":
        """Score a target against sub-checklists (one per category).

        Typically used with ``checklist.by_category()`` output.

        Args:
            sub_checklists: Dict mapping category name to sub-Checklist
            target: Target response to score
            input: Optional input for context

        Returns:
            GroupedScore with per-category Score objects
        """
        if self.scorer is None:
            raise RuntimeError("No scorer configured.")
        scores = {}
        for name, sub_cl in sub_checklists.items():
            scores[name] = self.score(sub_cl, target, input=input)
        return GroupedScore(scores=scores)

    def score_batch(
        self,
        checklist: Checklist,
        targets: List[str],
        inputs: Optional[List[str]] = None,
        show_progress: bool = False,
        on_progress: Optional[Callable[[int, int], None]] = None,
    ) -> List[Score]:
        """Score multiple targets against a single checklist."""
        return self._run_batch_scoring(
            checklist=checklist,
            targets=targets,
            inputs=inputs,
            show_progress=show_progress,
            on_progress=on_progress,
        )

    def generate_batch(
        self,
        data: Optional[List[Dict[str, Any]]] = None,
        inputs: Optional[List[str]] = None,
        show_progress: bool = False,
        on_progress: Optional[Callable[[int, int], None]] = None,
        output_path: Optional[str] = None,
        overwrite: bool = False,
    ) -> List[Checklist]:
        """Generate checklists for a batch of inputs (no scoring).

        Only works for instance-level generators (1:1 input → checklist).
        For corpus-level generators, call generator.generate() directly.

        Args:
            data: List of dicts with "input" key
            inputs: List of input strings (convenience alternative to data)
            show_progress: Show progress bar
            on_progress: Callback(completed, total) fired after each item
            output_path: Path to JSONL file for incremental writes + resume
            overwrite: If True, delete existing output_path before starting

        Returns:
            List of Checklist objects
        """
        if self.generator is None:
            raise RuntimeError("No generator configured.")
        if not self.is_instance_level:
            raise RuntimeError(
                "generate_batch() only works for instance-level generators. "
                "Corpus-level generators produce one checklist from all inputs — "
                "call generator.generate() directly."
            )

        if data is None:
            if inputs is None:
                raise ValueError("Provide either 'data' or 'inputs'")
            data = [{"input": inp} for inp in inputs]

        total = len(data)

        # Resume logic
        if overwrite and output_path and os.path.exists(output_path):
            os.remove(output_path)
        completed = _load_completed_indices(output_path) if output_path else {}

        checklists: List[Optional[Checklist]] = [None] * total
        for idx, record in completed.items():
            if 0 <= idx < total:
                checklists[idx] = _checklist_from_record(record)

        out_file = open(output_path, "a") if output_path else None
        try:
            for i, item in enumerate(data):
                if i in completed:
                    if on_progress:
                        on_progress(i + 1, total)
                    continue
                checklist = self.generate(input=item.get("input", ""), **{
                    k: v for k, v in item.items() if k not in ("input", "target")
                })
                checklists[i] = checklist
                if out_file:
                    record = _checklist_record(i, item, checklist)
                    out_file.write(json.dumps(record) + "\n")
                    out_file.flush()
                if on_progress:
                    on_progress(i + 1, total)
        finally:
            if out_file:
                out_file.close()

        return [c for c in checklists if c is not None]

    def run_batch(
        self,
        data: Optional[List[Dict[str, Any]]] = None,
        checklist: Optional[Checklist] = None,
        inputs: Optional[List[str]] = None,
        targets: Optional[List[str]] = None,
        show_progress: bool = False,
        on_progress: Optional[Callable[[int, int], None]] = None,
        output_path: Optional[str] = None,
        overwrite: bool = False,
    ) -> BatchResult:
        """Run batch evaluation on a corpus.

        Can be called with either:
        1. data: List of dicts with "input" and "target" keys
        2. inputs + targets: Separate lists

        Args:
            data: List of dicts with input/target pairs
            checklist: Optional shared checklist to use for all evaluations
            inputs: Optional list of inputs (alternative to data)
            targets: Optional list of targets (alternative to data)
            show_progress: Show progress bar
            on_progress: Optional callback
            output_path: Path to JSONL file for incremental writes + resume
            overwrite: If True, delete existing output_path before starting

        Returns:
            BatchResult with scores and aggregated metrics
        """
        if data is None:
            if inputs is None or targets is None:
                raise ValueError("Provide either 'data' or both 'inputs' and 'targets'")
            if len(inputs) != len(targets):
                raise ValueError("inputs and targets must have same length")
            data = [
                {"input": inp, "target": tgt}
                for inp, tgt in zip(inputs, targets)
            ]

        if output_path is not None and checklist is not None:
            raise ValueError(
                "output_path with shared checklist is not supported. "
                "Use score_batch() for shared-checklist scoring."
            )

        if checklist is not None:
            target_list = [d.get("target", "") for d in data]
            input_list = [d.get("input") for d in data]
            scores = self._run_batch_scoring(
                checklist=checklist,
                targets=target_list,
                inputs=input_list,
                show_progress=show_progress,
                on_progress=on_progress,
            )
            return BatchResult(
                scores=scores,
                data=data,
                checklist=checklist,
            )
        else:
            return self._run_batch_generation_and_scoring(
                data=data,
                show_progress=show_progress,
                on_progress=on_progress,
                output_path=output_path,
                overwrite=overwrite,
            )

    def _run_batch_generation_and_scoring(
        self,
        data: List[Dict[str, Any]],
        show_progress: bool = False,
        on_progress: Optional[Callable[[int, int], None]] = None,
        output_path: Optional[str] = None,
        overwrite: bool = False,
    ) -> BatchResult:
        """Generate and score for each item in the batch."""
        total = len(data)

        # Resume logic
        if overwrite and output_path and os.path.exists(output_path):
            os.remove(output_path)
        completed = _load_completed_indices(output_path) if output_path else {}

        gen_pbar = None
        score_pbar = None
        if show_progress:
            try:
                from tqdm import tqdm
                num_completed = len(completed)
                gen_pbar = tqdm(total=total, desc="Generating", initial=num_completed)
                score_pbar = tqdm(total=total, desc="Scoring", initial=num_completed)
            except ImportError:
                pass

        checklists: List[Optional[Checklist]] = [None] * total
        scores: List[Optional[Score]] = [None] * total

        # Load completed results
        for idx, record in completed.items():
            if 0 <= idx < total:
                checklists[idx] = _checklist_from_record(record)
                scores[idx] = _score_from_record(record)

        out_file = open(output_path, "a") if output_path else None
        try:
            for i, item in enumerate(data):
                if i in completed:
                    if on_progress:
                        on_progress(i + 1, total)
                    continue

                inp = item.get("input", "")
                tgt = item.get("target", "")

                cl = self.generate(input=inp, **{k: v for k, v in item.items() if k not in ("input", "target")})
                checklists[i] = cl
                if gen_pbar:
                    gen_pbar.update(1)

                sc = self.score(cl, tgt, input=inp)
                scores[i] = sc
                if score_pbar:
                    score_pbar.update(1)

                if out_file:
                    record = _record_from_result(i, item, cl, sc)
                    out_file.write(json.dumps(record) + "\n")
                    out_file.flush()

                if on_progress:
                    on_progress(i + 1, total)

            return BatchResult(
                scores=[s for s in scores if s is not None],
                data=data,
                checklists=[c for c in checklists if c is not None],
            )
        finally:
            if out_file:
                out_file.close()
            if gen_pbar:
                gen_pbar.close()
            if score_pbar:
                score_pbar.close()

    def _run_batch_scoring(
        self,
        checklist: Checklist,
        targets: List[str],
        inputs: Optional[List[str]] = None,
        show_progress: bool = False,
        on_progress: Optional[Callable[[int, int], None]] = None,
    ) -> List[Score]:
        """Score multiple targets with concurrency support."""
        pbar = None
        total = len(targets)

        if show_progress:
            try:
                from tqdm import tqdm
                pbar = tqdm(total=total, desc="Scoring")
            except ImportError:
                pass

        try:
            scores = []
            for i, target in enumerate(targets):
                inp = inputs[i] if inputs else None
                score = self.score(checklist, target, input=inp)
                scores.append(score)
                if pbar:
                    pbar.update(1)
                if on_progress:
                    on_progress(i + 1, total)
            return scores
        finally:
            if pbar:
                pbar.close()

    def run_batch_from_file(
        self,
        path: str,
        checklist: Optional[Checklist] = None,
        input_key: str = "input",
        target_key: str = "target",
        show_progress: bool = False,
    ) -> BatchResult:
        """Run batch evaluation from a JSONL file."""
        data = []
        with open(path) as f:
            for line in f:
                obj = json.loads(line)
                data.append({
                    "input": obj.get(input_key, ""),
                    "target": obj.get(target_key, ""),
                })

        return self.run_batch(
            data=data,
            checklist=checklist,
            show_progress=show_progress,
        )

is_instance_level property

Check if the generator is instance-level.

is_corpus_level property

Check if the generator is corpus-level.

__call__(input=None, target=None, **kwargs)

Run the full pipeline: generate → refine → score.

For instance-level generators, pass input and target. For corpus-level generators, pass the appropriate inputs via kwargs (e.g., feedback=..., dimensions=...).

Parameters:

Name Type Description Default
input Optional[str]

Input instruction/query (for instance-level)

None
target Optional[str]

Target response to evaluate (optional for generation-only)

None
**kwargs Any

Additional arguments passed to generator

{}

Returns:

Type Description
PipelineResult

PipelineResult with checklist and optional score

Source code in autochecklist/pipeline.py
def __call__(
    self,
    input: Optional[str] = None,
    target: Optional[str] = None,
    **kwargs: Any,
) -> PipelineResult:
    """Run the full pipeline: generate → refine → score.

    For instance-level generators, pass input and target.
    For corpus-level generators, pass the appropriate inputs via kwargs
    (e.g., feedback=..., dimensions=...).

    Args:
        input: Input instruction/query (for instance-level)
        target: Target response to evaluate (optional for generation-only)
        **kwargs: Additional arguments passed to generator

    Returns:
        PipelineResult with checklist and optional score
    """
    checklist = self.generate(input=input, **kwargs)

    score = None
    if target is not None:
        score = self.score(checklist, target, input=input)

    return PipelineResult(checklist=checklist, score=score)

generate(input=None, **kwargs)

Generate a checklist (without scoring).

Parameters:

Name Type Description Default
input Optional[str]

Input instruction/query (for instance-level)

None
**kwargs Any

Additional arguments for generator

{}

Returns:

Type Description
Checklist

Generated and refined checklist

Source code in autochecklist/pipeline.py
def generate(
    self,
    input: Optional[str] = None,
    **kwargs: Any,
) -> Checklist:
    """Generate a checklist (without scoring).

    Args:
        input: Input instruction/query (for instance-level)
        **kwargs: Additional arguments for generator

    Returns:
        Generated and refined checklist
    """
    if self.generator is None:
        raise RuntimeError("No generator configured. Provide generator (name or instance) to use generate().")

    if self.is_instance_level:
        if input is None:
            raise ValueError("input is required for instance-level generators")
        checklist = self.generator.generate(input=input, **kwargs)
    else:
        checklist = self.generator.generate(**kwargs)

    return self.refine(checklist)

refine(checklist)

Apply refiners to a checklist.

Source code in autochecklist/pipeline.py
def refine(self, checklist: Checklist) -> Checklist:
    """Apply refiners to a checklist."""
    for refiner in self.refiners:
        checklist = refiner.refine(checklist)
    return checklist

score(checklist, target, input=None)

Score a target response against a checklist.

Parameters:

Name Type Description Default
checklist Checklist

Checklist to evaluate against

required
target str

Target response to score

required
input Optional[str]

Optional input for context

None

Returns:

Type Description
Score

Score object

Source code in autochecklist/pipeline.py
def score(
    self,
    checklist: Checklist,
    target: str,
    input: Optional[str] = None,
) -> Score:
    """Score a target response against a checklist.

    Args:
        checklist: Checklist to evaluate against
        target: Target response to score
        input: Optional input for context

    Returns:
        Score object
    """
    if self.scorer is None:
        raise RuntimeError(
            "No scorer configured. Provide scorer (name or instance) to use score(). Use pipeline() for automatic scorer defaults."
        )
    return self.scorer.score(
        checklist=checklist,
        target=target,
        input=input,
    )

score_group(sub_checklists, target, input=None)

Score a target against sub-checklists (one per category).

Typically used with checklist.by_category() output.

Parameters:

Name Type Description Default
sub_checklists Dict[str, Checklist]

Dict mapping category name to sub-Checklist

required
target str

Target response to score

required
input Optional[str]

Optional input for context

None

Returns:

Type Description
GroupedScore

GroupedScore with per-category Score objects

Source code in autochecklist/pipeline.py
def score_group(
    self,
    sub_checklists: Dict[str, Checklist],
    target: str,
    input: Optional[str] = None,
) -> "GroupedScore":
    """Score a target against sub-checklists (one per category).

    Typically used with ``checklist.by_category()`` output.

    Args:
        sub_checklists: Dict mapping category name to sub-Checklist
        target: Target response to score
        input: Optional input for context

    Returns:
        GroupedScore with per-category Score objects
    """
    if self.scorer is None:
        raise RuntimeError("No scorer configured.")
    scores = {}
    for name, sub_cl in sub_checklists.items():
        scores[name] = self.score(sub_cl, target, input=input)
    return GroupedScore(scores=scores)

score_batch(checklist, targets, inputs=None, show_progress=False, on_progress=None)

Score multiple targets against a single checklist.

Source code in autochecklist/pipeline.py
def score_batch(
    self,
    checklist: Checklist,
    targets: List[str],
    inputs: Optional[List[str]] = None,
    show_progress: bool = False,
    on_progress: Optional[Callable[[int, int], None]] = None,
) -> List[Score]:
    """Score multiple targets against a single checklist."""
    return self._run_batch_scoring(
        checklist=checklist,
        targets=targets,
        inputs=inputs,
        show_progress=show_progress,
        on_progress=on_progress,
    )

generate_batch(data=None, inputs=None, show_progress=False, on_progress=None, output_path=None, overwrite=False)

Generate checklists for a batch of inputs (no scoring).

Only works for instance-level generators (1:1 input → checklist). For corpus-level generators, call generator.generate() directly.

Parameters:

Name Type Description Default
data Optional[List[Dict[str, Any]]]

List of dicts with "input" key

None
inputs Optional[List[str]]

List of input strings (convenience alternative to data)

None
show_progress bool

Show progress bar

False
on_progress Optional[Callable[[int, int], None]]

Callback(completed, total) fired after each item

None
output_path Optional[str]

Path to JSONL file for incremental writes + resume

None
overwrite bool

If True, delete existing output_path before starting

False

Returns:

Type Description
List[Checklist]

List of Checklist objects

Source code in autochecklist/pipeline.py
def generate_batch(
    self,
    data: Optional[List[Dict[str, Any]]] = None,
    inputs: Optional[List[str]] = None,
    show_progress: bool = False,
    on_progress: Optional[Callable[[int, int], None]] = None,
    output_path: Optional[str] = None,
    overwrite: bool = False,
) -> List[Checklist]:
    """Generate checklists for a batch of inputs (no scoring).

    Only works for instance-level generators (1:1 input → checklist).
    For corpus-level generators, call generator.generate() directly.

    Args:
        data: List of dicts with "input" key
        inputs: List of input strings (convenience alternative to data)
        show_progress: Show progress bar
        on_progress: Callback(completed, total) fired after each item
        output_path: Path to JSONL file for incremental writes + resume
        overwrite: If True, delete existing output_path before starting

    Returns:
        List of Checklist objects
    """
    if self.generator is None:
        raise RuntimeError("No generator configured.")
    if not self.is_instance_level:
        raise RuntimeError(
            "generate_batch() only works for instance-level generators. "
            "Corpus-level generators produce one checklist from all inputs — "
            "call generator.generate() directly."
        )

    if data is None:
        if inputs is None:
            raise ValueError("Provide either 'data' or 'inputs'")
        data = [{"input": inp} for inp in inputs]

    total = len(data)

    # Resume logic
    if overwrite and output_path and os.path.exists(output_path):
        os.remove(output_path)
    completed = _load_completed_indices(output_path) if output_path else {}

    checklists: List[Optional[Checklist]] = [None] * total
    for idx, record in completed.items():
        if 0 <= idx < total:
            checklists[idx] = _checklist_from_record(record)

    out_file = open(output_path, "a") if output_path else None
    try:
        for i, item in enumerate(data):
            if i in completed:
                if on_progress:
                    on_progress(i + 1, total)
                continue
            checklist = self.generate(input=item.get("input", ""), **{
                k: v for k, v in item.items() if k not in ("input", "target")
            })
            checklists[i] = checklist
            if out_file:
                record = _checklist_record(i, item, checklist)
                out_file.write(json.dumps(record) + "\n")
                out_file.flush()
            if on_progress:
                on_progress(i + 1, total)
    finally:
        if out_file:
            out_file.close()

    return [c for c in checklists if c is not None]

run_batch(data=None, checklist=None, inputs=None, targets=None, show_progress=False, on_progress=None, output_path=None, overwrite=False)

Run batch evaluation on a corpus.

Can be called with either: 1. data: List of dicts with "input" and "target" keys 2. inputs + targets: Separate lists

Parameters:

Name Type Description Default
data Optional[List[Dict[str, Any]]]

List of dicts with input/target pairs

None
checklist Optional[Checklist]

Optional shared checklist to use for all evaluations

None
inputs Optional[List[str]]

Optional list of inputs (alternative to data)

None
targets Optional[List[str]]

Optional list of targets (alternative to data)

None
show_progress bool

Show progress bar

False
on_progress Optional[Callable[[int, int], None]]

Optional callback

None
output_path Optional[str]

Path to JSONL file for incremental writes + resume

None
overwrite bool

If True, delete existing output_path before starting

False

Returns:

Type Description
BatchResult

BatchResult with scores and aggregated metrics

Source code in autochecklist/pipeline.py
def run_batch(
    self,
    data: Optional[List[Dict[str, Any]]] = None,
    checklist: Optional[Checklist] = None,
    inputs: Optional[List[str]] = None,
    targets: Optional[List[str]] = None,
    show_progress: bool = False,
    on_progress: Optional[Callable[[int, int], None]] = None,
    output_path: Optional[str] = None,
    overwrite: bool = False,
) -> BatchResult:
    """Run batch evaluation on a corpus.

    Can be called with either:
    1. data: List of dicts with "input" and "target" keys
    2. inputs + targets: Separate lists

    Args:
        data: List of dicts with input/target pairs
        checklist: Optional shared checklist to use for all evaluations
        inputs: Optional list of inputs (alternative to data)
        targets: Optional list of targets (alternative to data)
        show_progress: Show progress bar
        on_progress: Optional callback
        output_path: Path to JSONL file for incremental writes + resume
        overwrite: If True, delete existing output_path before starting

    Returns:
        BatchResult with scores and aggregated metrics
    """
    if data is None:
        if inputs is None or targets is None:
            raise ValueError("Provide either 'data' or both 'inputs' and 'targets'")
        if len(inputs) != len(targets):
            raise ValueError("inputs and targets must have same length")
        data = [
            {"input": inp, "target": tgt}
            for inp, tgt in zip(inputs, targets)
        ]

    if output_path is not None and checklist is not None:
        raise ValueError(
            "output_path with shared checklist is not supported. "
            "Use score_batch() for shared-checklist scoring."
        )

    if checklist is not None:
        target_list = [d.get("target", "") for d in data]
        input_list = [d.get("input") for d in data]
        scores = self._run_batch_scoring(
            checklist=checklist,
            targets=target_list,
            inputs=input_list,
            show_progress=show_progress,
            on_progress=on_progress,
        )
        return BatchResult(
            scores=scores,
            data=data,
            checklist=checklist,
        )
    else:
        return self._run_batch_generation_and_scoring(
            data=data,
            show_progress=show_progress,
            on_progress=on_progress,
            output_path=output_path,
            overwrite=overwrite,
        )

run_batch_from_file(path, checklist=None, input_key='input', target_key='target', show_progress=False)

Run batch evaluation from a JSONL file.

Source code in autochecklist/pipeline.py
def run_batch_from_file(
    self,
    path: str,
    checklist: Optional[Checklist] = None,
    input_key: str = "input",
    target_key: str = "target",
    show_progress: bool = False,
) -> BatchResult:
    """Run batch evaluation from a JSONL file."""
    data = []
    with open(path) as f:
        for line in f:
            obj = json.loads(line)
            data.append({
                "input": obj.get(input_key, ""),
                "target": obj.get(target_key, ""),
            })

    return self.run_batch(
        data=data,
        checklist=checklist,
        show_progress=show_progress,
    )

pipeline(task=None, generator_model=None, scorer_model=None, refiners=None, scorer=None, provider=None, base_url=None, client=None, api_key=None, api_format=None, reasoning_effort=None, generator_kwargs=None, scorer_kwargs=None, custom_prompt=None)

Convenience factory that creates a ChecklistPipeline from a pipeline preset.

Equivalent to ChecklistPipeline(from_preset=task, ...). Resolves both the generator and the preset's default scorer automatically.

Can also be called with custom_prompt= instead of a task name to create a pipeline from a custom prompt without registration.

Parameters:

Name Type Description Default
task Optional[str]

Pipeline name or alias. Options: - Instance-level: "tick", "rlcf_direct", "rocketeval", "rlcf_candidate", "rlcf_candidates_only" - Corpus-level: "feedback", "checkeval", "interacteval" Can be None if custom_prompt is provided.

None
generator_model Optional[str]

Model for the generator

None
scorer_model Optional[str]

Model for the scorer

None
refiners Optional[List[Union[str, Any]]]

Optional list of refiner names or instances

None
scorer Optional[Union[str, Any]]

Optional scorer name or instance (overrides preset default)

None
provider Optional[str]

LLM provider ("openrouter", "openai", "vllm")

None
base_url Optional[str]

Override base URL (e.g., vLLM server URL)

None
client Any

Injected LLM client (e.g., VLLMOfflineClient)

None
api_key Optional[str]

API key for the provider

None
api_format Optional[str]

API format ("chat" or "responses")

None
generator_kwargs Optional[Dict[str, Any]]

Extra kwargs for generator (e.g., candidate config)

None
scorer_kwargs Optional[Dict[str, Any]]

Extra kwargs for scorer

None
custom_prompt Optional[Union[str, Path]]

Custom generator prompt. Pass a Path to load from file, or a str for raw prompt text. No registration needed.

None

Returns:

Type Description
ChecklistPipeline

Configured ChecklistPipeline

Example

pipe = pipeline("tick", generator_model="gpt-4o-mini") pipe = pipeline("tick", generator_model="gpt-4o", scorer_model="gpt-4o-mini") pipe = pipeline(custom_prompt=Path("my_eval.md"), scorer_model="gpt-4o-mini")

Source code in autochecklist/pipeline.py
def pipeline(
    task: Optional[str] = None,
    generator_model: Optional[str] = None,
    scorer_model: Optional[str] = None,
    refiners: Optional[List[Union[str, Any]]] = None,
    scorer: Optional[Union[str, Any]] = None,
    provider: Optional[str] = None,
    base_url: Optional[str] = None,
    client: Any = None,
    api_key: Optional[str] = None,
    api_format: Optional[str] = None,
    reasoning_effort: Optional[str] = None,
    generator_kwargs: Optional[Dict[str, Any]] = None,
    scorer_kwargs: Optional[Dict[str, Any]] = None,
    custom_prompt: Optional[Union[str, Path]] = None,
) -> ChecklistPipeline:
    """Convenience factory that creates a ChecklistPipeline from a pipeline preset.

    Equivalent to ``ChecklistPipeline(from_preset=task, ...)``. Resolves
    both the generator and the preset's default scorer automatically.

    Can also be called with ``custom_prompt=`` instead of a task name to
    create a pipeline from a custom prompt without registration.

    Args:
        task: Pipeline name or alias. Options:
            - Instance-level: "tick", "rlcf_direct", "rocketeval",
              "rlcf_candidate", "rlcf_candidates_only"
            - Corpus-level: "feedback", "checkeval", "interacteval"
            Can be None if custom_prompt is provided.
        generator_model: Model for the generator
        scorer_model: Model for the scorer
        refiners: Optional list of refiner names or instances
        scorer: Optional scorer name or instance (overrides preset default)
        provider: LLM provider ("openrouter", "openai", "vllm")
        base_url: Override base URL (e.g., vLLM server URL)
        client: Injected LLM client (e.g., VLLMOfflineClient)
        api_key: API key for the provider
        api_format: API format ("chat" or "responses")
        generator_kwargs: Extra kwargs for generator (e.g., candidate config)
        scorer_kwargs: Extra kwargs for scorer
        custom_prompt: Custom generator prompt. Pass a Path to load from
            file, or a str for raw prompt text. No registration needed.

    Returns:
        Configured ChecklistPipeline

    Example:
        >>> pipe = pipeline("tick", generator_model="gpt-4o-mini")
        >>> pipe = pipeline("tick", generator_model="gpt-4o", scorer_model="gpt-4o-mini")
        >>> pipe = pipeline(custom_prompt=Path("my_eval.md"), scorer_model="gpt-4o-mini")
    """
    # If custom_prompt provided without task, default to "tick" preset
    if task is None and custom_prompt is not None:
        task = "tick"

    if task is None:
        raise ValueError("Must provide 'task' (pipeline name) or 'custom_prompt'")

    return ChecklistPipeline(
        from_preset=task,
        generator_model=generator_model,
        scorer_model=scorer_model,
        refiners=refiners,
        scorer=scorer,
        provider=provider,
        base_url=base_url,
        client=client,
        api_key=api_key,
        api_format=api_format,
        reasoning_effort=reasoning_effort,
        generator_kwargs=generator_kwargs,
        scorer_kwargs=scorer_kwargs,
        custom_prompt=custom_prompt,
    )