Skip to content

autochecklist

providers

ChicagoHAI/AutoChecklist

providers

`providers` ¶

LLM providers.

`LLMClient` ¶

Bases: Protocol

Protocol that all LLM providers must satisfy.

Returns OpenAI-format dicts everywhere so existing parsing code works unchanged regardless of provider.

Source code in autochecklist/providers/base.py

@runtime_checkable
class LLMClient(Protocol):
    """Protocol that all LLM providers must satisfy.

    Returns OpenAI-format dicts everywhere so existing parsing code
    works unchanged regardless of provider.
    """

    def chat_completion(
        self,
        model: str,
        messages: List[Dict[str, str]],
        temperature: float = 0.7,
        max_tokens: int = 2048,
        **kwargs: Any,
    ) -> Dict[str, Any]: ...

    def get_logprobs(
        self,
        model: str,
        messages: List[Dict[str, str]],
        **kwargs: Any,
    ) -> Dict[str, float]: ...

    def supports_logprobs(self, model: str) -> bool: ...

    def batch_completions(
        self,
        requests: List[Dict[str, Any]],
        concurrency: int = 5,
        progress_callback: Optional[Callable[[int], None]] = None,
    ) -> List[Dict[str, Any]]: ...

    def close(self) -> None: ...

    def __enter__(self) -> "LLMClient": ...

    def __exit__(self, *args: Any) -> None: ...

`ProviderConfig` `dataclass` ¶

Configuration preset for an LLM provider.

Source code in autochecklist/providers/base.py

@dataclass
class ProviderConfig:
    """Configuration preset for an LLM provider."""

    name: str
    base_url: Optional[str] = None
    api_key_env_var: Optional[str] = None
    default_headers: Dict[str, str] = field(default_factory=dict)
    requires_api_key: bool = True

`LLMHTTPClient` ¶

HTTP client for OpenAI-compatible LLM APIs.

Works with OpenRouter, OpenAI, and vLLM server mode. Supports both Chat Completions and Responses API formats.

Source code in autochecklist/providers/http_client.py

class LLMHTTPClient:
    """HTTP client for OpenAI-compatible LLM APIs.

    Works with OpenRouter, OpenAI, and vLLM server mode. Supports both
    Chat Completions and Responses API formats.
    """

    # Class-level cache for model capabilities, keyed by (provider, base_url)
    _models_cache: Dict[tuple, List[Dict[str, Any]]] = {}
    _models_cache_time: Dict[tuple, float] = {}
    _CACHE_TTL = 3600  # 1 hour

    def __init__(
        self,
        provider: str = "openrouter",
        api_key: Optional[str] = None,
        base_url: Optional[str] = None,
        timeout: int = 60,
        api_format: str = "chat",
    ):
        config = get_provider_config(provider, base_url=base_url)
        self.provider = provider
        self.api_format = api_format

        # Resolve base URL
        self.base_url = base_url or config.base_url or ""

        # Resolve API key: explicit > env var > config
        self.api_key = self._resolve_api_key(api_key, config)

        self.timeout = timeout
        self._provider_config = config

        # Build headers
        headers = {
            "Content-Type": "application/json",
            **config.default_headers,
        }
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"

        self._client = httpx.Client(
            base_url=self.base_url,
            headers=headers,
            timeout=self.timeout,
        )

    def _resolve_api_key(
        self,
        explicit_key: Optional[str],
        config: Any,
    ) -> Optional[str]:
        """Resolve API key from explicit param, env var, or global config."""
        if explicit_key:
            return explicit_key

        # Try provider-specific env var
        if config.api_key_env_var:
            env_key = os.getenv(config.api_key_env_var)
            if env_key:
                return env_key

        # Try global config for backward compat
        if self.provider == "openrouter":
            from ..config import get_config
            cfg = get_config()
            if cfg.openrouter_api_key:
                return cfg.openrouter_api_key

        if config.requires_api_key:
            env_var = config.api_key_env_var or "API_KEY"
            raise ValueError(
                f"{self.provider.title()} API key required. "
                f"Set {env_var} env var or pass api_key parameter."
            )

        return None

    def close(self) -> None:
        """Close the HTTP client."""
        self._client.close()

    def __enter__(self) -> "LLMHTTPClient":
        return self

    def __exit__(self, *args: Any) -> None:
        self.close()

    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=1, max=10),
        retry=retry_if_exception(_is_retryable),
    )
    def chat_completion(
        self,
        model: str,
        messages: List[Dict[str, str]],
        temperature: float = 0.7,
        max_tokens: int = 2048,
        **kwargs: Any,
    ) -> Dict[str, Any]:
        """Make a chat completion request.

        For api_format="responses", translates to/from the Responses API format.
        Always returns normalized OpenAI Chat Completions format.
        """
        model = _normalize_model_name(model, self.provider)

        if self.api_format == "responses":
            return self._chat_completion_responses(
                model, messages, temperature, max_tokens, **kwargs
            )

        # Reasoning models require max_completion_tokens and only support temperature=1
        is_reasoning = _is_reasoning_model(model)
        token_key = "max_completion_tokens" if is_reasoning else "max_tokens"
        body = {
            "model": model,
            "messages": messages,
            token_key: max_tokens,
            **kwargs,
        }
        if not is_reasoning:
            body["temperature"] = temperature
        response = self._client.post("/chat/completions", json=body)
        _raise_with_detail(response)
        raw = response.json()
        return self._normalize_response(raw)

    def _chat_completion_responses(
        self,
        model: str,
        messages: List[Dict[str, str]],
        temperature: float = 0.7,
        max_tokens: int = 2048,
        **kwargs: Any,
    ) -> Dict[str, Any]:
        """Make a Responses API request and normalize to Chat Completions format."""
        # Build Responses API request body
        body: Dict[str, Any] = {
            "model": model,
            "input": messages,
            "max_output_tokens": max_tokens,
        }
        if not _is_reasoning_model(model):
            body["temperature"] = temperature

        # Handle response_format for Responses API
        # Chat Completions uses: {"response_format": {"type": "json_schema", "json_schema": {"name": ..., "schema": ...}}}
        # Responses API uses:    {"text": {"format": {"type": "json_schema", "name": ..., "schema": ...}}}
        response_format = kwargs.pop("response_format", None)
        if response_format is not None:
            if (
                isinstance(response_format, dict)
                and response_format.get("type") == "json_schema"
                and "json_schema" in response_format
            ):
                # Flatten: unwrap json_schema wrapper for Responses API
                inner = response_format["json_schema"]
                body["text"] = {"format": {
                    "type": "json_schema",
                    **inner,
                }}
            else:
                body["text"] = {"format": response_format}

        # Map Chat Completions kwargs to Responses API equivalents
        if kwargs.get("logprobs"):
            if "text" not in body:
                body["text"] = {"format": {"type": "text"}}
            body["include"] = ["message.output_text.logprobs"]
            # Responses API uses top_logprobs at request level
            if "top_logprobs" in kwargs:
                body["top_logprobs"] = kwargs.pop("top_logprobs")
            kwargs.pop("logprobs", None)

        # Pass through remaining kwargs
        for k, v in kwargs.items():
            if k not in ("max_tokens",):
                body[k] = v

        response = self._client.post("/responses", json=body)
        _raise_with_detail(response)
        raw = response.json()
        return self._normalize_response(raw)

    def _normalize_response(self, raw: Dict[str, Any]) -> Dict[str, Any]:
        """Normalize response to Chat Completions format.

        Handles both Chat Completions (passthrough) and Responses API
        (translation) formats.
        """
        # Chat Completions format — already has "choices"
        if "choices" in raw:
            return raw

        # Responses API format — has "output"
        if "output" in raw:
            return self._normalize_responses_api(raw)

        # Unknown format — return as-is
        return raw

    def _normalize_responses_api(self, raw: Dict[str, Any]) -> Dict[str, Any]:
        """Translate Responses API format to Chat Completions format."""
        # Find message outputs
        text_parts = []
        all_logprobs = []

        for item in raw.get("output", []):
            if item.get("type") != "message":
                continue
            for content in item.get("content", []):
                if content.get("type") == "output_text":
                    text_parts.append(content.get("text", ""))
                    if "logprobs" in content and content["logprobs"]:
                        all_logprobs.extend(content["logprobs"])

        combined_text = "".join(text_parts)

        choice: Dict[str, Any] = {
            "message": {
                "role": "assistant",
                "content": combined_text,
            },
            "finish_reason": "stop",
        }

        # Include logprobs if present
        if all_logprobs:
            choice["logprobs"] = {"content": all_logprobs}

        result: Dict[str, Any] = {
            "choices": [choice],
            "model": raw.get("model", ""),
        }

        if "usage" in raw:
            result["usage"] = raw["usage"]

        return result

    def chat_completion_stream(
        self,
        model: str,
        messages: List[Dict[str, str]],
        **kwargs: Any,
    ) -> Iterator[str]:
        """Stream chat completion. Yields content chunks."""
        model = _normalize_model_name(model, self.provider)
        with self._client.stream(
            "POST",
            "/chat/completions",
            json={
                "model": model,
                "messages": messages,
                "stream": True,
                **kwargs,
            },
        ) as response:
            for line in response.iter_lines():
                if line.startswith("data: "):
                    data = line[6:]
                    if data != "[DONE]":
                        try:
                            chunk = json.loads(data)
                            if content := chunk["choices"][0]["delta"].get("content"):
                                yield content
                        except (json.JSONDecodeError, KeyError, IndexError):
                            pass

    def get_logprobs(
        self,
        model: str,
        messages: List[Dict[str, str]],
        **kwargs: Any,
    ) -> Dict[str, float]:
        """Get Yes/No log probabilities for normalized scoring.

        Returns dict with "yes" and "no" probability values.
        """
        response = self.chat_completion(
            model=model,
            messages=messages,
            logprobs=True,
            top_logprobs=5,
            max_tokens=512,
            **kwargs,
        )

        try:
            logprobs_data = response["choices"][0].get("logprobs")
            if logprobs_data is None:
                return {"yes": 0.0, "no": 0.0}

            logprobs = logprobs_data["content"][0]["top_logprobs"]
            probs = {
                lp["token"].lower().strip(): math.exp(lp["logprob"])
                for lp in logprobs
            }
            return {
                "yes": probs.get("yes", 0.0),
                "no": probs.get("no", 0.0),
            }
        except (KeyError, IndexError, TypeError):
            return {"yes": 0.0, "no": 0.0}

    def _get_models(self) -> List[Dict[str, Any]]:
        """Get models list (OpenRouter only), using cache if fresh."""
        cache_key = (self.provider, self.base_url)
        now = time.time()

        cached = LLMHTTPClient._models_cache.get(cache_key)
        cached_time = LLMHTTPClient._models_cache_time.get(cache_key, 0)

        if cached is not None and now - cached_time < self._CACHE_TTL:
            return cached

        try:
            response = self._client.get("/models")
            response.raise_for_status()
            models = response.json().get("data", [])
            LLMHTTPClient._models_cache[cache_key] = models
            LLMHTTPClient._models_cache_time[cache_key] = now
            return models
        except Exception:
            return []

    def supports_logprobs(self, model: str) -> bool:
        """Check if a model supports logprobs.

        vLLM and OpenAI always support logprobs.
        OpenRouter queries the /models endpoint.
        """
        if self.provider in ("vllm", "openai"):
            return True

        # OpenRouter: query models endpoint
        models = self._get_models()
        for m in models:
            if m.get("id") == model:
                supported = m.get("supported_parameters", [])
                return "logprobs" in supported
        return False

    async def chat_completion_async(
        self,
        model: str,
        messages: List[Dict[str, str]],
        temperature: float = 0.7,
        max_tokens: int = 2048,
        **kwargs: Any,
    ) -> Dict[str, Any]:
        """Async chat completion request."""
        headers = {
            "Content-Type": "application/json",
            **self._provider_config.default_headers,
        }
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"

        model = _normalize_model_name(model, self.provider)

        async with httpx.AsyncClient(
            base_url=self.base_url,
            headers=headers,
            timeout=self.timeout,
        ) as client:
            is_reasoning = _is_reasoning_model(model)
            token_key = "max_completion_tokens" if is_reasoning else "max_tokens"
            body = {
                "model": model,
                "messages": messages,
                token_key: max_tokens,
                **kwargs,
            }
            if not is_reasoning:
                body["temperature"] = temperature
            response = await client.post("/chat/completions", json=body)
            _raise_with_detail(response)
            return response.json()

    async def _batch_completions_async(
        self,
        requests: List[Dict[str, Any]],
        concurrency: int = 5,
        progress_callback: Optional[Callable[[int], None]] = None,
    ) -> List[Dict[str, Any]]:
        """Process multiple requests concurrently (async implementation)."""
        semaphore = asyncio.Semaphore(concurrency)
        results: List[Optional[Dict[str, Any]]] = [None] * len(requests)
        completed = 0

        async def limited_request(idx: int, req: Dict[str, Any]):
            nonlocal completed
            async with semaphore:
                try:
                    result = await self.chat_completion_async(**req)
                    results[idx] = result
                except Exception as e:
                    results[idx] = {"error": str(e)}
                finally:
                    completed += 1
                    if progress_callback:
                        progress_callback(completed)

        await asyncio.gather(*[
            limited_request(i, req) for i, req in enumerate(requests)
        ])

        return results  # type: ignore

    def batch_completions(
        self,
        requests: List[Dict[str, Any]],
        concurrency: int = 5,
        progress_callback: Optional[Callable[[int], None]] = None,
    ) -> List[Dict[str, Any]]:
        """Process multiple requests concurrently (sync wrapper)."""
        try:
            loop = asyncio.get_event_loop()
            if loop.is_running():
                import concurrent.futures
                with concurrent.futures.ThreadPoolExecutor() as executor:
                    future = executor.submit(
                        asyncio.run,
                        self._batch_completions_async(requests, concurrency, progress_callback)
                    )
                    return future.result()
            else:
                return loop.run_until_complete(
                    self._batch_completions_async(requests, concurrency, progress_callback)
                )
        except RuntimeError:
            return asyncio.run(
                self._batch_completions_async(requests, concurrency, progress_callback)
            )

`close()` ¶

Close the HTTP client.

Source code in autochecklist/providers/http_client.py

def close(self) -> None:
    """Close the HTTP client."""
    self._client.close()

`chat_completion(model, messages, temperature=0.7, max_tokens=2048, **kwargs)` ¶

Make a chat completion request.

For api_format="responses", translates to/from the Responses API format. Always returns normalized OpenAI Chat Completions format.

Source code in autochecklist/providers/http_client.py

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=1, max=10),
    retry=retry_if_exception(_is_retryable),
)
def chat_completion(
    self,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float = 0.7,
    max_tokens: int = 2048,
    **kwargs: Any,
) -> Dict[str, Any]:
    """Make a chat completion request.

    For api_format="responses", translates to/from the Responses API format.
    Always returns normalized OpenAI Chat Completions format.
    """
    model = _normalize_model_name(model, self.provider)

    if self.api_format == "responses":
        return self._chat_completion_responses(
            model, messages, temperature, max_tokens, **kwargs
        )

    # Reasoning models require max_completion_tokens and only support temperature=1
    is_reasoning = _is_reasoning_model(model)
    token_key = "max_completion_tokens" if is_reasoning else "max_tokens"
    body = {
        "model": model,
        "messages": messages,
        token_key: max_tokens,
        **kwargs,
    }
    if not is_reasoning:
        body["temperature"] = temperature
    response = self._client.post("/chat/completions", json=body)
    _raise_with_detail(response)
    raw = response.json()
    return self._normalize_response(raw)

`chat_completion_stream(model, messages, **kwargs)` ¶

Stream chat completion. Yields content chunks.

Source code in autochecklist/providers/http_client.py

def chat_completion_stream(
    self,
    model: str,
    messages: List[Dict[str, str]],
    **kwargs: Any,
) -> Iterator[str]:
    """Stream chat completion. Yields content chunks."""
    model = _normalize_model_name(model, self.provider)
    with self._client.stream(
        "POST",
        "/chat/completions",
        json={
            "model": model,
            "messages": messages,
            "stream": True,
            **kwargs,
        },
    ) as response:
        for line in response.iter_lines():
            if line.startswith("data: "):
                data = line[6:]
                if data != "[DONE]":
                    try:
                        chunk = json.loads(data)
                        if content := chunk["choices"][0]["delta"].get("content"):
                            yield content
                    except (json.JSONDecodeError, KeyError, IndexError):
                        pass

`get_logprobs(model, messages, **kwargs)` ¶

Get Yes/No log probabilities for normalized scoring.

Returns dict with "yes" and "no" probability values.

Source code in autochecklist/providers/http_client.py

def get_logprobs(
    self,
    model: str,
    messages: List[Dict[str, str]],
    **kwargs: Any,
) -> Dict[str, float]:
    """Get Yes/No log probabilities for normalized scoring.

    Returns dict with "yes" and "no" probability values.
    """
    response = self.chat_completion(
        model=model,
        messages=messages,
        logprobs=True,
        top_logprobs=5,
        max_tokens=512,
        **kwargs,
    )

    try:
        logprobs_data = response["choices"][0].get("logprobs")
        if logprobs_data is None:
            return {"yes": 0.0, "no": 0.0}

        logprobs = logprobs_data["content"][0]["top_logprobs"]
        probs = {
            lp["token"].lower().strip(): math.exp(lp["logprob"])
            for lp in logprobs
        }
        return {
            "yes": probs.get("yes", 0.0),
            "no": probs.get("no", 0.0),
        }
    except (KeyError, IndexError, TypeError):
        return {"yes": 0.0, "no": 0.0}

`supports_logprobs(model)` ¶

Check if a model supports logprobs.

vLLM and OpenAI always support logprobs. OpenRouter queries the /models endpoint.

Source code in autochecklist/providers/http_client.py

def supports_logprobs(self, model: str) -> bool:
    """Check if a model supports logprobs.

    vLLM and OpenAI always support logprobs.
    OpenRouter queries the /models endpoint.
    """
    if self.provider in ("vllm", "openai"):
        return True

    # OpenRouter: query models endpoint
    models = self._get_models()
    for m in models:
        if m.get("id") == model:
            supported = m.get("supported_parameters", [])
            return "logprobs" in supported
    return False

`chat_completion_async(model, messages, temperature=0.7, max_tokens=2048, **kwargs)` `async` ¶

Async chat completion request.

Source code in autochecklist/providers/http_client.py

async def chat_completion_async(
    self,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float = 0.7,
    max_tokens: int = 2048,
    **kwargs: Any,
) -> Dict[str, Any]:
    """Async chat completion request."""
    headers = {
        "Content-Type": "application/json",
        **self._provider_config.default_headers,
    }
    if self.api_key:
        headers["Authorization"] = f"Bearer {self.api_key}"

    model = _normalize_model_name(model, self.provider)

    async with httpx.AsyncClient(
        base_url=self.base_url,
        headers=headers,
        timeout=self.timeout,
    ) as client:
        is_reasoning = _is_reasoning_model(model)
        token_key = "max_completion_tokens" if is_reasoning else "max_tokens"
        body = {
            "model": model,
            "messages": messages,
            token_key: max_tokens,
            **kwargs,
        }
        if not is_reasoning:
            body["temperature"] = temperature
        response = await client.post("/chat/completions", json=body)
        _raise_with_detail(response)
        return response.json()

`batch_completions(requests, concurrency=5, progress_callback=None)` ¶

Process multiple requests concurrently (sync wrapper).

Source code in autochecklist/providers/http_client.py

def batch_completions(
    self,
    requests: List[Dict[str, Any]],
    concurrency: int = 5,
    progress_callback: Optional[Callable[[int], None]] = None,
) -> List[Dict[str, Any]]:
    """Process multiple requests concurrently (sync wrapper)."""
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            import concurrent.futures
            with concurrent.futures.ThreadPoolExecutor() as executor:
                future = executor.submit(
                    asyncio.run,
                    self._batch_completions_async(requests, concurrency, progress_callback)
                )
                return future.result()
        else:
            return loop.run_until_complete(
                self._batch_completions_async(requests, concurrency, progress_callback)
            )
    except RuntimeError:
        return asyncio.run(
            self._batch_completions_async(requests, concurrency, progress_callback)
        )

`VLLMOfflineClient` ¶

Offline inference client using vLLM's Python API.

Loads a model once at init and reuses it for all calls. The model parameter in method signatures is ignored — the model is fixed at construction time.

Context manager is a no-op: the model stays loaded. This is critical because existing code does with Client() as client: in tight loops.

Source code in autochecklist/providers/vllm_offline.py

class VLLMOfflineClient:
    """Offline inference client using vLLM's Python API.

    Loads a model once at __init__ and reuses it for all calls.
    The model parameter in method signatures is ignored — the model
    is fixed at construction time.

    Context manager is a no-op: the model stays loaded. This is critical
    because existing code does `with Client() as client:` in tight loops.
    """

    def __init__(
        self,
        model: str,
        tensor_parallel_size: int = 1,
        gpu_memory_utilization: float = 0.9,
        max_model_len: Optional[int] = None,
        **vllm_kwargs: Any,
    ):
        try:
            from vllm import LLM, SamplingParams
        except (ImportError, ModuleNotFoundError):
            raise ImportError(
                "vllm is required for offline inference. "
                "Install with: pip install vllm"
            )

        self._SamplingParams = SamplingParams
        self._model_name = model
        self._llm = LLM(
            model=model,
            tensor_parallel_size=tensor_parallel_size,
            gpu_memory_utilization=gpu_memory_utilization,
            max_model_len=max_model_len,
            **vllm_kwargs,
        )
        self._tokenizer = self._llm.get_tokenizer()

    def _apply_chat_template(self, messages: List[Dict[str, str]]) -> str:
        """Convert chat messages to a prompt string using the model's template."""
        if hasattr(self._tokenizer, "apply_chat_template"):
            return self._tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
        # Fallback for models without chat templates
        parts = []
        for msg in messages:
            role = msg["role"]
            content = msg["content"]
            if role == "system":
                parts.append(f"System: {content}")
            elif role == "user":
                parts.append(f"User: {content}")
            elif role == "assistant":
                parts.append(f"Assistant: {content}")
        parts.append("Assistant:")
        return "\n\n".join(parts)

    def chat_completion(
        self,
        model: str,
        messages: List[Dict[str, str]],
        temperature: float = 0.7,
        max_tokens: int = 2048,
        **kwargs: Any,
    ) -> Dict[str, Any]:
        """Generate completion, returning OpenAI-format dict.

        The model parameter is ignored — uses the model loaded at init.
        """
        prompt = self._apply_chat_template(messages)

        logprobs_count = kwargs.pop("top_logprobs", None)
        request_logprobs = kwargs.pop("logprobs", False)
        kwargs.pop("reasoning_effort", None)  # Not supported by vLLM

        # Handle response_format → guided decoding
        response_format = kwargs.pop("response_format", None)
        guided_params = self._build_guided_params(response_format)

        sampling_params = self._SamplingParams(
            temperature=temperature,
            max_tokens=max_tokens,
            logprobs=logprobs_count if request_logprobs else None,
            guided_decoding=guided_params,
        )

        outputs = self._llm.generate([prompt], sampling_params)
        output = outputs[0].outputs[0]

        # Build OpenAI-format response
        response: Dict[str, Any] = {
            "choices": [{
                "message": {
                    "role": "assistant",
                    "content": output.text,
                },
                "finish_reason": output.finish_reason,
            }],
            "model": self._model_name,
            "usage": {
                "prompt_tokens": len(outputs[0].prompt_token_ids),
                "completion_tokens": len(output.token_ids),
            },
        }

        # Include logprobs if requested
        if request_logprobs and output.logprobs:
            response["choices"][0]["logprobs"] = self._format_logprobs(
                output.logprobs, logprobs_count or 5
            )

        return response

    @staticmethod
    def _build_guided_params(response_format: Optional[Dict[str, Any]]) -> Any:
        """Convert OpenAI-style response_format to vLLM GuidedDecodingParams.

        Returns None when no response_format is provided.
        """
        if response_format is None:
            return None
        from vllm import GuidedDecodingParams

        json_schema = response_format
        if isinstance(response_format, dict) and "json_schema" in response_format:
            json_schema = response_format["json_schema"].get("schema", response_format)
        return GuidedDecodingParams(json=json_schema)

    def _format_logprobs(
        self,
        vllm_logprobs: List,
        top_n: int,
    ) -> Dict[str, Any]:
        """Convert vLLM logprobs to OpenAI format.

        vLLM returns: List[Dict[int, Logprob]] per token
        OpenAI format: {"content": [{"token": str, "logprob": float, "top_logprobs": [...]}]}
        """
        content = []
        for token_logprobs in vllm_logprobs:
            if token_logprobs is None:
                continue

            sorted_lps = sorted(
                token_logprobs.values(),
                key=lambda lp: lp.logprob,
                reverse=True,
            )[:top_n]

            top_logprobs_list = []
            for lp in sorted_lps:
                token_str = (
                    lp.decoded_token
                    if hasattr(lp, "decoded_token") and lp.decoded_token
                    else str(lp.rank if hasattr(lp, "rank") else "")
                )
                top_logprobs_list.append({
                    "token": token_str,
                    "logprob": lp.logprob,
                })

            if top_logprobs_list:
                content.append({
                    "token": top_logprobs_list[0]["token"],
                    "logprob": top_logprobs_list[0]["logprob"],
                    "top_logprobs": top_logprobs_list,
                })

        return {"content": content}

    def get_logprobs(
        self,
        model: str,
        messages: List[Dict[str, str]],
        **kwargs: Any,
    ) -> Dict[str, float]:
        """Get Yes/No log probabilities."""
        response = self.chat_completion(
            model=model,
            messages=messages,
            logprobs=True,
            top_logprobs=5,
            max_tokens=512,
            **kwargs,
        )

        try:
            logprobs_data = response["choices"][0].get("logprobs")
            if logprobs_data is None:
                return {"yes": 0.0, "no": 0.0}

            logprobs = logprobs_data["content"][0]["top_logprobs"]
            probs = {
                lp["token"].lower().strip(): math.exp(lp["logprob"])
                for lp in logprobs
            }
            return {
                "yes": probs.get("yes", 0.0),
                "no": probs.get("no", 0.0),
            }
        except (KeyError, IndexError, TypeError):
            return {"yes": 0.0, "no": 0.0}

    def supports_logprobs(self, model: str) -> bool:
        """vLLM always supports logprobs."""
        return True

    def batch_completions(
        self,
        requests: List[Dict[str, Any]],
        progress_callback: Optional[Callable[[int], None]] = None,
    ) -> List[Dict[str, Any]]:
        """Process batch using vLLM's native batching."""
        prompts = []
        sampling_params_list = []

        for req in requests:
            msgs = req["messages"]
            prompt = self._apply_chat_template(msgs)
            prompts.append(prompt)
            guided_params = self._build_guided_params(
                req.get("response_format")
            )
            sampling_params_list.append(self._SamplingParams(
                temperature=req.get("temperature", 0.7),
                max_tokens=req.get("max_tokens", 2048),
                guided_decoding=guided_params,
            ))

        all_outputs = self._llm.generate(prompts, sampling_params_list)

        results = []
        for i, output in enumerate(all_outputs):
            results.append({
                "choices": [{
                    "message": {
                        "role": "assistant",
                        "content": output.outputs[0].text,
                    },
                }],
                "model": self._model_name,
            })
            if progress_callback:
                progress_callback(i + 1)

        return results

    def close(self) -> None:
        """No-op — model stays loaded until garbage collection."""
        pass

    def __enter__(self) -> "VLLMOfflineClient":
        return self

    def __exit__(self, *args: Any) -> None:
        # Do NOT unload model on context exit
        pass

`chat_completion(model, messages, temperature=0.7, max_tokens=2048, **kwargs)` ¶

Generate completion, returning OpenAI-format dict.

The model parameter is ignored — uses the model loaded at init.

Source code in autochecklist/providers/vllm_offline.py

def chat_completion(
    self,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float = 0.7,
    max_tokens: int = 2048,
    **kwargs: Any,
) -> Dict[str, Any]:
    """Generate completion, returning OpenAI-format dict.

    The model parameter is ignored — uses the model loaded at init.
    """
    prompt = self._apply_chat_template(messages)

    logprobs_count = kwargs.pop("top_logprobs", None)
    request_logprobs = kwargs.pop("logprobs", False)
    kwargs.pop("reasoning_effort", None)  # Not supported by vLLM

    # Handle response_format → guided decoding
    response_format = kwargs.pop("response_format", None)
    guided_params = self._build_guided_params(response_format)

    sampling_params = self._SamplingParams(
        temperature=temperature,
        max_tokens=max_tokens,
        logprobs=logprobs_count if request_logprobs else None,
        guided_decoding=guided_params,
    )

    outputs = self._llm.generate([prompt], sampling_params)
    output = outputs[0].outputs[0]

    # Build OpenAI-format response
    response: Dict[str, Any] = {
        "choices": [{
            "message": {
                "role": "assistant",
                "content": output.text,
            },
            "finish_reason": output.finish_reason,
        }],
        "model": self._model_name,
        "usage": {
            "prompt_tokens": len(outputs[0].prompt_token_ids),
            "completion_tokens": len(output.token_ids),
        },
    }

    # Include logprobs if requested
    if request_logprobs and output.logprobs:
        response["choices"][0]["logprobs"] = self._format_logprobs(
            output.logprobs, logprobs_count or 5
        )

    return response

`get_logprobs(model, messages, **kwargs)` ¶

Get Yes/No log probabilities.

Source code in autochecklist/providers/vllm_offline.py

def get_logprobs(
    self,
    model: str,
    messages: List[Dict[str, str]],
    **kwargs: Any,
) -> Dict[str, float]:
    """Get Yes/No log probabilities."""
    response = self.chat_completion(
        model=model,
        messages=messages,
        logprobs=True,
        top_logprobs=5,
        max_tokens=512,
        **kwargs,
    )

    try:
        logprobs_data = response["choices"][0].get("logprobs")
        if logprobs_data is None:
            return {"yes": 0.0, "no": 0.0}

        logprobs = logprobs_data["content"][0]["top_logprobs"]
        probs = {
            lp["token"].lower().strip(): math.exp(lp["logprob"])
            for lp in logprobs
        }
        return {
            "yes": probs.get("yes", 0.0),
            "no": probs.get("no", 0.0),
        }
    except (KeyError, IndexError, TypeError):
        return {"yes": 0.0, "no": 0.0}

`supports_logprobs(model)` ¶

vLLM always supports logprobs.

Source code in autochecklist/providers/vllm_offline.py

def supports_logprobs(self, model: str) -> bool:
    """vLLM always supports logprobs."""
    return True

`batch_completions(requests, progress_callback=None)` ¶

Process batch using vLLM's native batching.

Source code in autochecklist/providers/vllm_offline.py

def batch_completions(
    self,
    requests: List[Dict[str, Any]],
    progress_callback: Optional[Callable[[int], None]] = None,
) -> List[Dict[str, Any]]:
    """Process batch using vLLM's native batching."""
    prompts = []
    sampling_params_list = []

    for req in requests:
        msgs = req["messages"]
        prompt = self._apply_chat_template(msgs)
        prompts.append(prompt)
        guided_params = self._build_guided_params(
            req.get("response_format")
        )
        sampling_params_list.append(self._SamplingParams(
            temperature=req.get("temperature", 0.7),
            max_tokens=req.get("max_tokens", 2048),
            guided_decoding=guided_params,
        ))

    all_outputs = self._llm.generate(prompts, sampling_params_list)

    results = []
    for i, output in enumerate(all_outputs):
        results.append({
            "choices": [{
                "message": {
                    "role": "assistant",
                    "content": output.outputs[0].text,
                },
            }],
            "model": self._model_name,
        })
        if progress_callback:
            progress_callback(i + 1)

    return results

`close()` ¶

No-op — model stays loaded until garbage collection.

Source code in autochecklist/providers/vllm_offline.py

def close(self) -> None:
    """No-op — model stays loaded until garbage collection."""
    pass

`get_provider_config(provider, base_url=None)` ¶

Get provider config with optional overrides.

Parameters:

Name	Type	Description	Default
`provider`	`str`	Provider name ("openrouter", "openai", "vllm")	required
`base_url`	`Optional[str]`	Override the default base URL	`None`

Returns:

Type	Description
`ProviderConfig`	ProviderConfig with overrides applied

Source code in autochecklist/providers/base.py

def get_provider_config(
    provider: str,
    base_url: Optional[str] = None,
) -> ProviderConfig:
    """Get provider config with optional overrides.

    Args:
        provider: Provider name ("openrouter", "openai", "vllm")
        base_url: Override the default base URL

    Returns:
        ProviderConfig with overrides applied
    """
    if provider not in PROVIDER_PRESETS:
        raise ValueError(
            f"Unknown provider: {provider}. "
            f"Available: {', '.join(PROVIDER_PRESETS.keys())}"
        )

    preset = PROVIDER_PRESETS[provider]

    if base_url is not None:
        # Return a copy with overridden base_url
        return ProviderConfig(
            name=preset.name,
            base_url=base_url,
            api_key_env_var=preset.api_key_env_var,
            default_headers=preset.default_headers,
            requires_api_key=preset.requires_api_key,
        )

    return preset

`get_client(provider='openrouter', base_url=None, api_key=None, model=None, api_format=None, **kwargs)` ¶

Create an LLM client for the given provider.

Parameters:

Name	Type	Description	Default
`provider`	`str`	Provider name ("openrouter", "openai", "vllm")	`'openrouter'`
`base_url`	`Optional[str]`	Override the default base URL. For vLLM, None means offline mode.	`None`
`api_key`	`Optional[str]`	API key (resolved from env if not provided)	`None`
`model`	`Optional[str]`	Model name (required for vLLM offline mode)	`None`
`api_format`	`Optional[str]`	API format ("chat" or "responses"). Defaults to "responses" for OpenAI, "chat" for other providers.	`None`
`**kwargs`	`Any`	Additional kwargs passed to the client constructor	`{}`

Returns:

Type	Description
`LLMClient`	An LLMClient instance

Raises:

Type	Description
`ValueError`	If provider is unknown or vLLM offline mode missing model

Source code in autochecklist/providers/factory.py

def get_client(
    provider: str = "openrouter",
    base_url: Optional[str] = None,
    api_key: Optional[str] = None,
    model: Optional[str] = None,
    api_format: Optional[str] = None,
    **kwargs: Any,
) -> LLMClient:
    """Create an LLM client for the given provider.

    Args:
        provider: Provider name ("openrouter", "openai", "vllm")
        base_url: Override the default base URL. For vLLM, None means offline mode.
        api_key: API key (resolved from env if not provided)
        model: Model name (required for vLLM offline mode)
        api_format: API format ("chat" or "responses"). Defaults to "responses"
            for OpenAI, "chat" for other providers.
        **kwargs: Additional kwargs passed to the client constructor

    Returns:
        An LLMClient instance

    Raises:
        ValueError: If provider is unknown or vLLM offline mode missing model
    """
    if provider == "vllm" and base_url is None:
        # Offline mode — direct Python inference
        from .vllm_offline import VLLMOfflineClient

        if model is None:
            raise ValueError(
                "model is required for vLLM offline mode. "
                "Pass model='your-model-name' or set base_url for server mode."
            )
        return VLLMOfflineClient(model=model, **kwargs)

    # HTTP mode (OpenRouter, OpenAI, vLLM server)
    from .http_client import LLMHTTPClient

    # Default to Responses API for OpenAI (handles reasoning models natively)
    if api_format is None:
        api_format = "responses" if provider == "openai" else "chat"

    return LLMHTTPClient(
        provider=provider,
        api_key=api_key,
        base_url=base_url,
        api_format=api_format,
        **kwargs,
    )