Skip to content

vllm_offline

vllm_offline

vLLM offline inference client.

Wraps vllm.LLM for direct Python inference without an HTTP server. Satisfies the LLMClient protocol so it can be used anywhere the HTTP client is used.

VLLMOfflineClient

Offline inference client using vLLM's Python API.

Loads a model once at init and reuses it for all calls. The model parameter in method signatures is ignored — the model is fixed at construction time.

Context manager is a no-op: the model stays loaded. This is critical because existing code does with Client() as client: in tight loops.

Source code in autochecklist/providers/vllm_offline.py
class VLLMOfflineClient:
    """Offline inference client using vLLM's Python API.

    Loads a model once at __init__ and reuses it for all calls.
    The model parameter in method signatures is ignored — the model
    is fixed at construction time.

    Context manager is a no-op: the model stays loaded. This is critical
    because existing code does `with Client() as client:` in tight loops.
    """

    def __init__(
        self,
        model: str,
        tensor_parallel_size: int = 1,
        gpu_memory_utilization: float = 0.9,
        max_model_len: Optional[int] = None,
        **vllm_kwargs: Any,
    ):
        try:
            from vllm import LLM, SamplingParams
        except (ImportError, ModuleNotFoundError):
            raise ImportError(
                "vllm is required for offline inference. "
                "Install with: pip install vllm"
            )

        self._SamplingParams = SamplingParams
        self._model_name = model
        self._llm = LLM(
            model=model,
            tensor_parallel_size=tensor_parallel_size,
            gpu_memory_utilization=gpu_memory_utilization,
            max_model_len=max_model_len,
            **vllm_kwargs,
        )
        self._tokenizer = self._llm.get_tokenizer()

    def _apply_chat_template(self, messages: List[Dict[str, str]]) -> str:
        """Convert chat messages to a prompt string using the model's template."""
        if hasattr(self._tokenizer, "apply_chat_template"):
            return self._tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
        # Fallback for models without chat templates
        parts = []
        for msg in messages:
            role = msg["role"]
            content = msg["content"]
            if role == "system":
                parts.append(f"System: {content}")
            elif role == "user":
                parts.append(f"User: {content}")
            elif role == "assistant":
                parts.append(f"Assistant: {content}")
        parts.append("Assistant:")
        return "\n\n".join(parts)

    def chat_completion(
        self,
        model: str,
        messages: List[Dict[str, str]],
        temperature: float = 0.7,
        max_tokens: int = 2048,
        **kwargs: Any,
    ) -> Dict[str, Any]:
        """Generate completion, returning OpenAI-format dict.

        The model parameter is ignored — uses the model loaded at init.
        """
        prompt = self._apply_chat_template(messages)

        logprobs_count = kwargs.pop("top_logprobs", None)
        request_logprobs = kwargs.pop("logprobs", False)
        kwargs.pop("reasoning_effort", None)  # Not supported by vLLM

        # Handle response_format → guided decoding
        response_format = kwargs.pop("response_format", None)
        guided_params = self._build_guided_params(response_format)

        sampling_params = self._SamplingParams(
            temperature=temperature,
            max_tokens=max_tokens,
            logprobs=logprobs_count if request_logprobs else None,
            guided_decoding=guided_params,
        )

        outputs = self._llm.generate([prompt], sampling_params)
        output = outputs[0].outputs[0]

        # Build OpenAI-format response
        response: Dict[str, Any] = {
            "choices": [{
                "message": {
                    "role": "assistant",
                    "content": output.text,
                },
                "finish_reason": output.finish_reason,
            }],
            "model": self._model_name,
            "usage": {
                "prompt_tokens": len(outputs[0].prompt_token_ids),
                "completion_tokens": len(output.token_ids),
            },
        }

        # Include logprobs if requested
        if request_logprobs and output.logprobs:
            response["choices"][0]["logprobs"] = self._format_logprobs(
                output.logprobs, logprobs_count or 5
            )

        return response

    @staticmethod
    def _build_guided_params(response_format: Optional[Dict[str, Any]]) -> Any:
        """Convert OpenAI-style response_format to vLLM GuidedDecodingParams.

        Returns None when no response_format is provided.
        """
        if response_format is None:
            return None
        from vllm import GuidedDecodingParams

        json_schema = response_format
        if isinstance(response_format, dict) and "json_schema" in response_format:
            json_schema = response_format["json_schema"].get("schema", response_format)
        return GuidedDecodingParams(json=json_schema)

    def _format_logprobs(
        self,
        vllm_logprobs: List,
        top_n: int,
    ) -> Dict[str, Any]:
        """Convert vLLM logprobs to OpenAI format.

        vLLM returns: List[Dict[int, Logprob]] per token
        OpenAI format: {"content": [{"token": str, "logprob": float, "top_logprobs": [...]}]}
        """
        content = []
        for token_logprobs in vllm_logprobs:
            if token_logprobs is None:
                continue

            sorted_lps = sorted(
                token_logprobs.values(),
                key=lambda lp: lp.logprob,
                reverse=True,
            )[:top_n]

            top_logprobs_list = []
            for lp in sorted_lps:
                token_str = (
                    lp.decoded_token
                    if hasattr(lp, "decoded_token") and lp.decoded_token
                    else str(lp.rank if hasattr(lp, "rank") else "")
                )
                top_logprobs_list.append({
                    "token": token_str,
                    "logprob": lp.logprob,
                })

            if top_logprobs_list:
                content.append({
                    "token": top_logprobs_list[0]["token"],
                    "logprob": top_logprobs_list[0]["logprob"],
                    "top_logprobs": top_logprobs_list,
                })

        return {"content": content}

    def get_logprobs(
        self,
        model: str,
        messages: List[Dict[str, str]],
        **kwargs: Any,
    ) -> Dict[str, float]:
        """Get Yes/No log probabilities."""
        response = self.chat_completion(
            model=model,
            messages=messages,
            logprobs=True,
            top_logprobs=5,
            max_tokens=512,
            **kwargs,
        )

        try:
            logprobs_data = response["choices"][0].get("logprobs")
            if logprobs_data is None:
                return {"yes": 0.0, "no": 0.0}

            logprobs = logprobs_data["content"][0]["top_logprobs"]
            probs = {
                lp["token"].lower().strip(): math.exp(lp["logprob"])
                for lp in logprobs
            }
            return {
                "yes": probs.get("yes", 0.0),
                "no": probs.get("no", 0.0),
            }
        except (KeyError, IndexError, TypeError):
            return {"yes": 0.0, "no": 0.0}

    def supports_logprobs(self, model: str) -> bool:
        """vLLM always supports logprobs."""
        return True

    def batch_completions(
        self,
        requests: List[Dict[str, Any]],
        progress_callback: Optional[Callable[[int], None]] = None,
    ) -> List[Dict[str, Any]]:
        """Process batch using vLLM's native batching."""
        prompts = []
        sampling_params_list = []

        for req in requests:
            msgs = req["messages"]
            prompt = self._apply_chat_template(msgs)
            prompts.append(prompt)
            guided_params = self._build_guided_params(
                req.get("response_format")
            )
            sampling_params_list.append(self._SamplingParams(
                temperature=req.get("temperature", 0.7),
                max_tokens=req.get("max_tokens", 2048),
                guided_decoding=guided_params,
            ))

        all_outputs = self._llm.generate(prompts, sampling_params_list)

        results = []
        for i, output in enumerate(all_outputs):
            results.append({
                "choices": [{
                    "message": {
                        "role": "assistant",
                        "content": output.outputs[0].text,
                    },
                }],
                "model": self._model_name,
            })
            if progress_callback:
                progress_callback(i + 1)

        return results

    def close(self) -> None:
        """No-op — model stays loaded until garbage collection."""
        pass

    def __enter__(self) -> "VLLMOfflineClient":
        return self

    def __exit__(self, *args: Any) -> None:
        # Do NOT unload model on context exit
        pass

chat_completion(model, messages, temperature=0.7, max_tokens=2048, **kwargs)

Generate completion, returning OpenAI-format dict.

The model parameter is ignored — uses the model loaded at init.

Source code in autochecklist/providers/vllm_offline.py
def chat_completion(
    self,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float = 0.7,
    max_tokens: int = 2048,
    **kwargs: Any,
) -> Dict[str, Any]:
    """Generate completion, returning OpenAI-format dict.

    The model parameter is ignored — uses the model loaded at init.
    """
    prompt = self._apply_chat_template(messages)

    logprobs_count = kwargs.pop("top_logprobs", None)
    request_logprobs = kwargs.pop("logprobs", False)
    kwargs.pop("reasoning_effort", None)  # Not supported by vLLM

    # Handle response_format → guided decoding
    response_format = kwargs.pop("response_format", None)
    guided_params = self._build_guided_params(response_format)

    sampling_params = self._SamplingParams(
        temperature=temperature,
        max_tokens=max_tokens,
        logprobs=logprobs_count if request_logprobs else None,
        guided_decoding=guided_params,
    )

    outputs = self._llm.generate([prompt], sampling_params)
    output = outputs[0].outputs[0]

    # Build OpenAI-format response
    response: Dict[str, Any] = {
        "choices": [{
            "message": {
                "role": "assistant",
                "content": output.text,
            },
            "finish_reason": output.finish_reason,
        }],
        "model": self._model_name,
        "usage": {
            "prompt_tokens": len(outputs[0].prompt_token_ids),
            "completion_tokens": len(output.token_ids),
        },
    }

    # Include logprobs if requested
    if request_logprobs and output.logprobs:
        response["choices"][0]["logprobs"] = self._format_logprobs(
            output.logprobs, logprobs_count or 5
        )

    return response

get_logprobs(model, messages, **kwargs)

Get Yes/No log probabilities.

Source code in autochecklist/providers/vllm_offline.py
def get_logprobs(
    self,
    model: str,
    messages: List[Dict[str, str]],
    **kwargs: Any,
) -> Dict[str, float]:
    """Get Yes/No log probabilities."""
    response = self.chat_completion(
        model=model,
        messages=messages,
        logprobs=True,
        top_logprobs=5,
        max_tokens=512,
        **kwargs,
    )

    try:
        logprobs_data = response["choices"][0].get("logprobs")
        if logprobs_data is None:
            return {"yes": 0.0, "no": 0.0}

        logprobs = logprobs_data["content"][0]["top_logprobs"]
        probs = {
            lp["token"].lower().strip(): math.exp(lp["logprob"])
            for lp in logprobs
        }
        return {
            "yes": probs.get("yes", 0.0),
            "no": probs.get("no", 0.0),
        }
    except (KeyError, IndexError, TypeError):
        return {"yes": 0.0, "no": 0.0}

supports_logprobs(model)

vLLM always supports logprobs.

Source code in autochecklist/providers/vllm_offline.py
def supports_logprobs(self, model: str) -> bool:
    """vLLM always supports logprobs."""
    return True

batch_completions(requests, progress_callback=None)

Process batch using vLLM's native batching.

Source code in autochecklist/providers/vllm_offline.py
def batch_completions(
    self,
    requests: List[Dict[str, Any]],
    progress_callback: Optional[Callable[[int], None]] = None,
) -> List[Dict[str, Any]]:
    """Process batch using vLLM's native batching."""
    prompts = []
    sampling_params_list = []

    for req in requests:
        msgs = req["messages"]
        prompt = self._apply_chat_template(msgs)
        prompts.append(prompt)
        guided_params = self._build_guided_params(
            req.get("response_format")
        )
        sampling_params_list.append(self._SamplingParams(
            temperature=req.get("temperature", 0.7),
            max_tokens=req.get("max_tokens", 2048),
            guided_decoding=guided_params,
        ))

    all_outputs = self._llm.generate(prompts, sampling_params_list)

    results = []
    for i, output in enumerate(all_outputs):
        results.append({
            "choices": [{
                "message": {
                    "role": "assistant",
                    "content": output.outputs[0].text,
                },
            }],
            "model": self._model_name,
        })
        if progress_callback:
            progress_callback(i + 1)

    return results

close()

No-op — model stays loaded until garbage collection.

Source code in autochecklist/providers/vllm_offline.py
def close(self) -> None:
    """No-op — model stays loaded until garbage collection."""
    pass