group-wbl/.venv/lib/python3.13/site-packages/langsmith/evaluation/llm_evaluator.py
2026-01-09 09:48:03 +08:00

303 lines
12 KiB
Python

"""Contains the `LLMEvaluator` class for building LLM-as-a-judge evaluators."""
from typing import Any, Callable, Optional, Union, cast
from pydantic import BaseModel
from langsmith._internal._beta_decorator import warn_beta
from langsmith.evaluation import EvaluationResult, EvaluationResults, RunEvaluator
from langsmith.schemas import Example, Run
class CategoricalScoreConfig(BaseModel):
"""Configuration for a categorical score."""
key: str
choices: list[str]
description: str
include_explanation: bool = False
explanation_description: Optional[str] = None
class ContinuousScoreConfig(BaseModel):
"""Configuration for a continuous score."""
key: str
min: float = 0
max: float = 1
description: str
include_explanation: bool = False
explanation_description: Optional[str] = None
def _create_score_json_schema(
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
) -> dict:
properties: dict[str, Any] = {}
if isinstance(score_config, CategoricalScoreConfig):
properties["score"] = {
"type": "string",
"enum": score_config.choices,
"description": f"The score for the evaluation, one of "
f"{', '.join(score_config.choices)}.",
}
elif isinstance(score_config, ContinuousScoreConfig):
properties["score"] = {
"type": "number",
"minimum": score_config.min,
"maximum": score_config.max,
"description": f"The score for the evaluation, between "
f"{score_config.min} and {score_config.max}, inclusive.",
}
else:
raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'")
if score_config.include_explanation:
properties["explanation"] = {
"type": "string",
"description": (
"The explanation for the score."
if score_config.explanation_description is None
else score_config.explanation_description
),
}
return {
"title": score_config.key,
"description": score_config.description,
"type": "object",
"properties": properties,
"required": (
["score", "explanation"] if score_config.include_explanation else ["score"]
),
}
class LLMEvaluator(RunEvaluator):
"""A class for building LLM-as-a-judge evaluators.
.. deprecated:: 0.5.0
LLMEvaluator is deprecated. Use openevals instead: https://github.com/langchain-ai/openevals
"""
def __init__(
self,
*,
prompt_template: Union[str, list[tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
model_name: str = "gpt-4o",
model_provider: str = "openai",
**kwargs,
):
"""Initialize the `LLMEvaluator`.
Args:
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
template to use for the evaluation. If a string is provided, it is
assumed to be a human / user message.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The configuration for the score, either categorical or continuous.
map_variables (Optional[Callable[[Run, Example], dict]], optional):
A function that maps the run and example to the variables in the
prompt.
If `None`, it is assumed that the prompt only requires 'input',
'output', and 'expected'.
model_name (Optional[str], optional): The model to use for the evaluation.
model_provider (Optional[str], optional): The model provider to use
for the evaluation.
"""
try:
from langchain.chat_models import ( # type: ignore[import-not-found]
init_chat_model,
)
except ImportError as e:
raise ImportError(
"LLMEvaluator requires langchain to be installed. "
"Please install langchain by running `pip install langchain`."
) from e
chat_model = init_chat_model(
model=model_name, model_provider=model_provider, **kwargs
)
self._initialize(prompt_template, score_config, map_variables, chat_model)
@classmethod
def from_model(
cls,
model: Any,
*,
prompt_template: Union[str, list[tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
):
"""Create an `LLMEvaluator` instance from a `BaseChatModel` instance.
Args:
model (BaseChatModel): The chat model instance to use for the evaluation.
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
template to use for the evaluation. If a string is provided, it is
assumed to be a system message.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The configuration for the score, either categorical or continuous.
map_variables (Optional[Callable[[Run, Example]], dict]], optional):
A function that maps the run and example to the variables in the
prompt.
If `None`, it is assumed that the prompt only requires 'input',
'output', and 'expected'.
Returns:
LLMEvaluator: An instance of `LLMEvaluator`.
"""
instance = cls.__new__(cls)
instance._initialize(prompt_template, score_config, map_variables, model)
return instance
def _initialize(
self,
prompt_template: Union[str, list[tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]],
chat_model: Any,
):
"""Shared initialization code for `__init__` and `from_model`.
Args:
prompt_template (Union[str, List[Tuple[str, str]]): The prompt template.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The score configuration.
map_variables (Optional[Callable[[Run, Example]], dict]]):
Function to map variables.
chat_model (BaseChatModel): The chat model instance.
"""
try:
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts import ChatPromptTemplate
except ImportError as e:
raise ImportError(
"LLMEvaluator requires langchain-core to be installed. "
"Please install langchain-core by running `pip install langchain-core`."
) from e
if not (
isinstance(chat_model, BaseChatModel)
and hasattr(chat_model, "with_structured_output")
):
raise ValueError(
"chat_model must be an instance of "
"BaseLanguageModel and support structured output."
)
if isinstance(prompt_template, str):
self.prompt = ChatPromptTemplate.from_messages([("human", prompt_template)])
else:
self.prompt = ChatPromptTemplate.from_messages(prompt_template)
if set(self.prompt.input_variables) - {"input", "output", "expected"}:
if not map_variables:
raise ValueError(
"map_inputs must be provided if the prompt template contains "
"variables other than 'input', 'output', and 'expected'"
)
self.map_variables = map_variables
self.score_config = score_config
self.score_schema = _create_score_json_schema(self.score_config)
chat_model = chat_model.with_structured_output(self.score_schema)
self.runnable = self.prompt | chat_model
@warn_beta
def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> Union[EvaluationResult, EvaluationResults]:
"""Evaluate a run."""
variables = self._prepare_variables(run, example)
output: dict = cast(dict, self.runnable.invoke(variables))
return self._parse_output(output)
@warn_beta
async def aevaluate_run(
self, run: Run, example: Optional[Example] = None
) -> Union[EvaluationResult, EvaluationResults]:
"""Asynchronously evaluate a run."""
variables = self._prepare_variables(run, example)
output: dict = cast(dict, await self.runnable.ainvoke(variables))
return self._parse_output(output)
def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
"""Prepare variables for model invocation."""
if self.map_variables:
return self.map_variables(run, example)
variables = {}
if "input" in self.prompt.input_variables:
if len(run.inputs) == 0:
raise ValueError(
"No input keys are present in run.inputs but the prompt "
"requires 'input'."
)
if len(run.inputs) != 1:
raise ValueError(
"Multiple input keys are present in run.inputs. Please provide "
"a map_variables function."
)
variables["input"] = list(run.inputs.values())[0]
if "output" in self.prompt.input_variables:
if not run.outputs:
raise ValueError(
"No output keys are present in run.outputs but the prompt "
"requires 'output'."
)
if len(run.outputs) == 0:
raise ValueError(
"No output keys are present in run.outputs but the prompt "
"requires 'output'."
)
if len(run.outputs) != 1:
raise ValueError(
"Multiple output keys are present in run.outputs. Please "
"provide a map_variables function."
)
variables["output"] = list(run.outputs.values())[0]
if "expected" in self.prompt.input_variables:
if not example or not example.outputs:
raise ValueError(
"No example or example outputs is provided but the prompt "
"requires 'expected'."
)
if len(example.outputs) == 0:
raise ValueError(
"No output keys are present in example.outputs but the prompt "
"requires 'expected'."
)
if len(example.outputs) != 1:
raise ValueError(
"Multiple output keys are present in example.outputs. Please "
"provide a map_variables function."
)
variables["expected"] = list(example.outputs.values())[0]
return variables
def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]:
"""Parse the model output into an evaluation result."""
if isinstance(self.score_config, CategoricalScoreConfig):
value = output["score"]
explanation = output.get("explanation", None)
return EvaluationResult(
key=self.score_config.key, value=value, comment=explanation
)
elif isinstance(self.score_config, ContinuousScoreConfig):
score = output["score"]
explanation = output.get("explanation", None)
return EvaluationResult(
key=self.score_config.key, score=score, comment=explanation
)