303 lines
12 KiB
Python
303 lines
12 KiB
Python
"""Contains the `LLMEvaluator` class for building LLM-as-a-judge evaluators."""
|
|
|
|
from typing import Any, Callable, Optional, Union, cast
|
|
|
|
from pydantic import BaseModel
|
|
|
|
from langsmith._internal._beta_decorator import warn_beta
|
|
from langsmith.evaluation import EvaluationResult, EvaluationResults, RunEvaluator
|
|
from langsmith.schemas import Example, Run
|
|
|
|
|
|
class CategoricalScoreConfig(BaseModel):
|
|
"""Configuration for a categorical score."""
|
|
|
|
key: str
|
|
choices: list[str]
|
|
description: str
|
|
include_explanation: bool = False
|
|
explanation_description: Optional[str] = None
|
|
|
|
|
|
class ContinuousScoreConfig(BaseModel):
|
|
"""Configuration for a continuous score."""
|
|
|
|
key: str
|
|
min: float = 0
|
|
max: float = 1
|
|
description: str
|
|
include_explanation: bool = False
|
|
explanation_description: Optional[str] = None
|
|
|
|
|
|
def _create_score_json_schema(
|
|
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
|
|
) -> dict:
|
|
properties: dict[str, Any] = {}
|
|
if isinstance(score_config, CategoricalScoreConfig):
|
|
properties["score"] = {
|
|
"type": "string",
|
|
"enum": score_config.choices,
|
|
"description": f"The score for the evaluation, one of "
|
|
f"{', '.join(score_config.choices)}.",
|
|
}
|
|
elif isinstance(score_config, ContinuousScoreConfig):
|
|
properties["score"] = {
|
|
"type": "number",
|
|
"minimum": score_config.min,
|
|
"maximum": score_config.max,
|
|
"description": f"The score for the evaluation, between "
|
|
f"{score_config.min} and {score_config.max}, inclusive.",
|
|
}
|
|
else:
|
|
raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'")
|
|
|
|
if score_config.include_explanation:
|
|
properties["explanation"] = {
|
|
"type": "string",
|
|
"description": (
|
|
"The explanation for the score."
|
|
if score_config.explanation_description is None
|
|
else score_config.explanation_description
|
|
),
|
|
}
|
|
|
|
return {
|
|
"title": score_config.key,
|
|
"description": score_config.description,
|
|
"type": "object",
|
|
"properties": properties,
|
|
"required": (
|
|
["score", "explanation"] if score_config.include_explanation else ["score"]
|
|
),
|
|
}
|
|
|
|
|
|
class LLMEvaluator(RunEvaluator):
|
|
"""A class for building LLM-as-a-judge evaluators.
|
|
|
|
.. deprecated:: 0.5.0
|
|
|
|
LLMEvaluator is deprecated. Use openevals instead: https://github.com/langchain-ai/openevals
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
prompt_template: Union[str, list[tuple[str, str]]],
|
|
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
|
|
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
|
|
model_name: str = "gpt-4o",
|
|
model_provider: str = "openai",
|
|
**kwargs,
|
|
):
|
|
"""Initialize the `LLMEvaluator`.
|
|
|
|
Args:
|
|
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
|
|
template to use for the evaluation. If a string is provided, it is
|
|
assumed to be a human / user message.
|
|
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
|
|
The configuration for the score, either categorical or continuous.
|
|
map_variables (Optional[Callable[[Run, Example], dict]], optional):
|
|
A function that maps the run and example to the variables in the
|
|
prompt.
|
|
|
|
If `None`, it is assumed that the prompt only requires 'input',
|
|
'output', and 'expected'.
|
|
model_name (Optional[str], optional): The model to use for the evaluation.
|
|
model_provider (Optional[str], optional): The model provider to use
|
|
for the evaluation.
|
|
"""
|
|
try:
|
|
from langchain.chat_models import ( # type: ignore[import-not-found]
|
|
init_chat_model,
|
|
)
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"LLMEvaluator requires langchain to be installed. "
|
|
"Please install langchain by running `pip install langchain`."
|
|
) from e
|
|
|
|
chat_model = init_chat_model(
|
|
model=model_name, model_provider=model_provider, **kwargs
|
|
)
|
|
|
|
self._initialize(prompt_template, score_config, map_variables, chat_model)
|
|
|
|
@classmethod
|
|
def from_model(
|
|
cls,
|
|
model: Any,
|
|
*,
|
|
prompt_template: Union[str, list[tuple[str, str]]],
|
|
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
|
|
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
|
|
):
|
|
"""Create an `LLMEvaluator` instance from a `BaseChatModel` instance.
|
|
|
|
Args:
|
|
model (BaseChatModel): The chat model instance to use for the evaluation.
|
|
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
|
|
template to use for the evaluation. If a string is provided, it is
|
|
assumed to be a system message.
|
|
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
|
|
The configuration for the score, either categorical or continuous.
|
|
map_variables (Optional[Callable[[Run, Example]], dict]], optional):
|
|
A function that maps the run and example to the variables in the
|
|
prompt.
|
|
|
|
If `None`, it is assumed that the prompt only requires 'input',
|
|
'output', and 'expected'.
|
|
|
|
Returns:
|
|
LLMEvaluator: An instance of `LLMEvaluator`.
|
|
"""
|
|
instance = cls.__new__(cls)
|
|
instance._initialize(prompt_template, score_config, map_variables, model)
|
|
return instance
|
|
|
|
def _initialize(
|
|
self,
|
|
prompt_template: Union[str, list[tuple[str, str]]],
|
|
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
|
|
map_variables: Optional[Callable[[Run, Optional[Example]], dict]],
|
|
chat_model: Any,
|
|
):
|
|
"""Shared initialization code for `__init__` and `from_model`.
|
|
|
|
Args:
|
|
prompt_template (Union[str, List[Tuple[str, str]]): The prompt template.
|
|
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
|
|
The score configuration.
|
|
map_variables (Optional[Callable[[Run, Example]], dict]]):
|
|
Function to map variables.
|
|
chat_model (BaseChatModel): The chat model instance.
|
|
"""
|
|
try:
|
|
from langchain_core.language_models.chat_models import BaseChatModel
|
|
from langchain_core.prompts import ChatPromptTemplate
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"LLMEvaluator requires langchain-core to be installed. "
|
|
"Please install langchain-core by running `pip install langchain-core`."
|
|
) from e
|
|
|
|
if not (
|
|
isinstance(chat_model, BaseChatModel)
|
|
and hasattr(chat_model, "with_structured_output")
|
|
):
|
|
raise ValueError(
|
|
"chat_model must be an instance of "
|
|
"BaseLanguageModel and support structured output."
|
|
)
|
|
|
|
if isinstance(prompt_template, str):
|
|
self.prompt = ChatPromptTemplate.from_messages([("human", prompt_template)])
|
|
else:
|
|
self.prompt = ChatPromptTemplate.from_messages(prompt_template)
|
|
|
|
if set(self.prompt.input_variables) - {"input", "output", "expected"}:
|
|
if not map_variables:
|
|
raise ValueError(
|
|
"map_inputs must be provided if the prompt template contains "
|
|
"variables other than 'input', 'output', and 'expected'"
|
|
)
|
|
self.map_variables = map_variables
|
|
|
|
self.score_config = score_config
|
|
self.score_schema = _create_score_json_schema(self.score_config)
|
|
|
|
chat_model = chat_model.with_structured_output(self.score_schema)
|
|
self.runnable = self.prompt | chat_model
|
|
|
|
@warn_beta
|
|
def evaluate_run(
|
|
self, run: Run, example: Optional[Example] = None
|
|
) -> Union[EvaluationResult, EvaluationResults]:
|
|
"""Evaluate a run."""
|
|
variables = self._prepare_variables(run, example)
|
|
output: dict = cast(dict, self.runnable.invoke(variables))
|
|
return self._parse_output(output)
|
|
|
|
@warn_beta
|
|
async def aevaluate_run(
|
|
self, run: Run, example: Optional[Example] = None
|
|
) -> Union[EvaluationResult, EvaluationResults]:
|
|
"""Asynchronously evaluate a run."""
|
|
variables = self._prepare_variables(run, example)
|
|
output: dict = cast(dict, await self.runnable.ainvoke(variables))
|
|
return self._parse_output(output)
|
|
|
|
def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
|
|
"""Prepare variables for model invocation."""
|
|
if self.map_variables:
|
|
return self.map_variables(run, example)
|
|
|
|
variables = {}
|
|
if "input" in self.prompt.input_variables:
|
|
if len(run.inputs) == 0:
|
|
raise ValueError(
|
|
"No input keys are present in run.inputs but the prompt "
|
|
"requires 'input'."
|
|
)
|
|
if len(run.inputs) != 1:
|
|
raise ValueError(
|
|
"Multiple input keys are present in run.inputs. Please provide "
|
|
"a map_variables function."
|
|
)
|
|
variables["input"] = list(run.inputs.values())[0]
|
|
|
|
if "output" in self.prompt.input_variables:
|
|
if not run.outputs:
|
|
raise ValueError(
|
|
"No output keys are present in run.outputs but the prompt "
|
|
"requires 'output'."
|
|
)
|
|
if len(run.outputs) == 0:
|
|
raise ValueError(
|
|
"No output keys are present in run.outputs but the prompt "
|
|
"requires 'output'."
|
|
)
|
|
if len(run.outputs) != 1:
|
|
raise ValueError(
|
|
"Multiple output keys are present in run.outputs. Please "
|
|
"provide a map_variables function."
|
|
)
|
|
variables["output"] = list(run.outputs.values())[0]
|
|
|
|
if "expected" in self.prompt.input_variables:
|
|
if not example or not example.outputs:
|
|
raise ValueError(
|
|
"No example or example outputs is provided but the prompt "
|
|
"requires 'expected'."
|
|
)
|
|
if len(example.outputs) == 0:
|
|
raise ValueError(
|
|
"No output keys are present in example.outputs but the prompt "
|
|
"requires 'expected'."
|
|
)
|
|
if len(example.outputs) != 1:
|
|
raise ValueError(
|
|
"Multiple output keys are present in example.outputs. Please "
|
|
"provide a map_variables function."
|
|
)
|
|
variables["expected"] = list(example.outputs.values())[0]
|
|
|
|
return variables
|
|
|
|
def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]:
|
|
"""Parse the model output into an evaluation result."""
|
|
if isinstance(self.score_config, CategoricalScoreConfig):
|
|
value = output["score"]
|
|
explanation = output.get("explanation", None)
|
|
return EvaluationResult(
|
|
key=self.score_config.key, value=value, comment=explanation
|
|
)
|
|
elif isinstance(self.score_config, ContinuousScoreConfig):
|
|
score = output["score"]
|
|
explanation = output.get("explanation", None)
|
|
return EvaluationResult(
|
|
key=self.score_config.key, score=score, comment=explanation
|
|
)
|