fmeval.eval_algorithms.eval_algorithm

 1from abc import ABC, abstractmethod
 2from typing import Optional, List, Union
 3
 4from fmeval.data_loaders.data_config import DataConfig
 5from fmeval.eval_algorithms import EvalScore, EvalOutput
 6from fmeval.eval_algorithms.save_strategy import SaveStrategy
 7from fmeval.model_runners.model_runner import ModelRunner
 8
 9
10class EvalAlgorithmConfig:
11    """Configuration class to be inherited from to provide evaluation algorithm-specific parameters."""
12
13
14class EvalAlgorithmInterface(ABC):
15    """Interface for evaluation algorithms.
16
17    This interface defines two required methods that all evaluation algorithms must implement.
18    """
19
20    def __init__(self, eval_algorithm_config: EvalAlgorithmConfig):
21        """Initialize an evaluation algorithm instance.
22
23        :param eval_algorithm_config: Contains all configurable parameters for the evaluation algorithm.
24        """
25
26    @abstractmethod
27    def evaluate_sample(
28        self,
29        model_input: Optional[str] = None,
30        target_output: Optional[str] = None,
31        model_output: Optional[str] = None,
32    ) -> List[EvalScore]:
33        """Compute metrics for a single sample, where a sample is defined by the particular algorithm.
34
35        The `evaluate_sample` method implemented by different algorithms should use a subset of
36        these input parameters, but not all of them are required.
37
38        :param model_input: The input passed to `model`. If this parameter is not None,
39            `model` should likewise not be None.
40        :param target_output: The reference output that `model_output` will be compared against.
41        :param model_output: The output from invoking a model.
42        :returns: A list of EvalScore objects, where each EvalScore represents a single
43            score/metric that is computed by the evaluation algorithm.
44        """
45
46    @abstractmethod
47    def evaluate(
48        self,
49        model: Optional[ModelRunner] = None,
50        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
51        prompt_template: Optional[str] = None,
52        num_records: int = 100,
53        save: bool = False,
54        save_strategy: Optional[SaveStrategy] = None,
55    ) -> List[EvalOutput]:
56        """Compute metrics on all samples in one or more datasets.
57
58        :param model: An instance of ModelRunner representing the model being evaluated.
59        :param dataset_config: Configures a single dataset or list of datasets used for the
60            evaluation. If not provided, this method will run evaluations using all of its
61            supported built-in datasets.
62        :param prompt_template: A template used to generate prompts from raw text inputs.
63            This parameter is not required if you with to run evaluations using the built-in
64            datasets, as they have their own default prompt templates pre-configured.
65        :param num_records: The number of records to be randomly sampled from the input dataset
66            that is used for the evaluation.
67        :param save: If set to true, prompt responses and scores will be saved to a file.
68        :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not
69            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
70            If that environment variable is also not configured, it will be saved to
71
72        :returns: A list of EvalOutput objects, where an EvalOutput encapsulates
73        the EvalScores (and optionally, CategoryScores) generated by the evaluation,
74        as well as additional metadata regarding the evaluation.
75        """
class EvalAlgorithmConfig:
11class EvalAlgorithmConfig:
12    """Configuration class to be inherited from to provide evaluation algorithm-specific parameters."""

Configuration class to be inherited from to provide evaluation algorithm-specific parameters.

class EvalAlgorithmInterface(abc.ABC):
15class EvalAlgorithmInterface(ABC):
16    """Interface for evaluation algorithms.
17
18    This interface defines two required methods that all evaluation algorithms must implement.
19    """
20
21    def __init__(self, eval_algorithm_config: EvalAlgorithmConfig):
22        """Initialize an evaluation algorithm instance.
23
24        :param eval_algorithm_config: Contains all configurable parameters for the evaluation algorithm.
25        """
26
27    @abstractmethod
28    def evaluate_sample(
29        self,
30        model_input: Optional[str] = None,
31        target_output: Optional[str] = None,
32        model_output: Optional[str] = None,
33    ) -> List[EvalScore]:
34        """Compute metrics for a single sample, where a sample is defined by the particular algorithm.
35
36        The `evaluate_sample` method implemented by different algorithms should use a subset of
37        these input parameters, but not all of them are required.
38
39        :param model_input: The input passed to `model`. If this parameter is not None,
40            `model` should likewise not be None.
41        :param target_output: The reference output that `model_output` will be compared against.
42        :param model_output: The output from invoking a model.
43        :returns: A list of EvalScore objects, where each EvalScore represents a single
44            score/metric that is computed by the evaluation algorithm.
45        """
46
47    @abstractmethod
48    def evaluate(
49        self,
50        model: Optional[ModelRunner] = None,
51        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
52        prompt_template: Optional[str] = None,
53        num_records: int = 100,
54        save: bool = False,
55        save_strategy: Optional[SaveStrategy] = None,
56    ) -> List[EvalOutput]:
57        """Compute metrics on all samples in one or more datasets.
58
59        :param model: An instance of ModelRunner representing the model being evaluated.
60        :param dataset_config: Configures a single dataset or list of datasets used for the
61            evaluation. If not provided, this method will run evaluations using all of its
62            supported built-in datasets.
63        :param prompt_template: A template used to generate prompts from raw text inputs.
64            This parameter is not required if you with to run evaluations using the built-in
65            datasets, as they have their own default prompt templates pre-configured.
66        :param num_records: The number of records to be randomly sampled from the input dataset
67            that is used for the evaluation.
68        :param save: If set to true, prompt responses and scores will be saved to a file.
69        :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not
70            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
71            If that environment variable is also not configured, it will be saved to
72
73        :returns: A list of EvalOutput objects, where an EvalOutput encapsulates
74        the EvalScores (and optionally, CategoryScores) generated by the evaluation,
75        as well as additional metadata regarding the evaluation.
76        """

Interface for evaluation algorithms.

This interface defines two required methods that all evaluation algorithms must implement.

EvalAlgorithmInterface( eval_algorithm_config: EvalAlgorithmConfig)
21    def __init__(self, eval_algorithm_config: EvalAlgorithmConfig):
22        """Initialize an evaluation algorithm instance.
23
24        :param eval_algorithm_config: Contains all configurable parameters for the evaluation algorithm.
25        """

Initialize an evaluation algorithm instance.

Parameters
  • eval_algorithm_config: Contains all configurable parameters for the evaluation algorithm.
@abstractmethod
def evaluate_sample( self, model_input: Optional[str] = None, target_output: Optional[str] = None, model_output: Optional[str] = None) -> List[fmeval.eval_algorithms.EvalScore]:
27    @abstractmethod
28    def evaluate_sample(
29        self,
30        model_input: Optional[str] = None,
31        target_output: Optional[str] = None,
32        model_output: Optional[str] = None,
33    ) -> List[EvalScore]:
34        """Compute metrics for a single sample, where a sample is defined by the particular algorithm.
35
36        The `evaluate_sample` method implemented by different algorithms should use a subset of
37        these input parameters, but not all of them are required.
38
39        :param model_input: The input passed to `model`. If this parameter is not None,
40            `model` should likewise not be None.
41        :param target_output: The reference output that `model_output` will be compared against.
42        :param model_output: The output from invoking a model.
43        :returns: A list of EvalScore objects, where each EvalScore represents a single
44            score/metric that is computed by the evaluation algorithm.
45        """

Compute metrics for a single sample, where a sample is defined by the particular algorithm.

The evaluate_sample method implemented by different algorithms should use a subset of these input parameters, but not all of them are required.

Parameters
  • model_input: The input passed to model. If this parameter is not None, model should likewise not be None.
  • target_output: The reference output that model_output will be compared against.
  • model_output: The output from invoking a model. :returns: A list of EvalScore objects, where each EvalScore represents a single score/metric that is computed by the evaluation algorithm.
@abstractmethod
def evaluate( self, model: Optional[fmeval.model_runners.model_runner.ModelRunner] = None, dataset_config: Union[fmeval.data_loaders.data_config.DataConfig, List[fmeval.data_loaders.data_config.DataConfig], NoneType] = None, prompt_template: Optional[str] = None, num_records: int = 100, save: bool = False, save_strategy: Optional[fmeval.eval_algorithms.save_strategy.SaveStrategy] = None) -> List[fmeval.eval_algorithms.EvalOutput]:
47    @abstractmethod
48    def evaluate(
49        self,
50        model: Optional[ModelRunner] = None,
51        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
52        prompt_template: Optional[str] = None,
53        num_records: int = 100,
54        save: bool = False,
55        save_strategy: Optional[SaveStrategy] = None,
56    ) -> List[EvalOutput]:
57        """Compute metrics on all samples in one or more datasets.
58
59        :param model: An instance of ModelRunner representing the model being evaluated.
60        :param dataset_config: Configures a single dataset or list of datasets used for the
61            evaluation. If not provided, this method will run evaluations using all of its
62            supported built-in datasets.
63        :param prompt_template: A template used to generate prompts from raw text inputs.
64            This parameter is not required if you with to run evaluations using the built-in
65            datasets, as they have their own default prompt templates pre-configured.
66        :param num_records: The number of records to be randomly sampled from the input dataset
67            that is used for the evaluation.
68        :param save: If set to true, prompt responses and scores will be saved to a file.
69        :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not
70            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
71            If that environment variable is also not configured, it will be saved to
72
73        :returns: A list of EvalOutput objects, where an EvalOutput encapsulates
74        the EvalScores (and optionally, CategoryScores) generated by the evaluation,
75        as well as additional metadata regarding the evaluation.
76        """

Compute metrics on all samples in one or more datasets.

Parameters
  • model: An instance of ModelRunner representing the model being evaluated.
  • dataset_config: Configures a single dataset or list of datasets used for the evaluation. If not provided, this method will run evaluations using all of its supported built-in datasets.
  • prompt_template: A template used to generate prompts from raw text inputs. This parameter is not required if you with to run evaluations using the built-in datasets, as they have their own default prompt templates pre-configured.
  • num_records: The number of records to be randomly sampled from the input dataset that is used for the evaluation.
  • save: If set to true, prompt responses and scores will be saved to a file.
  • save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. If that environment variable is also not configured, it will be saved to

:returns: A list of EvalOutput objects, where an EvalOutput encapsulates the EvalScores (and optionally, CategoryScores) generated by the evaluation, as well as additional metadata regarding the evaluation.