fmeval.eval_algorithms.eval_algorithm
1from abc import ABC, abstractmethod 2from typing import Optional, List, Union 3 4from fmeval.data_loaders.data_config import DataConfig 5from fmeval.eval_algorithms import EvalScore, EvalOutput 6from fmeval.eval_algorithms.save_strategy import SaveStrategy 7from fmeval.model_runners.model_runner import ModelRunner 8 9 10class EvalAlgorithmConfig: 11 """Configuration class to be inherited from to provide evaluation algorithm-specific parameters.""" 12 13 14class EvalAlgorithmInterface(ABC): 15 """Interface for evaluation algorithms. 16 17 This interface defines two required methods that all evaluation algorithms must implement. 18 """ 19 20 def __init__(self, eval_algorithm_config: EvalAlgorithmConfig): 21 """Initialize an evaluation algorithm instance. 22 23 :param eval_algorithm_config: Contains all configurable parameters for the evaluation algorithm. 24 """ 25 26 @abstractmethod 27 def evaluate_sample( 28 self, 29 model_input: Optional[str] = None, 30 target_output: Optional[str] = None, 31 model_output: Optional[str] = None, 32 ) -> List[EvalScore]: 33 """Compute metrics for a single sample, where a sample is defined by the particular algorithm. 34 35 The `evaluate_sample` method implemented by different algorithms should use a subset of 36 these input parameters, but not all of them are required. 37 38 :param model_input: The input passed to `model`. If this parameter is not None, 39 `model` should likewise not be None. 40 :param target_output: The reference output that `model_output` will be compared against. 41 :param model_output: The output from invoking a model. 42 :returns: A list of EvalScore objects, where each EvalScore represents a single 43 score/metric that is computed by the evaluation algorithm. 44 """ 45 46 @abstractmethod 47 def evaluate( 48 self, 49 model: Optional[ModelRunner] = None, 50 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 51 prompt_template: Optional[str] = None, 52 num_records: int = 100, 53 save: bool = False, 54 save_strategy: Optional[SaveStrategy] = None, 55 ) -> List[EvalOutput]: 56 """Compute metrics on all samples in one or more datasets. 57 58 :param model: An instance of ModelRunner representing the model being evaluated. 59 :param dataset_config: Configures a single dataset or list of datasets used for the 60 evaluation. If not provided, this method will run evaluations using all of its 61 supported built-in datasets. 62 :param prompt_template: A template used to generate prompts from raw text inputs. 63 This parameter is not required if you with to run evaluations using the built-in 64 datasets, as they have their own default prompt templates pre-configured. 65 :param num_records: The number of records to be randomly sampled from the input dataset 66 that is used for the evaluation. 67 :param save: If set to true, prompt responses and scores will be saved to a file. 68 :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not 69 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 70 If that environment variable is also not configured, it will be saved to 71 72 :returns: A list of EvalOutput objects, where an EvalOutput encapsulates 73 the EvalScores (and optionally, CategoryScores) generated by the evaluation, 74 as well as additional metadata regarding the evaluation. 75 """
11class EvalAlgorithmConfig: 12 """Configuration class to be inherited from to provide evaluation algorithm-specific parameters."""
Configuration class to be inherited from to provide evaluation algorithm-specific parameters.
15class EvalAlgorithmInterface(ABC): 16 """Interface for evaluation algorithms. 17 18 This interface defines two required methods that all evaluation algorithms must implement. 19 """ 20 21 def __init__(self, eval_algorithm_config: EvalAlgorithmConfig): 22 """Initialize an evaluation algorithm instance. 23 24 :param eval_algorithm_config: Contains all configurable parameters for the evaluation algorithm. 25 """ 26 27 @abstractmethod 28 def evaluate_sample( 29 self, 30 model_input: Optional[str] = None, 31 target_output: Optional[str] = None, 32 model_output: Optional[str] = None, 33 ) -> List[EvalScore]: 34 """Compute metrics for a single sample, where a sample is defined by the particular algorithm. 35 36 The `evaluate_sample` method implemented by different algorithms should use a subset of 37 these input parameters, but not all of them are required. 38 39 :param model_input: The input passed to `model`. If this parameter is not None, 40 `model` should likewise not be None. 41 :param target_output: The reference output that `model_output` will be compared against. 42 :param model_output: The output from invoking a model. 43 :returns: A list of EvalScore objects, where each EvalScore represents a single 44 score/metric that is computed by the evaluation algorithm. 45 """ 46 47 @abstractmethod 48 def evaluate( 49 self, 50 model: Optional[ModelRunner] = None, 51 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 52 prompt_template: Optional[str] = None, 53 num_records: int = 100, 54 save: bool = False, 55 save_strategy: Optional[SaveStrategy] = None, 56 ) -> List[EvalOutput]: 57 """Compute metrics on all samples in one or more datasets. 58 59 :param model: An instance of ModelRunner representing the model being evaluated. 60 :param dataset_config: Configures a single dataset or list of datasets used for the 61 evaluation. If not provided, this method will run evaluations using all of its 62 supported built-in datasets. 63 :param prompt_template: A template used to generate prompts from raw text inputs. 64 This parameter is not required if you with to run evaluations using the built-in 65 datasets, as they have their own default prompt templates pre-configured. 66 :param num_records: The number of records to be randomly sampled from the input dataset 67 that is used for the evaluation. 68 :param save: If set to true, prompt responses and scores will be saved to a file. 69 :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not 70 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 71 If that environment variable is also not configured, it will be saved to 72 73 :returns: A list of EvalOutput objects, where an EvalOutput encapsulates 74 the EvalScores (and optionally, CategoryScores) generated by the evaluation, 75 as well as additional metadata regarding the evaluation. 76 """
Interface for evaluation algorithms.
This interface defines two required methods that all evaluation algorithms must implement.
21 def __init__(self, eval_algorithm_config: EvalAlgorithmConfig): 22 """Initialize an evaluation algorithm instance. 23 24 :param eval_algorithm_config: Contains all configurable parameters for the evaluation algorithm. 25 """
Initialize an evaluation algorithm instance.
Parameters
- eval_algorithm_config: Contains all configurable parameters for the evaluation algorithm.
27 @abstractmethod 28 def evaluate_sample( 29 self, 30 model_input: Optional[str] = None, 31 target_output: Optional[str] = None, 32 model_output: Optional[str] = None, 33 ) -> List[EvalScore]: 34 """Compute metrics for a single sample, where a sample is defined by the particular algorithm. 35 36 The `evaluate_sample` method implemented by different algorithms should use a subset of 37 these input parameters, but not all of them are required. 38 39 :param model_input: The input passed to `model`. If this parameter is not None, 40 `model` should likewise not be None. 41 :param target_output: The reference output that `model_output` will be compared against. 42 :param model_output: The output from invoking a model. 43 :returns: A list of EvalScore objects, where each EvalScore represents a single 44 score/metric that is computed by the evaluation algorithm. 45 """
Compute metrics for a single sample, where a sample is defined by the particular algorithm.
The evaluate_sample
method implemented by different algorithms should use a subset of
these input parameters, but not all of them are required.
Parameters
- model_input: The input passed to
model
. If this parameter is not None,model
should likewise not be None. - target_output: The reference output that
model_output
will be compared against. - model_output: The output from invoking a model. :returns: A list of EvalScore objects, where each EvalScore represents a single score/metric that is computed by the evaluation algorithm.
47 @abstractmethod 48 def evaluate( 49 self, 50 model: Optional[ModelRunner] = None, 51 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 52 prompt_template: Optional[str] = None, 53 num_records: int = 100, 54 save: bool = False, 55 save_strategy: Optional[SaveStrategy] = None, 56 ) -> List[EvalOutput]: 57 """Compute metrics on all samples in one or more datasets. 58 59 :param model: An instance of ModelRunner representing the model being evaluated. 60 :param dataset_config: Configures a single dataset or list of datasets used for the 61 evaluation. If not provided, this method will run evaluations using all of its 62 supported built-in datasets. 63 :param prompt_template: A template used to generate prompts from raw text inputs. 64 This parameter is not required if you with to run evaluations using the built-in 65 datasets, as they have their own default prompt templates pre-configured. 66 :param num_records: The number of records to be randomly sampled from the input dataset 67 that is used for the evaluation. 68 :param save: If set to true, prompt responses and scores will be saved to a file. 69 :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not 70 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 71 If that environment variable is also not configured, it will be saved to 72 73 :returns: A list of EvalOutput objects, where an EvalOutput encapsulates 74 the EvalScores (and optionally, CategoryScores) generated by the evaluation, 75 as well as additional metadata regarding the evaluation. 76 """
Compute metrics on all samples in one or more datasets.
Parameters
- model: An instance of ModelRunner representing the model being evaluated.
- dataset_config: Configures a single dataset or list of datasets used for the evaluation. If not provided, this method will run evaluations using all of its supported built-in datasets.
- prompt_template: A template used to generate prompts from raw text inputs. This parameter is not required if you with to run evaluations using the built-in datasets, as they have their own default prompt templates pre-configured.
- num_records: The number of records to be randomly sampled from the input dataset that is used for the evaluation.
- save: If set to true, prompt responses and scores will be saved to a file.
- save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. If that environment variable is also not configured, it will be saved to
:returns: A list of EvalOutput objects, where an EvalOutput encapsulates the EvalScores (and optionally, CategoryScores) generated by the evaluation, as well as additional metadata regarding the evaluation.