fmeval.eval_algorithms.common
1import logging 2from typing import List, Optional 3 4from ray.data import Dataset 5 6from fmeval.constants import EVAL_OUTPUT_RECORDS_BATCH_SIZE, MEAN, DatasetColumns 7from fmeval.eval_algorithms import EvalOutput, get_default_prompt_template 8from fmeval.eval_algorithms.save_strategy import SaveStrategy, FileSaveStrategy 9from fmeval.eval_algorithms.util import ( 10 EvalOutputRecord, 11 aggregate_evaluation_scores, 12 validate_dataset, 13 generate_output_dataset_path, 14 create_model_invocation_pipeline, 15) 16from fmeval.exceptions import EvalAlgorithmClientError 17from fmeval.model_runners.model_runner import ModelRunner 18from fmeval.perf_util import timed_block 19from fmeval.transforms.transform_pipeline import TransformPipeline 20 21logger = logging.getLogger(__name__) 22 23 24def save_dataset(dataset: Dataset, score_names: List[str], save_strategy: SaveStrategy) -> None: # pragma: no cover 25 """ 26 Writes the dataset to a JSON Lines file, where each JSON Lines object 27 is the JSON representation of an `EvalOutputRecord`. 28 29 :param dataset: a Ray Dataset that is produced during the execution of 30 an EvalAlgorithm's `evaluate` method. This dataset is expected 31 to include columns for every score computed by the evaluation algorithm. 32 :param score_names: the names of the score columns in `dataset`. 33 :param save_strategy: the SaveStrategy to be used to save the outputs. 34 35 36 Example Dataset: 37 ________________________________________________ 38 | "model_input" | "aux" | "rouge" | "bert_score"| 39 ------------------------------------------------- 40 | "hello" | 0.189 | 0.5 | 0.42 | 41 ------------------------------------------------- 42 | "world" | 0.162 | 0.314 | 0.271 | 43 ------------------------------------------------- 44 45 Note that the "aux" column name does not belong to constants.COLUMN_NAMES, meaning that this column 46 won't get included in the saved outputs. See the docstring for EvalOutputRecord.from_row for more details. 47 48 Corresponding Json Lines file contents: 49 {"model_input" : "hello", "scores" : [{"name": "rouge", "value": 0.5}, {"name": "bert_score", "value": 0.42}]} 50 {"model_input" : "world", "scores" : [{"name": "rouge", "value": 0.314}, {"name": "bert_score", "value": 0.271}]} 51 52 53 """ 54 with timed_block(f"Saving dataset to file", logger): 55 # We need the outer dict that wraps the EvalOutputRecord because map() requires 56 # whatever is returned from the lambda function to be a dict 57 dataset = dataset.map(lambda row: {"record": EvalOutputRecord.from_row(row, score_names)}) 58 # Without this line, dataset.iter_rows() below is not guaranteed to return the rows 59 # in the same order that they appear in `dataset`. 60 dataset.materialize() 61 62 with save_strategy: 63 for batch in dataset.iter_batches(batch_size=EVAL_OUTPUT_RECORDS_BATCH_SIZE): 64 save_strategy.save(batch["record"]) 65 66 67def evaluate_dataset( 68 dataset: Dataset, 69 pipeline: TransformPipeline, 70 dataset_name: str, 71 eval_name: str, 72 metric_names: List[str], 73 eval_results_path: str, 74 model: Optional[ModelRunner] = None, 75 prompt_template: Optional[str] = None, 76 agg_method: str = MEAN, 77 save: bool = False, 78 save_strategy: Optional[SaveStrategy] = None, 79) -> EvalOutput: 80 """Execute an evaluation algorithm's pipeline on a dataset. 81 82 :param dataset: The dataset to be evaluated. 83 :param pipeline: The evaluation algorithm's pipeline, to be executed on the dataset. 84 :param dataset_name: The name of the dataset being evaluated. This is metadata that 85 will be included in the returned EvalOutput object. 86 :param eval_name: The name of the evaluation algorithm. 87 :param metric_names: The names of the metrics that this evaluation algorithm computes. 88 prior to performing any evaluation logic. This parameter is algorithm-specific. 89 :param eval_results_path: A file containing evaluation results will be stored at this path. 90 :param model: An instance of ModelRunner representing the model under evaluation. 91 If this argument is None, model responses cannot be obtained. In such cases, 92 the dataset configured by `dataset_config` should already contain a column for 93 model outputs. 94 :param prompt_template: A template used to generate prompts that are fed to the model. 95 If set to None, a default value will be used. Note that if this argument is not None, 96 `model` must also not be None. 97 :param agg_method: The aggregation method to use when aggregating the computed metric values. 98 Currently, only MEAN is supported. 99 :param save: If set to true, prompt responses and scores will be saved to a file. 100 The path that this file is stored at is configured by `eval_results_path`. 101 102 :return: An EvalOutput object encapsulating the results of the evaluation. 103 """ 104 if model: 105 try: 106 validate_dataset(dataset, [DatasetColumns.MODEL_INPUT.value.name]) 107 except EvalAlgorithmClientError: 108 raise EvalAlgorithmClientError( 109 "evaluate_dataset has been given a ModelRunner to obtain outputs from " 110 "but the provided dataset does not contain a model input column." 111 ) 112 prompt_template = get_default_prompt_template(dataset_name) if not prompt_template else prompt_template 113 model_invocation_pipeline = create_model_invocation_pipeline(model, prompt_template) 114 pipeline = TransformPipeline([model_invocation_pipeline, pipeline]) 115 else: 116 if prompt_template: 117 logger.warning( 118 "A prompt template, but no corresponding model, was provided." 119 "Model outputs from the dataset will be used, and this prompt template will be ignored." 120 ) 121 try: 122 validate_dataset(dataset, [DatasetColumns.MODEL_OUTPUT.value.name]) 123 except EvalAlgorithmClientError: 124 raise EvalAlgorithmClientError( 125 "evaluate_dataset has been given a dataset with no model output column " 126 "and no ModelRunner to obtain outputs from. Please either provide a model " 127 "or use a dataset that contains model outputs already." 128 ) 129 130 with (timed_block(f"Computing score and aggregation on dataset {dataset_name}", logger)): 131 dataset = pipeline.execute(dataset) 132 dataset_scores, category_scores = aggregate_evaluation_scores(dataset, metric_names, agg_method=agg_method) 133 134 output_path = generate_output_dataset_path( 135 path_to_parent_dir=eval_results_path, 136 eval_name=eval_name, 137 dataset_name=dataset_name, 138 ) 139 eval_output = EvalOutput( 140 eval_name=eval_name, 141 dataset_name=dataset_name, 142 prompt_template=prompt_template, 143 dataset_scores=dataset_scores, 144 category_scores=category_scores, 145 output_path=output_path, 146 ) 147 148 if save: # pragma: no branch 149 save_dataset( 150 dataset=dataset, 151 score_names=metric_names, 152 save_strategy=save_strategy if save_strategy else FileSaveStrategy(output_path), 153 ) 154 155 return eval_output
25def save_dataset(dataset: Dataset, score_names: List[str], save_strategy: SaveStrategy) -> None: # pragma: no cover 26 """ 27 Writes the dataset to a JSON Lines file, where each JSON Lines object 28 is the JSON representation of an `EvalOutputRecord`. 29 30 :param dataset: a Ray Dataset that is produced during the execution of 31 an EvalAlgorithm's `evaluate` method. This dataset is expected 32 to include columns for every score computed by the evaluation algorithm. 33 :param score_names: the names of the score columns in `dataset`. 34 :param save_strategy: the SaveStrategy to be used to save the outputs. 35 36 37 Example Dataset: 38 ________________________________________________ 39 | "model_input" | "aux" | "rouge" | "bert_score"| 40 ------------------------------------------------- 41 | "hello" | 0.189 | 0.5 | 0.42 | 42 ------------------------------------------------- 43 | "world" | 0.162 | 0.314 | 0.271 | 44 ------------------------------------------------- 45 46 Note that the "aux" column name does not belong to constants.COLUMN_NAMES, meaning that this column 47 won't get included in the saved outputs. See the docstring for EvalOutputRecord.from_row for more details. 48 49 Corresponding Json Lines file contents: 50 {"model_input" : "hello", "scores" : [{"name": "rouge", "value": 0.5}, {"name": "bert_score", "value": 0.42}]} 51 {"model_input" : "world", "scores" : [{"name": "rouge", "value": 0.314}, {"name": "bert_score", "value": 0.271}]} 52 53 54 """ 55 with timed_block(f"Saving dataset to file", logger): 56 # We need the outer dict that wraps the EvalOutputRecord because map() requires 57 # whatever is returned from the lambda function to be a dict 58 dataset = dataset.map(lambda row: {"record": EvalOutputRecord.from_row(row, score_names)}) 59 # Without this line, dataset.iter_rows() below is not guaranteed to return the rows 60 # in the same order that they appear in `dataset`. 61 dataset.materialize() 62 63 with save_strategy: 64 for batch in dataset.iter_batches(batch_size=EVAL_OUTPUT_RECORDS_BATCH_SIZE): 65 save_strategy.save(batch["record"])
Writes the dataset to a JSON Lines file, where each JSON Lines object
is the JSON representation of an EvalOutputRecord
.
Parameters
- dataset: a Ray Dataset that is produced during the execution of
an EvalAlgorithm's
evaluate
method. This dataset is expected to include columns for every score computed by the evaluation algorithm. - score_names: the names of the score columns in
dataset
. save_strategy: the SaveStrategy to be used to save the outputs.
Example Dataset:
| "model_input" | "aux" | "rouge" | "bert_score"|
| "hello" | 0.189 | 0.5 | 0.42 |
| "world" | 0.162 | 0.314 | 0.271 |
Note that the "aux" column name does not belong to constants.COLUMN_NAMES, meaning that this column won't get included in the saved outputs. See the docstring for EvalOutputRecord.from_row for more details.
Corresponding Json Lines file contents: {"model_input" : "hello", "scores" : [{"name": "rouge", "value": 0.5}, {"name": "bert_score", "value": 0.42}]} {"model_input" : "world", "scores" : [{"name": "rouge", "value": 0.314}, {"name": "bert_score", "value": 0.271}]}
68def evaluate_dataset( 69 dataset: Dataset, 70 pipeline: TransformPipeline, 71 dataset_name: str, 72 eval_name: str, 73 metric_names: List[str], 74 eval_results_path: str, 75 model: Optional[ModelRunner] = None, 76 prompt_template: Optional[str] = None, 77 agg_method: str = MEAN, 78 save: bool = False, 79 save_strategy: Optional[SaveStrategy] = None, 80) -> EvalOutput: 81 """Execute an evaluation algorithm's pipeline on a dataset. 82 83 :param dataset: The dataset to be evaluated. 84 :param pipeline: The evaluation algorithm's pipeline, to be executed on the dataset. 85 :param dataset_name: The name of the dataset being evaluated. This is metadata that 86 will be included in the returned EvalOutput object. 87 :param eval_name: The name of the evaluation algorithm. 88 :param metric_names: The names of the metrics that this evaluation algorithm computes. 89 prior to performing any evaluation logic. This parameter is algorithm-specific. 90 :param eval_results_path: A file containing evaluation results will be stored at this path. 91 :param model: An instance of ModelRunner representing the model under evaluation. 92 If this argument is None, model responses cannot be obtained. In such cases, 93 the dataset configured by `dataset_config` should already contain a column for 94 model outputs. 95 :param prompt_template: A template used to generate prompts that are fed to the model. 96 If set to None, a default value will be used. Note that if this argument is not None, 97 `model` must also not be None. 98 :param agg_method: The aggregation method to use when aggregating the computed metric values. 99 Currently, only MEAN is supported. 100 :param save: If set to true, prompt responses and scores will be saved to a file. 101 The path that this file is stored at is configured by `eval_results_path`. 102 103 :return: An EvalOutput object encapsulating the results of the evaluation. 104 """ 105 if model: 106 try: 107 validate_dataset(dataset, [DatasetColumns.MODEL_INPUT.value.name]) 108 except EvalAlgorithmClientError: 109 raise EvalAlgorithmClientError( 110 "evaluate_dataset has been given a ModelRunner to obtain outputs from " 111 "but the provided dataset does not contain a model input column." 112 ) 113 prompt_template = get_default_prompt_template(dataset_name) if not prompt_template else prompt_template 114 model_invocation_pipeline = create_model_invocation_pipeline(model, prompt_template) 115 pipeline = TransformPipeline([model_invocation_pipeline, pipeline]) 116 else: 117 if prompt_template: 118 logger.warning( 119 "A prompt template, but no corresponding model, was provided." 120 "Model outputs from the dataset will be used, and this prompt template will be ignored." 121 ) 122 try: 123 validate_dataset(dataset, [DatasetColumns.MODEL_OUTPUT.value.name]) 124 except EvalAlgorithmClientError: 125 raise EvalAlgorithmClientError( 126 "evaluate_dataset has been given a dataset with no model output column " 127 "and no ModelRunner to obtain outputs from. Please either provide a model " 128 "or use a dataset that contains model outputs already." 129 ) 130 131 with (timed_block(f"Computing score and aggregation on dataset {dataset_name}", logger)): 132 dataset = pipeline.execute(dataset) 133 dataset_scores, category_scores = aggregate_evaluation_scores(dataset, metric_names, agg_method=agg_method) 134 135 output_path = generate_output_dataset_path( 136 path_to_parent_dir=eval_results_path, 137 eval_name=eval_name, 138 dataset_name=dataset_name, 139 ) 140 eval_output = EvalOutput( 141 eval_name=eval_name, 142 dataset_name=dataset_name, 143 prompt_template=prompt_template, 144 dataset_scores=dataset_scores, 145 category_scores=category_scores, 146 output_path=output_path, 147 ) 148 149 if save: # pragma: no branch 150 save_dataset( 151 dataset=dataset, 152 score_names=metric_names, 153 save_strategy=save_strategy if save_strategy else FileSaveStrategy(output_path), 154 ) 155 156 return eval_output
Execute an evaluation algorithm's pipeline on a dataset.
Parameters
- dataset: The dataset to be evaluated.
- pipeline: The evaluation algorithm's pipeline, to be executed on the dataset.
- dataset_name: The name of the dataset being evaluated. This is metadata that will be included in the returned EvalOutput object.
- eval_name: The name of the evaluation algorithm.
- metric_names: The names of the metrics that this evaluation algorithm computes. prior to performing any evaluation logic. This parameter is algorithm-specific.
- eval_results_path: A file containing evaluation results will be stored at this path.
- model: An instance of ModelRunner representing the model under evaluation.
If this argument is None, model responses cannot be obtained. In such cases,
the dataset configured by
dataset_config
should already contain a column for model outputs. - prompt_template: A template used to generate prompts that are fed to the model.
If set to None, a default value will be used. Note that if this argument is not None,
model
must also not be None. - agg_method: The aggregation method to use when aggregating the computed metric values. Currently, only MEAN is supported.
- save: If set to true, prompt responses and scores will be saved to a file.
The path that this file is stored at is configured by
eval_results_path
.
Returns
An EvalOutput object encapsulating the results of the evaluation.