fmeval.eval_algorithms.common

  1import logging
  2from typing import List, Optional
  3
  4from ray.data import Dataset
  5
  6from fmeval.constants import EVAL_OUTPUT_RECORDS_BATCH_SIZE, MEAN, DatasetColumns
  7from fmeval.eval_algorithms import EvalOutput, get_default_prompt_template
  8from fmeval.eval_algorithms.save_strategy import SaveStrategy, FileSaveStrategy
  9from fmeval.eval_algorithms.util import (
 10    EvalOutputRecord,
 11    aggregate_evaluation_scores,
 12    validate_dataset,
 13    generate_output_dataset_path,
 14    create_model_invocation_pipeline,
 15)
 16from fmeval.exceptions import EvalAlgorithmClientError
 17from fmeval.model_runners.model_runner import ModelRunner
 18from fmeval.perf_util import timed_block
 19from fmeval.transforms.transform_pipeline import TransformPipeline
 20
 21logger = logging.getLogger(__name__)
 22
 23
 24def save_dataset(dataset: Dataset, score_names: List[str], save_strategy: SaveStrategy) -> None:  # pragma: no cover
 25    """
 26    Writes the dataset to a JSON Lines file, where each JSON Lines object
 27    is the JSON representation of an `EvalOutputRecord`.
 28
 29    :param dataset: a Ray Dataset that is produced during the execution of
 30        an EvalAlgorithm's `evaluate` method. This dataset is expected
 31        to include columns for every score computed by the evaluation algorithm.
 32    :param score_names: the names of the score columns in `dataset`.
 33    :param save_strategy: the SaveStrategy to be used to save the outputs.
 34
 35
 36        Example Dataset:
 37         ________________________________________________
 38        | "model_input" | "aux" | "rouge" | "bert_score"|
 39        -------------------------------------------------
 40        |    "hello"    | 0.189 |   0.5   |     0.42    |
 41        -------------------------------------------------
 42        |    "world"    | 0.162 |  0.314  |    0.271    |
 43        -------------------------------------------------
 44
 45        Note that the "aux" column name does not belong to constants.COLUMN_NAMES, meaning that this column
 46        won't get included in the saved outputs. See the docstring for EvalOutputRecord.from_row for more details.
 47
 48        Corresponding Json Lines file contents:
 49        {"model_input" : "hello", "scores" : [{"name": "rouge", "value": 0.5}, {"name": "bert_score", "value": 0.42}]}
 50        {"model_input" : "world", "scores" : [{"name": "rouge", "value": 0.314}, {"name": "bert_score", "value": 0.271}]}
 51
 52
 53    """
 54    with timed_block(f"Saving dataset to file", logger):
 55        # We need the outer dict that wraps the EvalOutputRecord because map() requires
 56        # whatever is returned from the lambda function to be a dict
 57        dataset = dataset.map(lambda row: {"record": EvalOutputRecord.from_row(row, score_names)})
 58        # Without this line, dataset.iter_rows() below is not guaranteed to return the rows
 59        # in the same order that they appear in `dataset`.
 60        dataset.materialize()
 61
 62        with save_strategy:
 63            for batch in dataset.iter_batches(batch_size=EVAL_OUTPUT_RECORDS_BATCH_SIZE):
 64                save_strategy.save(batch["record"])
 65
 66
 67def evaluate_dataset(
 68    dataset: Dataset,
 69    pipeline: TransformPipeline,
 70    dataset_name: str,
 71    eval_name: str,
 72    metric_names: List[str],
 73    eval_results_path: str,
 74    model: Optional[ModelRunner] = None,
 75    prompt_template: Optional[str] = None,
 76    agg_method: str = MEAN,
 77    save: bool = False,
 78    save_strategy: Optional[SaveStrategy] = None,
 79) -> EvalOutput:
 80    """Execute an evaluation algorithm's pipeline on a dataset.
 81
 82    :param dataset: The dataset to be evaluated.
 83    :param pipeline: The evaluation algorithm's pipeline, to be executed on the dataset.
 84    :param dataset_name: The name of the dataset being evaluated. This is metadata that
 85        will be included in the returned EvalOutput object.
 86    :param eval_name: The name of the evaluation algorithm.
 87    :param metric_names: The names of the metrics that this evaluation algorithm computes.
 88        prior to performing any evaluation logic. This parameter is algorithm-specific.
 89    :param eval_results_path: A file containing evaluation results will be stored at this path.
 90    :param model: An instance of ModelRunner representing the model under evaluation.
 91        If this argument is None, model responses cannot be obtained. In such cases,
 92        the dataset configured by `dataset_config` should already contain a column for
 93        model outputs.
 94    :param prompt_template: A template used to generate prompts that are fed to the model.
 95        If set to None, a default value will be used. Note that if this argument is not None,
 96        `model` must also not be None.
 97    :param agg_method: The aggregation method to use when aggregating the computed metric values.
 98        Currently, only MEAN is supported.
 99    :param save: If set to true, prompt responses and scores will be saved to a file.
100        The path that this file is stored at is configured by `eval_results_path`.
101
102    :return: An EvalOutput object encapsulating the results of the evaluation.
103    """
104    if model:
105        try:
106            validate_dataset(dataset, [DatasetColumns.MODEL_INPUT.value.name])
107        except EvalAlgorithmClientError:
108            raise EvalAlgorithmClientError(
109                "evaluate_dataset has been given a ModelRunner to obtain outputs from "
110                "but the provided dataset does not contain a model input column."
111            )
112        prompt_template = get_default_prompt_template(dataset_name) if not prompt_template else prompt_template
113        model_invocation_pipeline = create_model_invocation_pipeline(model, prompt_template)
114        pipeline = TransformPipeline([model_invocation_pipeline, pipeline])
115    else:
116        if prompt_template:
117            logger.warning(
118                "A prompt template, but no corresponding model, was provided."
119                "Model outputs from the dataset will be used, and this prompt template will be ignored."
120            )
121        try:
122            validate_dataset(dataset, [DatasetColumns.MODEL_OUTPUT.value.name])
123        except EvalAlgorithmClientError:
124            raise EvalAlgorithmClientError(
125                "evaluate_dataset has been given a dataset with no model output column "
126                "and no ModelRunner to obtain outputs from. Please either provide a model "
127                "or use a dataset that contains model outputs already."
128            )
129
130    with (timed_block(f"Computing score and aggregation on dataset {dataset_name}", logger)):
131        dataset = pipeline.execute(dataset)
132        dataset_scores, category_scores = aggregate_evaluation_scores(dataset, metric_names, agg_method=agg_method)
133
134        output_path = generate_output_dataset_path(
135            path_to_parent_dir=eval_results_path,
136            eval_name=eval_name,
137            dataset_name=dataset_name,
138        )
139        eval_output = EvalOutput(
140            eval_name=eval_name,
141            dataset_name=dataset_name,
142            prompt_template=prompt_template,
143            dataset_scores=dataset_scores,
144            category_scores=category_scores,
145            output_path=output_path,
146        )
147
148        if save:  # pragma: no branch
149            save_dataset(
150                dataset=dataset,
151                score_names=metric_names,
152                save_strategy=save_strategy if save_strategy else FileSaveStrategy(output_path),
153            )
154
155        return eval_output
logger = <Logger fmeval.eval_algorithms.common (WARNING)>
def save_dataset( dataset: ray.data.dataset.Dataset, score_names: List[str], save_strategy: fmeval.eval_algorithms.save_strategy.SaveStrategy) -> None:
25def save_dataset(dataset: Dataset, score_names: List[str], save_strategy: SaveStrategy) -> None:  # pragma: no cover
26    """
27    Writes the dataset to a JSON Lines file, where each JSON Lines object
28    is the JSON representation of an `EvalOutputRecord`.
29
30    :param dataset: a Ray Dataset that is produced during the execution of
31        an EvalAlgorithm's `evaluate` method. This dataset is expected
32        to include columns for every score computed by the evaluation algorithm.
33    :param score_names: the names of the score columns in `dataset`.
34    :param save_strategy: the SaveStrategy to be used to save the outputs.
35
36
37        Example Dataset:
38         ________________________________________________
39        | "model_input" | "aux" | "rouge" | "bert_score"|
40        -------------------------------------------------
41        |    "hello"    | 0.189 |   0.5   |     0.42    |
42        -------------------------------------------------
43        |    "world"    | 0.162 |  0.314  |    0.271    |
44        -------------------------------------------------
45
46        Note that the "aux" column name does not belong to constants.COLUMN_NAMES, meaning that this column
47        won't get included in the saved outputs. See the docstring for EvalOutputRecord.from_row for more details.
48
49        Corresponding Json Lines file contents:
50        {"model_input" : "hello", "scores" : [{"name": "rouge", "value": 0.5}, {"name": "bert_score", "value": 0.42}]}
51        {"model_input" : "world", "scores" : [{"name": "rouge", "value": 0.314}, {"name": "bert_score", "value": 0.271}]}
52
53
54    """
55    with timed_block(f"Saving dataset to file", logger):
56        # We need the outer dict that wraps the EvalOutputRecord because map() requires
57        # whatever is returned from the lambda function to be a dict
58        dataset = dataset.map(lambda row: {"record": EvalOutputRecord.from_row(row, score_names)})
59        # Without this line, dataset.iter_rows() below is not guaranteed to return the rows
60        # in the same order that they appear in `dataset`.
61        dataset.materialize()
62
63        with save_strategy:
64            for batch in dataset.iter_batches(batch_size=EVAL_OUTPUT_RECORDS_BATCH_SIZE):
65                save_strategy.save(batch["record"])

Writes the dataset to a JSON Lines file, where each JSON Lines object is the JSON representation of an EvalOutputRecord.

Parameters
  • dataset: a Ray Dataset that is produced during the execution of an EvalAlgorithm's evaluate method. This dataset is expected to include columns for every score computed by the evaluation algorithm.
  • score_names: the names of the score columns in dataset.
  • save_strategy: the SaveStrategy to be used to save the outputs.

    Example Dataset:


    | "model_input" | "aux" | "rouge" | "bert_score"|

    | "hello" | 0.189 | 0.5 | 0.42 |

    | "world" | 0.162 | 0.314 | 0.271 |

    Note that the "aux" column name does not belong to constants.COLUMN_NAMES, meaning that this column won't get included in the saved outputs. See the docstring for EvalOutputRecord.from_row for more details.

    Corresponding Json Lines file contents: {"model_input" : "hello", "scores" : [{"name": "rouge", "value": 0.5}, {"name": "bert_score", "value": 0.42}]} {"model_input" : "world", "scores" : [{"name": "rouge", "value": 0.314}, {"name": "bert_score", "value": 0.271}]}

def evaluate_dataset( dataset: ray.data.dataset.Dataset, pipeline: fmeval.transforms.transform_pipeline.TransformPipeline, dataset_name: str, eval_name: str, metric_names: List[str], eval_results_path: str, model: Optional[fmeval.model_runners.model_runner.ModelRunner] = None, prompt_template: Optional[str] = None, agg_method: str = 'mean', save: bool = False, save_strategy: Optional[fmeval.eval_algorithms.save_strategy.SaveStrategy] = None) -> fmeval.eval_algorithms.EvalOutput:
 68def evaluate_dataset(
 69    dataset: Dataset,
 70    pipeline: TransformPipeline,
 71    dataset_name: str,
 72    eval_name: str,
 73    metric_names: List[str],
 74    eval_results_path: str,
 75    model: Optional[ModelRunner] = None,
 76    prompt_template: Optional[str] = None,
 77    agg_method: str = MEAN,
 78    save: bool = False,
 79    save_strategy: Optional[SaveStrategy] = None,
 80) -> EvalOutput:
 81    """Execute an evaluation algorithm's pipeline on a dataset.
 82
 83    :param dataset: The dataset to be evaluated.
 84    :param pipeline: The evaluation algorithm's pipeline, to be executed on the dataset.
 85    :param dataset_name: The name of the dataset being evaluated. This is metadata that
 86        will be included in the returned EvalOutput object.
 87    :param eval_name: The name of the evaluation algorithm.
 88    :param metric_names: The names of the metrics that this evaluation algorithm computes.
 89        prior to performing any evaluation logic. This parameter is algorithm-specific.
 90    :param eval_results_path: A file containing evaluation results will be stored at this path.
 91    :param model: An instance of ModelRunner representing the model under evaluation.
 92        If this argument is None, model responses cannot be obtained. In such cases,
 93        the dataset configured by `dataset_config` should already contain a column for
 94        model outputs.
 95    :param prompt_template: A template used to generate prompts that are fed to the model.
 96        If set to None, a default value will be used. Note that if this argument is not None,
 97        `model` must also not be None.
 98    :param agg_method: The aggregation method to use when aggregating the computed metric values.
 99        Currently, only MEAN is supported.
100    :param save: If set to true, prompt responses and scores will be saved to a file.
101        The path that this file is stored at is configured by `eval_results_path`.
102
103    :return: An EvalOutput object encapsulating the results of the evaluation.
104    """
105    if model:
106        try:
107            validate_dataset(dataset, [DatasetColumns.MODEL_INPUT.value.name])
108        except EvalAlgorithmClientError:
109            raise EvalAlgorithmClientError(
110                "evaluate_dataset has been given a ModelRunner to obtain outputs from "
111                "but the provided dataset does not contain a model input column."
112            )
113        prompt_template = get_default_prompt_template(dataset_name) if not prompt_template else prompt_template
114        model_invocation_pipeline = create_model_invocation_pipeline(model, prompt_template)
115        pipeline = TransformPipeline([model_invocation_pipeline, pipeline])
116    else:
117        if prompt_template:
118            logger.warning(
119                "A prompt template, but no corresponding model, was provided."
120                "Model outputs from the dataset will be used, and this prompt template will be ignored."
121            )
122        try:
123            validate_dataset(dataset, [DatasetColumns.MODEL_OUTPUT.value.name])
124        except EvalAlgorithmClientError:
125            raise EvalAlgorithmClientError(
126                "evaluate_dataset has been given a dataset with no model output column "
127                "and no ModelRunner to obtain outputs from. Please either provide a model "
128                "or use a dataset that contains model outputs already."
129            )
130
131    with (timed_block(f"Computing score and aggregation on dataset {dataset_name}", logger)):
132        dataset = pipeline.execute(dataset)
133        dataset_scores, category_scores = aggregate_evaluation_scores(dataset, metric_names, agg_method=agg_method)
134
135        output_path = generate_output_dataset_path(
136            path_to_parent_dir=eval_results_path,
137            eval_name=eval_name,
138            dataset_name=dataset_name,
139        )
140        eval_output = EvalOutput(
141            eval_name=eval_name,
142            dataset_name=dataset_name,
143            prompt_template=prompt_template,
144            dataset_scores=dataset_scores,
145            category_scores=category_scores,
146            output_path=output_path,
147        )
148
149        if save:  # pragma: no branch
150            save_dataset(
151                dataset=dataset,
152                score_names=metric_names,
153                save_strategy=save_strategy if save_strategy else FileSaveStrategy(output_path),
154            )
155
156        return eval_output

Execute an evaluation algorithm's pipeline on a dataset.

Parameters
  • dataset: The dataset to be evaluated.
  • pipeline: The evaluation algorithm's pipeline, to be executed on the dataset.
  • dataset_name: The name of the dataset being evaluated. This is metadata that will be included in the returned EvalOutput object.
  • eval_name: The name of the evaluation algorithm.
  • metric_names: The names of the metrics that this evaluation algorithm computes. prior to performing any evaluation logic. This parameter is algorithm-specific.
  • eval_results_path: A file containing evaluation results will be stored at this path.
  • model: An instance of ModelRunner representing the model under evaluation. If this argument is None, model responses cannot be obtained. In such cases, the dataset configured by dataset_config should already contain a column for model outputs.
  • prompt_template: A template used to generate prompts that are fed to the model. If set to None, a default value will be used. Note that if this argument is not None, model must also not be None.
  • agg_method: The aggregation method to use when aggregating the computed metric values. Currently, only MEAN is supported.
  • save: If set to true, prompt responses and scores will be saved to a file. The path that this file is stored at is configured by eval_results_path.
Returns

An EvalOutput object encapsulating the results of the evaluation.