fmeval.eval_algorithms.util

View Source

  1import json
  2import logging
  3import os
  4import ray.data
  5import string
  6
  7import fmeval.util as util
  8
  9from ray.data import Dataset
 10from collections import OrderedDict
 11from dataclasses import dataclass
 12from typing import Any, Dict, List, Optional, Tuple, Union
 13from fmeval.constants import (
 14    DatasetColumns,
 15    EVAL_OUTPUT_RECORDS_BATCH_SIZE,
 16    MEAN,
 17    NUM_ROWS_DETERMINISTIC,
 18    DATASET_COLUMNS,
 19)
 20from fmeval.data_loaders.data_config import DataConfig
 21from fmeval.eval_algorithms import (
 22    EvalScore,
 23    CategoryScore,
 24    DATASET_CONFIGS,
 25    EVAL_DATASETS,
 26    EvalOutput,
 27    get_default_prompt_template,
 28)
 29from fmeval.exceptions import EvalAlgorithmInternalError
 30from fmeval.model_runners.composers.composers import PromptComposer
 31from fmeval.model_runners.model_runner import ModelRunner
 32from fmeval.perf_util import timed_block
 33from fmeval.transforms.common import GeneratePrompt, GetModelOutputs
 34from fmeval.transforms.transform_pipeline import TransformPipeline
 35from fmeval.util import get_num_actors
 36
 37# punctuation and articles for the normalize function
 38ENGLISH_ARTICLES = ["a", "an", "the"]
 39ENGLISH_PUNCTUATIONS = string.punctuation
 40
 41logger = logging.getLogger(__name__)
 42
 43
 44def get_dataset_configs(data_config: Optional[Union[DataConfig, List[DataConfig]]], eval_name: str) -> List[DataConfig]:
 45    if not data_config:
 46        return [DATASET_CONFIGS[dataset_name] for dataset_name in EVAL_DATASETS[eval_name]]
 47    elif isinstance(data_config, list):
 48        return data_config
 49    elif isinstance(data_config, tuple):
 50        return [cfg for cfg in data_config]
 51    else:
 52        return [data_config]
 53
 54
 55def generate_model_predict_response_for_dataset(
 56    model: ModelRunner,
 57    data: Dataset,
 58    model_input_column_name: str,
 59    model_output_column_name: Optional[str] = None,
 60    model_log_probability_column_name: Optional[str] = None,
 61) -> Dataset:
 62    """
 63    Runs the model on the given data. Output will be written to the
 64    `model_output_column_name` column, and log_probability will be
 65    written to the `model_log_probability_column_name` column.
 66
 67    :param model: ModelRunner to get predictions from.
 68    :param data: The dataset containing model inputs to feed to `model`.
 69    :param model_input_column_name: The name of the column containing the model input.
 70    :param model_output_column_name: The name of the column to write the model output to.
 71    :param model_log_probability_column_name: The name of the column to write the model log probability to.
 72    :return: The dataset with a model output column and model log probability column added.
 73        Note that both columns are optional, i.e. it is possible that a model output
 74        column is added, but a log probability column is not added (and vice versa).
 75    """
 76    with timed_block(f"Performing inference on dataset on {model}", logger):
 77
 78        class ModelRunnerWrapper:  # pragma: no cover
 79            """
 80            This class represents the Ray Actor that gets model predictions
 81            by feeding model inputs from the dataset to the model runner.
 82
 83            We use Ray Actors instead of Tasks because the Actor approach minimizes
 84            the number of times that the ModelRunner `model` gets deserialized.
 85            With Tasks, Ray will serialize and deserialize `model` for every single
 86            prediction. With Actors, `model` gets deserialized once per Actor when
 87            the Actor gets initialized.
 88            """
 89
 90            def __init__(self):
 91                self.model_runner = model
 92                logger.setLevel(logging.DEBUG)
 93
 94            def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
 95                predict_output = self.model_runner.predict(row[model_input_column_name])
 96                if model_output_column_name:
 97                    row[model_output_column_name] = predict_output[0]
 98                if model_log_probability_column_name:
 99                    row[model_log_probability_column_name] = predict_output[1]
100                return row
101
102        data = data.map(
103            ModelRunnerWrapper, compute=ray.data.ActorPoolStrategy(size=get_num_actors())  # type: ignore[arg-type]
104        ).materialize()
105    return data
106
107
108def generate_prompt_column_for_dataset(
109    prompt_template: str, data: Dataset, model_input_column_name: str, prompt_column_name: str
110) -> Dataset:
111    """
112    Generates prompts column for a given input dataset and prompt_template
113    :param prompt_template: Prompt template
114    :param data: the dataset where each instance is a row in the dataset.
115    :param model_input_column_name: the name of the column containing the model input.
116    :param prompt_column_name: Output column name to which composed prompts are added
117    :return: the dataset with the composed prompts added.
118    """
119    with timed_block(f"Generating prompt column", logger):
120        prompt_composer = PromptComposer(prompt_template)
121
122        def _generate_prompt_column(row: Dict[str, Any]) -> Dict[str, Any]:  # pragma: no cover
123            """
124            Map function for generating the prompt column value given a dataset row.
125            """
126            row[prompt_column_name] = prompt_composer.compose(row[model_input_column_name])
127            return row
128
129        data = data.map(_generate_prompt_column).materialize()
130    return data
131
132
133def validate_dataset(dataset: Dataset, column_names: List[str]):
134    """
135    Util function to validate that dataset contains the required column names.
136
137    :param dataset: Input ray dataset
138    :param column_names: names of the columns that must be present in the dataset
139    :raises: EvalAlgorithmClientError for an invalid dataset
140    """
141    for column_name in column_names:
142        util.require(
143            column_name in dataset.columns(),
144            f"Missing required column: {column_name}, for evaluate() method",
145        )
146
147
148def validate_prompt_template(prompt_template: str, placeholders: List[str]):
149    """
150    Util function to validate that prompt_template contains the keywords.
151
152    :param prompt_template: A template used to compose prompts. Ex: '{"Question":$question, "Answer": $answer}'
153    :param placeholders: Placeholder keyword list. This keyword appears
154            in `prompt_template` with a $ sign prepended. In the above example,
155            the placeholders are ["question", "answer"].
156    :raises: EvalAlgorithmClientError for an invalid prompt_template
157    """
158    for placeholder in placeholders:
159        util.require(
160            f"${placeholder}" in prompt_template,
161            f"Unable to find placeholder ${placeholder} in prompt_template.",
162        )
163
164
165def aggregate_evaluation_scores(
166    dataset: Dataset, score_column_names: List[str], agg_method: str
167) -> Tuple[List[EvalScore], Optional[List[CategoryScore]]]:
168    """
169    The method aggregates scores at the dataset level and optionally at the category level if
170     categories are available in the dataset.
171
172    :param dataset: ray dataset with eval scores
173    :param score_column_names: a list of column names which contain the scores to aggregate
174    :param agg_method: the name of the aggregation to perform
175    :return: a tuple containing 1) dataset-level scores and
176                                2) a list of category-level scores if categories are available, `None` otherwise
177    """
178    dataset_scores = [
179        EvalScore(name=score_column_name, value=dataset_aggregation(dataset, score_column_name, agg_method))
180        for score_column_name in score_column_names
181    ]
182    category_scores: Optional[Dict[str, CategoryScore]] = None
183    if DatasetColumns.CATEGORY.value.name in dataset.columns():
184        category_scores = {
185            name: CategoryScore(name=name, scores=[]) for name in dataset.unique(DatasetColumns.CATEGORY.value.name)
186        }
187        for score_column_name in score_column_names:
188            category_aggregate: Dataset = category_wise_aggregation(dataset, score_column_name, agg_method)
189            for row in category_aggregate.iter_rows():
190                category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append(
191                    EvalScore(name=score_column_name, value=row[f"mean({score_column_name})"])
192                )
193
194    return dataset_scores, list(category_scores.values()) if category_scores else None
195
196
197def dataset_aggregation(dataset: Dataset, score_column_name: str, agg_method: str) -> float:
198    if agg_method == MEAN:
199        aggregate = dataset.mean(on=score_column_name, ignore_nulls=True)
200        assert isinstance(aggregate, float)
201        return aggregate
202    else:
203        raise EvalAlgorithmInternalError(f"Aggregation method {agg_method} is not supported")
204
205
206def category_wise_aggregation(dataset: Dataset, score_column_name: str, agg_method: str) -> Dataset:
207    category_aggregate: Dataset = dataset.groupby(DatasetColumns.CATEGORY.value.name)  # type: ignore
208    if agg_method == MEAN:
209        category_aggregate = category_aggregate.mean(on=score_column_name, ignore_nulls=True)
210    else:
211        raise EvalAlgorithmInternalError(f"Aggregation method {agg_method} is not supported")
212    return category_aggregate
213
214
215# Moved function to util.py because it's being used by both factual knowledge and qa accuracy
216def normalize_text_quac_protocol(text: str) -> str:
217    """
218    Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py
219    Given a text, normalize it using the SQUAD / QUAC protocol. That is remove punctuations, excess spaces and articles, and return the lowercased tokens.
220    SQUAD (https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) and
221    QuAC benchmarks (https://s3.amazonaws.com/my89public/quac/scorer.py) use this protocol to normalize text before evaluating it.
222    HELM (https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L116)
223    and HuggingFace evaluate (https://github.com/huggingface/evaluate/blob/775555d80af30d83dc6e9f42051840d29a34f31b/metrics/squad/compute_score.py#L11)
224    also use this to normalization procedure.
225
226    :param text: The text that needs to be normalized.
227    :returns: The normalized text.
228    """
229
230    text = text.lower()
231    text = "".join(character for character in text if character not in ENGLISH_PUNCTUATIONS)
232    return " ".join([word for word in text.split(" ") if (word != "" and word not in ENGLISH_ARTICLES)])
233
234
235@dataclass
236class EvalOutputRecord:
237    """
238    This class represents a single record that gets written by the `save_dataset` method.
239    In other words, it represents a single row from the Ray Dataset that is being saved.
240
241    :param scores: A list of EvalScores, where each EvalScore corresponds
242        to one of the score columns in the Ray Dataset being saved.
243    :param dataset_columns: Maps a column name to its contents in the current row
244        (recall that an EvalOutputRecord corresponds to a single Ray Dataset row).
245
246        Note: the keys in `dataset_columns` must belong to constants.COLUMN_NAMES,
247        because constants.COLUMN_NAMES defines which (non-score) columns are allowed
248        to appear in the saved output, i.e. it defines the schema for an output record.
249    """
250
251    scores: List[EvalScore]
252    dataset_columns: Dict[str, Union[str, float, int]]
253
254    def __post_init__(self):
255        for col in self.dataset_columns:
256            util.assert_condition(
257                col in DATASET_COLUMNS,
258                f"Attempting to initialize an EvalOutputRecord with invalid non-score column {col}.",
259            )
260
261    def __str__(self):
262        return json.dumps(self.to_dict())
263
264    def to_dict(self) -> OrderedDict[str, Union[str, float, int, List]]:
265        """
266        Returns a dictionary representation of this instance,
267        to be used when writing this object to JSON Lines.
268
269        Note that we use an OrderedDict to maintain consistency
270        in the ordering of columns. The score columns always come
271        at the end, and the non-score columns are ordered according
272        to constants.COLUMN_NAMES.
273        """
274        json_obj = OrderedDict(
275            (col_name, self.dataset_columns[col_name])
276            for col_name in DATASET_COLUMNS
277            if col_name in self.dataset_columns
278        )
279        json_obj["scores"] = [
280            # filter out None "value" and None "error"
281            {k: v for k, v in eval_score.__dict__.items() if v is not None}
282            for eval_score in self.scores
283        ]
284        return json_obj
285
286    @staticmethod
287    def from_row(row: Dict[str, Union[str, float, int]], score_names: List[str]) -> "EvalOutputRecord":
288        """
289        Returns an instance of EvalOutputRecord, created from a Ray Dataset row (represented as a dict).
290
291        Example input:
292            row = {
293                "model_input": "input",
294                "model_output": "output",
295                "column_that_wont_be_included": "hello",
296                "rouge": 0.42,
297                "bert": 0.162
298            }
299
300        Corresponding output:
301            EvalOutputRecord(
302                scores=[
303                    EvalScore(name="rouge", value=0.42),
304                    EvalScore(name="bert", value=0.162)
305                ],
306                dataset_columns={
307                    "model_input": "input",
308                    "model_output": "output"
309                }
310            )
311
312        Note how "column_that_wont_be_included" is not included in the produced EvalOutputRecord.
313        This is because only columns in constants.COLUMN_NAMES are considered to be valid columns
314        in the saved output file generated by `save_dataset`. The reason why it's even possible
315        for a column name that doesn't belong to constants.COLUMN_NAMES to appear in `row` is that
316        the Ray Dataset that `row` belongs to can contain columns used to store intermediate computations.
317        For example, ClassificationAccuracy generates a column named CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME
318        that is used to compute CLASSIFICATION_ACCURACY_SCORE, which is one of the score columns.
319
320        :param row: a Ray Dataset row represented as a dict
321        :param score_names: column names included in the Ray Dataset that `row`
322            is a sample of that correspond to evaluation algorithm scores
323        :returns: an instance of EvalOutputRecord corresponding to `row`
324        """
325        dataset_columns = {}
326        scores = []
327        for column_name, value in row.items():
328            if column_name not in score_names:  # pragma: no branch
329                if column_name in DATASET_COLUMNS:  # pragma: no branch
330                    dataset_columns[column_name] = value
331            else:
332                assert isinstance(value, float) or isinstance(value, int) or value is None  # to satisfy Mypy
333                if value is None:
334                    assert row.get(DatasetColumns.ERROR.value.name, None)
335                    scores.append(EvalScore(name=column_name, error=row.get(DatasetColumns.ERROR.value.name)))
336                else:
337                    scores.append(EvalScore(name=column_name, value=value))
338
339        return EvalOutputRecord(
340            scores=scores,
341            dataset_columns=dataset_columns,
342        )
343
344
345def generate_output_dataset_path(path_to_parent_dir: str, eval_name: str, dataset_name) -> str:
346    """
347    Returns the path to be used by an EvalAlgorithm when calling `save_dataset`.
348
349    :param path_to_parent_dir: The path to the parent directory of the file to be saved.
350    :param eval_name: The evaluation name provided by the EvalAlgorithm.
351    :param dataset_name: The name of the dataset.
352    :returns: A path that is unique to an evaluation/dataset pair for a given job.
353    """
354    return os.path.join(path_to_parent_dir, f"{eval_name}_{dataset_name}.jsonl")
355
356
357def generate_mean_delta_score(original_score: EvalScore, perturbed_input_scores: List[EvalScore]) -> float:
358    """
359    Util method to generate mean of difference between original and perturbed input scores
360    :param original_score: Original score
361    :param perturbed_input_scores: List of scores for model inference outputs on perturbed inputs
362    :returns: mean of delta between the scores
363    """
364    return sum([abs(original_score.value - reference_score.value) for reference_score in perturbed_input_scores]) / len(
365        perturbed_input_scores
366    )
367
368
369def verify_model_determinism(
370    model: ModelRunner,
371    dataset: Dataset,
372    prompt_template: str,
373    model_input_column_name: str = DatasetColumns.MODEL_INPUT.value.name,
374) -> bool:
375    """Heuristic for whether model is deterministic.
376
377    This function invokes the provided model twice on each of the first
378    NUM_ROWS_DETERMINISTIC rows in the dataset. If the two model outputs
379    for each input are the same for all rows, the model is considered deterministic.
380
381    :param model: A ModelRunner instance representing the model under investigation.
382    :param dataset: A Ray Dataset that includes a model input column.
383    :param prompt_template: The template used to compose the prompt from the model input.
384    :param model_input_column_name: Model input column name.
385    :returns: Whether the model is deterministic.
386    """
387    prompt_composer = PromptComposer(prompt_template)
388    for row in dataset.limit(NUM_ROWS_DETERMINISTIC).iter_rows():
389        prompt = prompt_composer.compose(row[model_input_column_name])
390        model_output = model.predict(prompt)[0]
391        if model.predict(prompt)[0] != model_output:
392            return False
393    return True
394
395
396def create_model_invocation_pipeline(model: ModelRunner, prompt_template: str) -> TransformPipeline:
397    """Create a transform pipeline for performing the standard action of invoking a model on a prompt.
398
399    :param model: The model to be invoked.
400    :param prompt_template: The template used for constructing prompts (out of raw inputs)
401        that will be fed to the model.
402    :returns: A TransformPipeline instance containing a GeneratePrompt transform that uses `prompt_template`
403        and a GetModelOutputs transform for invoking the model on the generated prompts.
404    """
405    gen_prompt = GeneratePrompt(
406        input_keys=[DatasetColumns.MODEL_INPUT.value.name],
407        output_keys=[DatasetColumns.PROMPT.value.name],
408        prompt_template=prompt_template,
409    )
410    get_model_outputs = GetModelOutputs(
411        input_to_output_keys={DatasetColumns.PROMPT.value.name: [DatasetColumns.MODEL_OUTPUT.value.name]},
412        model_runner=model,
413    )
414    return TransformPipeline([gen_prompt, get_model_outputs])

ENGLISH_ARTICLES = ['a', 'an', 'the']

ENGLISH_PUNCTUATIONS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

logger = <Logger fmeval.eval_algorithms.util (WARNING)>

def get_dataset_configs( data_config: Union[fmeval.data_loaders.data_config.DataConfig, List[fmeval.data_loaders.data_config.DataConfig], NoneType], eval_name: str) -> List[fmeval.data_loaders.data_config.DataConfig]: View Source

45def get_dataset_configs(data_config: Optional[Union[DataConfig, List[DataConfig]]], eval_name: str) -> List[DataConfig]:
46    if not data_config:
47        return [DATASET_CONFIGS[dataset_name] for dataset_name in EVAL_DATASETS[eval_name]]
48    elif isinstance(data_config, list):
49        return data_config
50    elif isinstance(data_config, tuple):
51        return [cfg for cfg in data_config]
52    else:
53        return [data_config]

def generate_model_predict_response_for_dataset( model: fmeval.model_runners.model_runner.ModelRunner, data: ray.data.dataset.Dataset, model_input_column_name: str, model_output_column_name: Optional[str] = None, model_log_probability_column_name: Optional[str] = None) -> ray.data.dataset.Dataset: View Source

 56def generate_model_predict_response_for_dataset(
 57    model: ModelRunner,
 58    data: Dataset,
 59    model_input_column_name: str,
 60    model_output_column_name: Optional[str] = None,
 61    model_log_probability_column_name: Optional[str] = None,
 62) -> Dataset:
 63    """
 64    Runs the model on the given data. Output will be written to the
 65    `model_output_column_name` column, and log_probability will be
 66    written to the `model_log_probability_column_name` column.
 67
 68    :param model: ModelRunner to get predictions from.
 69    :param data: The dataset containing model inputs to feed to `model`.
 70    :param model_input_column_name: The name of the column containing the model input.
 71    :param model_output_column_name: The name of the column to write the model output to.
 72    :param model_log_probability_column_name: The name of the column to write the model log probability to.
 73    :return: The dataset with a model output column and model log probability column added.
 74        Note that both columns are optional, i.e. it is possible that a model output
 75        column is added, but a log probability column is not added (and vice versa).
 76    """
 77    with timed_block(f"Performing inference on dataset on {model}", logger):
 78
 79        class ModelRunnerWrapper:  # pragma: no cover
 80            """
 81            This class represents the Ray Actor that gets model predictions
 82            by feeding model inputs from the dataset to the model runner.
 83
 84            We use Ray Actors instead of Tasks because the Actor approach minimizes
 85            the number of times that the ModelRunner `model` gets deserialized.
 86            With Tasks, Ray will serialize and deserialize `model` for every single
 87            prediction. With Actors, `model` gets deserialized once per Actor when
 88            the Actor gets initialized.
 89            """
 90
 91            def __init__(self):
 92                self.model_runner = model
 93                logger.setLevel(logging.DEBUG)
 94
 95            def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
 96                predict_output = self.model_runner.predict(row[model_input_column_name])
 97                if model_output_column_name:
 98                    row[model_output_column_name] = predict_output[0]
 99                if model_log_probability_column_name:
100                    row[model_log_probability_column_name] = predict_output[1]
101                return row
102
103        data = data.map(
104            ModelRunnerWrapper, compute=ray.data.ActorPoolStrategy(size=get_num_actors())  # type: ignore[arg-type]
105        ).materialize()
106    return data

Runs the model on the given data. Output will be written to the model_output_column_name column, and log_probability will be written to the model_log_probability_column_name column.

Parameters

model: ModelRunner to get predictions from.
data: The dataset containing model inputs to feed to model.
model_input_column_name: The name of the column containing the model input.
model_output_column_name: The name of the column to write the model output to.
model_log_probability_column_name: The name of the column to write the model log probability to.

Returns

The dataset with a model output column and model log probability column added. Note that both columns are optional, i.e. it is possible that a model output column is added, but a log probability column is not added (and vice versa).

def generate_prompt_column_for_dataset( prompt_template: str, data: ray.data.dataset.Dataset, model_input_column_name: str, prompt_column_name: str) -> ray.data.dataset.Dataset: View Source

109def generate_prompt_column_for_dataset(
110    prompt_template: str, data: Dataset, model_input_column_name: str, prompt_column_name: str
111) -> Dataset:
112    """
113    Generates prompts column for a given input dataset and prompt_template
114    :param prompt_template: Prompt template
115    :param data: the dataset where each instance is a row in the dataset.
116    :param model_input_column_name: the name of the column containing the model input.
117    :param prompt_column_name: Output column name to which composed prompts are added
118    :return: the dataset with the composed prompts added.
119    """
120    with timed_block(f"Generating prompt column", logger):
121        prompt_composer = PromptComposer(prompt_template)
122
123        def _generate_prompt_column(row: Dict[str, Any]) -> Dict[str, Any]:  # pragma: no cover
124            """
125            Map function for generating the prompt column value given a dataset row.
126            """
127            row[prompt_column_name] = prompt_composer.compose(row[model_input_column_name])
128            return row
129
130        data = data.map(_generate_prompt_column).materialize()
131    return data

Generates prompts column for a given input dataset and prompt_template

Parameters

prompt_template: Prompt template
data: the dataset where each instance is a row in the dataset.
model_input_column_name: the name of the column containing the model input.
prompt_column_name: Output column name to which composed prompts are added

Returns

the dataset with the composed prompts added.

def validate_dataset(dataset: ray.data.dataset.Dataset, column_names: List[str]): View Source

134def validate_dataset(dataset: Dataset, column_names: List[str]):
135    """
136    Util function to validate that dataset contains the required column names.
137
138    :param dataset: Input ray dataset
139    :param column_names: names of the columns that must be present in the dataset
140    :raises: EvalAlgorithmClientError for an invalid dataset
141    """
142    for column_name in column_names:
143        util.require(
144            column_name in dataset.columns(),
145            f"Missing required column: {column_name}, for evaluate() method",
146        )

Util function to validate that dataset contains the required column names.

Parameters

dataset: Input ray dataset
column_names: names of the columns that must be present in the dataset

Raises

EvalAlgorithmClientError for an invalid dataset

def validate_prompt_template(prompt_template: str, placeholders: List[str]): View Source

149def validate_prompt_template(prompt_template: str, placeholders: List[str]):
150    """
151    Util function to validate that prompt_template contains the keywords.
152
153    :param prompt_template: A template used to compose prompts. Ex: '{"Question":$question, "Answer": $answer}'
154    :param placeholders: Placeholder keyword list. This keyword appears
155            in `prompt_template` with a $ sign prepended. In the above example,
156            the placeholders are ["question", "answer"].
157    :raises: EvalAlgorithmClientError for an invalid prompt_template
158    """
159    for placeholder in placeholders:
160        util.require(
161            f"${placeholder}" in prompt_template,
162            f"Unable to find placeholder ${placeholder} in prompt_template.",
163        )

Util function to validate that prompt_template contains the keywords.

Parameters

prompt_template: A template used to compose prompts. Ex: '{"Question":$question, "Answer": $answer}'
placeholders: Placeholder keyword list. This keyword appears in prompt_template with a $ sign prepended. In the above example, the placeholders are ["question", "answer"].

Raises

EvalAlgorithmClientError for an invalid prompt_template

def aggregate_evaluation_scores( dataset: ray.data.dataset.Dataset, score_column_names: List[str], agg_method: str) -> Tuple[List[fmeval.eval_algorithms.EvalScore], Optional[List[fmeval.eval_algorithms.CategoryScore]]]: View Source

166def aggregate_evaluation_scores(
167    dataset: Dataset, score_column_names: List[str], agg_method: str
168) -> Tuple[List[EvalScore], Optional[List[CategoryScore]]]:
169    """
170    The method aggregates scores at the dataset level and optionally at the category level if
171     categories are available in the dataset.
172
173    :param dataset: ray dataset with eval scores
174    :param score_column_names: a list of column names which contain the scores to aggregate
175    :param agg_method: the name of the aggregation to perform
176    :return: a tuple containing 1) dataset-level scores and
177                                2) a list of category-level scores if categories are available, `None` otherwise
178    """
179    dataset_scores = [
180        EvalScore(name=score_column_name, value=dataset_aggregation(dataset, score_column_name, agg_method))
181        for score_column_name in score_column_names
182    ]
183    category_scores: Optional[Dict[str, CategoryScore]] = None
184    if DatasetColumns.CATEGORY.value.name in dataset.columns():
185        category_scores = {
186            name: CategoryScore(name=name, scores=[]) for name in dataset.unique(DatasetColumns.CATEGORY.value.name)
187        }
188        for score_column_name in score_column_names:
189            category_aggregate: Dataset = category_wise_aggregation(dataset, score_column_name, agg_method)
190            for row in category_aggregate.iter_rows():
191                category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append(
192                    EvalScore(name=score_column_name, value=row[f"mean({score_column_name})"])
193                )
194
195    return dataset_scores, list(category_scores.values()) if category_scores else None

The method aggregates scores at the dataset level and optionally at the category level if categories are available in the dataset.

Parameters

dataset: ray dataset with eval scores
score_column_names: a list of column names which contain the scores to aggregate
agg_method: the name of the aggregation to perform

Returns

a tuple containing 1) dataset-level scores and 2) a list of category-level scores if categories are available, None otherwise

def dataset_aggregation( dataset: ray.data.dataset.Dataset, score_column_name: str, agg_method: str) -> float: View Source

198def dataset_aggregation(dataset: Dataset, score_column_name: str, agg_method: str) -> float:
199    if agg_method == MEAN:
200        aggregate = dataset.mean(on=score_column_name, ignore_nulls=True)
201        assert isinstance(aggregate, float)
202        return aggregate
203    else:
204        raise EvalAlgorithmInternalError(f"Aggregation method {agg_method} is not supported")

def category_wise_aggregation( dataset: ray.data.dataset.Dataset, score_column_name: str, agg_method: str) -> ray.data.dataset.Dataset: View Source

207def category_wise_aggregation(dataset: Dataset, score_column_name: str, agg_method: str) -> Dataset:
208    category_aggregate: Dataset = dataset.groupby(DatasetColumns.CATEGORY.value.name)  # type: ignore
209    if agg_method == MEAN:
210        category_aggregate = category_aggregate.mean(on=score_column_name, ignore_nulls=True)
211    else:
212        raise EvalAlgorithmInternalError(f"Aggregation method {agg_method} is not supported")
213    return category_aggregate

def normalize_text_quac_protocol(text: str) -> str: View Source

217def normalize_text_quac_protocol(text: str) -> str:
218    """
219    Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py
220    Given a text, normalize it using the SQUAD / QUAC protocol. That is remove punctuations, excess spaces and articles, and return the lowercased tokens.
221    SQUAD (https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) and
222    QuAC benchmarks (https://s3.amazonaws.com/my89public/quac/scorer.py) use this protocol to normalize text before evaluating it.
223    HELM (https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L116)
224    and HuggingFace evaluate (https://github.com/huggingface/evaluate/blob/775555d80af30d83dc6e9f42051840d29a34f31b/metrics/squad/compute_score.py#L11)
225    also use this to normalization procedure.
226
227    :param text: The text that needs to be normalized.
228    :returns: The normalized text.
229    """
230
231    text = text.lower()
232    text = "".join(character for character in text if character not in ENGLISH_PUNCTUATIONS)
233    return " ".join([word for word in text.split(" ") if (word != "" and word not in ENGLISH_ARTICLES)])

Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py Given a text, normalize it using the SQUAD / QUAC protocol. That is remove punctuations, excess spaces and articles, and return the lowercased tokens. SQUAD (https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) and QuAC benchmarks (https://s3.amazonaws.com/my89public/quac/scorer.py) use this protocol to normalize text before evaluating it. HELM (https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L116) and HuggingFace evaluate (https://github.com/huggingface/evaluate/blob/775555d80af30d83dc6e9f42051840d29a34f31b/metrics/squad/compute_score.py#L11) also use this to normalization procedure.

Parameters

text: The text that needs to be normalized. :returns: The normalized text.

@dataclass

class EvalOutputRecord: View Source

236@dataclass
237class EvalOutputRecord:
238    """
239    This class represents a single record that gets written by the `save_dataset` method.
240    In other words, it represents a single row from the Ray Dataset that is being saved.
241
242    :param scores: A list of EvalScores, where each EvalScore corresponds
243        to one of the score columns in the Ray Dataset being saved.
244    :param dataset_columns: Maps a column name to its contents in the current row
245        (recall that an EvalOutputRecord corresponds to a single Ray Dataset row).
246
247        Note: the keys in `dataset_columns` must belong to constants.COLUMN_NAMES,
248        because constants.COLUMN_NAMES defines which (non-score) columns are allowed
249        to appear in the saved output, i.e. it defines the schema for an output record.
250    """
251
252    scores: List[EvalScore]
253    dataset_columns: Dict[str, Union[str, float, int]]
254
255    def __post_init__(self):
256        for col in self.dataset_columns:
257            util.assert_condition(
258                col in DATASET_COLUMNS,
259                f"Attempting to initialize an EvalOutputRecord with invalid non-score column {col}.",
260            )
261
262    def __str__(self):
263        return json.dumps(self.to_dict())
264
265    def to_dict(self) -> OrderedDict[str, Union[str, float, int, List]]:
266        """
267        Returns a dictionary representation of this instance,
268        to be used when writing this object to JSON Lines.
269
270        Note that we use an OrderedDict to maintain consistency
271        in the ordering of columns. The score columns always come
272        at the end, and the non-score columns are ordered according
273        to constants.COLUMN_NAMES.
274        """
275        json_obj = OrderedDict(
276            (col_name, self.dataset_columns[col_name])
277            for col_name in DATASET_COLUMNS
278            if col_name in self.dataset_columns
279        )
280        json_obj["scores"] = [
281            # filter out None "value" and None "error"
282            {k: v for k, v in eval_score.__dict__.items() if v is not None}
283            for eval_score in self.scores
284        ]
285        return json_obj
286
287    @staticmethod
288    def from_row(row: Dict[str, Union[str, float, int]], score_names: List[str]) -> "EvalOutputRecord":
289        """
290        Returns an instance of EvalOutputRecord, created from a Ray Dataset row (represented as a dict).
291
292        Example input:
293            row = {
294                "model_input": "input",
295                "model_output": "output",
296                "column_that_wont_be_included": "hello",
297                "rouge": 0.42,
298                "bert": 0.162
299            }
300
301        Corresponding output:
302            EvalOutputRecord(
303                scores=[
304                    EvalScore(name="rouge", value=0.42),
305                    EvalScore(name="bert", value=0.162)
306                ],
307                dataset_columns={
308                    "model_input": "input",
309                    "model_output": "output"
310                }
311            )
312
313        Note how "column_that_wont_be_included" is not included in the produced EvalOutputRecord.
314        This is because only columns in constants.COLUMN_NAMES are considered to be valid columns
315        in the saved output file generated by `save_dataset`. The reason why it's even possible
316        for a column name that doesn't belong to constants.COLUMN_NAMES to appear in `row` is that
317        the Ray Dataset that `row` belongs to can contain columns used to store intermediate computations.
318        For example, ClassificationAccuracy generates a column named CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME
319        that is used to compute CLASSIFICATION_ACCURACY_SCORE, which is one of the score columns.
320
321        :param row: a Ray Dataset row represented as a dict
322        :param score_names: column names included in the Ray Dataset that `row`
323            is a sample of that correspond to evaluation algorithm scores
324        :returns: an instance of EvalOutputRecord corresponding to `row`
325        """
326        dataset_columns = {}
327        scores = []
328        for column_name, value in row.items():
329            if column_name not in score_names:  # pragma: no branch
330                if column_name in DATASET_COLUMNS:  # pragma: no branch
331                    dataset_columns[column_name] = value
332            else:
333                assert isinstance(value, float) or isinstance(value, int) or value is None  # to satisfy Mypy
334                if value is None:
335                    assert row.get(DatasetColumns.ERROR.value.name, None)
336                    scores.append(EvalScore(name=column_name, error=row.get(DatasetColumns.ERROR.value.name)))
337                else:
338                    scores.append(EvalScore(name=column_name, value=value))
339
340        return EvalOutputRecord(
341            scores=scores,
342            dataset_columns=dataset_columns,
343        )

This class represents a single record that gets written by the save_dataset method. In other words, it represents a single row from the Ray Dataset that is being saved.

Parameters

scores: A list of EvalScores, where each EvalScore corresponds to one of the score columns in the Ray Dataset being saved.
dataset_columns: Maps a column name to its contents in the current row (recall that an EvalOutputRecord corresponds to a single Ray Dataset row).

Note: the keys in dataset_columns must belong to constants.COLUMN_NAMES, because constants.COLUMN_NAMES defines which (non-score) columns are allowed to appear in the saved output, i.e. it defines the schema for an output record.

EvalOutputRecord( scores: List[fmeval.eval_algorithms.EvalScore], dataset_columns: Dict[str, Union[str, float, int]])

scores: List[fmeval.eval_algorithms.EvalScore]

dataset_columns: Dict[str, Union[str, float, int]]

def to_dict( self) -> collections.OrderedDict[str, typing.Union[str, float, int, typing.List]]: View Source

265    def to_dict(self) -> OrderedDict[str, Union[str, float, int, List]]:
266        """
267        Returns a dictionary representation of this instance,
268        to be used when writing this object to JSON Lines.
269
270        Note that we use an OrderedDict to maintain consistency
271        in the ordering of columns. The score columns always come
272        at the end, and the non-score columns are ordered according
273        to constants.COLUMN_NAMES.
274        """
275        json_obj = OrderedDict(
276            (col_name, self.dataset_columns[col_name])
277            for col_name in DATASET_COLUMNS
278            if col_name in self.dataset_columns
279        )
280        json_obj["scores"] = [
281            # filter out None "value" and None "error"
282            {k: v for k, v in eval_score.__dict__.items() if v is not None}
283            for eval_score in self.scores
284        ]
285        return json_obj

Returns a dictionary representation of this instance, to be used when writing this object to JSON Lines.

Note that we use an OrderedDict to maintain consistency in the ordering of columns. The score columns always come at the end, and the non-score columns are ordered according to constants.COLUMN_NAMES.

@staticmethod

def from_row( row: Dict[str, Union[str, float, int]], score_names: List[str]) -> EvalOutputRecord: View Source

287    @staticmethod
288    def from_row(row: Dict[str, Union[str, float, int]], score_names: List[str]) -> "EvalOutputRecord":
289        """
290        Returns an instance of EvalOutputRecord, created from a Ray Dataset row (represented as a dict).
291
292        Example input:
293            row = {
294                "model_input": "input",
295                "model_output": "output",
296                "column_that_wont_be_included": "hello",
297                "rouge": 0.42,
298                "bert": 0.162
299            }
300
301        Corresponding output:
302            EvalOutputRecord(
303                scores=[
304                    EvalScore(name="rouge", value=0.42),
305                    EvalScore(name="bert", value=0.162)
306                ],
307                dataset_columns={
308                    "model_input": "input",
309                    "model_output": "output"
310                }
311            )
312
313        Note how "column_that_wont_be_included" is not included in the produced EvalOutputRecord.
314        This is because only columns in constants.COLUMN_NAMES are considered to be valid columns
315        in the saved output file generated by `save_dataset`. The reason why it's even possible
316        for a column name that doesn't belong to constants.COLUMN_NAMES to appear in `row` is that
317        the Ray Dataset that `row` belongs to can contain columns used to store intermediate computations.
318        For example, ClassificationAccuracy generates a column named CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME
319        that is used to compute CLASSIFICATION_ACCURACY_SCORE, which is one of the score columns.
320
321        :param row: a Ray Dataset row represented as a dict
322        :param score_names: column names included in the Ray Dataset that `row`
323            is a sample of that correspond to evaluation algorithm scores
324        :returns: an instance of EvalOutputRecord corresponding to `row`
325        """
326        dataset_columns = {}
327        scores = []
328        for column_name, value in row.items():
329            if column_name not in score_names:  # pragma: no branch
330                if column_name in DATASET_COLUMNS:  # pragma: no branch
331                    dataset_columns[column_name] = value
332            else:
333                assert isinstance(value, float) or isinstance(value, int) or value is None  # to satisfy Mypy
334                if value is None:
335                    assert row.get(DatasetColumns.ERROR.value.name, None)
336                    scores.append(EvalScore(name=column_name, error=row.get(DatasetColumns.ERROR.value.name)))
337                else:
338                    scores.append(EvalScore(name=column_name, value=value))
339
340        return EvalOutputRecord(
341            scores=scores,
342            dataset_columns=dataset_columns,
343        )

Returns an instance of EvalOutputRecord, created from a Ray Dataset row (represented as a dict).

Example input: row = { "model_input": "input", "model_output": "output", "column_that_wont_be_included": "hello", "rouge": 0.42, "bert": 0.162 }

Corresponding output: EvalOutputRecord( scores=[ EvalScore(name="rouge", value=0.42), EvalScore(name="bert", value=0.162) ], dataset_columns={ "model_input": "input", "model_output": "output" } )

Note how "column_that_wont_be_included" is not included in the produced EvalOutputRecord. This is because only columns in constants.COLUMN_NAMES are considered to be valid columns in the saved output file generated by save_dataset. The reason why it's even possible for a column name that doesn't belong to constants.COLUMN_NAMES to appear in row is that the Ray Dataset that row belongs to can contain columns used to store intermediate computations. For example, ClassificationAccuracy generates a column named CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME that is used to compute CLASSIFICATION_ACCURACY_SCORE, which is one of the score columns.

Parameters

row: a Ray Dataset row represented as a dict
score_names: column names included in the Ray Dataset that row is a sample of that correspond to evaluation algorithm scores :returns: an instance of EvalOutputRecord corresponding to row

def generate_output_dataset_path(path_to_parent_dir: str, eval_name: str, dataset_name) -> str: View Source

346def generate_output_dataset_path(path_to_parent_dir: str, eval_name: str, dataset_name) -> str:
347    """
348    Returns the path to be used by an EvalAlgorithm when calling `save_dataset`.
349
350    :param path_to_parent_dir: The path to the parent directory of the file to be saved.
351    :param eval_name: The evaluation name provided by the EvalAlgorithm.
352    :param dataset_name: The name of the dataset.
353    :returns: A path that is unique to an evaluation/dataset pair for a given job.
354    """
355    return os.path.join(path_to_parent_dir, f"{eval_name}_{dataset_name}.jsonl")

Returns the path to be used by an EvalAlgorithm when calling save_dataset.

Parameters

path_to_parent_dir: The path to the parent directory of the file to be saved.
eval_name: The evaluation name provided by the EvalAlgorithm.
dataset_name: The name of the dataset. :returns: A path that is unique to an evaluation/dataset pair for a given job.

def generate_mean_delta_score( original_score: fmeval.eval_algorithms.EvalScore, perturbed_input_scores: List[fmeval.eval_algorithms.EvalScore]) -> float: View Source

358def generate_mean_delta_score(original_score: EvalScore, perturbed_input_scores: List[EvalScore]) -> float:
359    """
360    Util method to generate mean of difference between original and perturbed input scores
361    :param original_score: Original score
362    :param perturbed_input_scores: List of scores for model inference outputs on perturbed inputs
363    :returns: mean of delta between the scores
364    """
365    return sum([abs(original_score.value - reference_score.value) for reference_score in perturbed_input_scores]) / len(
366        perturbed_input_scores
367    )

Util method to generate mean of difference between original and perturbed input scores

Parameters

original_score: Original score
perturbed_input_scores: List of scores for model inference outputs on perturbed inputs :returns: mean of delta between the scores

def verify_model_determinism( model: fmeval.model_runners.model_runner.ModelRunner, dataset: ray.data.dataset.Dataset, prompt_template: str, model_input_column_name: str = 'model_input') -> bool: View Source

370def verify_model_determinism(
371    model: ModelRunner,
372    dataset: Dataset,
373    prompt_template: str,
374    model_input_column_name: str = DatasetColumns.MODEL_INPUT.value.name,
375) -> bool:
376    """Heuristic for whether model is deterministic.
377
378    This function invokes the provided model twice on each of the first
379    NUM_ROWS_DETERMINISTIC rows in the dataset. If the two model outputs
380    for each input are the same for all rows, the model is considered deterministic.
381
382    :param model: A ModelRunner instance representing the model under investigation.
383    :param dataset: A Ray Dataset that includes a model input column.
384    :param prompt_template: The template used to compose the prompt from the model input.
385    :param model_input_column_name: Model input column name.
386    :returns: Whether the model is deterministic.
387    """
388    prompt_composer = PromptComposer(prompt_template)
389    for row in dataset.limit(NUM_ROWS_DETERMINISTIC).iter_rows():
390        prompt = prompt_composer.compose(row[model_input_column_name])
391        model_output = model.predict(prompt)[0]
392        if model.predict(prompt)[0] != model_output:
393            return False
394    return True

Heuristic for whether model is deterministic.

This function invokes the provided model twice on each of the first NUM_ROWS_DETERMINISTIC rows in the dataset. If the two model outputs for each input are the same for all rows, the model is considered deterministic.

Parameters

model: A ModelRunner instance representing the model under investigation.
dataset: A Ray Dataset that includes a model input column.
prompt_template: The template used to compose the prompt from the model input.
model_input_column_name: Model input column name. :returns: Whether the model is deterministic.

def create_model_invocation_pipeline( model: fmeval.model_runners.model_runner.ModelRunner, prompt_template: str) -> fmeval.transforms.transform_pipeline.TransformPipeline: View Source

397def create_model_invocation_pipeline(model: ModelRunner, prompt_template: str) -> TransformPipeline:
398    """Create a transform pipeline for performing the standard action of invoking a model on a prompt.
399
400    :param model: The model to be invoked.
401    :param prompt_template: The template used for constructing prompts (out of raw inputs)
402        that will be fed to the model.
403    :returns: A TransformPipeline instance containing a GeneratePrompt transform that uses `prompt_template`
404        and a GetModelOutputs transform for invoking the model on the generated prompts.
405    """
406    gen_prompt = GeneratePrompt(
407        input_keys=[DatasetColumns.MODEL_INPUT.value.name],
408        output_keys=[DatasetColumns.PROMPT.value.name],
409        prompt_template=prompt_template,
410    )
411    get_model_outputs = GetModelOutputs(
412        input_to_output_keys={DatasetColumns.PROMPT.value.name: [DatasetColumns.MODEL_OUTPUT.value.name]},
413        model_runner=model,
414    )
415    return TransformPipeline([gen_prompt, get_model_outputs])

Create a transform pipeline for performing the standard action of invoking a model on a prompt.

Parameters

model: The model to be invoked.
prompt_template: The template used for constructing prompts (out of raw inputs) that will be fed to the model. :returns: A TransformPipeline instance containing a GeneratePrompt transform that uses prompt_template and a GetModelOutputs transform for invoking the model on the generated prompts.