fmeval.eval_algorithms.qa_accuracy

  1import logging
  2
  3from functools import partial
  4from typing import Any, Callable, Dict, List, Optional, Set, Union
  5from dataclasses import dataclass
  6from nltk.metrics.scores import f_measure, precision, recall
  7
  8from fmeval.constants import (
  9    BERTSCORE_DEFAULT_MODEL,
 10    DatasetColumns,
 11    MEAN,
 12)
 13from fmeval.data_loaders.util import get_dataset
 14from fmeval.data_loaders.data_config import DataConfig
 15from fmeval.eval_algorithms.common import evaluate_dataset
 16from fmeval.eval_algorithms.helper_models.helper_model import BertscoreHelperModelTypes, BertscoreHelperModel
 17from fmeval.eval_algorithms.save_strategy import SaveStrategy
 18from fmeval.eval_algorithms.util import validate_dataset, get_dataset_configs, normalize_text_quac_protocol
 19from fmeval.eval_algorithms.eval_algorithm import EvalAlgorithmConfig, EvalAlgorithmInterface
 20from fmeval.eval_algorithms import (
 21    EvalAlgorithm,
 22    EvalOutput,
 23    EvalScore,
 24)
 25from fmeval.transforms.common import SplitWithDelimiter
 26from fmeval.model_runners.model_runner import ModelRunner
 27from fmeval.transforms.summarization_accuracy_metrics import BertScore, BERT_SCORE
 28from fmeval.transforms.transform import Transform
 29from fmeval.transforms.transform_pipeline import TransformPipeline
 30from fmeval.transforms.util import validate_call
 31from fmeval.util import (
 32    get_eval_results_path,
 33    require,
 34    create_shared_resource,
 35    cleanup_shared_resource,
 36    assert_condition,
 37)
 38
 39F1_SCORE = "f1_score"
 40EXACT_MATCH_SCORE = "exact_match_score"
 41QUASI_EXACT_MATCH_SCORE = "quasi_exact_match_score"
 42PRECISION_OVER_WORDS = "precision_over_words"
 43RECALL_OVER_WORDS = "recall_over_words"
 44
 45# for metrics that are included in the QAAccuracyScores Transform
 46QA_ACCURACY_SCORE_NAMES = [
 47    F1_SCORE,
 48    EXACT_MATCH_SCORE,
 49    QUASI_EXACT_MATCH_SCORE,
 50    PRECISION_OVER_WORDS,
 51    RECALL_OVER_WORDS,
 52]
 53
 54# for all metrics in qa_accuracy (metrics from both the QAAccuracyScores Transform and the BertScore Transform)
 55SCORE_NAMES = QA_ACCURACY_SCORE_NAMES + [BERT_SCORE]
 56
 57POSSIBLE_TARGETS = "possible_targets"
 58logger = logging.getLogger(__name__)
 59
 60
 61def _normalize_and_strip_text(text: str, *, normalize_text: bool = False, strip_text: bool = False) -> str:
 62    """
 63    Combine two common operations -- normalization and stripping -- used by several metrics.
 64    :param normalize_text: Normalize the text. We use the QuAC protocol for normalization.
 65    :param strip_text: Strip the text, that is, remove whitespace characters from the beginning and end of the text.
 66    :returns: The normalized (if the normalize_text flag was set to True) and stripped (if the strip_text flag was set
 67              to True). If neither of the flags was set, the function returns the original text.
 68    """
 69    if strip_text:
 70        text = text.strip()
 71    if normalize_text:  # pragma: no branch
 72        text = normalize_text_quac_protocol(text)
 73    return text
 74
 75
 76def _split(text: str) -> Set[str]:
 77    """
 78    Splits the text to compute precision, recall scores and F1-score based on string.whitespace characters
 79     (namely ' \t\n\r\x0b\x0c') and converting the resulting list into a set.
 80    """
 81    return set(text.split())
 82
 83
 84def _f1_score(
 85    model_output: str, target_output: str, *, normalize_text: bool = False, strip_text: bool = False
 86) -> float:
 87    """
 88    Inspired by the implementation in HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L182
 89
 90    Given the model output and the target output, compute the f1 score between the two.
 91    F1-score is the harmonic mean of precision and recall where precision is the number of
 92    words in the prediction that are also found in the target output and recall is the number
 93    of words in the target output that are also found in the answer.
 94
 95    :param model_output: The output of a model that we want to evaluate.
 96    :param target_output: The reference or the "ground truth" output.
 97    :param normalize_text: Normalize the text before computing f1. We normalize the text following the QuAC protocol.
 98    :param strip_text: Strip the model_output and the target_output before computing the f1 score. Stripping amounts to removing whitespace characters from the beginning and end of the strings.
 99    :returns: The F1 score.
100    """
101    model_output = _normalize_and_strip_text(model_output, normalize_text=normalize_text, strip_text=strip_text)
102    target_output = _normalize_and_strip_text(target_output, normalize_text=normalize_text, strip_text=strip_text)
103    ret = f_measure(reference=_split(target_output), test=_split(model_output))
104    if ret is None:  # pragma: no cover
105        return 0.0
106    else:
107        return float(ret)
108
109
110def _precision(
111    model_output: str, target_output: str, *, normalize_text: bool = False, strip_text: bool = False
112) -> float:
113    """
114    Given the model output and the target output, compute the precision.
115    Precision is the fraction of words in the prediction that are also found in the target output.
116    Before computing precision, we normalize the text following the QuAC protocol.
117
118    :param model_output: The output of a model that we want to evaluate.
119    :param target_output: The reference or the "ground truth" output.
120    :param normalize_text: Normalize the text before computing precision.
121    :param strip_text: Strip the model_output and the target_output before computing precision. Stripping amounts to removing whitespace characters from the beginning and end of the strings.
122    :returns: Precision.
123    """
124    model_output = _normalize_and_strip_text(model_output, normalize_text=normalize_text, strip_text=strip_text)
125    target_output = _normalize_and_strip_text(target_output, normalize_text=normalize_text, strip_text=strip_text)
126    ret = precision(reference=_split(target_output), test=_split(model_output))
127    if ret is None:  # pragma: no cover
128        return 0.0
129    else:
130        return float(ret)
131
132
133def _recall(model_output: str, target_output: str, *, normalize_text: bool = False, strip_text: bool = False) -> float:
134    """
135    Given the model output and the target output, compute the recall.
136    Recall is the fraction of words in the target output that are also found in the prediction.
137    Before computing recall, we normalize the text following the QuAC protocol.
138
139    :param model_output: The output of a model that we want to evaluate.
140    :param target_output: The reference or the "ground truth" output.
141    :param normalize_text: Normalize the text before computing recall.
142    :param strip_text: Strip the model_output and the target_output before computing recall. Stripping amounts to removing whitespace characters from the beginning and end of the strings.
143    :returns: Recall.
144    """
145    model_output = _normalize_and_strip_text(model_output, normalize_text=normalize_text, strip_text=strip_text)
146    target_output = _normalize_and_strip_text(target_output, normalize_text=normalize_text, strip_text=strip_text)
147    ret = recall(reference=_split(target_output), test=_split(model_output))
148    if ret is None:  # pragma: no cover
149        return 0.0
150    else:
151        return float(ret)
152
153
154def _exact_match_score(model_output: str, target_output: str) -> float:
155    """
156    Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L137
157    Computes if the two strings exactly match.
158
159    :param model_output: The output of a model that we want to evaluate.
160    :param target_output: The reference or the "ground truth" output.
161    :returns: 0 is the two inputs do not match, else 1.
162    """
163    return float(model_output.strip() == target_output.strip())
164
165
166def _quasi_exact_match_score(model_output: str, target_output: str) -> float:
167    """
168    Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L144
169    Computes if the two strings exactly match after normalizing them.
170
171    Normalization: Given a text, normalize it using the SQUAD/QUAC protocol (remove punctuations, excess spaces,
172    and articles) and return the lowercased tokens.
173    SQUAD (https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) and
174    QuAC benchmarks (https://s3.amazonaws.com/my89public/quac/scorer.py) use this protocol to normalize text before
175    evaluating it. Can learn more at fmeval/src/fmeval/eval_algorithms/util.py
176
177    :param model_output: The output of a model that we want to evaluate.
178    :param target_output: The reference or the "ground truth" output.
179    :returns: 1 if the two strings match after normalization else 0.
180    """
181    return float(
182        normalize_text_quac_protocol(model_output.strip()) == normalize_text_quac_protocol(target_output.strip())
183    )
184
185
186QA_ACCURACY_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = {
187    F1_SCORE: partial(_f1_score, normalize_text=True, strip_text=True),
188    EXACT_MATCH_SCORE: _exact_match_score,
189    QUASI_EXACT_MATCH_SCORE: _quasi_exact_match_score,
190    PRECISION_OVER_WORDS: partial(_precision, normalize_text=True, strip_text=True),
191    RECALL_OVER_WORDS: partial(_recall, normalize_text=True, strip_text=True),
192}
193
194
195class QAAccuracyScores(Transform):
196    def __init__(
197        self,
198        target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name,
199        model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name,
200        output_keys: List[str] = QA_ACCURACY_SCORE_NAMES,
201        target_output_delimiter: Optional[str] = "<OR>",
202    ):
203        super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter)
204        self.register_input_output_keys(
205            input_keys=[target_output_key, model_output_key],
206            output_keys=output_keys,
207        )
208        self.target_output_key = target_output_key
209        self.model_output_key = model_output_key
210        self.output_keys = output_keys
211        self.target_output_delimiter = target_output_delimiter
212
213    def _get_score(
214        self,
215        target_output: str,
216        model_output: str,
217        score_fn: Callable[..., float],
218        **fn_kwargs,
219    ) -> float:
220        """Compute an accuracy score from target_output and model_output.
221
222        :param target_output: A single string potentially containing multiple
223            target output values. If there are multiple target output values,
224            they will be separated by `target_output_delimiter`.
225            For example, if valid target outputs for a question are ["UK", "England"]
226            and the delimiter is "<OR>", then `target_output` will be "UK<OR>England".
227        :param model_output: The model output.
228        :param target_output_delimiter: The delimiter used to separate the possible
229            target outputs within the `target_output` string.
230        :param score_fn: One of the functions in QA_ACCURACY_SCORES_TO_FUNCS.
231        :returns: A computed QA accuracy score.
232        """
233        possible_targets = target_output.split(self.target_output_delimiter)
234        return max([score_fn(model_output, target, **fn_kwargs) for target in possible_targets])
235
236    @validate_call
237    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
238        target_output = record[self.target_output_key]
239        model_output = record[self.model_output_key]
240        for output_key, score_name in zip(self.output_keys, QA_ACCURACY_SCORE_NAMES):
241            record[output_key] = self._get_score(
242                target_output=target_output,
243                model_output=model_output,
244                score_fn=QA_ACCURACY_SCORES_TO_FUNCS[score_name],
245            )
246        return record
247
248
249@dataclass(frozen=True)
250class QAAccuracyConfig(EvalAlgorithmConfig):
251    """Configures the QA Accuracy evaluation algorithm.
252
253    :param target_output_delimiter: There can be multiple valid target outputs for a given question.
254        This delimiter is used to combine all possible target outputs into a single string.
255        For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>", then the
256        target output text will be "UK<OR>England".
257    :param model_type_for_bertscore: BERT model type to use for computing BERT score.
258    """
259
260    target_output_delimiter: Optional[str] = "<OR>"
261    model_type_for_bertscore: str = BERTSCORE_DEFAULT_MODEL
262
263    def __post_init__(self):
264        require(
265            self.target_output_delimiter != "",
266            "Empty target_output_delimiter is provided. "
267            "Please either provide a non-empty string, or set it to None.",
268        )
269        require(
270            BertscoreHelperModelTypes.model_is_allowed(self.model_type_for_bertscore),
271            f"Invalid model_type_for_bertscore: {self.model_type_for_bertscore} requested in "
272            f"QAAccuracyConfig. Please choose from acceptable values: "
273            f"{BertscoreHelperModelTypes.model_list()}.",
274        )
275
276
277class QAAccuracy(EvalAlgorithmInterface):
278    """
279    This evaluation measures how well the model performs in question answering (QA) tasks. The model is queried
280    for a range of facts, and we evaluate the accuracy of its response by comparing model output to target answer under different metrics:
281
282    1. Exact match (EM): Binary score, 1 if model output and target answer match exactly.
283    2. Quasi-exact match: Binary score. Similar to exact match, but both model output and target answer are normalized first
284    by removing any articles and punctuation.
285    3. Precision over Words: The fraction of words in the prediction that are also found in the target answer. The text is normalized as before.
286    4. Recall over Words: The fraction of words in the target answer that are also found in the prediction.
287    5. F1 over Words: The harmonic mean of precision and recall, over words (normalized).
288    6. [BERTScore](https://arxiv.org/pdf/1904.09675.pdf) uses a second ML model (from the BERT family) to compute sentence embeddings and compare their cosine similarity. This score may account for additional linguistic flexibility over the other QAAccuracy metrics since semantically similar sentences should be embedded closer to each other.
289
290
291    Precision, Recall and F1 over Words are more flexible as they assign non-zero scores to
292    model answers containing parts of the ground truth. Specifically, recall measures whether the ground truth answer is _contained_ in the
293    model output, whereas precision penalizes verbosity.
294
295    All metrics are reported on average over `num_records` datapoints and per category, resulting in a number between 0
296    (worst) and 1 (best) for each metric.
297    """
298
299    eval_name = EvalAlgorithm.QA_ACCURACY.value
300
301    def __init__(self, eval_algorithm_config: QAAccuracyConfig = QAAccuracyConfig()):
302        """QAAccuracy initializer.
303
304        :param eval_algorithm_config: QA Accuracy evaluation algorithm config.
305        """
306        super().__init__(eval_algorithm_config)
307
308        self.bertscore_model = BertscoreHelperModel(eval_algorithm_config.model_type_for_bertscore)
309
310        # Saving QAAccuracyScores in the original self.transform
311        self.transform = QAAccuracyScores(target_output_delimiter=eval_algorithm_config.target_output_delimiter)
312
313        self.split_transform = SplitWithDelimiter(
314            input_key=DatasetColumns.TARGET_OUTPUT.value.name,
315            output_key=POSSIBLE_TARGETS,
316            target_output_delimiter=eval_algorithm_config.target_output_delimiter,
317        )
318        self.bert_scores = BertScore(
319            target_output_keys=None,
320            model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name],
321            output_keys=[BERT_SCORE],
322            allow_duplicate_input_keys=True,
323            target_output_keys_provider=POSSIBLE_TARGETS,
324            bertscore_model=self.bertscore_model,
325        )
326
327        self._eval_algorithm_config = eval_algorithm_config
328
329        self.pipeline = TransformPipeline([self.transform, self.split_transform, self.bert_scores])
330
331    def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]:
332        """Compute QA accuracy metrics for a single sample.
333
334        :param target_output: The expected/desired model output.
335        :param model_output: The actual model output.
336        :returns: A list of EvalScore objects, one for each of the QA accuracy metrics.
337        """
338        sample = {
339            DatasetColumns.TARGET_OUTPUT.value.name: target_output,
340            DatasetColumns.MODEL_OUTPUT.value.name: model_output,
341        }
342        result = self.pipeline.execute_record(sample)
343        return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES]
344
345    def evaluate(
346        self,
347        model: Optional[ModelRunner] = None,
348        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
349        prompt_template: Optional[str] = None,
350        num_records: int = 100,
351        save: bool = False,
352        save_strategy: Optional[SaveStrategy] = None,
353    ) -> List[EvalOutput]:
354        """Compute QA accuracy metrics on one or more datasets.
355
356        :param model: An instance of ModelRunner representing the model under evaluation.
357            If this argument is None, the `dataset_config` argument must not be None,
358            and must correspond to a dataset that already contains a column with model outputs.
359        :param dataset_config: Configures a single dataset or list of datasets used for the
360            evaluation. If not provided, this method will run evaluations using all of its
361            supported built-in datasets.
362        :param prompt_template: A template used to generate prompts that are fed to the model.
363            If not provided, defaults will be used. If provided, `model` must not be None.
364        :param num_records: The number of records to be sampled randomly from the input dataset(s)
365            used to perform the evaluation(s).
366        :param save: If set to true, prompt responses and scores will be saved to a file.
367        :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not
368            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
369            If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`.
370
371        :return: A list of EvalOutput objects.
372        """
373        # Create a shared resource to be used during the evaluation.
374        bertscore_shared_resource = create_shared_resource(self.bertscore_model)
375
376        bert_scores = BertScore(
377            target_output_keys=None,
378            model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name],
379            output_keys=[BERT_SCORE],
380            allow_duplicate_input_keys=True,
381            target_output_keys_provider=POSSIBLE_TARGETS,
382            bertscore_model=bertscore_shared_resource,
383        )
384
385        # Create a new pipeline that uses the shared resource instead of self.bertscore_model.
386        pipeline = TransformPipeline([self.transform, self.split_transform, bert_scores])
387
388        dataset_configs = get_dataset_configs(dataset_config, self.eval_name)
389        eval_outputs = []
390        for dataset_config in dataset_configs:
391            dataset = get_dataset(dataset_config, num_records)
392            validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name])
393            eval_output = evaluate_dataset(
394                dataset=dataset,
395                pipeline=pipeline,
396                dataset_name=dataset_config.dataset_name,
397                eval_name=self.eval_name,
398                metric_names=SCORE_NAMES,
399                eval_results_path=get_eval_results_path(),
400                model=model,
401                prompt_template=prompt_template,
402                agg_method=MEAN,
403                save=save,
404                save_strategy=save_strategy,
405            )
406            eval_outputs.append(eval_output)
407        cleanup_shared_resource(bertscore_shared_resource)
408        return eval_outputs
F1_SCORE = 'f1_score'
EXACT_MATCH_SCORE = 'exact_match_score'
QUASI_EXACT_MATCH_SCORE = 'quasi_exact_match_score'
PRECISION_OVER_WORDS = 'precision_over_words'
RECALL_OVER_WORDS = 'recall_over_words'
QA_ACCURACY_SCORE_NAMES = ['f1_score', 'exact_match_score', 'quasi_exact_match_score', 'precision_over_words', 'recall_over_words']
SCORE_NAMES = ['f1_score', 'exact_match_score', 'quasi_exact_match_score', 'precision_over_words', 'recall_over_words', 'bertscore']
POSSIBLE_TARGETS = 'possible_targets'
logger = <Logger fmeval.eval_algorithms.qa_accuracy (WARNING)>
QA_ACCURACY_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = {'f1_score': functools.partial(<function _f1_score>, normalize_text=True, strip_text=True), 'exact_match_score': <function _exact_match_score>, 'quasi_exact_match_score': <function _quasi_exact_match_score>, 'precision_over_words': functools.partial(<function _precision>, normalize_text=True, strip_text=True), 'recall_over_words': functools.partial(<function _recall>, normalize_text=True, strip_text=True)}
class QAAccuracyScores(fmeval.transforms.transform.Transform):
196class QAAccuracyScores(Transform):
197    def __init__(
198        self,
199        target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name,
200        model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name,
201        output_keys: List[str] = QA_ACCURACY_SCORE_NAMES,
202        target_output_delimiter: Optional[str] = "<OR>",
203    ):
204        super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter)
205        self.register_input_output_keys(
206            input_keys=[target_output_key, model_output_key],
207            output_keys=output_keys,
208        )
209        self.target_output_key = target_output_key
210        self.model_output_key = model_output_key
211        self.output_keys = output_keys
212        self.target_output_delimiter = target_output_delimiter
213
214    def _get_score(
215        self,
216        target_output: str,
217        model_output: str,
218        score_fn: Callable[..., float],
219        **fn_kwargs,
220    ) -> float:
221        """Compute an accuracy score from target_output and model_output.
222
223        :param target_output: A single string potentially containing multiple
224            target output values. If there are multiple target output values,
225            they will be separated by `target_output_delimiter`.
226            For example, if valid target outputs for a question are ["UK", "England"]
227            and the delimiter is "<OR>", then `target_output` will be "UK<OR>England".
228        :param model_output: The model output.
229        :param target_output_delimiter: The delimiter used to separate the possible
230            target outputs within the `target_output` string.
231        :param score_fn: One of the functions in QA_ACCURACY_SCORES_TO_FUNCS.
232        :returns: A computed QA accuracy score.
233        """
234        possible_targets = target_output.split(self.target_output_delimiter)
235        return max([score_fn(model_output, target, **fn_kwargs) for target in possible_targets])
236
237    @validate_call
238    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
239        target_output = record[self.target_output_key]
240        model_output = record[self.model_output_key]
241        for output_key, score_name in zip(self.output_keys, QA_ACCURACY_SCORE_NAMES):
242            record[output_key] = self._get_score(
243                target_output=target_output,
244                model_output=model_output,
245                score_fn=QA_ACCURACY_SCORES_TO_FUNCS[score_name],
246            )
247        return record

A Transform represents a single operation that consumes a record and outputs another.

Typically, the output record is the same object as the input; the Transform simply mutates its input (usually by augmenting it with new data). However, the output record can also be a new object, independent of the input record.

The logic for creating the output record is implemented in the Transform's __call__ method, which takes a record as its sole argument. Any additional data besides this record that is required to perform the transformation logic should be stored as instance attributes in the Transform.

QAAccuracyScores( target_output_key: str = 'target_output', model_output_key: str = 'model_output', output_keys: List[str] = ['f1_score', 'exact_match_score', 'quasi_exact_match_score', 'precision_over_words', 'recall_over_words'], target_output_delimiter: Optional[str] = '<OR>')
197    def __init__(
198        self,
199        target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name,
200        model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name,
201        output_keys: List[str] = QA_ACCURACY_SCORE_NAMES,
202        target_output_delimiter: Optional[str] = "<OR>",
203    ):
204        super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter)
205        self.register_input_output_keys(
206            input_keys=[target_output_key, model_output_key],
207            output_keys=output_keys,
208        )
209        self.target_output_key = target_output_key
210        self.model_output_key = model_output_key
211        self.output_keys = output_keys
212        self.target_output_delimiter = target_output_delimiter

Transform initializer.

Concrete subclasses of Transform should always call super().__init__ with every argument passed to their own __init__ method. Transform.__init__ stores all positional arguments in the args instance attribute and all keyword arguments in the kwargs instance attribute. This data is passed to Ray when Ray creates copies of this Transform instance to perform parallel execution.

Note: The input_keys and output_keys attributes are initialized to None and only assigned a meaningful value if the register_input_output_keys method is called. This method is used in conjunction with the validate_call decorator to perform validations of the __call__ inputs and outputs at runtime. While it is not strictly necessary to utilize register_input_output_keys and validate_call when implementing your own transforms, these methods are used in all built-in transforms.

Parameters
  • *args: Variable length argument list.
  • **kwargs: Arbitrary keyword arguments.
target_output_key
model_output_key
output_keys
target_output_delimiter
@dataclass(frozen=True)
class QAAccuracyConfig(fmeval.eval_algorithms.eval_algorithm.EvalAlgorithmConfig):
250@dataclass(frozen=True)
251class QAAccuracyConfig(EvalAlgorithmConfig):
252    """Configures the QA Accuracy evaluation algorithm.
253
254    :param target_output_delimiter: There can be multiple valid target outputs for a given question.
255        This delimiter is used to combine all possible target outputs into a single string.
256        For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>", then the
257        target output text will be "UK<OR>England".
258    :param model_type_for_bertscore: BERT model type to use for computing BERT score.
259    """
260
261    target_output_delimiter: Optional[str] = "<OR>"
262    model_type_for_bertscore: str = BERTSCORE_DEFAULT_MODEL
263
264    def __post_init__(self):
265        require(
266            self.target_output_delimiter != "",
267            "Empty target_output_delimiter is provided. "
268            "Please either provide a non-empty string, or set it to None.",
269        )
270        require(
271            BertscoreHelperModelTypes.model_is_allowed(self.model_type_for_bertscore),
272            f"Invalid model_type_for_bertscore: {self.model_type_for_bertscore} requested in "
273            f"QAAccuracyConfig. Please choose from acceptable values: "
274            f"{BertscoreHelperModelTypes.model_list()}.",
275        )

Configures the QA Accuracy evaluation algorithm.

Parameters
  • target_output_delimiter: There can be multiple valid target outputs for a given question. This delimiter is used to combine all possible target outputs into a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "", then the target output text will be "UKEngland".
  • model_type_for_bertscore: BERT model type to use for computing BERT score.
QAAccuracyConfig( target_output_delimiter: Optional[str] = '<OR>', model_type_for_bertscore: str = 'microsoft/deberta-xlarge-mnli')
target_output_delimiter: Optional[str] = '<OR>'
model_type_for_bertscore: str = 'microsoft/deberta-xlarge-mnli'
278class QAAccuracy(EvalAlgorithmInterface):
279    """
280    This evaluation measures how well the model performs in question answering (QA) tasks. The model is queried
281    for a range of facts, and we evaluate the accuracy of its response by comparing model output to target answer under different metrics:
282
283    1. Exact match (EM): Binary score, 1 if model output and target answer match exactly.
284    2. Quasi-exact match: Binary score. Similar to exact match, but both model output and target answer are normalized first
285    by removing any articles and punctuation.
286    3. Precision over Words: The fraction of words in the prediction that are also found in the target answer. The text is normalized as before.
287    4. Recall over Words: The fraction of words in the target answer that are also found in the prediction.
288    5. F1 over Words: The harmonic mean of precision and recall, over words (normalized).
289    6. [BERTScore](https://arxiv.org/pdf/1904.09675.pdf) uses a second ML model (from the BERT family) to compute sentence embeddings and compare their cosine similarity. This score may account for additional linguistic flexibility over the other QAAccuracy metrics since semantically similar sentences should be embedded closer to each other.
290
291
292    Precision, Recall and F1 over Words are more flexible as they assign non-zero scores to
293    model answers containing parts of the ground truth. Specifically, recall measures whether the ground truth answer is _contained_ in the
294    model output, whereas precision penalizes verbosity.
295
296    All metrics are reported on average over `num_records` datapoints and per category, resulting in a number between 0
297    (worst) and 1 (best) for each metric.
298    """
299
300    eval_name = EvalAlgorithm.QA_ACCURACY.value
301
302    def __init__(self, eval_algorithm_config: QAAccuracyConfig = QAAccuracyConfig()):
303        """QAAccuracy initializer.
304
305        :param eval_algorithm_config: QA Accuracy evaluation algorithm config.
306        """
307        super().__init__(eval_algorithm_config)
308
309        self.bertscore_model = BertscoreHelperModel(eval_algorithm_config.model_type_for_bertscore)
310
311        # Saving QAAccuracyScores in the original self.transform
312        self.transform = QAAccuracyScores(target_output_delimiter=eval_algorithm_config.target_output_delimiter)
313
314        self.split_transform = SplitWithDelimiter(
315            input_key=DatasetColumns.TARGET_OUTPUT.value.name,
316            output_key=POSSIBLE_TARGETS,
317            target_output_delimiter=eval_algorithm_config.target_output_delimiter,
318        )
319        self.bert_scores = BertScore(
320            target_output_keys=None,
321            model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name],
322            output_keys=[BERT_SCORE],
323            allow_duplicate_input_keys=True,
324            target_output_keys_provider=POSSIBLE_TARGETS,
325            bertscore_model=self.bertscore_model,
326        )
327
328        self._eval_algorithm_config = eval_algorithm_config
329
330        self.pipeline = TransformPipeline([self.transform, self.split_transform, self.bert_scores])
331
332    def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]:
333        """Compute QA accuracy metrics for a single sample.
334
335        :param target_output: The expected/desired model output.
336        :param model_output: The actual model output.
337        :returns: A list of EvalScore objects, one for each of the QA accuracy metrics.
338        """
339        sample = {
340            DatasetColumns.TARGET_OUTPUT.value.name: target_output,
341            DatasetColumns.MODEL_OUTPUT.value.name: model_output,
342        }
343        result = self.pipeline.execute_record(sample)
344        return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES]
345
346    def evaluate(
347        self,
348        model: Optional[ModelRunner] = None,
349        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
350        prompt_template: Optional[str] = None,
351        num_records: int = 100,
352        save: bool = False,
353        save_strategy: Optional[SaveStrategy] = None,
354    ) -> List[EvalOutput]:
355        """Compute QA accuracy metrics on one or more datasets.
356
357        :param model: An instance of ModelRunner representing the model under evaluation.
358            If this argument is None, the `dataset_config` argument must not be None,
359            and must correspond to a dataset that already contains a column with model outputs.
360        :param dataset_config: Configures a single dataset or list of datasets used for the
361            evaluation. If not provided, this method will run evaluations using all of its
362            supported built-in datasets.
363        :param prompt_template: A template used to generate prompts that are fed to the model.
364            If not provided, defaults will be used. If provided, `model` must not be None.
365        :param num_records: The number of records to be sampled randomly from the input dataset(s)
366            used to perform the evaluation(s).
367        :param save: If set to true, prompt responses and scores will be saved to a file.
368        :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not
369            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
370            If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`.
371
372        :return: A list of EvalOutput objects.
373        """
374        # Create a shared resource to be used during the evaluation.
375        bertscore_shared_resource = create_shared_resource(self.bertscore_model)
376
377        bert_scores = BertScore(
378            target_output_keys=None,
379            model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name],
380            output_keys=[BERT_SCORE],
381            allow_duplicate_input_keys=True,
382            target_output_keys_provider=POSSIBLE_TARGETS,
383            bertscore_model=bertscore_shared_resource,
384        )
385
386        # Create a new pipeline that uses the shared resource instead of self.bertscore_model.
387        pipeline = TransformPipeline([self.transform, self.split_transform, bert_scores])
388
389        dataset_configs = get_dataset_configs(dataset_config, self.eval_name)
390        eval_outputs = []
391        for dataset_config in dataset_configs:
392            dataset = get_dataset(dataset_config, num_records)
393            validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name])
394            eval_output = evaluate_dataset(
395                dataset=dataset,
396                pipeline=pipeline,
397                dataset_name=dataset_config.dataset_name,
398                eval_name=self.eval_name,
399                metric_names=SCORE_NAMES,
400                eval_results_path=get_eval_results_path(),
401                model=model,
402                prompt_template=prompt_template,
403                agg_method=MEAN,
404                save=save,
405                save_strategy=save_strategy,
406            )
407            eval_outputs.append(eval_output)
408        cleanup_shared_resource(bertscore_shared_resource)
409        return eval_outputs

This evaluation measures how well the model performs in question answering (QA) tasks. The model is queried for a range of facts, and we evaluate the accuracy of its response by comparing model output to target answer under different metrics:

  1. Exact match (EM): Binary score, 1 if model output and target answer match exactly.
  2. Quasi-exact match: Binary score. Similar to exact match, but both model output and target answer are normalized first by removing any articles and punctuation.
  3. Precision over Words: The fraction of words in the prediction that are also found in the target answer. The text is normalized as before.
  4. Recall over Words: The fraction of words in the target answer that are also found in the prediction.
  5. F1 over Words: The harmonic mean of precision and recall, over words (normalized).
  6. BERTScore uses a second ML model (from the BERT family) to compute sentence embeddings and compare their cosine similarity. This score may account for additional linguistic flexibility over the other QAAccuracy metrics since semantically similar sentences should be embedded closer to each other.

Precision, Recall and F1 over Words are more flexible as they assign non-zero scores to model answers containing parts of the ground truth. Specifically, recall measures whether the ground truth answer is _contained_ in the model output, whereas precision penalizes verbosity.

All metrics are reported on average over num_records datapoints and per category, resulting in a number between 0 (worst) and 1 (best) for each metric.

QAAccuracy( eval_algorithm_config: QAAccuracyConfig = QAAccuracyConfig(target_output_delimiter='<OR>', model_type_for_bertscore='microsoft/deberta-xlarge-mnli'))
302    def __init__(self, eval_algorithm_config: QAAccuracyConfig = QAAccuracyConfig()):
303        """QAAccuracy initializer.
304
305        :param eval_algorithm_config: QA Accuracy evaluation algorithm config.
306        """
307        super().__init__(eval_algorithm_config)
308
309        self.bertscore_model = BertscoreHelperModel(eval_algorithm_config.model_type_for_bertscore)
310
311        # Saving QAAccuracyScores in the original self.transform
312        self.transform = QAAccuracyScores(target_output_delimiter=eval_algorithm_config.target_output_delimiter)
313
314        self.split_transform = SplitWithDelimiter(
315            input_key=DatasetColumns.TARGET_OUTPUT.value.name,
316            output_key=POSSIBLE_TARGETS,
317            target_output_delimiter=eval_algorithm_config.target_output_delimiter,
318        )
319        self.bert_scores = BertScore(
320            target_output_keys=None,
321            model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name],
322            output_keys=[BERT_SCORE],
323            allow_duplicate_input_keys=True,
324            target_output_keys_provider=POSSIBLE_TARGETS,
325            bertscore_model=self.bertscore_model,
326        )
327
328        self._eval_algorithm_config = eval_algorithm_config
329
330        self.pipeline = TransformPipeline([self.transform, self.split_transform, self.bert_scores])

QAAccuracy initializer.

Parameters
  • eval_algorithm_config: QA Accuracy evaluation algorithm config.
eval_name = 'qa_accuracy'
bertscore_model
transform
split_transform
bert_scores
pipeline
def evaluate_sample( self, target_output: str, model_output: str) -> List[fmeval.eval_algorithms.EvalScore]:
332    def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]:
333        """Compute QA accuracy metrics for a single sample.
334
335        :param target_output: The expected/desired model output.
336        :param model_output: The actual model output.
337        :returns: A list of EvalScore objects, one for each of the QA accuracy metrics.
338        """
339        sample = {
340            DatasetColumns.TARGET_OUTPUT.value.name: target_output,
341            DatasetColumns.MODEL_OUTPUT.value.name: model_output,
342        }
343        result = self.pipeline.execute_record(sample)
344        return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES]

Compute QA accuracy metrics for a single sample.

Parameters
  • target_output: The expected/desired model output.
  • model_output: The actual model output. :returns: A list of EvalScore objects, one for each of the QA accuracy metrics.
def evaluate( self, model: Optional[fmeval.model_runners.model_runner.ModelRunner] = None, dataset_config: Union[fmeval.data_loaders.data_config.DataConfig, List[fmeval.data_loaders.data_config.DataConfig], NoneType] = None, prompt_template: Optional[str] = None, num_records: int = 100, save: bool = False, save_strategy: Optional[fmeval.eval_algorithms.save_strategy.SaveStrategy] = None) -> List[fmeval.eval_algorithms.EvalOutput]:
346    def evaluate(
347        self,
348        model: Optional[ModelRunner] = None,
349        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
350        prompt_template: Optional[str] = None,
351        num_records: int = 100,
352        save: bool = False,
353        save_strategy: Optional[SaveStrategy] = None,
354    ) -> List[EvalOutput]:
355        """Compute QA accuracy metrics on one or more datasets.
356
357        :param model: An instance of ModelRunner representing the model under evaluation.
358            If this argument is None, the `dataset_config` argument must not be None,
359            and must correspond to a dataset that already contains a column with model outputs.
360        :param dataset_config: Configures a single dataset or list of datasets used for the
361            evaluation. If not provided, this method will run evaluations using all of its
362            supported built-in datasets.
363        :param prompt_template: A template used to generate prompts that are fed to the model.
364            If not provided, defaults will be used. If provided, `model` must not be None.
365        :param num_records: The number of records to be sampled randomly from the input dataset(s)
366            used to perform the evaluation(s).
367        :param save: If set to true, prompt responses and scores will be saved to a file.
368        :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not
369            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
370            If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`.
371
372        :return: A list of EvalOutput objects.
373        """
374        # Create a shared resource to be used during the evaluation.
375        bertscore_shared_resource = create_shared_resource(self.bertscore_model)
376
377        bert_scores = BertScore(
378            target_output_keys=None,
379            model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name],
380            output_keys=[BERT_SCORE],
381            allow_duplicate_input_keys=True,
382            target_output_keys_provider=POSSIBLE_TARGETS,
383            bertscore_model=bertscore_shared_resource,
384        )
385
386        # Create a new pipeline that uses the shared resource instead of self.bertscore_model.
387        pipeline = TransformPipeline([self.transform, self.split_transform, bert_scores])
388
389        dataset_configs = get_dataset_configs(dataset_config, self.eval_name)
390        eval_outputs = []
391        for dataset_config in dataset_configs:
392            dataset = get_dataset(dataset_config, num_records)
393            validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name])
394            eval_output = evaluate_dataset(
395                dataset=dataset,
396                pipeline=pipeline,
397                dataset_name=dataset_config.dataset_name,
398                eval_name=self.eval_name,
399                metric_names=SCORE_NAMES,
400                eval_results_path=get_eval_results_path(),
401                model=model,
402                prompt_template=prompt_template,
403                agg_method=MEAN,
404                save=save,
405                save_strategy=save_strategy,
406            )
407            eval_outputs.append(eval_output)
408        cleanup_shared_resource(bertscore_shared_resource)
409        return eval_outputs

Compute QA accuracy metrics on one or more datasets.

Parameters
  • model: An instance of ModelRunner representing the model under evaluation. If this argument is None, the dataset_config argument must not be None, and must correspond to a dataset that already contains a column with model outputs.
  • dataset_config: Configures a single dataset or list of datasets used for the evaluation. If not provided, this method will run evaluations using all of its supported built-in datasets.
  • prompt_template: A template used to generate prompts that are fed to the model. If not provided, defaults will be used. If provided, model must not be None.
  • num_records: The number of records to be sampled randomly from the input dataset(s) used to perform the evaluation(s).
  • save: If set to true, prompt responses and scores will be saved to a file.
  • save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. If that environment variable is also not configured, it will be saved to the default path /tmp/eval_results/.
Returns

A list of EvalOutput objects.