fmeval.eval_algorithms.qa_accuracy
1import logging 2 3from functools import partial 4from typing import Any, Callable, Dict, List, Optional, Set, Union 5from dataclasses import dataclass 6from nltk.metrics.scores import f_measure, precision, recall 7 8from fmeval.constants import ( 9 BERTSCORE_DEFAULT_MODEL, 10 DatasetColumns, 11 MEAN, 12) 13from fmeval.data_loaders.util import get_dataset 14from fmeval.data_loaders.data_config import DataConfig 15from fmeval.eval_algorithms.common import evaluate_dataset 16from fmeval.eval_algorithms.helper_models.helper_model import BertscoreHelperModelTypes, BertscoreHelperModel 17from fmeval.eval_algorithms.save_strategy import SaveStrategy 18from fmeval.eval_algorithms.util import validate_dataset, get_dataset_configs, normalize_text_quac_protocol 19from fmeval.eval_algorithms.eval_algorithm import EvalAlgorithmConfig, EvalAlgorithmInterface 20from fmeval.eval_algorithms import ( 21 EvalAlgorithm, 22 EvalOutput, 23 EvalScore, 24) 25from fmeval.transforms.common import SplitWithDelimiter 26from fmeval.model_runners.model_runner import ModelRunner 27from fmeval.transforms.summarization_accuracy_metrics import BertScore, BERT_SCORE 28from fmeval.transforms.transform import Transform 29from fmeval.transforms.transform_pipeline import TransformPipeline 30from fmeval.transforms.util import validate_call 31from fmeval.util import ( 32 get_eval_results_path, 33 require, 34 create_shared_resource, 35 cleanup_shared_resource, 36 assert_condition, 37) 38 39F1_SCORE = "f1_score" 40EXACT_MATCH_SCORE = "exact_match_score" 41QUASI_EXACT_MATCH_SCORE = "quasi_exact_match_score" 42PRECISION_OVER_WORDS = "precision_over_words" 43RECALL_OVER_WORDS = "recall_over_words" 44 45# for metrics that are included in the QAAccuracyScores Transform 46QA_ACCURACY_SCORE_NAMES = [ 47 F1_SCORE, 48 EXACT_MATCH_SCORE, 49 QUASI_EXACT_MATCH_SCORE, 50 PRECISION_OVER_WORDS, 51 RECALL_OVER_WORDS, 52] 53 54# for all metrics in qa_accuracy (metrics from both the QAAccuracyScores Transform and the BertScore Transform) 55SCORE_NAMES = QA_ACCURACY_SCORE_NAMES + [BERT_SCORE] 56 57POSSIBLE_TARGETS = "possible_targets" 58logger = logging.getLogger(__name__) 59 60 61def _normalize_and_strip_text(text: str, *, normalize_text: bool = False, strip_text: bool = False) -> str: 62 """ 63 Combine two common operations -- normalization and stripping -- used by several metrics. 64 :param normalize_text: Normalize the text. We use the QuAC protocol for normalization. 65 :param strip_text: Strip the text, that is, remove whitespace characters from the beginning and end of the text. 66 :returns: The normalized (if the normalize_text flag was set to True) and stripped (if the strip_text flag was set 67 to True). If neither of the flags was set, the function returns the original text. 68 """ 69 if strip_text: 70 text = text.strip() 71 if normalize_text: # pragma: no branch 72 text = normalize_text_quac_protocol(text) 73 return text 74 75 76def _split(text: str) -> Set[str]: 77 """ 78 Splits the text to compute precision, recall scores and F1-score based on string.whitespace characters 79 (namely ' \t\n\r\x0b\x0c') and converting the resulting list into a set. 80 """ 81 return set(text.split()) 82 83 84def _f1_score( 85 model_output: str, target_output: str, *, normalize_text: bool = False, strip_text: bool = False 86) -> float: 87 """ 88 Inspired by the implementation in HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L182 89 90 Given the model output and the target output, compute the f1 score between the two. 91 F1-score is the harmonic mean of precision and recall where precision is the number of 92 words in the prediction that are also found in the target output and recall is the number 93 of words in the target output that are also found in the answer. 94 95 :param model_output: The output of a model that we want to evaluate. 96 :param target_output: The reference or the "ground truth" output. 97 :param normalize_text: Normalize the text before computing f1. We normalize the text following the QuAC protocol. 98 :param strip_text: Strip the model_output and the target_output before computing the f1 score. Stripping amounts to removing whitespace characters from the beginning and end of the strings. 99 :returns: The F1 score. 100 """ 101 model_output = _normalize_and_strip_text(model_output, normalize_text=normalize_text, strip_text=strip_text) 102 target_output = _normalize_and_strip_text(target_output, normalize_text=normalize_text, strip_text=strip_text) 103 ret = f_measure(reference=_split(target_output), test=_split(model_output)) 104 if ret is None: # pragma: no cover 105 return 0.0 106 else: 107 return float(ret) 108 109 110def _precision( 111 model_output: str, target_output: str, *, normalize_text: bool = False, strip_text: bool = False 112) -> float: 113 """ 114 Given the model output and the target output, compute the precision. 115 Precision is the fraction of words in the prediction that are also found in the target output. 116 Before computing precision, we normalize the text following the QuAC protocol. 117 118 :param model_output: The output of a model that we want to evaluate. 119 :param target_output: The reference or the "ground truth" output. 120 :param normalize_text: Normalize the text before computing precision. 121 :param strip_text: Strip the model_output and the target_output before computing precision. Stripping amounts to removing whitespace characters from the beginning and end of the strings. 122 :returns: Precision. 123 """ 124 model_output = _normalize_and_strip_text(model_output, normalize_text=normalize_text, strip_text=strip_text) 125 target_output = _normalize_and_strip_text(target_output, normalize_text=normalize_text, strip_text=strip_text) 126 ret = precision(reference=_split(target_output), test=_split(model_output)) 127 if ret is None: # pragma: no cover 128 return 0.0 129 else: 130 return float(ret) 131 132 133def _recall(model_output: str, target_output: str, *, normalize_text: bool = False, strip_text: bool = False) -> float: 134 """ 135 Given the model output and the target output, compute the recall. 136 Recall is the fraction of words in the target output that are also found in the prediction. 137 Before computing recall, we normalize the text following the QuAC protocol. 138 139 :param model_output: The output of a model that we want to evaluate. 140 :param target_output: The reference or the "ground truth" output. 141 :param normalize_text: Normalize the text before computing recall. 142 :param strip_text: Strip the model_output and the target_output before computing recall. Stripping amounts to removing whitespace characters from the beginning and end of the strings. 143 :returns: Recall. 144 """ 145 model_output = _normalize_and_strip_text(model_output, normalize_text=normalize_text, strip_text=strip_text) 146 target_output = _normalize_and_strip_text(target_output, normalize_text=normalize_text, strip_text=strip_text) 147 ret = recall(reference=_split(target_output), test=_split(model_output)) 148 if ret is None: # pragma: no cover 149 return 0.0 150 else: 151 return float(ret) 152 153 154def _exact_match_score(model_output: str, target_output: str) -> float: 155 """ 156 Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L137 157 Computes if the two strings exactly match. 158 159 :param model_output: The output of a model that we want to evaluate. 160 :param target_output: The reference or the "ground truth" output. 161 :returns: 0 is the two inputs do not match, else 1. 162 """ 163 return float(model_output.strip() == target_output.strip()) 164 165 166def _quasi_exact_match_score(model_output: str, target_output: str) -> float: 167 """ 168 Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L144 169 Computes if the two strings exactly match after normalizing them. 170 171 Normalization: Given a text, normalize it using the SQUAD/QUAC protocol (remove punctuations, excess spaces, 172 and articles) and return the lowercased tokens. 173 SQUAD (https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) and 174 QuAC benchmarks (https://s3.amazonaws.com/my89public/quac/scorer.py) use this protocol to normalize text before 175 evaluating it. Can learn more at fmeval/src/fmeval/eval_algorithms/util.py 176 177 :param model_output: The output of a model that we want to evaluate. 178 :param target_output: The reference or the "ground truth" output. 179 :returns: 1 if the two strings match after normalization else 0. 180 """ 181 return float( 182 normalize_text_quac_protocol(model_output.strip()) == normalize_text_quac_protocol(target_output.strip()) 183 ) 184 185 186QA_ACCURACY_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = { 187 F1_SCORE: partial(_f1_score, normalize_text=True, strip_text=True), 188 EXACT_MATCH_SCORE: _exact_match_score, 189 QUASI_EXACT_MATCH_SCORE: _quasi_exact_match_score, 190 PRECISION_OVER_WORDS: partial(_precision, normalize_text=True, strip_text=True), 191 RECALL_OVER_WORDS: partial(_recall, normalize_text=True, strip_text=True), 192} 193 194 195class QAAccuracyScores(Transform): 196 def __init__( 197 self, 198 target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name, 199 model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name, 200 output_keys: List[str] = QA_ACCURACY_SCORE_NAMES, 201 target_output_delimiter: Optional[str] = "<OR>", 202 ): 203 super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter) 204 self.register_input_output_keys( 205 input_keys=[target_output_key, model_output_key], 206 output_keys=output_keys, 207 ) 208 self.target_output_key = target_output_key 209 self.model_output_key = model_output_key 210 self.output_keys = output_keys 211 self.target_output_delimiter = target_output_delimiter 212 213 def _get_score( 214 self, 215 target_output: str, 216 model_output: str, 217 score_fn: Callable[..., float], 218 **fn_kwargs, 219 ) -> float: 220 """Compute an accuracy score from target_output and model_output. 221 222 :param target_output: A single string potentially containing multiple 223 target output values. If there are multiple target output values, 224 they will be separated by `target_output_delimiter`. 225 For example, if valid target outputs for a question are ["UK", "England"] 226 and the delimiter is "<OR>", then `target_output` will be "UK<OR>England". 227 :param model_output: The model output. 228 :param target_output_delimiter: The delimiter used to separate the possible 229 target outputs within the `target_output` string. 230 :param score_fn: One of the functions in QA_ACCURACY_SCORES_TO_FUNCS. 231 :returns: A computed QA accuracy score. 232 """ 233 possible_targets = target_output.split(self.target_output_delimiter) 234 return max([score_fn(model_output, target, **fn_kwargs) for target in possible_targets]) 235 236 @validate_call 237 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 238 target_output = record[self.target_output_key] 239 model_output = record[self.model_output_key] 240 for output_key, score_name in zip(self.output_keys, QA_ACCURACY_SCORE_NAMES): 241 record[output_key] = self._get_score( 242 target_output=target_output, 243 model_output=model_output, 244 score_fn=QA_ACCURACY_SCORES_TO_FUNCS[score_name], 245 ) 246 return record 247 248 249@dataclass(frozen=True) 250class QAAccuracyConfig(EvalAlgorithmConfig): 251 """Configures the QA Accuracy evaluation algorithm. 252 253 :param target_output_delimiter: There can be multiple valid target outputs for a given question. 254 This delimiter is used to combine all possible target outputs into a single string. 255 For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>", then the 256 target output text will be "UK<OR>England". 257 :param model_type_for_bertscore: BERT model type to use for computing BERT score. 258 """ 259 260 target_output_delimiter: Optional[str] = "<OR>" 261 model_type_for_bertscore: str = BERTSCORE_DEFAULT_MODEL 262 263 def __post_init__(self): 264 require( 265 self.target_output_delimiter != "", 266 "Empty target_output_delimiter is provided. " 267 "Please either provide a non-empty string, or set it to None.", 268 ) 269 require( 270 BertscoreHelperModelTypes.model_is_allowed(self.model_type_for_bertscore), 271 f"Invalid model_type_for_bertscore: {self.model_type_for_bertscore} requested in " 272 f"QAAccuracyConfig. Please choose from acceptable values: " 273 f"{BertscoreHelperModelTypes.model_list()}.", 274 ) 275 276 277class QAAccuracy(EvalAlgorithmInterface): 278 """ 279 This evaluation measures how well the model performs in question answering (QA) tasks. The model is queried 280 for a range of facts, and we evaluate the accuracy of its response by comparing model output to target answer under different metrics: 281 282 1. Exact match (EM): Binary score, 1 if model output and target answer match exactly. 283 2. Quasi-exact match: Binary score. Similar to exact match, but both model output and target answer are normalized first 284 by removing any articles and punctuation. 285 3. Precision over Words: The fraction of words in the prediction that are also found in the target answer. The text is normalized as before. 286 4. Recall over Words: The fraction of words in the target answer that are also found in the prediction. 287 5. F1 over Words: The harmonic mean of precision and recall, over words (normalized). 288 6. [BERTScore](https://arxiv.org/pdf/1904.09675.pdf) uses a second ML model (from the BERT family) to compute sentence embeddings and compare their cosine similarity. This score may account for additional linguistic flexibility over the other QAAccuracy metrics since semantically similar sentences should be embedded closer to each other. 289 290 291 Precision, Recall and F1 over Words are more flexible as they assign non-zero scores to 292 model answers containing parts of the ground truth. Specifically, recall measures whether the ground truth answer is _contained_ in the 293 model output, whereas precision penalizes verbosity. 294 295 All metrics are reported on average over `num_records` datapoints and per category, resulting in a number between 0 296 (worst) and 1 (best) for each metric. 297 """ 298 299 eval_name = EvalAlgorithm.QA_ACCURACY.value 300 301 def __init__(self, eval_algorithm_config: QAAccuracyConfig = QAAccuracyConfig()): 302 """QAAccuracy initializer. 303 304 :param eval_algorithm_config: QA Accuracy evaluation algorithm config. 305 """ 306 super().__init__(eval_algorithm_config) 307 308 self.bertscore_model = BertscoreHelperModel(eval_algorithm_config.model_type_for_bertscore) 309 310 # Saving QAAccuracyScores in the original self.transform 311 self.transform = QAAccuracyScores(target_output_delimiter=eval_algorithm_config.target_output_delimiter) 312 313 self.split_transform = SplitWithDelimiter( 314 input_key=DatasetColumns.TARGET_OUTPUT.value.name, 315 output_key=POSSIBLE_TARGETS, 316 target_output_delimiter=eval_algorithm_config.target_output_delimiter, 317 ) 318 self.bert_scores = BertScore( 319 target_output_keys=None, 320 model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name], 321 output_keys=[BERT_SCORE], 322 allow_duplicate_input_keys=True, 323 target_output_keys_provider=POSSIBLE_TARGETS, 324 bertscore_model=self.bertscore_model, 325 ) 326 327 self._eval_algorithm_config = eval_algorithm_config 328 329 self.pipeline = TransformPipeline([self.transform, self.split_transform, self.bert_scores]) 330 331 def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: 332 """Compute QA accuracy metrics for a single sample. 333 334 :param target_output: The expected/desired model output. 335 :param model_output: The actual model output. 336 :returns: A list of EvalScore objects, one for each of the QA accuracy metrics. 337 """ 338 sample = { 339 DatasetColumns.TARGET_OUTPUT.value.name: target_output, 340 DatasetColumns.MODEL_OUTPUT.value.name: model_output, 341 } 342 result = self.pipeline.execute_record(sample) 343 return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES] 344 345 def evaluate( 346 self, 347 model: Optional[ModelRunner] = None, 348 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 349 prompt_template: Optional[str] = None, 350 num_records: int = 100, 351 save: bool = False, 352 save_strategy: Optional[SaveStrategy] = None, 353 ) -> List[EvalOutput]: 354 """Compute QA accuracy metrics on one or more datasets. 355 356 :param model: An instance of ModelRunner representing the model under evaluation. 357 If this argument is None, the `dataset_config` argument must not be None, 358 and must correspond to a dataset that already contains a column with model outputs. 359 :param dataset_config: Configures a single dataset or list of datasets used for the 360 evaluation. If not provided, this method will run evaluations using all of its 361 supported built-in datasets. 362 :param prompt_template: A template used to generate prompts that are fed to the model. 363 If not provided, defaults will be used. If provided, `model` must not be None. 364 :param num_records: The number of records to be sampled randomly from the input dataset(s) 365 used to perform the evaluation(s). 366 :param save: If set to true, prompt responses and scores will be saved to a file. 367 :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not 368 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 369 If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`. 370 371 :return: A list of EvalOutput objects. 372 """ 373 # Create a shared resource to be used during the evaluation. 374 bertscore_shared_resource = create_shared_resource(self.bertscore_model) 375 376 bert_scores = BertScore( 377 target_output_keys=None, 378 model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name], 379 output_keys=[BERT_SCORE], 380 allow_duplicate_input_keys=True, 381 target_output_keys_provider=POSSIBLE_TARGETS, 382 bertscore_model=bertscore_shared_resource, 383 ) 384 385 # Create a new pipeline that uses the shared resource instead of self.bertscore_model. 386 pipeline = TransformPipeline([self.transform, self.split_transform, bert_scores]) 387 388 dataset_configs = get_dataset_configs(dataset_config, self.eval_name) 389 eval_outputs = [] 390 for dataset_config in dataset_configs: 391 dataset = get_dataset(dataset_config, num_records) 392 validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name]) 393 eval_output = evaluate_dataset( 394 dataset=dataset, 395 pipeline=pipeline, 396 dataset_name=dataset_config.dataset_name, 397 eval_name=self.eval_name, 398 metric_names=SCORE_NAMES, 399 eval_results_path=get_eval_results_path(), 400 model=model, 401 prompt_template=prompt_template, 402 agg_method=MEAN, 403 save=save, 404 save_strategy=save_strategy, 405 ) 406 eval_outputs.append(eval_output) 407 cleanup_shared_resource(bertscore_shared_resource) 408 return eval_outputs
196class QAAccuracyScores(Transform): 197 def __init__( 198 self, 199 target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name, 200 model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name, 201 output_keys: List[str] = QA_ACCURACY_SCORE_NAMES, 202 target_output_delimiter: Optional[str] = "<OR>", 203 ): 204 super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter) 205 self.register_input_output_keys( 206 input_keys=[target_output_key, model_output_key], 207 output_keys=output_keys, 208 ) 209 self.target_output_key = target_output_key 210 self.model_output_key = model_output_key 211 self.output_keys = output_keys 212 self.target_output_delimiter = target_output_delimiter 213 214 def _get_score( 215 self, 216 target_output: str, 217 model_output: str, 218 score_fn: Callable[..., float], 219 **fn_kwargs, 220 ) -> float: 221 """Compute an accuracy score from target_output and model_output. 222 223 :param target_output: A single string potentially containing multiple 224 target output values. If there are multiple target output values, 225 they will be separated by `target_output_delimiter`. 226 For example, if valid target outputs for a question are ["UK", "England"] 227 and the delimiter is "<OR>", then `target_output` will be "UK<OR>England". 228 :param model_output: The model output. 229 :param target_output_delimiter: The delimiter used to separate the possible 230 target outputs within the `target_output` string. 231 :param score_fn: One of the functions in QA_ACCURACY_SCORES_TO_FUNCS. 232 :returns: A computed QA accuracy score. 233 """ 234 possible_targets = target_output.split(self.target_output_delimiter) 235 return max([score_fn(model_output, target, **fn_kwargs) for target in possible_targets]) 236 237 @validate_call 238 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 239 target_output = record[self.target_output_key] 240 model_output = record[self.model_output_key] 241 for output_key, score_name in zip(self.output_keys, QA_ACCURACY_SCORE_NAMES): 242 record[output_key] = self._get_score( 243 target_output=target_output, 244 model_output=model_output, 245 score_fn=QA_ACCURACY_SCORES_TO_FUNCS[score_name], 246 ) 247 return record
A Transform represents a single operation that consumes a record and outputs another.
Typically, the output record is the same object as the input; the Transform simply mutates its input (usually by augmenting it with new data). However, the output record can also be a new object, independent of the input record.
The logic for creating the output record is implemented in the Transform's __call__ method, which takes a record as its sole argument. Any additional data besides this record that is required to perform the transformation logic should be stored as instance attributes in the Transform.
197 def __init__( 198 self, 199 target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name, 200 model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name, 201 output_keys: List[str] = QA_ACCURACY_SCORE_NAMES, 202 target_output_delimiter: Optional[str] = "<OR>", 203 ): 204 super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter) 205 self.register_input_output_keys( 206 input_keys=[target_output_key, model_output_key], 207 output_keys=output_keys, 208 ) 209 self.target_output_key = target_output_key 210 self.model_output_key = model_output_key 211 self.output_keys = output_keys 212 self.target_output_delimiter = target_output_delimiter
Transform initializer.
Concrete subclasses of Transform should always call super().__init__
with every argument passed to their own __init__ method.
Transform.__init__ stores all positional arguments in the args
instance
attribute and all keyword arguments in the kwargs
instance attribute.
This data is passed to Ray when Ray creates copies of this Transform instance
to perform parallel execution.
Note: The input_keys
and output_keys
attributes are initialized to None
and only assigned a meaningful value if the register_input_output_keys
method
is called. This method is used in conjunction with the validate_call
decorator
to perform validations of the __call__ inputs and outputs at runtime.
While it is not strictly necessary to utilize register_input_output_keys
and
validate_call
when implementing your own transforms, these methods are used in
all built-in transforms.
Parameters
- *args: Variable length argument list.
- **kwargs: Arbitrary keyword arguments.
Inherited Members
250@dataclass(frozen=True) 251class QAAccuracyConfig(EvalAlgorithmConfig): 252 """Configures the QA Accuracy evaluation algorithm. 253 254 :param target_output_delimiter: There can be multiple valid target outputs for a given question. 255 This delimiter is used to combine all possible target outputs into a single string. 256 For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>", then the 257 target output text will be "UK<OR>England". 258 :param model_type_for_bertscore: BERT model type to use for computing BERT score. 259 """ 260 261 target_output_delimiter: Optional[str] = "<OR>" 262 model_type_for_bertscore: str = BERTSCORE_DEFAULT_MODEL 263 264 def __post_init__(self): 265 require( 266 self.target_output_delimiter != "", 267 "Empty target_output_delimiter is provided. " 268 "Please either provide a non-empty string, or set it to None.", 269 ) 270 require( 271 BertscoreHelperModelTypes.model_is_allowed(self.model_type_for_bertscore), 272 f"Invalid model_type_for_bertscore: {self.model_type_for_bertscore} requested in " 273 f"QAAccuracyConfig. Please choose from acceptable values: " 274 f"{BertscoreHelperModelTypes.model_list()}.", 275 )
Configures the QA Accuracy evaluation algorithm.
Parameters
- target_output_delimiter: There can be multiple valid target outputs for a given question.
This delimiter is used to combine all possible target outputs into a single string.
For example, if valid answers are ["UK", "England"] and the delimiter is "
", then the target output text will be "UK England". - model_type_for_bertscore: BERT model type to use for computing BERT score.
278class QAAccuracy(EvalAlgorithmInterface): 279 """ 280 This evaluation measures how well the model performs in question answering (QA) tasks. The model is queried 281 for a range of facts, and we evaluate the accuracy of its response by comparing model output to target answer under different metrics: 282 283 1. Exact match (EM): Binary score, 1 if model output and target answer match exactly. 284 2. Quasi-exact match: Binary score. Similar to exact match, but both model output and target answer are normalized first 285 by removing any articles and punctuation. 286 3. Precision over Words: The fraction of words in the prediction that are also found in the target answer. The text is normalized as before. 287 4. Recall over Words: The fraction of words in the target answer that are also found in the prediction. 288 5. F1 over Words: The harmonic mean of precision and recall, over words (normalized). 289 6. [BERTScore](https://arxiv.org/pdf/1904.09675.pdf) uses a second ML model (from the BERT family) to compute sentence embeddings and compare their cosine similarity. This score may account for additional linguistic flexibility over the other QAAccuracy metrics since semantically similar sentences should be embedded closer to each other. 290 291 292 Precision, Recall and F1 over Words are more flexible as they assign non-zero scores to 293 model answers containing parts of the ground truth. Specifically, recall measures whether the ground truth answer is _contained_ in the 294 model output, whereas precision penalizes verbosity. 295 296 All metrics are reported on average over `num_records` datapoints and per category, resulting in a number between 0 297 (worst) and 1 (best) for each metric. 298 """ 299 300 eval_name = EvalAlgorithm.QA_ACCURACY.value 301 302 def __init__(self, eval_algorithm_config: QAAccuracyConfig = QAAccuracyConfig()): 303 """QAAccuracy initializer. 304 305 :param eval_algorithm_config: QA Accuracy evaluation algorithm config. 306 """ 307 super().__init__(eval_algorithm_config) 308 309 self.bertscore_model = BertscoreHelperModel(eval_algorithm_config.model_type_for_bertscore) 310 311 # Saving QAAccuracyScores in the original self.transform 312 self.transform = QAAccuracyScores(target_output_delimiter=eval_algorithm_config.target_output_delimiter) 313 314 self.split_transform = SplitWithDelimiter( 315 input_key=DatasetColumns.TARGET_OUTPUT.value.name, 316 output_key=POSSIBLE_TARGETS, 317 target_output_delimiter=eval_algorithm_config.target_output_delimiter, 318 ) 319 self.bert_scores = BertScore( 320 target_output_keys=None, 321 model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name], 322 output_keys=[BERT_SCORE], 323 allow_duplicate_input_keys=True, 324 target_output_keys_provider=POSSIBLE_TARGETS, 325 bertscore_model=self.bertscore_model, 326 ) 327 328 self._eval_algorithm_config = eval_algorithm_config 329 330 self.pipeline = TransformPipeline([self.transform, self.split_transform, self.bert_scores]) 331 332 def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: 333 """Compute QA accuracy metrics for a single sample. 334 335 :param target_output: The expected/desired model output. 336 :param model_output: The actual model output. 337 :returns: A list of EvalScore objects, one for each of the QA accuracy metrics. 338 """ 339 sample = { 340 DatasetColumns.TARGET_OUTPUT.value.name: target_output, 341 DatasetColumns.MODEL_OUTPUT.value.name: model_output, 342 } 343 result = self.pipeline.execute_record(sample) 344 return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES] 345 346 def evaluate( 347 self, 348 model: Optional[ModelRunner] = None, 349 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 350 prompt_template: Optional[str] = None, 351 num_records: int = 100, 352 save: bool = False, 353 save_strategy: Optional[SaveStrategy] = None, 354 ) -> List[EvalOutput]: 355 """Compute QA accuracy metrics on one or more datasets. 356 357 :param model: An instance of ModelRunner representing the model under evaluation. 358 If this argument is None, the `dataset_config` argument must not be None, 359 and must correspond to a dataset that already contains a column with model outputs. 360 :param dataset_config: Configures a single dataset or list of datasets used for the 361 evaluation. If not provided, this method will run evaluations using all of its 362 supported built-in datasets. 363 :param prompt_template: A template used to generate prompts that are fed to the model. 364 If not provided, defaults will be used. If provided, `model` must not be None. 365 :param num_records: The number of records to be sampled randomly from the input dataset(s) 366 used to perform the evaluation(s). 367 :param save: If set to true, prompt responses and scores will be saved to a file. 368 :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not 369 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 370 If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`. 371 372 :return: A list of EvalOutput objects. 373 """ 374 # Create a shared resource to be used during the evaluation. 375 bertscore_shared_resource = create_shared_resource(self.bertscore_model) 376 377 bert_scores = BertScore( 378 target_output_keys=None, 379 model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name], 380 output_keys=[BERT_SCORE], 381 allow_duplicate_input_keys=True, 382 target_output_keys_provider=POSSIBLE_TARGETS, 383 bertscore_model=bertscore_shared_resource, 384 ) 385 386 # Create a new pipeline that uses the shared resource instead of self.bertscore_model. 387 pipeline = TransformPipeline([self.transform, self.split_transform, bert_scores]) 388 389 dataset_configs = get_dataset_configs(dataset_config, self.eval_name) 390 eval_outputs = [] 391 for dataset_config in dataset_configs: 392 dataset = get_dataset(dataset_config, num_records) 393 validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name]) 394 eval_output = evaluate_dataset( 395 dataset=dataset, 396 pipeline=pipeline, 397 dataset_name=dataset_config.dataset_name, 398 eval_name=self.eval_name, 399 metric_names=SCORE_NAMES, 400 eval_results_path=get_eval_results_path(), 401 model=model, 402 prompt_template=prompt_template, 403 agg_method=MEAN, 404 save=save, 405 save_strategy=save_strategy, 406 ) 407 eval_outputs.append(eval_output) 408 cleanup_shared_resource(bertscore_shared_resource) 409 return eval_outputs
This evaluation measures how well the model performs in question answering (QA) tasks. The model is queried for a range of facts, and we evaluate the accuracy of its response by comparing model output to target answer under different metrics:
- Exact match (EM): Binary score, 1 if model output and target answer match exactly.
- Quasi-exact match: Binary score. Similar to exact match, but both model output and target answer are normalized first by removing any articles and punctuation.
- Precision over Words: The fraction of words in the prediction that are also found in the target answer. The text is normalized as before.
- Recall over Words: The fraction of words in the target answer that are also found in the prediction.
- F1 over Words: The harmonic mean of precision and recall, over words (normalized).
- BERTScore uses a second ML model (from the BERT family) to compute sentence embeddings and compare their cosine similarity. This score may account for additional linguistic flexibility over the other QAAccuracy metrics since semantically similar sentences should be embedded closer to each other.
Precision, Recall and F1 over Words are more flexible as they assign non-zero scores to model answers containing parts of the ground truth. Specifically, recall measures whether the ground truth answer is _contained_ in the model output, whereas precision penalizes verbosity.
All metrics are reported on average over num_records
datapoints and per category, resulting in a number between 0
(worst) and 1 (best) for each metric.
302 def __init__(self, eval_algorithm_config: QAAccuracyConfig = QAAccuracyConfig()): 303 """QAAccuracy initializer. 304 305 :param eval_algorithm_config: QA Accuracy evaluation algorithm config. 306 """ 307 super().__init__(eval_algorithm_config) 308 309 self.bertscore_model = BertscoreHelperModel(eval_algorithm_config.model_type_for_bertscore) 310 311 # Saving QAAccuracyScores in the original self.transform 312 self.transform = QAAccuracyScores(target_output_delimiter=eval_algorithm_config.target_output_delimiter) 313 314 self.split_transform = SplitWithDelimiter( 315 input_key=DatasetColumns.TARGET_OUTPUT.value.name, 316 output_key=POSSIBLE_TARGETS, 317 target_output_delimiter=eval_algorithm_config.target_output_delimiter, 318 ) 319 self.bert_scores = BertScore( 320 target_output_keys=None, 321 model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name], 322 output_keys=[BERT_SCORE], 323 allow_duplicate_input_keys=True, 324 target_output_keys_provider=POSSIBLE_TARGETS, 325 bertscore_model=self.bertscore_model, 326 ) 327 328 self._eval_algorithm_config = eval_algorithm_config 329 330 self.pipeline = TransformPipeline([self.transform, self.split_transform, self.bert_scores])
QAAccuracy initializer.
Parameters
- eval_algorithm_config: QA Accuracy evaluation algorithm config.
332 def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: 333 """Compute QA accuracy metrics for a single sample. 334 335 :param target_output: The expected/desired model output. 336 :param model_output: The actual model output. 337 :returns: A list of EvalScore objects, one for each of the QA accuracy metrics. 338 """ 339 sample = { 340 DatasetColumns.TARGET_OUTPUT.value.name: target_output, 341 DatasetColumns.MODEL_OUTPUT.value.name: model_output, 342 } 343 result = self.pipeline.execute_record(sample) 344 return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES]
Compute QA accuracy metrics for a single sample.
Parameters
- target_output: The expected/desired model output.
- model_output: The actual model output. :returns: A list of EvalScore objects, one for each of the QA accuracy metrics.
346 def evaluate( 347 self, 348 model: Optional[ModelRunner] = None, 349 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 350 prompt_template: Optional[str] = None, 351 num_records: int = 100, 352 save: bool = False, 353 save_strategy: Optional[SaveStrategy] = None, 354 ) -> List[EvalOutput]: 355 """Compute QA accuracy metrics on one or more datasets. 356 357 :param model: An instance of ModelRunner representing the model under evaluation. 358 If this argument is None, the `dataset_config` argument must not be None, 359 and must correspond to a dataset that already contains a column with model outputs. 360 :param dataset_config: Configures a single dataset or list of datasets used for the 361 evaluation. If not provided, this method will run evaluations using all of its 362 supported built-in datasets. 363 :param prompt_template: A template used to generate prompts that are fed to the model. 364 If not provided, defaults will be used. If provided, `model` must not be None. 365 :param num_records: The number of records to be sampled randomly from the input dataset(s) 366 used to perform the evaluation(s). 367 :param save: If set to true, prompt responses and scores will be saved to a file. 368 :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not 369 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 370 If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`. 371 372 :return: A list of EvalOutput objects. 373 """ 374 # Create a shared resource to be used during the evaluation. 375 bertscore_shared_resource = create_shared_resource(self.bertscore_model) 376 377 bert_scores = BertScore( 378 target_output_keys=None, 379 model_output_keys=[DatasetColumns.MODEL_OUTPUT.value.name], 380 output_keys=[BERT_SCORE], 381 allow_duplicate_input_keys=True, 382 target_output_keys_provider=POSSIBLE_TARGETS, 383 bertscore_model=bertscore_shared_resource, 384 ) 385 386 # Create a new pipeline that uses the shared resource instead of self.bertscore_model. 387 pipeline = TransformPipeline([self.transform, self.split_transform, bert_scores]) 388 389 dataset_configs = get_dataset_configs(dataset_config, self.eval_name) 390 eval_outputs = [] 391 for dataset_config in dataset_configs: 392 dataset = get_dataset(dataset_config, num_records) 393 validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name]) 394 eval_output = evaluate_dataset( 395 dataset=dataset, 396 pipeline=pipeline, 397 dataset_name=dataset_config.dataset_name, 398 eval_name=self.eval_name, 399 metric_names=SCORE_NAMES, 400 eval_results_path=get_eval_results_path(), 401 model=model, 402 prompt_template=prompt_template, 403 agg_method=MEAN, 404 save=save, 405 save_strategy=save_strategy, 406 ) 407 eval_outputs.append(eval_output) 408 cleanup_shared_resource(bertscore_shared_resource) 409 return eval_outputs
Compute QA accuracy metrics on one or more datasets.
Parameters
- model: An instance of ModelRunner representing the model under evaluation.
If this argument is None, the
dataset_config
argument must not be None, and must correspond to a dataset that already contains a column with model outputs. - dataset_config: Configures a single dataset or list of datasets used for the evaluation. If not provided, this method will run evaluations using all of its supported built-in datasets.
- prompt_template: A template used to generate prompts that are fed to the model.
If not provided, defaults will be used. If provided,
model
must not be None. - num_records: The number of records to be sampled randomly from the input dataset(s) used to perform the evaluation(s).
- save: If set to true, prompt responses and scores will be saved to a file.
- save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not
specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
If that environment variable is also not configured, it will be saved to the default path
/tmp/eval_results/
.
Returns
A list of EvalOutput objects.