fmeval.eval_algorithms.factual_knowledge
1import logging 2from dataclasses import dataclass 3from typing import Any, Dict, List, Optional, Union, Callable 4 5from fmeval.constants import ( 6 DatasetColumns, 7 MEAN, 8) 9from fmeval.data_loaders.util import get_dataset 10from fmeval.data_loaders.data_config import DataConfig 11from fmeval.eval_algorithms.common import evaluate_dataset 12from fmeval.eval_algorithms.save_strategy import SaveStrategy 13from fmeval.eval_algorithms.util import get_dataset_configs, normalize_text_quac_protocol 14from fmeval.eval_algorithms.eval_algorithm import EvalAlgorithmInterface, EvalAlgorithmConfig 15from fmeval.eval_algorithms import ( 16 EvalAlgorithm, 17 EvalOutput, 18 EvalScore, 19) 20from fmeval.eval_algorithms.util import validate_dataset 21from fmeval.exceptions import EvalAlgorithmClientError 22from fmeval.model_runners.model_runner import ModelRunner 23from fmeval.transforms.transform import Transform 24from fmeval.transforms.transform_pipeline import TransformPipeline 25from fmeval.transforms.util import validate_call 26from fmeval.util import get_eval_results_path 27 28FACTUAL_KNOWLEDGE = EvalAlgorithm.FACTUAL_KNOWLEDGE.value 29FACTUAL_KNOWLEDGE_QUASI_EXACT = "factual_knowledge_quasi_exact" 30SCORE_NAMES = [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT] 31 32logger = logging.getLogger(__name__) 33 34 35def _exact_inclusion_score(model_output: str, target_output: str) -> float: 36 """ 37 Given the model output and the target output, _exact_inclusion_score checks whether the target output 38 is contained in the model output after converting both strings to lowercase. If so, the function returns 39 1.0. Otherwise, returns 0. 40 41 :param model_output: The output of a model that we want to evaluate. 42 :param target_output: The reference or the "ground truth" output. 43 :returns: The exact_inclusion score. 44 """ 45 model_output_lower_case = model_output.lower() 46 return float(target_output.lower() in model_output_lower_case) 47 48 49def _quasi_exact_inclusion_score(model_output: str, target_output: str) -> float: 50 """ 51 Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L144 52 Computes if the target_output is contained in the model_output after normalizing both strings. If so, the 53 function returns 1.0. Otherwise, returns 0. 54 55 Normalization: Given a text, normalize it using the SQUAD/QUAC protocol (remove punctuations, excess spaces, 56 and articles) and return the lowercased tokens. 57 SQUAD (https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) and 58 QuAC benchmarks (https://s3.amazonaws.com/my89public/quac/scorer.py) use this protocol to normalize text before 59 evaluating it. Can learn more at fmeval/src/fmeval/eval_algorithms/util.py 60 61 :param model_output: The output of a model that we want to evaluate. 62 :param target_output: The reference or the "ground truth" output. 63 :returns: The quasi_exact_inclusion score (1 if the target_output is contained in model_output 64 after normalization, else 0). 65 """ 66 return float( 67 normalize_text_quac_protocol(target_output.strip()) in normalize_text_quac_protocol(model_output.strip()) 68 ) 69 70 71FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = { 72 FACTUAL_KNOWLEDGE: _exact_inclusion_score, 73 FACTUAL_KNOWLEDGE_QUASI_EXACT: _quasi_exact_inclusion_score, 74} 75 76 77class FactualKnowledgeScores(Transform): 78 """This transform augments its input record with the computed factual knowledge scores. 79 80 See the docstring for `FactualKnowledge` for more details regarding the score itself. 81 """ 82 83 def __init__( 84 self, 85 target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name, 86 model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name, 87 output_keys: List[str] = SCORE_NAMES, 88 target_output_delimiter: Optional[str] = "<OR>", 89 logical_operator: str = "OR", 90 ): 91 """FactualKnowledgeScores initializer. 92 93 :param target_output_key: The record key corresponding to the target output. 94 :param model_output_key: The record key corresponding to the model output. 95 :param output_key: The key corresponding to the factual knowledge score that 96 will be added to the input record. 97 :param target_output_delimiter: This delimiter is used to combine all possible target outputs into 98 a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>", 99 then the target output text will be "UK<OR>England". This can be useful to account for multiple 100 valid target outputs or to ensure that multiple target outputs are contained in the model output 101 (which can be configured using the logical_operator). 102 :param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator 103 is "OR" (the default behavior), at least one of the possible target outputs (separated by the 104 target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical 105 operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in 106 the model output in order for the answer to be correct. 107 """ 108 super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter, logical_operator) 109 self.register_input_output_keys( 110 input_keys=[target_output_key, model_output_key], 111 output_keys=output_keys, 112 ) 113 self.target_output_key = target_output_key 114 self.model_output_key = model_output_key 115 self.output_keys = output_keys 116 self.target_output_delimiter = target_output_delimiter 117 self.logical_operator = logical_operator 118 119 @validate_call 120 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 121 """Augment the input record with the computed factual knowledge scores. 122 123 :param record: The input record. 124 :returns: The input record, with the factual knowledge scores added in. 125 """ 126 target_output = record[self.target_output_key] 127 model_output = record[self.model_output_key] 128 for output_key, score_name in zip(self.output_keys, SCORE_NAMES): 129 record[output_key] = self._get_score( 130 target_output=target_output, 131 model_output=model_output, 132 score_fn=FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS[score_name], 133 ) 134 return record 135 136 def _get_score( 137 self, 138 target_output: str, 139 model_output: str, 140 score_fn: Callable[..., float], 141 **fn_kwargs, 142 ) -> float: 143 """Compute a factual knowledge score for a target output and model output pair based 144 on the score function. 145 146 :param target_output: Target output. 147 :param model_output: Model output. 148 :param score_fn: One of the functions in FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS. 149 :returns: A computed factual knowledge score (0 or 1). See the docstring for 150 `FactualKnowledge` for more details on what these numerical values represent. 151 """ 152 possible_targets = target_output.split(self.target_output_delimiter) 153 if self.logical_operator == "OR": 154 return max([score_fn(model_output, target, **fn_kwargs) for target in possible_targets]) 155 else: # self.logical_operator is "AND" 156 # checks that every target is in model_output, otherwise returns 0.0 157 return min([score_fn(model_output, target, **fn_kwargs) for target in possible_targets]) 158 159 160@dataclass(frozen=True) 161class FactualKnowledgeConfig(EvalAlgorithmConfig): 162 """Configures the factual knowledge evaluation algorithm. 163 164 :param target_output_delimiter: This delimiter is used to combine all possible target outputs into 165 a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>", 166 then the target output text will be "UK<OR>England". This can be useful to account for multiple 167 valid target outputs or to ensure that multiple target outputs are contained in the model output 168 (which can be configured using the logical_operator). 169 :param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator 170 is "OR" (the default behavior), at least one of the possible target outputs (separated by the 171 target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical 172 operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in 173 the model output in order for the answer to be correct. 174 """ 175 176 target_output_delimiter: Optional[str] = "<OR>" 177 logical_operator: str = "OR" 178 179 def __post_init__(self): 180 if self.target_output_delimiter == "": 181 raise EvalAlgorithmClientError( 182 "Empty target_output_delimiter is provided. Please either provide a non-empty string, " 183 "or set it to None." 184 ) 185 if self.logical_operator not in ["OR", "AND"]: 186 raise EvalAlgorithmClientError( 187 'Invalid logical_operator is provided. The only valid inputs are strings "OR" and "AND".' 188 ) 189 if self.target_output_delimiter in ["<OR>", "<AND>"] and self.target_output_delimiter != "<{}>".format( 190 self.logical_operator 191 ): 192 logger.warning( 193 f"The target_output_delimiter `{self.target_output_delimiter}` and logical_operator" 194 f" `{self.logical_operator}` are not consistent." 195 ) 196 197 198class FactualKnowledge(EvalAlgorithmInterface): 199 """ 200 This evaluation measures the ability of language models to reproduce facts about the real world and was proposed 201 by [Petroni et al.](https://arxiv.org/pdf/1909.01066.pdf). The evaluation queries the model with prompts like 202 'Berlin is the capital of' and 'Tata Motors is a subsidiary of' and compares the model generation with one or more 203 target answers. The prompts are divided into different knowledge categories like capitals, subsidiaries, etc. 204 205 This evaluation outputs two binary metrics. 206 The first is the "exact_inclusion" score: the metric value is 1 if the lower-cased expected answer is 207 contained anywhere within the lower-cased model response. For instance, consider the prompt 208 'Berlin is the capital of' with the expected answer 'Germany'. 209 If the model generation is 'Germany, and is also its most populous city', then the metric evaluates to 1. 210 211 The second metric is the "quasi_exact_inclusion" score: the metric value is 1 if the target output is contained 212 in the model output after both strings are normalized. 213 Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L144 214 215 If there is more than one correct target answer, the `logical_operator` can be set to "OR" (default) and 216 answers are seperated by the `target_output_delimiter`, both of which are configured inside the 217 `FactualKnowledgeConfig`. The `target_output_delimiter` defaults to `<OR>`, i.e, the target answer in this 218 example could be Germany<OR>Berlin (since Berlin is its own federal state). 219 220 If there are multiple correct target answers that must be included in the model output, 221 the `logical_operator` can be set to "AND". For example, consider the prompt 'What are the three primary colors?'. 222 The target answer would be Red<AND>Yellow<AND>Blue" (note that the target_output_delimiter could be anything, 223 but it is "<AND>" here for the sake of consistency with the logical_operator value).Red, yellow, and blue must 224 all be contained in the model generation for the answer to be correct under this configuration. 225 """ 226 227 eval_name = EvalAlgorithm.FACTUAL_KNOWLEDGE.value 228 229 def __init__(self, eval_algorithm_config: FactualKnowledgeConfig = FactualKnowledgeConfig()): 230 """FactualKnowledge initializer. 231 232 :param eval_algorithm_config: Factual knowledge evaluation algorithm config. 233 """ 234 super().__init__(eval_algorithm_config) 235 self.pipeline = TransformPipeline( 236 [ 237 FactualKnowledgeScores( 238 target_output_delimiter=eval_algorithm_config.target_output_delimiter, 239 logical_operator=eval_algorithm_config.logical_operator, 240 ) 241 ] 242 ) 243 244 def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: # type: ignore[override] 245 """Computes the factual knowledge metrics for a single sample. 246 247 :param target_output: The expected responses from the model. 248 :param model_output: The output of the model being evaluated. 249 :return: A list of EvalScore objects, one for each of the Factual Knowledge metrics 250 ("exact_inclusion" and "quasi_exact_inclusion"). 251 """ 252 sample = { 253 DatasetColumns.TARGET_OUTPUT.value.name: target_output, 254 DatasetColumns.MODEL_OUTPUT.value.name: model_output, 255 } 256 result = self.pipeline.execute_record(sample) 257 return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES] 258 259 def evaluate( 260 self, 261 model: Optional[ModelRunner] = None, 262 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 263 prompt_template: Optional[str] = None, 264 num_records: int = 300, 265 save: bool = False, 266 save_strategy: Optional[SaveStrategy] = None, 267 ) -> List[EvalOutput]: 268 """Compute the factual knowledge scores on one or more datasets. 269 270 :param model: An instance of ModelRunner representing the model under evaluation. 271 If this argument is None, the `dataset_config` argument must not be None, 272 and must correspond to a dataset that already contains a column with model outputs. 273 :param dataset_config: Configures a single dataset or list of datasets used for the 274 evaluation. If not provided, this method will run evaluations using all of its 275 supported built-in datasets. 276 :param prompt_template: A template used to generate prompts that are fed to the model. 277 If not provided, defaults will be used. If provided, `model` must not be None. 278 :param num_records: The number of records to be sampled randomly from the input dataset(s) 279 used to perform the evaluation(s). Note that the default value is 300, rather than 280 100, as it is for the rest of the built-in algorithms. This is because there 281 are 15 categories for factual knowledge, and if only 100 samples are used, there 282 will be categories with very few samples. 283 :param save: If set to true, prompt responses and scores will be saved to a file. 284 :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not 285 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 286 If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`. 287 288 :return: A list of EvalOutput objects. 289 """ 290 dataset_configs = get_dataset_configs(dataset_config, self.eval_name) 291 eval_outputs = [] 292 for dataset_config in dataset_configs: 293 dataset = get_dataset(dataset_config, num_records) 294 validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name]) 295 eval_output = evaluate_dataset( 296 dataset=dataset, 297 pipeline=self.pipeline, 298 dataset_name=dataset_config.dataset_name, 299 eval_name=self.eval_name, 300 metric_names=SCORE_NAMES, 301 eval_results_path=get_eval_results_path(), 302 model=model, 303 prompt_template=prompt_template, 304 agg_method=MEAN, 305 save=save, 306 save_strategy=save_strategy, 307 ) 308 eval_outputs.append(eval_output) 309 return eval_outputs
78class FactualKnowledgeScores(Transform): 79 """This transform augments its input record with the computed factual knowledge scores. 80 81 See the docstring for `FactualKnowledge` for more details regarding the score itself. 82 """ 83 84 def __init__( 85 self, 86 target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name, 87 model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name, 88 output_keys: List[str] = SCORE_NAMES, 89 target_output_delimiter: Optional[str] = "<OR>", 90 logical_operator: str = "OR", 91 ): 92 """FactualKnowledgeScores initializer. 93 94 :param target_output_key: The record key corresponding to the target output. 95 :param model_output_key: The record key corresponding to the model output. 96 :param output_key: The key corresponding to the factual knowledge score that 97 will be added to the input record. 98 :param target_output_delimiter: This delimiter is used to combine all possible target outputs into 99 a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>", 100 then the target output text will be "UK<OR>England". This can be useful to account for multiple 101 valid target outputs or to ensure that multiple target outputs are contained in the model output 102 (which can be configured using the logical_operator). 103 :param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator 104 is "OR" (the default behavior), at least one of the possible target outputs (separated by the 105 target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical 106 operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in 107 the model output in order for the answer to be correct. 108 """ 109 super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter, logical_operator) 110 self.register_input_output_keys( 111 input_keys=[target_output_key, model_output_key], 112 output_keys=output_keys, 113 ) 114 self.target_output_key = target_output_key 115 self.model_output_key = model_output_key 116 self.output_keys = output_keys 117 self.target_output_delimiter = target_output_delimiter 118 self.logical_operator = logical_operator 119 120 @validate_call 121 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 122 """Augment the input record with the computed factual knowledge scores. 123 124 :param record: The input record. 125 :returns: The input record, with the factual knowledge scores added in. 126 """ 127 target_output = record[self.target_output_key] 128 model_output = record[self.model_output_key] 129 for output_key, score_name in zip(self.output_keys, SCORE_NAMES): 130 record[output_key] = self._get_score( 131 target_output=target_output, 132 model_output=model_output, 133 score_fn=FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS[score_name], 134 ) 135 return record 136 137 def _get_score( 138 self, 139 target_output: str, 140 model_output: str, 141 score_fn: Callable[..., float], 142 **fn_kwargs, 143 ) -> float: 144 """Compute a factual knowledge score for a target output and model output pair based 145 on the score function. 146 147 :param target_output: Target output. 148 :param model_output: Model output. 149 :param score_fn: One of the functions in FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS. 150 :returns: A computed factual knowledge score (0 or 1). See the docstring for 151 `FactualKnowledge` for more details on what these numerical values represent. 152 """ 153 possible_targets = target_output.split(self.target_output_delimiter) 154 if self.logical_operator == "OR": 155 return max([score_fn(model_output, target, **fn_kwargs) for target in possible_targets]) 156 else: # self.logical_operator is "AND" 157 # checks that every target is in model_output, otherwise returns 0.0 158 return min([score_fn(model_output, target, **fn_kwargs) for target in possible_targets])
This transform augments its input record with the computed factual knowledge scores.
See the docstring for FactualKnowledge
for more details regarding the score itself.
84 def __init__( 85 self, 86 target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name, 87 model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name, 88 output_keys: List[str] = SCORE_NAMES, 89 target_output_delimiter: Optional[str] = "<OR>", 90 logical_operator: str = "OR", 91 ): 92 """FactualKnowledgeScores initializer. 93 94 :param target_output_key: The record key corresponding to the target output. 95 :param model_output_key: The record key corresponding to the model output. 96 :param output_key: The key corresponding to the factual knowledge score that 97 will be added to the input record. 98 :param target_output_delimiter: This delimiter is used to combine all possible target outputs into 99 a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>", 100 then the target output text will be "UK<OR>England". This can be useful to account for multiple 101 valid target outputs or to ensure that multiple target outputs are contained in the model output 102 (which can be configured using the logical_operator). 103 :param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator 104 is "OR" (the default behavior), at least one of the possible target outputs (separated by the 105 target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical 106 operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in 107 the model output in order for the answer to be correct. 108 """ 109 super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter, logical_operator) 110 self.register_input_output_keys( 111 input_keys=[target_output_key, model_output_key], 112 output_keys=output_keys, 113 ) 114 self.target_output_key = target_output_key 115 self.model_output_key = model_output_key 116 self.output_keys = output_keys 117 self.target_output_delimiter = target_output_delimiter 118 self.logical_operator = logical_operator
FactualKnowledgeScores initializer.
Parameters
- target_output_key: The record key corresponding to the target output.
- model_output_key: The record key corresponding to the model output.
- output_key: The key corresponding to the factual knowledge score that will be added to the input record.
- target_output_delimiter: This delimiter is used to combine all possible target outputs into
a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "
", then the target output text will be "UK England". This can be useful to account for multiple valid target outputs or to ensure that multiple target outputs are contained in the model output (which can be configured using the logical_operator). - logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator is "OR" (the default behavior), at least one of the possible target outputs (separated by the target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in the model output in order for the answer to be correct.
Inherited Members
161@dataclass(frozen=True) 162class FactualKnowledgeConfig(EvalAlgorithmConfig): 163 """Configures the factual knowledge evaluation algorithm. 164 165 :param target_output_delimiter: This delimiter is used to combine all possible target outputs into 166 a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>", 167 then the target output text will be "UK<OR>England". This can be useful to account for multiple 168 valid target outputs or to ensure that multiple target outputs are contained in the model output 169 (which can be configured using the logical_operator). 170 :param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator 171 is "OR" (the default behavior), at least one of the possible target outputs (separated by the 172 target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical 173 operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in 174 the model output in order for the answer to be correct. 175 """ 176 177 target_output_delimiter: Optional[str] = "<OR>" 178 logical_operator: str = "OR" 179 180 def __post_init__(self): 181 if self.target_output_delimiter == "": 182 raise EvalAlgorithmClientError( 183 "Empty target_output_delimiter is provided. Please either provide a non-empty string, " 184 "or set it to None." 185 ) 186 if self.logical_operator not in ["OR", "AND"]: 187 raise EvalAlgorithmClientError( 188 'Invalid logical_operator is provided. The only valid inputs are strings "OR" and "AND".' 189 ) 190 if self.target_output_delimiter in ["<OR>", "<AND>"] and self.target_output_delimiter != "<{}>".format( 191 self.logical_operator 192 ): 193 logger.warning( 194 f"The target_output_delimiter `{self.target_output_delimiter}` and logical_operator" 195 f" `{self.logical_operator}` are not consistent." 196 )
Configures the factual knowledge evaluation algorithm.
Parameters
- target_output_delimiter: This delimiter is used to combine all possible target outputs into
a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "
", then the target output text will be "UK England". This can be useful to account for multiple valid target outputs or to ensure that multiple target outputs are contained in the model output (which can be configured using the logical_operator). - logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator is "OR" (the default behavior), at least one of the possible target outputs (separated by the target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in the model output in order for the answer to be correct.
199class FactualKnowledge(EvalAlgorithmInterface): 200 """ 201 This evaluation measures the ability of language models to reproduce facts about the real world and was proposed 202 by [Petroni et al.](https://arxiv.org/pdf/1909.01066.pdf). The evaluation queries the model with prompts like 203 'Berlin is the capital of' and 'Tata Motors is a subsidiary of' and compares the model generation with one or more 204 target answers. The prompts are divided into different knowledge categories like capitals, subsidiaries, etc. 205 206 This evaluation outputs two binary metrics. 207 The first is the "exact_inclusion" score: the metric value is 1 if the lower-cased expected answer is 208 contained anywhere within the lower-cased model response. For instance, consider the prompt 209 'Berlin is the capital of' with the expected answer 'Germany'. 210 If the model generation is 'Germany, and is also its most populous city', then the metric evaluates to 1. 211 212 The second metric is the "quasi_exact_inclusion" score: the metric value is 1 if the target output is contained 213 in the model output after both strings are normalized. 214 Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L144 215 216 If there is more than one correct target answer, the `logical_operator` can be set to "OR" (default) and 217 answers are seperated by the `target_output_delimiter`, both of which are configured inside the 218 `FactualKnowledgeConfig`. The `target_output_delimiter` defaults to `<OR>`, i.e, the target answer in this 219 example could be Germany<OR>Berlin (since Berlin is its own federal state). 220 221 If there are multiple correct target answers that must be included in the model output, 222 the `logical_operator` can be set to "AND". For example, consider the prompt 'What are the three primary colors?'. 223 The target answer would be Red<AND>Yellow<AND>Blue" (note that the target_output_delimiter could be anything, 224 but it is "<AND>" here for the sake of consistency with the logical_operator value).Red, yellow, and blue must 225 all be contained in the model generation for the answer to be correct under this configuration. 226 """ 227 228 eval_name = EvalAlgorithm.FACTUAL_KNOWLEDGE.value 229 230 def __init__(self, eval_algorithm_config: FactualKnowledgeConfig = FactualKnowledgeConfig()): 231 """FactualKnowledge initializer. 232 233 :param eval_algorithm_config: Factual knowledge evaluation algorithm config. 234 """ 235 super().__init__(eval_algorithm_config) 236 self.pipeline = TransformPipeline( 237 [ 238 FactualKnowledgeScores( 239 target_output_delimiter=eval_algorithm_config.target_output_delimiter, 240 logical_operator=eval_algorithm_config.logical_operator, 241 ) 242 ] 243 ) 244 245 def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: # type: ignore[override] 246 """Computes the factual knowledge metrics for a single sample. 247 248 :param target_output: The expected responses from the model. 249 :param model_output: The output of the model being evaluated. 250 :return: A list of EvalScore objects, one for each of the Factual Knowledge metrics 251 ("exact_inclusion" and "quasi_exact_inclusion"). 252 """ 253 sample = { 254 DatasetColumns.TARGET_OUTPUT.value.name: target_output, 255 DatasetColumns.MODEL_OUTPUT.value.name: model_output, 256 } 257 result = self.pipeline.execute_record(sample) 258 return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES] 259 260 def evaluate( 261 self, 262 model: Optional[ModelRunner] = None, 263 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 264 prompt_template: Optional[str] = None, 265 num_records: int = 300, 266 save: bool = False, 267 save_strategy: Optional[SaveStrategy] = None, 268 ) -> List[EvalOutput]: 269 """Compute the factual knowledge scores on one or more datasets. 270 271 :param model: An instance of ModelRunner representing the model under evaluation. 272 If this argument is None, the `dataset_config` argument must not be None, 273 and must correspond to a dataset that already contains a column with model outputs. 274 :param dataset_config: Configures a single dataset or list of datasets used for the 275 evaluation. If not provided, this method will run evaluations using all of its 276 supported built-in datasets. 277 :param prompt_template: A template used to generate prompts that are fed to the model. 278 If not provided, defaults will be used. If provided, `model` must not be None. 279 :param num_records: The number of records to be sampled randomly from the input dataset(s) 280 used to perform the evaluation(s). Note that the default value is 300, rather than 281 100, as it is for the rest of the built-in algorithms. This is because there 282 are 15 categories for factual knowledge, and if only 100 samples are used, there 283 will be categories with very few samples. 284 :param save: If set to true, prompt responses and scores will be saved to a file. 285 :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not 286 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 287 If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`. 288 289 :return: A list of EvalOutput objects. 290 """ 291 dataset_configs = get_dataset_configs(dataset_config, self.eval_name) 292 eval_outputs = [] 293 for dataset_config in dataset_configs: 294 dataset = get_dataset(dataset_config, num_records) 295 validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name]) 296 eval_output = evaluate_dataset( 297 dataset=dataset, 298 pipeline=self.pipeline, 299 dataset_name=dataset_config.dataset_name, 300 eval_name=self.eval_name, 301 metric_names=SCORE_NAMES, 302 eval_results_path=get_eval_results_path(), 303 model=model, 304 prompt_template=prompt_template, 305 agg_method=MEAN, 306 save=save, 307 save_strategy=save_strategy, 308 ) 309 eval_outputs.append(eval_output) 310 return eval_outputs
This evaluation measures the ability of language models to reproduce facts about the real world and was proposed by Petroni et al.. The evaluation queries the model with prompts like 'Berlin is the capital of' and 'Tata Motors is a subsidiary of' and compares the model generation with one or more target answers. The prompts are divided into different knowledge categories like capitals, subsidiaries, etc.
This evaluation outputs two binary metrics. The first is the "exact_inclusion" score: the metric value is 1 if the lower-cased expected answer is contained anywhere within the lower-cased model response. For instance, consider the prompt 'Berlin is the capital of' with the expected answer 'Germany'. If the model generation is 'Germany, and is also its most populous city', then the metric evaluates to 1.
The second metric is the "quasi_exact_inclusion" score: the metric value is 1 if the target output is contained in the model output after both strings are normalized. Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L144
If there is more than one correct target answer, the logical_operator
can be set to "OR" (default) and
answers are seperated by the target_output_delimiter
, both of which are configured inside the
FactualKnowledgeConfig
. The target_output_delimiter
defaults to <OR>
, i.e, the target answer in this
example could be Germany
If there are multiple correct target answers that must be included in the model output,
the logical_operator
can be set to "AND". For example, consider the prompt 'What are the three primary colors?'.
The target answer would be Red
230 def __init__(self, eval_algorithm_config: FactualKnowledgeConfig = FactualKnowledgeConfig()): 231 """FactualKnowledge initializer. 232 233 :param eval_algorithm_config: Factual knowledge evaluation algorithm config. 234 """ 235 super().__init__(eval_algorithm_config) 236 self.pipeline = TransformPipeline( 237 [ 238 FactualKnowledgeScores( 239 target_output_delimiter=eval_algorithm_config.target_output_delimiter, 240 logical_operator=eval_algorithm_config.logical_operator, 241 ) 242 ] 243 )
FactualKnowledge initializer.
Parameters
- eval_algorithm_config: Factual knowledge evaluation algorithm config.
245 def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: # type: ignore[override] 246 """Computes the factual knowledge metrics for a single sample. 247 248 :param target_output: The expected responses from the model. 249 :param model_output: The output of the model being evaluated. 250 :return: A list of EvalScore objects, one for each of the Factual Knowledge metrics 251 ("exact_inclusion" and "quasi_exact_inclusion"). 252 """ 253 sample = { 254 DatasetColumns.TARGET_OUTPUT.value.name: target_output, 255 DatasetColumns.MODEL_OUTPUT.value.name: model_output, 256 } 257 result = self.pipeline.execute_record(sample) 258 return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES]
Computes the factual knowledge metrics for a single sample.
Parameters
- target_output: The expected responses from the model.
- model_output: The output of the model being evaluated.
Returns
A list of EvalScore objects, one for each of the Factual Knowledge metrics ("exact_inclusion" and "quasi_exact_inclusion").
260 def evaluate( 261 self, 262 model: Optional[ModelRunner] = None, 263 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 264 prompt_template: Optional[str] = None, 265 num_records: int = 300, 266 save: bool = False, 267 save_strategy: Optional[SaveStrategy] = None, 268 ) -> List[EvalOutput]: 269 """Compute the factual knowledge scores on one or more datasets. 270 271 :param model: An instance of ModelRunner representing the model under evaluation. 272 If this argument is None, the `dataset_config` argument must not be None, 273 and must correspond to a dataset that already contains a column with model outputs. 274 :param dataset_config: Configures a single dataset or list of datasets used for the 275 evaluation. If not provided, this method will run evaluations using all of its 276 supported built-in datasets. 277 :param prompt_template: A template used to generate prompts that are fed to the model. 278 If not provided, defaults will be used. If provided, `model` must not be None. 279 :param num_records: The number of records to be sampled randomly from the input dataset(s) 280 used to perform the evaluation(s). Note that the default value is 300, rather than 281 100, as it is for the rest of the built-in algorithms. This is because there 282 are 15 categories for factual knowledge, and if only 100 samples are used, there 283 will be categories with very few samples. 284 :param save: If set to true, prompt responses and scores will be saved to a file. 285 :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not 286 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 287 If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`. 288 289 :return: A list of EvalOutput objects. 290 """ 291 dataset_configs = get_dataset_configs(dataset_config, self.eval_name) 292 eval_outputs = [] 293 for dataset_config in dataset_configs: 294 dataset = get_dataset(dataset_config, num_records) 295 validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name]) 296 eval_output = evaluate_dataset( 297 dataset=dataset, 298 pipeline=self.pipeline, 299 dataset_name=dataset_config.dataset_name, 300 eval_name=self.eval_name, 301 metric_names=SCORE_NAMES, 302 eval_results_path=get_eval_results_path(), 303 model=model, 304 prompt_template=prompt_template, 305 agg_method=MEAN, 306 save=save, 307 save_strategy=save_strategy, 308 ) 309 eval_outputs.append(eval_output) 310 return eval_outputs
Compute the factual knowledge scores on one or more datasets.
Parameters
- model: An instance of ModelRunner representing the model under evaluation.
If this argument is None, the
dataset_config
argument must not be None, and must correspond to a dataset that already contains a column with model outputs. - dataset_config: Configures a single dataset or list of datasets used for the evaluation. If not provided, this method will run evaluations using all of its supported built-in datasets.
- prompt_template: A template used to generate prompts that are fed to the model.
If not provided, defaults will be used. If provided,
model
must not be None. - num_records: The number of records to be sampled randomly from the input dataset(s) used to perform the evaluation(s). Note that the default value is 300, rather than 100, as it is for the rest of the built-in algorithms. This is because there are 15 categories for factual knowledge, and if only 100 samples are used, there will be categories with very few samples.
- save: If set to true, prompt responses and scores will be saved to a file.
- save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not
specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
If that environment variable is also not configured, it will be saved to the default path
/tmp/eval_results/
.
Returns
A list of EvalOutput objects.