fmeval.eval_algorithms.factual_knowledge

  1import logging
  2from dataclasses import dataclass
  3from typing import Any, Dict, List, Optional, Union, Callable
  4
  5from fmeval.constants import (
  6    DatasetColumns,
  7    MEAN,
  8)
  9from fmeval.data_loaders.util import get_dataset
 10from fmeval.data_loaders.data_config import DataConfig
 11from fmeval.eval_algorithms.common import evaluate_dataset
 12from fmeval.eval_algorithms.save_strategy import SaveStrategy
 13from fmeval.eval_algorithms.util import get_dataset_configs, normalize_text_quac_protocol
 14from fmeval.eval_algorithms.eval_algorithm import EvalAlgorithmInterface, EvalAlgorithmConfig
 15from fmeval.eval_algorithms import (
 16    EvalAlgorithm,
 17    EvalOutput,
 18    EvalScore,
 19)
 20from fmeval.eval_algorithms.util import validate_dataset
 21from fmeval.exceptions import EvalAlgorithmClientError
 22from fmeval.model_runners.model_runner import ModelRunner
 23from fmeval.transforms.transform import Transform
 24from fmeval.transforms.transform_pipeline import TransformPipeline
 25from fmeval.transforms.util import validate_call
 26from fmeval.util import get_eval_results_path
 27
 28FACTUAL_KNOWLEDGE = EvalAlgorithm.FACTUAL_KNOWLEDGE.value
 29FACTUAL_KNOWLEDGE_QUASI_EXACT = "factual_knowledge_quasi_exact"
 30SCORE_NAMES = [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT]
 31
 32logger = logging.getLogger(__name__)
 33
 34
 35def _exact_inclusion_score(model_output: str, target_output: str) -> float:
 36    """
 37    Given the model output and the target output, _exact_inclusion_score checks whether the target output
 38    is contained in the model output after converting both strings to lowercase. If so, the function returns
 39    1.0. Otherwise, returns 0.
 40
 41    :param model_output: The output of a model that we want to evaluate.
 42    :param target_output: The reference or the "ground truth" output.
 43    :returns: The exact_inclusion score.
 44    """
 45    model_output_lower_case = model_output.lower()
 46    return float(target_output.lower() in model_output_lower_case)
 47
 48
 49def _quasi_exact_inclusion_score(model_output: str, target_output: str) -> float:
 50    """
 51    Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L144
 52    Computes if the target_output is contained in the model_output after normalizing both strings. If so, the
 53    function returns 1.0. Otherwise, returns 0.
 54
 55    Normalization: Given a text, normalize it using the SQUAD/QUAC protocol (remove punctuations, excess spaces,
 56    and articles) and return the lowercased tokens.
 57    SQUAD (https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) and
 58    QuAC benchmarks (https://s3.amazonaws.com/my89public/quac/scorer.py) use this protocol to normalize text before
 59    evaluating it. Can learn more at fmeval/src/fmeval/eval_algorithms/util.py
 60
 61    :param model_output: The output of a model that we want to evaluate.
 62    :param target_output: The reference or the "ground truth" output.
 63    :returns: The quasi_exact_inclusion score (1 if the target_output is contained in model_output
 64    after normalization, else 0).
 65    """
 66    return float(
 67        normalize_text_quac_protocol(target_output.strip()) in normalize_text_quac_protocol(model_output.strip())
 68    )
 69
 70
 71FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = {
 72    FACTUAL_KNOWLEDGE: _exact_inclusion_score,
 73    FACTUAL_KNOWLEDGE_QUASI_EXACT: _quasi_exact_inclusion_score,
 74}
 75
 76
 77class FactualKnowledgeScores(Transform):
 78    """This transform augments its input record with the computed factual knowledge scores.
 79
 80    See the docstring for `FactualKnowledge` for more details regarding the score itself.
 81    """
 82
 83    def __init__(
 84        self,
 85        target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name,
 86        model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name,
 87        output_keys: List[str] = SCORE_NAMES,
 88        target_output_delimiter: Optional[str] = "<OR>",
 89        logical_operator: str = "OR",
 90    ):
 91        """FactualKnowledgeScores initializer.
 92
 93        :param target_output_key: The record key corresponding to the target output.
 94        :param model_output_key: The record key corresponding to the model output.
 95        :param output_key: The key corresponding to the factual knowledge score that
 96            will be added to the input record.
 97        :param target_output_delimiter: This delimiter is used to combine all possible target outputs into
 98            a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>",
 99            then the target output text will be "UK<OR>England". This can be useful to account for multiple
100            valid target outputs or to ensure that multiple target outputs are contained in the model output
101            (which can be configured using the logical_operator).
102        :param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator
103            is "OR" (the default behavior), at least one of the possible target outputs (separated by the
104            target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical
105            operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in
106            the model output in order for the answer to be correct.
107        """
108        super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter, logical_operator)
109        self.register_input_output_keys(
110            input_keys=[target_output_key, model_output_key],
111            output_keys=output_keys,
112        )
113        self.target_output_key = target_output_key
114        self.model_output_key = model_output_key
115        self.output_keys = output_keys
116        self.target_output_delimiter = target_output_delimiter
117        self.logical_operator = logical_operator
118
119    @validate_call
120    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
121        """Augment the input record with the computed factual knowledge scores.
122
123        :param record: The input record.
124        :returns: The input record, with the factual knowledge scores added in.
125        """
126        target_output = record[self.target_output_key]
127        model_output = record[self.model_output_key]
128        for output_key, score_name in zip(self.output_keys, SCORE_NAMES):
129            record[output_key] = self._get_score(
130                target_output=target_output,
131                model_output=model_output,
132                score_fn=FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS[score_name],
133            )
134        return record
135
136    def _get_score(
137        self,
138        target_output: str,
139        model_output: str,
140        score_fn: Callable[..., float],
141        **fn_kwargs,
142    ) -> float:
143        """Compute a factual knowledge score for a target output and model output pair based
144        on the score function.
145
146        :param target_output: Target output.
147        :param model_output: Model output.
148        :param score_fn: One of the functions in FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS.
149        :returns: A computed factual knowledge score (0 or 1). See the docstring for
150        `FactualKnowledge` for more details on what these numerical values represent.
151        """
152        possible_targets = target_output.split(self.target_output_delimiter)
153        if self.logical_operator == "OR":
154            return max([score_fn(model_output, target, **fn_kwargs) for target in possible_targets])
155        else:  # self.logical_operator is "AND"
156            # checks that every target is in model_output, otherwise returns 0.0
157            return min([score_fn(model_output, target, **fn_kwargs) for target in possible_targets])
158
159
160@dataclass(frozen=True)
161class FactualKnowledgeConfig(EvalAlgorithmConfig):
162    """Configures the factual knowledge evaluation algorithm.
163
164    :param target_output_delimiter: This delimiter is used to combine all possible target outputs into
165        a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>",
166        then the target output text will be "UK<OR>England". This can be useful to account for multiple
167        valid target outputs or to ensure that multiple target outputs are contained in the model output
168        (which can be configured using the logical_operator).
169    :param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator
170        is "OR" (the default behavior), at least one of the possible target outputs (separated by the
171        target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical
172        operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in
173        the model output in order for the answer to be correct.
174    """
175
176    target_output_delimiter: Optional[str] = "<OR>"
177    logical_operator: str = "OR"
178
179    def __post_init__(self):
180        if self.target_output_delimiter == "":
181            raise EvalAlgorithmClientError(
182                "Empty target_output_delimiter is provided. Please either provide a non-empty string, "
183                "or set it to None."
184            )
185        if self.logical_operator not in ["OR", "AND"]:
186            raise EvalAlgorithmClientError(
187                'Invalid logical_operator is provided. The only valid inputs are strings "OR" and "AND".'
188            )
189        if self.target_output_delimiter in ["<OR>", "<AND>"] and self.target_output_delimiter != "<{}>".format(
190            self.logical_operator
191        ):
192            logger.warning(
193                f"The target_output_delimiter `{self.target_output_delimiter}` and logical_operator"
194                f" `{self.logical_operator}` are not consistent."
195            )
196
197
198class FactualKnowledge(EvalAlgorithmInterface):
199    """
200    This evaluation measures the ability of language models to reproduce facts about the real world and was proposed
201    by [Petroni et al.](https://arxiv.org/pdf/1909.01066.pdf). The evaluation queries the model with prompts like
202    'Berlin is the capital of' and 'Tata Motors is a subsidiary of' and compares the model generation with one or more
203    target answers. The prompts are divided into different knowledge categories like capitals, subsidiaries, etc.
204
205    This evaluation outputs two binary metrics.
206    The first is the "exact_inclusion" score: the metric value is 1 if the lower-cased expected answer is
207    contained anywhere within the lower-cased model response. For instance, consider the prompt
208    'Berlin is the capital of' with the expected answer 'Germany'.
209    If the model generation is 'Germany, and is also its most populous city', then the metric evaluates to 1.
210
211    The second metric is the "quasi_exact_inclusion" score: the metric value is 1 if the target output is contained
212    in the model output after both strings are normalized.
213    Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L144
214
215    If there is more than one correct target answer, the `logical_operator` can be set to "OR" (default) and
216    answers are seperated by the `target_output_delimiter`, both of which are configured inside the
217    `FactualKnowledgeConfig`. The `target_output_delimiter` defaults to `<OR>`, i.e, the target answer in this
218    example could be Germany<OR>Berlin (since Berlin is its own federal state).
219
220    If there are multiple correct target answers that must be included in the model output,
221    the `logical_operator` can be set to "AND". For example, consider the prompt 'What are the three primary colors?'.
222    The target answer would be Red<AND>Yellow<AND>Blue" (note that the target_output_delimiter could be anything,
223    but it is "<AND>" here for the sake of consistency with the logical_operator value).Red, yellow, and blue must
224    all be contained in the model generation for the answer to be correct under this configuration.
225    """
226
227    eval_name = EvalAlgorithm.FACTUAL_KNOWLEDGE.value
228
229    def __init__(self, eval_algorithm_config: FactualKnowledgeConfig = FactualKnowledgeConfig()):
230        """FactualKnowledge initializer.
231
232        :param eval_algorithm_config: Factual knowledge evaluation algorithm config.
233        """
234        super().__init__(eval_algorithm_config)
235        self.pipeline = TransformPipeline(
236            [
237                FactualKnowledgeScores(
238                    target_output_delimiter=eval_algorithm_config.target_output_delimiter,
239                    logical_operator=eval_algorithm_config.logical_operator,
240                )
241            ]
242        )
243
244    def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]:  # type: ignore[override]
245        """Computes the factual knowledge metrics for a single sample.
246
247        :param target_output: The expected responses from the model.
248        :param model_output: The output of the model being evaluated.
249        :return: A list of EvalScore objects, one for each of the Factual Knowledge metrics
250        ("exact_inclusion" and "quasi_exact_inclusion").
251        """
252        sample = {
253            DatasetColumns.TARGET_OUTPUT.value.name: target_output,
254            DatasetColumns.MODEL_OUTPUT.value.name: model_output,
255        }
256        result = self.pipeline.execute_record(sample)
257        return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES]
258
259    def evaluate(
260        self,
261        model: Optional[ModelRunner] = None,
262        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
263        prompt_template: Optional[str] = None,
264        num_records: int = 300,
265        save: bool = False,
266        save_strategy: Optional[SaveStrategy] = None,
267    ) -> List[EvalOutput]:
268        """Compute the factual knowledge scores on one or more datasets.
269
270        :param model: An instance of ModelRunner representing the model under evaluation.
271            If this argument is None, the `dataset_config` argument must not be None,
272            and must correspond to a dataset that already contains a column with model outputs.
273        :param dataset_config: Configures a single dataset or list of datasets used for the
274            evaluation. If not provided, this method will run evaluations using all of its
275            supported built-in datasets.
276        :param prompt_template: A template used to generate prompts that are fed to the model.
277            If not provided, defaults will be used. If provided, `model` must not be None.
278        :param num_records: The number of records to be sampled randomly from the input dataset(s)
279            used to perform the evaluation(s). Note that the default value is 300, rather than
280            100, as it is for the rest of the built-in algorithms. This is because there
281            are 15 categories for factual knowledge, and if only 100 samples are used, there
282            will be categories with very few samples.
283        :param save: If set to true, prompt responses and scores will be saved to a file.
284        :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not
285            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
286            If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`.
287
288        :return: A list of EvalOutput objects.
289        """
290        dataset_configs = get_dataset_configs(dataset_config, self.eval_name)
291        eval_outputs = []
292        for dataset_config in dataset_configs:
293            dataset = get_dataset(dataset_config, num_records)
294            validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name])
295            eval_output = evaluate_dataset(
296                dataset=dataset,
297                pipeline=self.pipeline,
298                dataset_name=dataset_config.dataset_name,
299                eval_name=self.eval_name,
300                metric_names=SCORE_NAMES,
301                eval_results_path=get_eval_results_path(),
302                model=model,
303                prompt_template=prompt_template,
304                agg_method=MEAN,
305                save=save,
306                save_strategy=save_strategy,
307            )
308            eval_outputs.append(eval_output)
309        return eval_outputs
FACTUAL_KNOWLEDGE = 'factual_knowledge'
FACTUAL_KNOWLEDGE_QUASI_EXACT = 'factual_knowledge_quasi_exact'
SCORE_NAMES = ['factual_knowledge', 'factual_knowledge_quasi_exact']
logger = <Logger fmeval.eval_algorithms.factual_knowledge (INFO)>
FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = {'factual_knowledge': <function _exact_inclusion_score>, 'factual_knowledge_quasi_exact': <function _quasi_exact_inclusion_score>}
class FactualKnowledgeScores(fmeval.transforms.transform.Transform):
 78class FactualKnowledgeScores(Transform):
 79    """This transform augments its input record with the computed factual knowledge scores.
 80
 81    See the docstring for `FactualKnowledge` for more details regarding the score itself.
 82    """
 83
 84    def __init__(
 85        self,
 86        target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name,
 87        model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name,
 88        output_keys: List[str] = SCORE_NAMES,
 89        target_output_delimiter: Optional[str] = "<OR>",
 90        logical_operator: str = "OR",
 91    ):
 92        """FactualKnowledgeScores initializer.
 93
 94        :param target_output_key: The record key corresponding to the target output.
 95        :param model_output_key: The record key corresponding to the model output.
 96        :param output_key: The key corresponding to the factual knowledge score that
 97            will be added to the input record.
 98        :param target_output_delimiter: This delimiter is used to combine all possible target outputs into
 99            a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>",
100            then the target output text will be "UK<OR>England". This can be useful to account for multiple
101            valid target outputs or to ensure that multiple target outputs are contained in the model output
102            (which can be configured using the logical_operator).
103        :param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator
104            is "OR" (the default behavior), at least one of the possible target outputs (separated by the
105            target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical
106            operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in
107            the model output in order for the answer to be correct.
108        """
109        super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter, logical_operator)
110        self.register_input_output_keys(
111            input_keys=[target_output_key, model_output_key],
112            output_keys=output_keys,
113        )
114        self.target_output_key = target_output_key
115        self.model_output_key = model_output_key
116        self.output_keys = output_keys
117        self.target_output_delimiter = target_output_delimiter
118        self.logical_operator = logical_operator
119
120    @validate_call
121    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
122        """Augment the input record with the computed factual knowledge scores.
123
124        :param record: The input record.
125        :returns: The input record, with the factual knowledge scores added in.
126        """
127        target_output = record[self.target_output_key]
128        model_output = record[self.model_output_key]
129        for output_key, score_name in zip(self.output_keys, SCORE_NAMES):
130            record[output_key] = self._get_score(
131                target_output=target_output,
132                model_output=model_output,
133                score_fn=FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS[score_name],
134            )
135        return record
136
137    def _get_score(
138        self,
139        target_output: str,
140        model_output: str,
141        score_fn: Callable[..., float],
142        **fn_kwargs,
143    ) -> float:
144        """Compute a factual knowledge score for a target output and model output pair based
145        on the score function.
146
147        :param target_output: Target output.
148        :param model_output: Model output.
149        :param score_fn: One of the functions in FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS.
150        :returns: A computed factual knowledge score (0 or 1). See the docstring for
151        `FactualKnowledge` for more details on what these numerical values represent.
152        """
153        possible_targets = target_output.split(self.target_output_delimiter)
154        if self.logical_operator == "OR":
155            return max([score_fn(model_output, target, **fn_kwargs) for target in possible_targets])
156        else:  # self.logical_operator is "AND"
157            # checks that every target is in model_output, otherwise returns 0.0
158            return min([score_fn(model_output, target, **fn_kwargs) for target in possible_targets])

This transform augments its input record with the computed factual knowledge scores.

See the docstring for FactualKnowledge for more details regarding the score itself.

FactualKnowledgeScores( target_output_key: str = 'target_output', model_output_key: str = 'model_output', output_keys: List[str] = ['factual_knowledge', 'factual_knowledge_quasi_exact'], target_output_delimiter: Optional[str] = '<OR>', logical_operator: str = 'OR')
 84    def __init__(
 85        self,
 86        target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name,
 87        model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name,
 88        output_keys: List[str] = SCORE_NAMES,
 89        target_output_delimiter: Optional[str] = "<OR>",
 90        logical_operator: str = "OR",
 91    ):
 92        """FactualKnowledgeScores initializer.
 93
 94        :param target_output_key: The record key corresponding to the target output.
 95        :param model_output_key: The record key corresponding to the model output.
 96        :param output_key: The key corresponding to the factual knowledge score that
 97            will be added to the input record.
 98        :param target_output_delimiter: This delimiter is used to combine all possible target outputs into
 99            a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>",
100            then the target output text will be "UK<OR>England". This can be useful to account for multiple
101            valid target outputs or to ensure that multiple target outputs are contained in the model output
102            (which can be configured using the logical_operator).
103        :param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator
104            is "OR" (the default behavior), at least one of the possible target outputs (separated by the
105            target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical
106            operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in
107            the model output in order for the answer to be correct.
108        """
109        super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter, logical_operator)
110        self.register_input_output_keys(
111            input_keys=[target_output_key, model_output_key],
112            output_keys=output_keys,
113        )
114        self.target_output_key = target_output_key
115        self.model_output_key = model_output_key
116        self.output_keys = output_keys
117        self.target_output_delimiter = target_output_delimiter
118        self.logical_operator = logical_operator

FactualKnowledgeScores initializer.

Parameters
  • target_output_key: The record key corresponding to the target output.
  • model_output_key: The record key corresponding to the model output.
  • output_key: The key corresponding to the factual knowledge score that will be added to the input record.
  • target_output_delimiter: This delimiter is used to combine all possible target outputs into a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "", then the target output text will be "UKEngland". This can be useful to account for multiple valid target outputs or to ensure that multiple target outputs are contained in the model output (which can be configured using the logical_operator).
  • logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator is "OR" (the default behavior), at least one of the possible target outputs (separated by the target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in the model output in order for the answer to be correct.
target_output_key
model_output_key
output_keys
target_output_delimiter
logical_operator
@dataclass(frozen=True)
class FactualKnowledgeConfig(fmeval.eval_algorithms.eval_algorithm.EvalAlgorithmConfig):
161@dataclass(frozen=True)
162class FactualKnowledgeConfig(EvalAlgorithmConfig):
163    """Configures the factual knowledge evaluation algorithm.
164
165    :param target_output_delimiter: This delimiter is used to combine all possible target outputs into
166        a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>",
167        then the target output text will be "UK<OR>England". This can be useful to account for multiple
168        valid target outputs or to ensure that multiple target outputs are contained in the model output
169        (which can be configured using the logical_operator).
170    :param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator
171        is "OR" (the default behavior), at least one of the possible target outputs (separated by the
172        target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical
173        operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in
174        the model output in order for the answer to be correct.
175    """
176
177    target_output_delimiter: Optional[str] = "<OR>"
178    logical_operator: str = "OR"
179
180    def __post_init__(self):
181        if self.target_output_delimiter == "":
182            raise EvalAlgorithmClientError(
183                "Empty target_output_delimiter is provided. Please either provide a non-empty string, "
184                "or set it to None."
185            )
186        if self.logical_operator not in ["OR", "AND"]:
187            raise EvalAlgorithmClientError(
188                'Invalid logical_operator is provided. The only valid inputs are strings "OR" and "AND".'
189            )
190        if self.target_output_delimiter in ["<OR>", "<AND>"] and self.target_output_delimiter != "<{}>".format(
191            self.logical_operator
192        ):
193            logger.warning(
194                f"The target_output_delimiter `{self.target_output_delimiter}` and logical_operator"
195                f" `{self.logical_operator}` are not consistent."
196            )

Configures the factual knowledge evaluation algorithm.

Parameters
  • target_output_delimiter: This delimiter is used to combine all possible target outputs into a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "", then the target output text will be "UKEngland". This can be useful to account for multiple valid target outputs or to ensure that multiple target outputs are contained in the model output (which can be configured using the logical_operator).
  • logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator is "OR" (the default behavior), at least one of the possible target outputs (separated by the target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in the model output in order for the answer to be correct.
FactualKnowledgeConfig( target_output_delimiter: Optional[str] = '<OR>', logical_operator: str = 'OR')
target_output_delimiter: Optional[str] = '<OR>'
logical_operator: str = 'OR'
199class FactualKnowledge(EvalAlgorithmInterface):
200    """
201    This evaluation measures the ability of language models to reproduce facts about the real world and was proposed
202    by [Petroni et al.](https://arxiv.org/pdf/1909.01066.pdf). The evaluation queries the model with prompts like
203    'Berlin is the capital of' and 'Tata Motors is a subsidiary of' and compares the model generation with one or more
204    target answers. The prompts are divided into different knowledge categories like capitals, subsidiaries, etc.
205
206    This evaluation outputs two binary metrics.
207    The first is the "exact_inclusion" score: the metric value is 1 if the lower-cased expected answer is
208    contained anywhere within the lower-cased model response. For instance, consider the prompt
209    'Berlin is the capital of' with the expected answer 'Germany'.
210    If the model generation is 'Germany, and is also its most populous city', then the metric evaluates to 1.
211
212    The second metric is the "quasi_exact_inclusion" score: the metric value is 1 if the target output is contained
213    in the model output after both strings are normalized.
214    Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L144
215
216    If there is more than one correct target answer, the `logical_operator` can be set to "OR" (default) and
217    answers are seperated by the `target_output_delimiter`, both of which are configured inside the
218    `FactualKnowledgeConfig`. The `target_output_delimiter` defaults to `<OR>`, i.e, the target answer in this
219    example could be Germany<OR>Berlin (since Berlin is its own federal state).
220
221    If there are multiple correct target answers that must be included in the model output,
222    the `logical_operator` can be set to "AND". For example, consider the prompt 'What are the three primary colors?'.
223    The target answer would be Red<AND>Yellow<AND>Blue" (note that the target_output_delimiter could be anything,
224    but it is "<AND>" here for the sake of consistency with the logical_operator value).Red, yellow, and blue must
225    all be contained in the model generation for the answer to be correct under this configuration.
226    """
227
228    eval_name = EvalAlgorithm.FACTUAL_KNOWLEDGE.value
229
230    def __init__(self, eval_algorithm_config: FactualKnowledgeConfig = FactualKnowledgeConfig()):
231        """FactualKnowledge initializer.
232
233        :param eval_algorithm_config: Factual knowledge evaluation algorithm config.
234        """
235        super().__init__(eval_algorithm_config)
236        self.pipeline = TransformPipeline(
237            [
238                FactualKnowledgeScores(
239                    target_output_delimiter=eval_algorithm_config.target_output_delimiter,
240                    logical_operator=eval_algorithm_config.logical_operator,
241                )
242            ]
243        )
244
245    def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]:  # type: ignore[override]
246        """Computes the factual knowledge metrics for a single sample.
247
248        :param target_output: The expected responses from the model.
249        :param model_output: The output of the model being evaluated.
250        :return: A list of EvalScore objects, one for each of the Factual Knowledge metrics
251        ("exact_inclusion" and "quasi_exact_inclusion").
252        """
253        sample = {
254            DatasetColumns.TARGET_OUTPUT.value.name: target_output,
255            DatasetColumns.MODEL_OUTPUT.value.name: model_output,
256        }
257        result = self.pipeline.execute_record(sample)
258        return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES]
259
260    def evaluate(
261        self,
262        model: Optional[ModelRunner] = None,
263        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
264        prompt_template: Optional[str] = None,
265        num_records: int = 300,
266        save: bool = False,
267        save_strategy: Optional[SaveStrategy] = None,
268    ) -> List[EvalOutput]:
269        """Compute the factual knowledge scores on one or more datasets.
270
271        :param model: An instance of ModelRunner representing the model under evaluation.
272            If this argument is None, the `dataset_config` argument must not be None,
273            and must correspond to a dataset that already contains a column with model outputs.
274        :param dataset_config: Configures a single dataset or list of datasets used for the
275            evaluation. If not provided, this method will run evaluations using all of its
276            supported built-in datasets.
277        :param prompt_template: A template used to generate prompts that are fed to the model.
278            If not provided, defaults will be used. If provided, `model` must not be None.
279        :param num_records: The number of records to be sampled randomly from the input dataset(s)
280            used to perform the evaluation(s). Note that the default value is 300, rather than
281            100, as it is for the rest of the built-in algorithms. This is because there
282            are 15 categories for factual knowledge, and if only 100 samples are used, there
283            will be categories with very few samples.
284        :param save: If set to true, prompt responses and scores will be saved to a file.
285        :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not
286            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
287            If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`.
288
289        :return: A list of EvalOutput objects.
290        """
291        dataset_configs = get_dataset_configs(dataset_config, self.eval_name)
292        eval_outputs = []
293        for dataset_config in dataset_configs:
294            dataset = get_dataset(dataset_config, num_records)
295            validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name])
296            eval_output = evaluate_dataset(
297                dataset=dataset,
298                pipeline=self.pipeline,
299                dataset_name=dataset_config.dataset_name,
300                eval_name=self.eval_name,
301                metric_names=SCORE_NAMES,
302                eval_results_path=get_eval_results_path(),
303                model=model,
304                prompt_template=prompt_template,
305                agg_method=MEAN,
306                save=save,
307                save_strategy=save_strategy,
308            )
309            eval_outputs.append(eval_output)
310        return eval_outputs

This evaluation measures the ability of language models to reproduce facts about the real world and was proposed by Petroni et al.. The evaluation queries the model with prompts like 'Berlin is the capital of' and 'Tata Motors is a subsidiary of' and compares the model generation with one or more target answers. The prompts are divided into different knowledge categories like capitals, subsidiaries, etc.

This evaluation outputs two binary metrics. The first is the "exact_inclusion" score: the metric value is 1 if the lower-cased expected answer is contained anywhere within the lower-cased model response. For instance, consider the prompt 'Berlin is the capital of' with the expected answer 'Germany'. If the model generation is 'Germany, and is also its most populous city', then the metric evaluates to 1.

The second metric is the "quasi_exact_inclusion" score: the metric value is 1 if the target output is contained in the model output after both strings are normalized. Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L144

If there is more than one correct target answer, the logical_operator can be set to "OR" (default) and answers are seperated by the target_output_delimiter, both of which are configured inside the FactualKnowledgeConfig. The target_output_delimiter defaults to <OR>, i.e, the target answer in this example could be GermanyBerlin (since Berlin is its own federal state).

If there are multiple correct target answers that must be included in the model output, the logical_operator can be set to "AND". For example, consider the prompt 'What are the three primary colors?'. The target answer would be RedYellowBlue" (note that the target_output_delimiter could be anything, but it is "" here for the sake of consistency with the logical_operator value).Red, yellow, and blue must all be contained in the model generation for the answer to be correct under this configuration.

FactualKnowledge( eval_algorithm_config: FactualKnowledgeConfig = FactualKnowledgeConfig(target_output_delimiter='<OR>', logical_operator='OR'))
230    def __init__(self, eval_algorithm_config: FactualKnowledgeConfig = FactualKnowledgeConfig()):
231        """FactualKnowledge initializer.
232
233        :param eval_algorithm_config: Factual knowledge evaluation algorithm config.
234        """
235        super().__init__(eval_algorithm_config)
236        self.pipeline = TransformPipeline(
237            [
238                FactualKnowledgeScores(
239                    target_output_delimiter=eval_algorithm_config.target_output_delimiter,
240                    logical_operator=eval_algorithm_config.logical_operator,
241                )
242            ]
243        )

FactualKnowledge initializer.

Parameters
  • eval_algorithm_config: Factual knowledge evaluation algorithm config.
eval_name = 'factual_knowledge'
pipeline
def evaluate_sample( self, target_output: str, model_output: str) -> List[fmeval.eval_algorithms.EvalScore]:
245    def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]:  # type: ignore[override]
246        """Computes the factual knowledge metrics for a single sample.
247
248        :param target_output: The expected responses from the model.
249        :param model_output: The output of the model being evaluated.
250        :return: A list of EvalScore objects, one for each of the Factual Knowledge metrics
251        ("exact_inclusion" and "quasi_exact_inclusion").
252        """
253        sample = {
254            DatasetColumns.TARGET_OUTPUT.value.name: target_output,
255            DatasetColumns.MODEL_OUTPUT.value.name: model_output,
256        }
257        result = self.pipeline.execute_record(sample)
258        return [EvalScore(name=score_name, value=result[score_name]) for score_name in SCORE_NAMES]

Computes the factual knowledge metrics for a single sample.

Parameters
  • target_output: The expected responses from the model.
  • model_output: The output of the model being evaluated.
Returns

A list of EvalScore objects, one for each of the Factual Knowledge metrics ("exact_inclusion" and "quasi_exact_inclusion").

def evaluate( self, model: Optional[fmeval.model_runners.model_runner.ModelRunner] = None, dataset_config: Union[fmeval.data_loaders.data_config.DataConfig, List[fmeval.data_loaders.data_config.DataConfig], NoneType] = None, prompt_template: Optional[str] = None, num_records: int = 300, save: bool = False, save_strategy: Optional[fmeval.eval_algorithms.save_strategy.SaveStrategy] = None) -> List[fmeval.eval_algorithms.EvalOutput]:
260    def evaluate(
261        self,
262        model: Optional[ModelRunner] = None,
263        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
264        prompt_template: Optional[str] = None,
265        num_records: int = 300,
266        save: bool = False,
267        save_strategy: Optional[SaveStrategy] = None,
268    ) -> List[EvalOutput]:
269        """Compute the factual knowledge scores on one or more datasets.
270
271        :param model: An instance of ModelRunner representing the model under evaluation.
272            If this argument is None, the `dataset_config` argument must not be None,
273            and must correspond to a dataset that already contains a column with model outputs.
274        :param dataset_config: Configures a single dataset or list of datasets used for the
275            evaluation. If not provided, this method will run evaluations using all of its
276            supported built-in datasets.
277        :param prompt_template: A template used to generate prompts that are fed to the model.
278            If not provided, defaults will be used. If provided, `model` must not be None.
279        :param num_records: The number of records to be sampled randomly from the input dataset(s)
280            used to perform the evaluation(s). Note that the default value is 300, rather than
281            100, as it is for the rest of the built-in algorithms. This is because there
282            are 15 categories for factual knowledge, and if only 100 samples are used, there
283            will be categories with very few samples.
284        :param save: If set to true, prompt responses and scores will be saved to a file.
285        :param save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not
286            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
287            If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`.
288
289        :return: A list of EvalOutput objects.
290        """
291        dataset_configs = get_dataset_configs(dataset_config, self.eval_name)
292        eval_outputs = []
293        for dataset_config in dataset_configs:
294            dataset = get_dataset(dataset_config, num_records)
295            validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name])
296            eval_output = evaluate_dataset(
297                dataset=dataset,
298                pipeline=self.pipeline,
299                dataset_name=dataset_config.dataset_name,
300                eval_name=self.eval_name,
301                metric_names=SCORE_NAMES,
302                eval_results_path=get_eval_results_path(),
303                model=model,
304                prompt_template=prompt_template,
305                agg_method=MEAN,
306                save=save,
307                save_strategy=save_strategy,
308            )
309            eval_outputs.append(eval_output)
310        return eval_outputs

Compute the factual knowledge scores on one or more datasets.

Parameters
  • model: An instance of ModelRunner representing the model under evaluation. If this argument is None, the dataset_config argument must not be None, and must correspond to a dataset that already contains a column with model outputs.
  • dataset_config: Configures a single dataset or list of datasets used for the evaluation. If not provided, this method will run evaluations using all of its supported built-in datasets.
  • prompt_template: A template used to generate prompts that are fed to the model. If not provided, defaults will be used. If provided, model must not be None.
  • num_records: The number of records to be sampled randomly from the input dataset(s) used to perform the evaluation(s). Note that the default value is 300, rather than 100, as it is for the rest of the built-in algorithms. This is because there are 15 categories for factual knowledge, and if only 100 samples are used, there will be categories with very few samples.
  • save: If set to true, prompt responses and scores will be saved to a file.
  • save_strategy: Specifies the strategy to use the save the localized outputs of the evaluations. If not specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. If that environment variable is also not configured, it will be saved to the default path /tmp/eval_results/.
Returns

A list of EvalOutput objects.