fmeval.transforms.semantic_robustness_metrics

  1import evaluate as hf_evaluate
  2from typing import List, Dict, Any, Tuple
  3
  4import numpy as np
  5
  6from fmeval.util import require
  7from fmeval.transforms.common import Mean
  8from fmeval.transforms.transform import Transform
  9from fmeval.transforms.util import validate_call
 10
 11
 12class BertScoreDissimilarity(Transform):
 13    """This Transform augments its input record with the BERTScore Dissimilarity metric.
 14
 15    BERTScore Dissimilarity is simply 1 - BERTScore
 16    (https://huggingface.co/spaces/evaluate-metric/bertscore).
 17    This Transform uses the mean of a list of BERTScore values as the BERTScore
 18    in the formula above.
 19    """
 20
 21    def __init__(self, bert_score_keys: List[str], output_key: str):
 22        """BertScoreDissimilarity initializer.
 23
 24        :param bert_score_keys: The keys corresponding to the BERTScore values.
 25        :param output_key: The key corresponding to the output of this transform.
 26        """
 27        super().__init__(bert_score_keys, output_key)
 28        self.register_input_output_keys(bert_score_keys, [output_key])
 29        self.bert_score_keys = bert_score_keys
 30        self.output_key = output_key
 31
 32    @validate_call
 33    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
 34        """Augment the input record with the computed BERTScore Dissimilarity metric.
 35
 36        :param record: The input record.
 37        :returns: The input record with the BERTScore Dissimilarity metric added in.
 38        """
 39        add_mean_bert_score = Mean(
 40            self.bert_score_keys,
 41            self.output_key,
 42        )
 43        record = add_mean_bert_score(record)
 44        # Override the intermediate value corresponding to self.output_key
 45        # (i.e. the mean bert score) to 1 - mean.
 46        record[self.output_key] = 1 - record[self.output_key]
 47        return record
 48
 49
 50class WER(Transform):
 51    """This Transform computes the Word Error Rate metric and augments its input record with the computed value.
 52
 53    Word Error Rate measures syntactic differences, that is, changes in the words, whereas BERTScore Dissimilarity
 54    measures semantic differences. Semantic differences account for cases when the precise words in the output
 55    change but the meaning is the same. For example, consider the outputs "it is pouring down today" vs.
 56    "it is very rainy today".
 57    """
 58
 59    def __init__(self, prediction_keys: List[str], reference_keys: List[str], output_key: str):
 60        """WER initializer.
 61
 62        Note that the order of elements in `prediction_keys` and `reference_keys` matters;
 63        the kth element of `prediction_keys` should correspond to the kth element of
 64        `reference_keys`.
 65
 66        :param prediction_keys: The record keys corresponding to model predictions.
 67        :param reference_keys: The record keys corresponding ot reference, aka target, values.
 68        :param output_key: The output key to assign the computed WER value.
 69        """
 70        require(
 71            len(prediction_keys) == len(reference_keys),
 72            "prediction_keys and reference_keys should have the same number of elements. "
 73            f"prediction_keys has {len(prediction_keys)} elements while reference_keys has "
 74            f"{len(reference_keys)} elements.",
 75        )
 76        super().__init__(prediction_keys, reference_keys, output_key)
 77        self.register_input_output_keys(prediction_keys + reference_keys, [output_key], allow_duplicates=True)
 78        self.prediction_keys = prediction_keys
 79        self.reference_keys = reference_keys
 80        self.output_key = output_key
 81        self.wer = hf_evaluate.load("wer")
 82
 83    @validate_call
 84    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
 85        """Augment the input record with the computed WER metric.
 86
 87        :param record: The input record.
 88        :returns: The input record with the WER metric added in.
 89        """
 90        wer_metric = self.wer.compute(
 91            predictions=[record[prediction_key] for prediction_key in self.prediction_keys],
 92            references=[record[reference_key] for reference_key in self.reference_keys],
 93        )
 94        record[self.output_key] = wer_metric
 95        return record
 96
 97
 98class MeanDeltaScores(Transform):
 99    """This transform augments an input record with mean delta scores.
100
101    Given
102        1) An "original score", which is a score that was computed using
103            an "original" i.e. unperturbed input
104        2) A series of "perturbed scores", which are scores computed using
105            perturbations of the original input
106    the delta score for a particular perturbed score is computed using the
107    formula: abs(original_score - perturbed_score), and the mean delta score
108    is simply the arithmetic mean of all delta scores for the series of
109    perturbed scores.
110    """
111
112    def __init__(self, key_mapping: Dict[str, Tuple[List[str], str]]):
113        """MeanDeltaScores initializer.
114
115        :param key_mapping: Maps an original score key to a tuple of the form
116            (perturbed_score_keys, output_key). output_key will be used
117            as the output key corresponding to the mean delta score computed
118            using the original score and perturbed scores.
119        """
120        super().__init__(key_mapping)
121        original_score_keys = list(key_mapping.keys())
122        perturbed_score_keys = [key for tup in key_mapping.values() for key in tup[0]]
123        self.register_input_output_keys(
124            input_keys=original_score_keys + perturbed_score_keys,
125            output_keys=[tup[1] for tup in key_mapping.values()],
126        )
127        self.key_mapping = key_mapping
128
129    @validate_call
130    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
131        """Augment the input record with the computed mean delta scores.
132
133        :param record: The input record.
134        :returns: The input record with the mean delta scores added in.
135        """
136        for original_score_key, tup in self.key_mapping.items():
137            perturbed_score_keys, output_key = tup
138            record[output_key] = np.mean(
139                [
140                    abs(record[original_score_key] - record[perturbed_score_key])
141                    for perturbed_score_key in perturbed_score_keys
142                ]
143            )
144        return record
class BertScoreDissimilarity(fmeval.transforms.transform.Transform):
13class BertScoreDissimilarity(Transform):
14    """This Transform augments its input record with the BERTScore Dissimilarity metric.
15
16    BERTScore Dissimilarity is simply 1 - BERTScore
17    (https://huggingface.co/spaces/evaluate-metric/bertscore).
18    This Transform uses the mean of a list of BERTScore values as the BERTScore
19    in the formula above.
20    """
21
22    def __init__(self, bert_score_keys: List[str], output_key: str):
23        """BertScoreDissimilarity initializer.
24
25        :param bert_score_keys: The keys corresponding to the BERTScore values.
26        :param output_key: The key corresponding to the output of this transform.
27        """
28        super().__init__(bert_score_keys, output_key)
29        self.register_input_output_keys(bert_score_keys, [output_key])
30        self.bert_score_keys = bert_score_keys
31        self.output_key = output_key
32
33    @validate_call
34    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
35        """Augment the input record with the computed BERTScore Dissimilarity metric.
36
37        :param record: The input record.
38        :returns: The input record with the BERTScore Dissimilarity metric added in.
39        """
40        add_mean_bert_score = Mean(
41            self.bert_score_keys,
42            self.output_key,
43        )
44        record = add_mean_bert_score(record)
45        # Override the intermediate value corresponding to self.output_key
46        # (i.e. the mean bert score) to 1 - mean.
47        record[self.output_key] = 1 - record[self.output_key]
48        return record

This Transform augments its input record with the BERTScore Dissimilarity metric.

BERTScore Dissimilarity is simply 1 - BERTScore (https://huggingface.co/spaces/evaluate-metric/bertscore). This Transform uses the mean of a list of BERTScore values as the BERTScore in the formula above.

BertScoreDissimilarity(bert_score_keys: List[str], output_key: str)
22    def __init__(self, bert_score_keys: List[str], output_key: str):
23        """BertScoreDissimilarity initializer.
24
25        :param bert_score_keys: The keys corresponding to the BERTScore values.
26        :param output_key: The key corresponding to the output of this transform.
27        """
28        super().__init__(bert_score_keys, output_key)
29        self.register_input_output_keys(bert_score_keys, [output_key])
30        self.bert_score_keys = bert_score_keys
31        self.output_key = output_key

BertScoreDissimilarity initializer.

Parameters
  • bert_score_keys: The keys corresponding to the BERTScore values.
  • output_key: The key corresponding to the output of this transform.
bert_score_keys
output_key
class WER(fmeval.transforms.transform.Transform):
51class WER(Transform):
52    """This Transform computes the Word Error Rate metric and augments its input record with the computed value.
53
54    Word Error Rate measures syntactic differences, that is, changes in the words, whereas BERTScore Dissimilarity
55    measures semantic differences. Semantic differences account for cases when the precise words in the output
56    change but the meaning is the same. For example, consider the outputs "it is pouring down today" vs.
57    "it is very rainy today".
58    """
59
60    def __init__(self, prediction_keys: List[str], reference_keys: List[str], output_key: str):
61        """WER initializer.
62
63        Note that the order of elements in `prediction_keys` and `reference_keys` matters;
64        the kth element of `prediction_keys` should correspond to the kth element of
65        `reference_keys`.
66
67        :param prediction_keys: The record keys corresponding to model predictions.
68        :param reference_keys: The record keys corresponding ot reference, aka target, values.
69        :param output_key: The output key to assign the computed WER value.
70        """
71        require(
72            len(prediction_keys) == len(reference_keys),
73            "prediction_keys and reference_keys should have the same number of elements. "
74            f"prediction_keys has {len(prediction_keys)} elements while reference_keys has "
75            f"{len(reference_keys)} elements.",
76        )
77        super().__init__(prediction_keys, reference_keys, output_key)
78        self.register_input_output_keys(prediction_keys + reference_keys, [output_key], allow_duplicates=True)
79        self.prediction_keys = prediction_keys
80        self.reference_keys = reference_keys
81        self.output_key = output_key
82        self.wer = hf_evaluate.load("wer")
83
84    @validate_call
85    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
86        """Augment the input record with the computed WER metric.
87
88        :param record: The input record.
89        :returns: The input record with the WER metric added in.
90        """
91        wer_metric = self.wer.compute(
92            predictions=[record[prediction_key] for prediction_key in self.prediction_keys],
93            references=[record[reference_key] for reference_key in self.reference_keys],
94        )
95        record[self.output_key] = wer_metric
96        return record

This Transform computes the Word Error Rate metric and augments its input record with the computed value.

Word Error Rate measures syntactic differences, that is, changes in the words, whereas BERTScore Dissimilarity measures semantic differences. Semantic differences account for cases when the precise words in the output change but the meaning is the same. For example, consider the outputs "it is pouring down today" vs. "it is very rainy today".

WER( prediction_keys: List[str], reference_keys: List[str], output_key: str)
60    def __init__(self, prediction_keys: List[str], reference_keys: List[str], output_key: str):
61        """WER initializer.
62
63        Note that the order of elements in `prediction_keys` and `reference_keys` matters;
64        the kth element of `prediction_keys` should correspond to the kth element of
65        `reference_keys`.
66
67        :param prediction_keys: The record keys corresponding to model predictions.
68        :param reference_keys: The record keys corresponding ot reference, aka target, values.
69        :param output_key: The output key to assign the computed WER value.
70        """
71        require(
72            len(prediction_keys) == len(reference_keys),
73            "prediction_keys and reference_keys should have the same number of elements. "
74            f"prediction_keys has {len(prediction_keys)} elements while reference_keys has "
75            f"{len(reference_keys)} elements.",
76        )
77        super().__init__(prediction_keys, reference_keys, output_key)
78        self.register_input_output_keys(prediction_keys + reference_keys, [output_key], allow_duplicates=True)
79        self.prediction_keys = prediction_keys
80        self.reference_keys = reference_keys
81        self.output_key = output_key
82        self.wer = hf_evaluate.load("wer")

WER initializer.

Note that the order of elements in prediction_keys and reference_keys matters; the kth element of prediction_keys should correspond to the kth element of reference_keys.

Parameters
  • prediction_keys: The record keys corresponding to model predictions.
  • reference_keys: The record keys corresponding ot reference, aka target, values.
  • output_key: The output key to assign the computed WER value.
prediction_keys
reference_keys
output_key
wer
class MeanDeltaScores(fmeval.transforms.transform.Transform):
 99class MeanDeltaScores(Transform):
100    """This transform augments an input record with mean delta scores.
101
102    Given
103        1) An "original score", which is a score that was computed using
104            an "original" i.e. unperturbed input
105        2) A series of "perturbed scores", which are scores computed using
106            perturbations of the original input
107    the delta score for a particular perturbed score is computed using the
108    formula: abs(original_score - perturbed_score), and the mean delta score
109    is simply the arithmetic mean of all delta scores for the series of
110    perturbed scores.
111    """
112
113    def __init__(self, key_mapping: Dict[str, Tuple[List[str], str]]):
114        """MeanDeltaScores initializer.
115
116        :param key_mapping: Maps an original score key to a tuple of the form
117            (perturbed_score_keys, output_key). output_key will be used
118            as the output key corresponding to the mean delta score computed
119            using the original score and perturbed scores.
120        """
121        super().__init__(key_mapping)
122        original_score_keys = list(key_mapping.keys())
123        perturbed_score_keys = [key for tup in key_mapping.values() for key in tup[0]]
124        self.register_input_output_keys(
125            input_keys=original_score_keys + perturbed_score_keys,
126            output_keys=[tup[1] for tup in key_mapping.values()],
127        )
128        self.key_mapping = key_mapping
129
130    @validate_call
131    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
132        """Augment the input record with the computed mean delta scores.
133
134        :param record: The input record.
135        :returns: The input record with the mean delta scores added in.
136        """
137        for original_score_key, tup in self.key_mapping.items():
138            perturbed_score_keys, output_key = tup
139            record[output_key] = np.mean(
140                [
141                    abs(record[original_score_key] - record[perturbed_score_key])
142                    for perturbed_score_key in perturbed_score_keys
143                ]
144            )
145        return record

This transform augments an input record with mean delta scores.

Given 1) An "original score", which is a score that was computed using an "original" i.e. unperturbed input 2) A series of "perturbed scores", which are scores computed using perturbations of the original input the delta score for a particular perturbed score is computed using the formula: abs(original_score - perturbed_score), and the mean delta score is simply the arithmetic mean of all delta scores for the series of perturbed scores.

MeanDeltaScores(key_mapping: Dict[str, Tuple[List[str], str]])
113    def __init__(self, key_mapping: Dict[str, Tuple[List[str], str]]):
114        """MeanDeltaScores initializer.
115
116        :param key_mapping: Maps an original score key to a tuple of the form
117            (perturbed_score_keys, output_key). output_key will be used
118            as the output key corresponding to the mean delta score computed
119            using the original score and perturbed scores.
120        """
121        super().__init__(key_mapping)
122        original_score_keys = list(key_mapping.keys())
123        perturbed_score_keys = [key for tup in key_mapping.values() for key in tup[0]]
124        self.register_input_output_keys(
125            input_keys=original_score_keys + perturbed_score_keys,
126            output_keys=[tup[1] for tup in key_mapping.values()],
127        )
128        self.key_mapping = key_mapping

MeanDeltaScores initializer.

Parameters
  • key_mapping: Maps an original score key to a tuple of the form (perturbed_score_keys, output_key). output_key will be used as the output key corresponding to the mean delta score computed using the original score and perturbed scores.
key_mapping