fmeval.transforms.semantic_robustness_metrics
1import evaluate as hf_evaluate 2from typing import List, Dict, Any, Tuple 3 4import numpy as np 5 6from fmeval.util import require 7from fmeval.transforms.common import Mean 8from fmeval.transforms.transform import Transform 9from fmeval.transforms.util import validate_call 10 11 12class BertScoreDissimilarity(Transform): 13 """This Transform augments its input record with the BERTScore Dissimilarity metric. 14 15 BERTScore Dissimilarity is simply 1 - BERTScore 16 (https://huggingface.co/spaces/evaluate-metric/bertscore). 17 This Transform uses the mean of a list of BERTScore values as the BERTScore 18 in the formula above. 19 """ 20 21 def __init__(self, bert_score_keys: List[str], output_key: str): 22 """BertScoreDissimilarity initializer. 23 24 :param bert_score_keys: The keys corresponding to the BERTScore values. 25 :param output_key: The key corresponding to the output of this transform. 26 """ 27 super().__init__(bert_score_keys, output_key) 28 self.register_input_output_keys(bert_score_keys, [output_key]) 29 self.bert_score_keys = bert_score_keys 30 self.output_key = output_key 31 32 @validate_call 33 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 34 """Augment the input record with the computed BERTScore Dissimilarity metric. 35 36 :param record: The input record. 37 :returns: The input record with the BERTScore Dissimilarity metric added in. 38 """ 39 add_mean_bert_score = Mean( 40 self.bert_score_keys, 41 self.output_key, 42 ) 43 record = add_mean_bert_score(record) 44 # Override the intermediate value corresponding to self.output_key 45 # (i.e. the mean bert score) to 1 - mean. 46 record[self.output_key] = 1 - record[self.output_key] 47 return record 48 49 50class WER(Transform): 51 """This Transform computes the Word Error Rate metric and augments its input record with the computed value. 52 53 Word Error Rate measures syntactic differences, that is, changes in the words, whereas BERTScore Dissimilarity 54 measures semantic differences. Semantic differences account for cases when the precise words in the output 55 change but the meaning is the same. For example, consider the outputs "it is pouring down today" vs. 56 "it is very rainy today". 57 """ 58 59 def __init__(self, prediction_keys: List[str], reference_keys: List[str], output_key: str): 60 """WER initializer. 61 62 Note that the order of elements in `prediction_keys` and `reference_keys` matters; 63 the kth element of `prediction_keys` should correspond to the kth element of 64 `reference_keys`. 65 66 :param prediction_keys: The record keys corresponding to model predictions. 67 :param reference_keys: The record keys corresponding ot reference, aka target, values. 68 :param output_key: The output key to assign the computed WER value. 69 """ 70 require( 71 len(prediction_keys) == len(reference_keys), 72 "prediction_keys and reference_keys should have the same number of elements. " 73 f"prediction_keys has {len(prediction_keys)} elements while reference_keys has " 74 f"{len(reference_keys)} elements.", 75 ) 76 super().__init__(prediction_keys, reference_keys, output_key) 77 self.register_input_output_keys(prediction_keys + reference_keys, [output_key], allow_duplicates=True) 78 self.prediction_keys = prediction_keys 79 self.reference_keys = reference_keys 80 self.output_key = output_key 81 self.wer = hf_evaluate.load("wer") 82 83 @validate_call 84 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 85 """Augment the input record with the computed WER metric. 86 87 :param record: The input record. 88 :returns: The input record with the WER metric added in. 89 """ 90 wer_metric = self.wer.compute( 91 predictions=[record[prediction_key] for prediction_key in self.prediction_keys], 92 references=[record[reference_key] for reference_key in self.reference_keys], 93 ) 94 record[self.output_key] = wer_metric 95 return record 96 97 98class MeanDeltaScores(Transform): 99 """This transform augments an input record with mean delta scores. 100 101 Given 102 1) An "original score", which is a score that was computed using 103 an "original" i.e. unperturbed input 104 2) A series of "perturbed scores", which are scores computed using 105 perturbations of the original input 106 the delta score for a particular perturbed score is computed using the 107 formula: abs(original_score - perturbed_score), and the mean delta score 108 is simply the arithmetic mean of all delta scores for the series of 109 perturbed scores. 110 """ 111 112 def __init__(self, key_mapping: Dict[str, Tuple[List[str], str]]): 113 """MeanDeltaScores initializer. 114 115 :param key_mapping: Maps an original score key to a tuple of the form 116 (perturbed_score_keys, output_key). output_key will be used 117 as the output key corresponding to the mean delta score computed 118 using the original score and perturbed scores. 119 """ 120 super().__init__(key_mapping) 121 original_score_keys = list(key_mapping.keys()) 122 perturbed_score_keys = [key for tup in key_mapping.values() for key in tup[0]] 123 self.register_input_output_keys( 124 input_keys=original_score_keys + perturbed_score_keys, 125 output_keys=[tup[1] for tup in key_mapping.values()], 126 ) 127 self.key_mapping = key_mapping 128 129 @validate_call 130 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 131 """Augment the input record with the computed mean delta scores. 132 133 :param record: The input record. 134 :returns: The input record with the mean delta scores added in. 135 """ 136 for original_score_key, tup in self.key_mapping.items(): 137 perturbed_score_keys, output_key = tup 138 record[output_key] = np.mean( 139 [ 140 abs(record[original_score_key] - record[perturbed_score_key]) 141 for perturbed_score_key in perturbed_score_keys 142 ] 143 ) 144 return record
13class BertScoreDissimilarity(Transform): 14 """This Transform augments its input record with the BERTScore Dissimilarity metric. 15 16 BERTScore Dissimilarity is simply 1 - BERTScore 17 (https://huggingface.co/spaces/evaluate-metric/bertscore). 18 This Transform uses the mean of a list of BERTScore values as the BERTScore 19 in the formula above. 20 """ 21 22 def __init__(self, bert_score_keys: List[str], output_key: str): 23 """BertScoreDissimilarity initializer. 24 25 :param bert_score_keys: The keys corresponding to the BERTScore values. 26 :param output_key: The key corresponding to the output of this transform. 27 """ 28 super().__init__(bert_score_keys, output_key) 29 self.register_input_output_keys(bert_score_keys, [output_key]) 30 self.bert_score_keys = bert_score_keys 31 self.output_key = output_key 32 33 @validate_call 34 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 35 """Augment the input record with the computed BERTScore Dissimilarity metric. 36 37 :param record: The input record. 38 :returns: The input record with the BERTScore Dissimilarity metric added in. 39 """ 40 add_mean_bert_score = Mean( 41 self.bert_score_keys, 42 self.output_key, 43 ) 44 record = add_mean_bert_score(record) 45 # Override the intermediate value corresponding to self.output_key 46 # (i.e. the mean bert score) to 1 - mean. 47 record[self.output_key] = 1 - record[self.output_key] 48 return record
This Transform augments its input record with the BERTScore Dissimilarity metric.
BERTScore Dissimilarity is simply 1 - BERTScore (https://huggingface.co/spaces/evaluate-metric/bertscore). This Transform uses the mean of a list of BERTScore values as the BERTScore in the formula above.
22 def __init__(self, bert_score_keys: List[str], output_key: str): 23 """BertScoreDissimilarity initializer. 24 25 :param bert_score_keys: The keys corresponding to the BERTScore values. 26 :param output_key: The key corresponding to the output of this transform. 27 """ 28 super().__init__(bert_score_keys, output_key) 29 self.register_input_output_keys(bert_score_keys, [output_key]) 30 self.bert_score_keys = bert_score_keys 31 self.output_key = output_key
BertScoreDissimilarity initializer.
Parameters
- bert_score_keys: The keys corresponding to the BERTScore values.
- output_key: The key corresponding to the output of this transform.
51class WER(Transform): 52 """This Transform computes the Word Error Rate metric and augments its input record with the computed value. 53 54 Word Error Rate measures syntactic differences, that is, changes in the words, whereas BERTScore Dissimilarity 55 measures semantic differences. Semantic differences account for cases when the precise words in the output 56 change but the meaning is the same. For example, consider the outputs "it is pouring down today" vs. 57 "it is very rainy today". 58 """ 59 60 def __init__(self, prediction_keys: List[str], reference_keys: List[str], output_key: str): 61 """WER initializer. 62 63 Note that the order of elements in `prediction_keys` and `reference_keys` matters; 64 the kth element of `prediction_keys` should correspond to the kth element of 65 `reference_keys`. 66 67 :param prediction_keys: The record keys corresponding to model predictions. 68 :param reference_keys: The record keys corresponding ot reference, aka target, values. 69 :param output_key: The output key to assign the computed WER value. 70 """ 71 require( 72 len(prediction_keys) == len(reference_keys), 73 "prediction_keys and reference_keys should have the same number of elements. " 74 f"prediction_keys has {len(prediction_keys)} elements while reference_keys has " 75 f"{len(reference_keys)} elements.", 76 ) 77 super().__init__(prediction_keys, reference_keys, output_key) 78 self.register_input_output_keys(prediction_keys + reference_keys, [output_key], allow_duplicates=True) 79 self.prediction_keys = prediction_keys 80 self.reference_keys = reference_keys 81 self.output_key = output_key 82 self.wer = hf_evaluate.load("wer") 83 84 @validate_call 85 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 86 """Augment the input record with the computed WER metric. 87 88 :param record: The input record. 89 :returns: The input record with the WER metric added in. 90 """ 91 wer_metric = self.wer.compute( 92 predictions=[record[prediction_key] for prediction_key in self.prediction_keys], 93 references=[record[reference_key] for reference_key in self.reference_keys], 94 ) 95 record[self.output_key] = wer_metric 96 return record
This Transform computes the Word Error Rate metric and augments its input record with the computed value.
Word Error Rate measures syntactic differences, that is, changes in the words, whereas BERTScore Dissimilarity measures semantic differences. Semantic differences account for cases when the precise words in the output change but the meaning is the same. For example, consider the outputs "it is pouring down today" vs. "it is very rainy today".
60 def __init__(self, prediction_keys: List[str], reference_keys: List[str], output_key: str): 61 """WER initializer. 62 63 Note that the order of elements in `prediction_keys` and `reference_keys` matters; 64 the kth element of `prediction_keys` should correspond to the kth element of 65 `reference_keys`. 66 67 :param prediction_keys: The record keys corresponding to model predictions. 68 :param reference_keys: The record keys corresponding ot reference, aka target, values. 69 :param output_key: The output key to assign the computed WER value. 70 """ 71 require( 72 len(prediction_keys) == len(reference_keys), 73 "prediction_keys and reference_keys should have the same number of elements. " 74 f"prediction_keys has {len(prediction_keys)} elements while reference_keys has " 75 f"{len(reference_keys)} elements.", 76 ) 77 super().__init__(prediction_keys, reference_keys, output_key) 78 self.register_input_output_keys(prediction_keys + reference_keys, [output_key], allow_duplicates=True) 79 self.prediction_keys = prediction_keys 80 self.reference_keys = reference_keys 81 self.output_key = output_key 82 self.wer = hf_evaluate.load("wer")
WER initializer.
Note that the order of elements in prediction_keys
and reference_keys
matters;
the kth element of prediction_keys
should correspond to the kth element of
reference_keys
.
Parameters
- prediction_keys: The record keys corresponding to model predictions.
- reference_keys: The record keys corresponding ot reference, aka target, values.
- output_key: The output key to assign the computed WER value.
99class MeanDeltaScores(Transform): 100 """This transform augments an input record with mean delta scores. 101 102 Given 103 1) An "original score", which is a score that was computed using 104 an "original" i.e. unperturbed input 105 2) A series of "perturbed scores", which are scores computed using 106 perturbations of the original input 107 the delta score for a particular perturbed score is computed using the 108 formula: abs(original_score - perturbed_score), and the mean delta score 109 is simply the arithmetic mean of all delta scores for the series of 110 perturbed scores. 111 """ 112 113 def __init__(self, key_mapping: Dict[str, Tuple[List[str], str]]): 114 """MeanDeltaScores initializer. 115 116 :param key_mapping: Maps an original score key to a tuple of the form 117 (perturbed_score_keys, output_key). output_key will be used 118 as the output key corresponding to the mean delta score computed 119 using the original score and perturbed scores. 120 """ 121 super().__init__(key_mapping) 122 original_score_keys = list(key_mapping.keys()) 123 perturbed_score_keys = [key for tup in key_mapping.values() for key in tup[0]] 124 self.register_input_output_keys( 125 input_keys=original_score_keys + perturbed_score_keys, 126 output_keys=[tup[1] for tup in key_mapping.values()], 127 ) 128 self.key_mapping = key_mapping 129 130 @validate_call 131 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 132 """Augment the input record with the computed mean delta scores. 133 134 :param record: The input record. 135 :returns: The input record with the mean delta scores added in. 136 """ 137 for original_score_key, tup in self.key_mapping.items(): 138 perturbed_score_keys, output_key = tup 139 record[output_key] = np.mean( 140 [ 141 abs(record[original_score_key] - record[perturbed_score_key]) 142 for perturbed_score_key in perturbed_score_keys 143 ] 144 ) 145 return record
This transform augments an input record with mean delta scores.
Given 1) An "original score", which is a score that was computed using an "original" i.e. unperturbed input 2) A series of "perturbed scores", which are scores computed using perturbations of the original input the delta score for a particular perturbed score is computed using the formula: abs(original_score - perturbed_score), and the mean delta score is simply the arithmetic mean of all delta scores for the series of perturbed scores.
113 def __init__(self, key_mapping: Dict[str, Tuple[List[str], str]]): 114 """MeanDeltaScores initializer. 115 116 :param key_mapping: Maps an original score key to a tuple of the form 117 (perturbed_score_keys, output_key). output_key will be used 118 as the output key corresponding to the mean delta score computed 119 using the original score and perturbed scores. 120 """ 121 super().__init__(key_mapping) 122 original_score_keys = list(key_mapping.keys()) 123 perturbed_score_keys = [key for tup in key_mapping.values() for key in tup[0]] 124 self.register_input_output_keys( 125 input_keys=original_score_keys + perturbed_score_keys, 126 output_keys=[tup[1] for tup in key_mapping.values()], 127 ) 128 self.key_mapping = key_mapping
MeanDeltaScores initializer.
Parameters
- key_mapping: Maps an original score key to a tuple of the form (perturbed_score_keys, output_key). output_key will be used as the output key corresponding to the mean delta score computed using the original score and perturbed scores.