fmeval.eval_algorithms.semantic_perturbation_utils
This module contains several semantic perturbations from the NL-Augmenter package. The goals of having this module are twofolds:
- NL-Augmenter has old dependencies which makes it difficult to install it in our env. We anyways do not need all of NL-Augmenter, so we copy over the perturbations we need over here.
- We might add more perturbations from other packages like
nlaug
, or even have our own custom ones, in the future, so we want to have a uniform API for these perturbations.
1""" 2This module contains several semantic perturbations from the NL-Augmenter package. The 3goals of having this module are twofolds: 4 51. NL-Augmenter has old dependencies which makes it difficult to install it in our env. We anyways 6do not need all of NL-Augmenter, so we copy over the perturbations we need over here. 72. We might add more perturbations from other packages like `nlaug`, or even have our own custom 8ones, in the future, so we want to have a uniform API for these perturbations. 9""" 10from abc import ABC, abstractmethod 11import random 12import itertools 13from typing import Dict, List, Union 14from dataclasses import dataclass 15import functools 16 17import numpy as np 18 19 20@dataclass(frozen=True) 21class ButterFingerConfig: 22 """ 23 Config for the Butter Finger perturbation. 24 Defaults set to match those in NL-Augmenter. 25 26 :param perturbation_prob: The probability that a given character will be perturbed. 27 """ 28 29 perturbation_prob: float = 0.1 30 31 32@dataclass(frozen=True) 33class RandomUpperCaseConfig: 34 """ 35 Config for the RandomUpperCase perturbation. 36 Defaults set to match those in NL-Augmenter. 37 38 :param corrupt_proportion: Fraction of characters to be changed to uppercase. 39 """ 40 41 corrupt_proportion: float = 0.1 42 43 44@dataclass(frozen=True) 45class WhitespaceAddRemoveConfig: 46 """ 47 Config for WhitespaceAddRemove perturbation. 48 Defaults set to match those in NL-Augmenter. 49 50 :param remove_prob: Given a whitespace, remove it with this much probability. 51 :param add_prob: Given a non-whitespace, add a whitespace before it with this probability. 52 """ 53 54 remove_prob: float = 0.1 55 add_prob: float = 0.05 56 57 58class SemanticPerturbationUtil(ABC): 59 """ 60 The interface that each perturbation should implement. 61 """ 62 63 def __init__(self, seed: int = 5): 64 self.set_seed(seed) 65 66 @abstractmethod 67 def perturb( 68 self, 69 text: str, 70 config: Union[ButterFingerConfig, RandomUpperCaseConfig, WhitespaceAddRemoveConfig], 71 num_perturbations: int = 5, 72 ) -> List[str]: 73 """ 74 Given an input text, generates one or more perturbed versions of it. Some perturbations can 75 only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8). 76 77 :param text: The input text that needs to be perturbed. 78 :param config: The configuration containing parameters for the perturbation. 79 :param num_perturbations: Number of perturbed versions to generate. Some perturbations can 80 only generate a single perturbed versions and will ignore this parameter. 81 :returns: A list of perturbed texts. 82 """ 83 84 @staticmethod 85 def set_seed(seed: int): 86 random.seed(seed) 87 np.random.seed(seed) 88 89 90class ButterFinger(SemanticPerturbationUtil): 91 """ 92 Given a text, add keyboard induced typos in randomly selected words. 93 Keyboard induced typos are ones where a character is replaced by adjacent characters on the keyboard. 94 95 Example: 96 Original: A quick brown fox jumps over the lazy dog 10 times. 97 Perturbed: W quick brmwn fox jumps over the lazy dig 10 times. 98 99 Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/butter_fingers_perturbation/transformation.py 100 """ 101 102 # Setting default values from NL-Augmenter 103 QUERTY_KEY_APPROX: Dict[str, str] = dict() 104 QUERTY_KEY_APPROX["q"] = "qwasedzx" 105 QUERTY_KEY_APPROX["w"] = "wqesadrfcx" 106 QUERTY_KEY_APPROX["e"] = "ewrsfdqazxcvgt" 107 QUERTY_KEY_APPROX["r"] = "retdgfwsxcvgt" 108 QUERTY_KEY_APPROX["t"] = "tryfhgedcvbnju" 109 QUERTY_KEY_APPROX["y"] = "ytugjhrfvbnji" 110 QUERTY_KEY_APPROX["u"] = "uyihkjtgbnmlo" 111 QUERTY_KEY_APPROX["i"] = "iuojlkyhnmlp" 112 QUERTY_KEY_APPROX["o"] = "oipklujm" 113 QUERTY_KEY_APPROX["p"] = "plo['ik" 114 115 QUERTY_KEY_APPROX["a"] = "aqszwxwdce" 116 QUERTY_KEY_APPROX["s"] = "swxadrfv" 117 QUERTY_KEY_APPROX["d"] = "decsfaqgbv" 118 QUERTY_KEY_APPROX["f"] = "fdgrvwsxyhn" 119 QUERTY_KEY_APPROX["g"] = "gtbfhedcyjn" 120 QUERTY_KEY_APPROX["h"] = "hyngjfrvkim" 121 QUERTY_KEY_APPROX["j"] = "jhknugtblom" 122 QUERTY_KEY_APPROX["k"] = "kjlinyhn" 123 QUERTY_KEY_APPROX["l"] = "lokmpujn" 124 125 QUERTY_KEY_APPROX["z"] = "zaxsvde" 126 QUERTY_KEY_APPROX["x"] = "xzcsdbvfrewq" 127 QUERTY_KEY_APPROX["c"] = "cxvdfzswergb" 128 QUERTY_KEY_APPROX["v"] = "vcfbgxdertyn" 129 QUERTY_KEY_APPROX["b"] = "bvnghcftyun" 130 QUERTY_KEY_APPROX["n"] = "nbmhjvgtuik" 131 QUERTY_KEY_APPROX["m"] = "mnkjloik" 132 QUERTY_KEY_APPROX[" "] = " " 133 134 def perturb( 135 self, text: str, config: ButterFingerConfig, num_perturbations: int = 5 # type: ignore[override] 136 ) -> List[str]: 137 prob_of_typo = int(config.perturbation_prob * 100) 138 perturbed_texts = [] 139 for _ in itertools.repeat(None, num_perturbations): 140 butter_text = [] 141 for letter in text: 142 lcletter = letter.lower() 143 if lcletter not in self.QUERTY_KEY_APPROX.keys(): 144 new_letter = lcletter 145 else: 146 if random.choice(range(0, 100)) <= prob_of_typo: 147 new_letter = random.choice(self.QUERTY_KEY_APPROX[lcletter]) 148 else: 149 new_letter = lcletter 150 # go back to original case 151 if not lcletter == letter: 152 new_letter = new_letter.upper() 153 butter_text.append(new_letter) 154 perturbed_texts.append("".join(butter_text)) 155 return perturbed_texts 156 157 158class RandomUpperCase(SemanticPerturbationUtil): 159 """ 160 Convert random characters in the text to uppercase. 161 Example: 162 Original: A quick brown fox jumps over the lazy dog 10 times. 163 Perturbed: A qUick brOwn fox jumps over the lazY dog 10 timEs. 164 165 Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/random_upper_transformation/transformation.py#L1 166 """ 167 168 def perturb( 169 self, text: str, config: RandomUpperCaseConfig, num_perturbations: int = 5 # type: ignore[override] 170 ) -> List[str]: 171 return list(map(functools.partial(self.random_upper, config=config), itertools.repeat(text, num_perturbations))) 172 173 @staticmethod 174 def random_upper(text: str, config: RandomUpperCaseConfig): 175 positions = np.random.choice( 176 range(len(text)), 177 int(len(text) * config.corrupt_proportion), 178 False, 179 ) 180 181 new_sentence = [letter if index not in positions else letter.upper() for index, letter in enumerate(text)] 182 return "".join(new_sentence) 183 184 185class WhitespaceAddRemove(SemanticPerturbationUtil): 186 """ 187 Add and remove whitespaces at random. 188 Example: 189 Original: A quick brown fox jumps over the lazy dog 10 times. 190 Perturbed: A q uick bro wn fox ju mps overthe lazy dog 10 times. 191 192 Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/whitespace_perturbation/transformation.py 193 """ 194 195 def perturb( 196 self, text: str, config: WhitespaceAddRemoveConfig, num_perturbations: int = 5 # type: ignore[override] 197 ) -> List[str]: 198 perturbed_texts = [] 199 for _ in range(num_perturbations): 200 perturbed_text = [] 201 for char in text: 202 random_num = random.random() 203 perturbed_text += WhitespaceAddRemove.whitespace(char, random_num, config.remove_prob, config.add_prob) 204 perturbed_texts.append("".join(perturbed_text)) 205 return perturbed_texts 206 207 @staticmethod 208 def whitespace(char, random_num, remove_prob, add_prob): 209 if char.isspace() and random_num < remove_prob: 210 return [] 211 perturbed_char = [char] 212 if (not char.isspace()) and random_num < add_prob: 213 perturbed_char.append(" ") 214 215 return perturbed_char
21@dataclass(frozen=True) 22class ButterFingerConfig: 23 """ 24 Config for the Butter Finger perturbation. 25 Defaults set to match those in NL-Augmenter. 26 27 :param perturbation_prob: The probability that a given character will be perturbed. 28 """ 29 30 perturbation_prob: float = 0.1
Config for the Butter Finger perturbation. Defaults set to match those in NL-Augmenter.
Parameters
- perturbation_prob: The probability that a given character will be perturbed.
33@dataclass(frozen=True) 34class RandomUpperCaseConfig: 35 """ 36 Config for the RandomUpperCase perturbation. 37 Defaults set to match those in NL-Augmenter. 38 39 :param corrupt_proportion: Fraction of characters to be changed to uppercase. 40 """ 41 42 corrupt_proportion: float = 0.1
Config for the RandomUpperCase perturbation. Defaults set to match those in NL-Augmenter.
Parameters
- corrupt_proportion: Fraction of characters to be changed to uppercase.
45@dataclass(frozen=True) 46class WhitespaceAddRemoveConfig: 47 """ 48 Config for WhitespaceAddRemove perturbation. 49 Defaults set to match those in NL-Augmenter. 50 51 :param remove_prob: Given a whitespace, remove it with this much probability. 52 :param add_prob: Given a non-whitespace, add a whitespace before it with this probability. 53 """ 54 55 remove_prob: float = 0.1 56 add_prob: float = 0.05
Config for WhitespaceAddRemove perturbation. Defaults set to match those in NL-Augmenter.
Parameters
- remove_prob: Given a whitespace, remove it with this much probability.
- add_prob: Given a non-whitespace, add a whitespace before it with this probability.
59class SemanticPerturbationUtil(ABC): 60 """ 61 The interface that each perturbation should implement. 62 """ 63 64 def __init__(self, seed: int = 5): 65 self.set_seed(seed) 66 67 @abstractmethod 68 def perturb( 69 self, 70 text: str, 71 config: Union[ButterFingerConfig, RandomUpperCaseConfig, WhitespaceAddRemoveConfig], 72 num_perturbations: int = 5, 73 ) -> List[str]: 74 """ 75 Given an input text, generates one or more perturbed versions of it. Some perturbations can 76 only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8). 77 78 :param text: The input text that needs to be perturbed. 79 :param config: The configuration containing parameters for the perturbation. 80 :param num_perturbations: Number of perturbed versions to generate. Some perturbations can 81 only generate a single perturbed versions and will ignore this parameter. 82 :returns: A list of perturbed texts. 83 """ 84 85 @staticmethod 86 def set_seed(seed: int): 87 random.seed(seed) 88 np.random.seed(seed)
The interface that each perturbation should implement.
67 @abstractmethod 68 def perturb( 69 self, 70 text: str, 71 config: Union[ButterFingerConfig, RandomUpperCaseConfig, WhitespaceAddRemoveConfig], 72 num_perturbations: int = 5, 73 ) -> List[str]: 74 """ 75 Given an input text, generates one or more perturbed versions of it. Some perturbations can 76 only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8). 77 78 :param text: The input text that needs to be perturbed. 79 :param config: The configuration containing parameters for the perturbation. 80 :param num_perturbations: Number of perturbed versions to generate. Some perturbations can 81 only generate a single perturbed versions and will ignore this parameter. 82 :returns: A list of perturbed texts. 83 """
Given an input text, generates one or more perturbed versions of it. Some perturbations can only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8).
Parameters
- text: The input text that needs to be perturbed.
- config: The configuration containing parameters for the perturbation.
- num_perturbations: Number of perturbed versions to generate. Some perturbations can only generate a single perturbed versions and will ignore this parameter. :returns: A list of perturbed texts.
91class ButterFinger(SemanticPerturbationUtil): 92 """ 93 Given a text, add keyboard induced typos in randomly selected words. 94 Keyboard induced typos are ones where a character is replaced by adjacent characters on the keyboard. 95 96 Example: 97 Original: A quick brown fox jumps over the lazy dog 10 times. 98 Perturbed: W quick brmwn fox jumps over the lazy dig 10 times. 99 100 Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/butter_fingers_perturbation/transformation.py 101 """ 102 103 # Setting default values from NL-Augmenter 104 QUERTY_KEY_APPROX: Dict[str, str] = dict() 105 QUERTY_KEY_APPROX["q"] = "qwasedzx" 106 QUERTY_KEY_APPROX["w"] = "wqesadrfcx" 107 QUERTY_KEY_APPROX["e"] = "ewrsfdqazxcvgt" 108 QUERTY_KEY_APPROX["r"] = "retdgfwsxcvgt" 109 QUERTY_KEY_APPROX["t"] = "tryfhgedcvbnju" 110 QUERTY_KEY_APPROX["y"] = "ytugjhrfvbnji" 111 QUERTY_KEY_APPROX["u"] = "uyihkjtgbnmlo" 112 QUERTY_KEY_APPROX["i"] = "iuojlkyhnmlp" 113 QUERTY_KEY_APPROX["o"] = "oipklujm" 114 QUERTY_KEY_APPROX["p"] = "plo['ik" 115 116 QUERTY_KEY_APPROX["a"] = "aqszwxwdce" 117 QUERTY_KEY_APPROX["s"] = "swxadrfv" 118 QUERTY_KEY_APPROX["d"] = "decsfaqgbv" 119 QUERTY_KEY_APPROX["f"] = "fdgrvwsxyhn" 120 QUERTY_KEY_APPROX["g"] = "gtbfhedcyjn" 121 QUERTY_KEY_APPROX["h"] = "hyngjfrvkim" 122 QUERTY_KEY_APPROX["j"] = "jhknugtblom" 123 QUERTY_KEY_APPROX["k"] = "kjlinyhn" 124 QUERTY_KEY_APPROX["l"] = "lokmpujn" 125 126 QUERTY_KEY_APPROX["z"] = "zaxsvde" 127 QUERTY_KEY_APPROX["x"] = "xzcsdbvfrewq" 128 QUERTY_KEY_APPROX["c"] = "cxvdfzswergb" 129 QUERTY_KEY_APPROX["v"] = "vcfbgxdertyn" 130 QUERTY_KEY_APPROX["b"] = "bvnghcftyun" 131 QUERTY_KEY_APPROX["n"] = "nbmhjvgtuik" 132 QUERTY_KEY_APPROX["m"] = "mnkjloik" 133 QUERTY_KEY_APPROX[" "] = " " 134 135 def perturb( 136 self, text: str, config: ButterFingerConfig, num_perturbations: int = 5 # type: ignore[override] 137 ) -> List[str]: 138 prob_of_typo = int(config.perturbation_prob * 100) 139 perturbed_texts = [] 140 for _ in itertools.repeat(None, num_perturbations): 141 butter_text = [] 142 for letter in text: 143 lcletter = letter.lower() 144 if lcletter not in self.QUERTY_KEY_APPROX.keys(): 145 new_letter = lcletter 146 else: 147 if random.choice(range(0, 100)) <= prob_of_typo: 148 new_letter = random.choice(self.QUERTY_KEY_APPROX[lcletter]) 149 else: 150 new_letter = lcletter 151 # go back to original case 152 if not lcletter == letter: 153 new_letter = new_letter.upper() 154 butter_text.append(new_letter) 155 perturbed_texts.append("".join(butter_text)) 156 return perturbed_texts
Given a text, add keyboard induced typos in randomly selected words. Keyboard induced typos are ones where a character is replaced by adjacent characters on the keyboard.
Example: Original: A quick brown fox jumps over the lazy dog 10 times. Perturbed: W quick brmwn fox jumps over the lazy dig 10 times.
135 def perturb( 136 self, text: str, config: ButterFingerConfig, num_perturbations: int = 5 # type: ignore[override] 137 ) -> List[str]: 138 prob_of_typo = int(config.perturbation_prob * 100) 139 perturbed_texts = [] 140 for _ in itertools.repeat(None, num_perturbations): 141 butter_text = [] 142 for letter in text: 143 lcletter = letter.lower() 144 if lcletter not in self.QUERTY_KEY_APPROX.keys(): 145 new_letter = lcletter 146 else: 147 if random.choice(range(0, 100)) <= prob_of_typo: 148 new_letter = random.choice(self.QUERTY_KEY_APPROX[lcletter]) 149 else: 150 new_letter = lcletter 151 # go back to original case 152 if not lcletter == letter: 153 new_letter = new_letter.upper() 154 butter_text.append(new_letter) 155 perturbed_texts.append("".join(butter_text)) 156 return perturbed_texts
Given an input text, generates one or more perturbed versions of it. Some perturbations can only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8).
Parameters
- text: The input text that needs to be perturbed.
- config: The configuration containing parameters for the perturbation.
- num_perturbations: Number of perturbed versions to generate. Some perturbations can only generate a single perturbed versions and will ignore this parameter. :returns: A list of perturbed texts.
Inherited Members
159class RandomUpperCase(SemanticPerturbationUtil): 160 """ 161 Convert random characters in the text to uppercase. 162 Example: 163 Original: A quick brown fox jumps over the lazy dog 10 times. 164 Perturbed: A qUick brOwn fox jumps over the lazY dog 10 timEs. 165 166 Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/random_upper_transformation/transformation.py#L1 167 """ 168 169 def perturb( 170 self, text: str, config: RandomUpperCaseConfig, num_perturbations: int = 5 # type: ignore[override] 171 ) -> List[str]: 172 return list(map(functools.partial(self.random_upper, config=config), itertools.repeat(text, num_perturbations))) 173 174 @staticmethod 175 def random_upper(text: str, config: RandomUpperCaseConfig): 176 positions = np.random.choice( 177 range(len(text)), 178 int(len(text) * config.corrupt_proportion), 179 False, 180 ) 181 182 new_sentence = [letter if index not in positions else letter.upper() for index, letter in enumerate(text)] 183 return "".join(new_sentence)
Convert random characters in the text to uppercase. Example: Original: A quick brown fox jumps over the lazy dog 10 times. Perturbed: A qUick brOwn fox jumps over the lazY dog 10 timEs.
169 def perturb( 170 self, text: str, config: RandomUpperCaseConfig, num_perturbations: int = 5 # type: ignore[override] 171 ) -> List[str]: 172 return list(map(functools.partial(self.random_upper, config=config), itertools.repeat(text, num_perturbations)))
Given an input text, generates one or more perturbed versions of it. Some perturbations can only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8).
Parameters
- text: The input text that needs to be perturbed.
- config: The configuration containing parameters for the perturbation.
- num_perturbations: Number of perturbed versions to generate. Some perturbations can only generate a single perturbed versions and will ignore this parameter. :returns: A list of perturbed texts.
174 @staticmethod 175 def random_upper(text: str, config: RandomUpperCaseConfig): 176 positions = np.random.choice( 177 range(len(text)), 178 int(len(text) * config.corrupt_proportion), 179 False, 180 ) 181 182 new_sentence = [letter if index not in positions else letter.upper() for index, letter in enumerate(text)] 183 return "".join(new_sentence)
Inherited Members
186class WhitespaceAddRemove(SemanticPerturbationUtil): 187 """ 188 Add and remove whitespaces at random. 189 Example: 190 Original: A quick brown fox jumps over the lazy dog 10 times. 191 Perturbed: A q uick bro wn fox ju mps overthe lazy dog 10 times. 192 193 Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/whitespace_perturbation/transformation.py 194 """ 195 196 def perturb( 197 self, text: str, config: WhitespaceAddRemoveConfig, num_perturbations: int = 5 # type: ignore[override] 198 ) -> List[str]: 199 perturbed_texts = [] 200 for _ in range(num_perturbations): 201 perturbed_text = [] 202 for char in text: 203 random_num = random.random() 204 perturbed_text += WhitespaceAddRemove.whitespace(char, random_num, config.remove_prob, config.add_prob) 205 perturbed_texts.append("".join(perturbed_text)) 206 return perturbed_texts 207 208 @staticmethod 209 def whitespace(char, random_num, remove_prob, add_prob): 210 if char.isspace() and random_num < remove_prob: 211 return [] 212 perturbed_char = [char] 213 if (not char.isspace()) and random_num < add_prob: 214 perturbed_char.append(" ") 215 216 return perturbed_char
Add and remove whitespaces at random. Example: Original: A quick brown fox jumps over the lazy dog 10 times. Perturbed: A q uick bro wn fox ju mps overthe lazy dog 10 times.
196 def perturb( 197 self, text: str, config: WhitespaceAddRemoveConfig, num_perturbations: int = 5 # type: ignore[override] 198 ) -> List[str]: 199 perturbed_texts = [] 200 for _ in range(num_perturbations): 201 perturbed_text = [] 202 for char in text: 203 random_num = random.random() 204 perturbed_text += WhitespaceAddRemove.whitespace(char, random_num, config.remove_prob, config.add_prob) 205 perturbed_texts.append("".join(perturbed_text)) 206 return perturbed_texts
Given an input text, generates one or more perturbed versions of it. Some perturbations can only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8).
Parameters
- text: The input text that needs to be perturbed.
- config: The configuration containing parameters for the perturbation.
- num_perturbations: Number of perturbed versions to generate. Some perturbations can only generate a single perturbed versions and will ignore this parameter. :returns: A list of perturbed texts.