fmeval.eval_algorithms.semantic_perturbation_utils

This module contains several semantic perturbations from the NL-Augmenter package. The goals of having this module are twofolds:

  1. NL-Augmenter has old dependencies which makes it difficult to install it in our env. We anyways do not need all of NL-Augmenter, so we copy over the perturbations we need over here.
  2. We might add more perturbations from other packages like nlaug, or even have our own custom ones, in the future, so we want to have a uniform API for these perturbations.
  1"""
  2This module contains several semantic perturbations from the NL-Augmenter package. The
  3goals of having this module are twofolds:
  4
  51. NL-Augmenter has old dependencies which makes it difficult to install it in our env. We anyways
  6do not need all of NL-Augmenter, so we copy over the perturbations we need over here.
  72. We might add more perturbations from other packages like `nlaug`, or even have our own custom
  8ones, in the future, so we want to have a uniform API for these perturbations.
  9"""
 10from abc import ABC, abstractmethod
 11import random
 12import itertools
 13from typing import Dict, List, Union
 14from dataclasses import dataclass
 15import functools
 16
 17import numpy as np
 18
 19
 20@dataclass(frozen=True)
 21class ButterFingerConfig:
 22    """
 23    Config for the Butter Finger perturbation.
 24    Defaults set to match those in NL-Augmenter.
 25
 26    :param perturbation_prob: The probability that a given character will be perturbed.
 27    """
 28
 29    perturbation_prob: float = 0.1
 30
 31
 32@dataclass(frozen=True)
 33class RandomUpperCaseConfig:
 34    """
 35    Config for the RandomUpperCase perturbation.
 36    Defaults set to match those in NL-Augmenter.
 37
 38    :param corrupt_proportion: Fraction of characters to be changed to uppercase.
 39    """
 40
 41    corrupt_proportion: float = 0.1
 42
 43
 44@dataclass(frozen=True)
 45class WhitespaceAddRemoveConfig:
 46    """
 47    Config for WhitespaceAddRemove perturbation.
 48    Defaults set to match those in NL-Augmenter.
 49
 50    :param remove_prob: Given a whitespace, remove it with this much probability.
 51    :param add_prob: Given a non-whitespace, add a whitespace before it with this probability.
 52    """
 53
 54    remove_prob: float = 0.1
 55    add_prob: float = 0.05
 56
 57
 58class SemanticPerturbationUtil(ABC):
 59    """
 60    The interface that each perturbation should implement.
 61    """
 62
 63    def __init__(self, seed: int = 5):
 64        self.set_seed(seed)
 65
 66    @abstractmethod
 67    def perturb(
 68        self,
 69        text: str,
 70        config: Union[ButterFingerConfig, RandomUpperCaseConfig, WhitespaceAddRemoveConfig],
 71        num_perturbations: int = 5,
 72    ) -> List[str]:
 73        """
 74        Given an input text, generates one or more perturbed versions of it. Some perturbations can
 75        only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8).
 76
 77        :param text: The input text that needs to be perturbed.
 78        :param config: The configuration containing parameters for the perturbation.
 79        :param num_perturbations: Number of perturbed versions to generate. Some perturbations can
 80        only generate a single perturbed versions and will ignore this parameter.
 81        :returns: A list of perturbed texts.
 82        """
 83
 84    @staticmethod
 85    def set_seed(seed: int):
 86        random.seed(seed)
 87        np.random.seed(seed)
 88
 89
 90class ButterFinger(SemanticPerturbationUtil):
 91    """
 92    Given a text, add keyboard induced typos in randomly selected words.
 93    Keyboard induced typos are ones where a character is replaced by adjacent characters on the keyboard.
 94
 95    Example:
 96        Original: A quick brown fox jumps over the lazy dog 10 times.
 97        Perturbed: W quick brmwn fox jumps over the lazy dig 10 times.
 98
 99    Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/butter_fingers_perturbation/transformation.py
100    """
101
102    # Setting default values from NL-Augmenter
103    QUERTY_KEY_APPROX: Dict[str, str] = dict()
104    QUERTY_KEY_APPROX["q"] = "qwasedzx"
105    QUERTY_KEY_APPROX["w"] = "wqesadrfcx"
106    QUERTY_KEY_APPROX["e"] = "ewrsfdqazxcvgt"
107    QUERTY_KEY_APPROX["r"] = "retdgfwsxcvgt"
108    QUERTY_KEY_APPROX["t"] = "tryfhgedcvbnju"
109    QUERTY_KEY_APPROX["y"] = "ytugjhrfvbnji"
110    QUERTY_KEY_APPROX["u"] = "uyihkjtgbnmlo"
111    QUERTY_KEY_APPROX["i"] = "iuojlkyhnmlp"
112    QUERTY_KEY_APPROX["o"] = "oipklujm"
113    QUERTY_KEY_APPROX["p"] = "plo['ik"
114
115    QUERTY_KEY_APPROX["a"] = "aqszwxwdce"
116    QUERTY_KEY_APPROX["s"] = "swxadrfv"
117    QUERTY_KEY_APPROX["d"] = "decsfaqgbv"
118    QUERTY_KEY_APPROX["f"] = "fdgrvwsxyhn"
119    QUERTY_KEY_APPROX["g"] = "gtbfhedcyjn"
120    QUERTY_KEY_APPROX["h"] = "hyngjfrvkim"
121    QUERTY_KEY_APPROX["j"] = "jhknugtblom"
122    QUERTY_KEY_APPROX["k"] = "kjlinyhn"
123    QUERTY_KEY_APPROX["l"] = "lokmpujn"
124
125    QUERTY_KEY_APPROX["z"] = "zaxsvde"
126    QUERTY_KEY_APPROX["x"] = "xzcsdbvfrewq"
127    QUERTY_KEY_APPROX["c"] = "cxvdfzswergb"
128    QUERTY_KEY_APPROX["v"] = "vcfbgxdertyn"
129    QUERTY_KEY_APPROX["b"] = "bvnghcftyun"
130    QUERTY_KEY_APPROX["n"] = "nbmhjvgtuik"
131    QUERTY_KEY_APPROX["m"] = "mnkjloik"
132    QUERTY_KEY_APPROX[" "] = " "
133
134    def perturb(
135        self, text: str, config: ButterFingerConfig, num_perturbations: int = 5  # type: ignore[override]
136    ) -> List[str]:
137        prob_of_typo = int(config.perturbation_prob * 100)
138        perturbed_texts = []
139        for _ in itertools.repeat(None, num_perturbations):
140            butter_text = []
141            for letter in text:
142                lcletter = letter.lower()
143                if lcletter not in self.QUERTY_KEY_APPROX.keys():
144                    new_letter = lcletter
145                else:
146                    if random.choice(range(0, 100)) <= prob_of_typo:
147                        new_letter = random.choice(self.QUERTY_KEY_APPROX[lcletter])
148                    else:
149                        new_letter = lcletter
150                # go back to original case
151                if not lcletter == letter:
152                    new_letter = new_letter.upper()
153                butter_text.append(new_letter)
154            perturbed_texts.append("".join(butter_text))
155        return perturbed_texts
156
157
158class RandomUpperCase(SemanticPerturbationUtil):
159    """
160    Convert random characters in the text to uppercase.
161    Example:
162        Original: A quick brown fox jumps over the lazy dog 10 times.
163        Perturbed: A qUick brOwn fox jumps over the lazY dog 10 timEs.
164
165    Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/random_upper_transformation/transformation.py#L1
166    """
167
168    def perturb(
169        self, text: str, config: RandomUpperCaseConfig, num_perturbations: int = 5  # type: ignore[override]
170    ) -> List[str]:
171        return list(map(functools.partial(self.random_upper, config=config), itertools.repeat(text, num_perturbations)))
172
173    @staticmethod
174    def random_upper(text: str, config: RandomUpperCaseConfig):
175        positions = np.random.choice(
176            range(len(text)),
177            int(len(text) * config.corrupt_proportion),
178            False,
179        )
180
181        new_sentence = [letter if index not in positions else letter.upper() for index, letter in enumerate(text)]
182        return "".join(new_sentence)
183
184
185class WhitespaceAddRemove(SemanticPerturbationUtil):
186    """
187    Add and remove whitespaces at random.
188    Example:
189        Original: A quick brown fox jumps over the lazy dog 10 times.
190        Perturbed: A q uick bro wn fox ju mps overthe lazy dog 10 times.
191
192    Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/whitespace_perturbation/transformation.py
193    """
194
195    def perturb(
196        self, text: str, config: WhitespaceAddRemoveConfig, num_perturbations: int = 5  # type: ignore[override]
197    ) -> List[str]:
198        perturbed_texts = []
199        for _ in range(num_perturbations):
200            perturbed_text = []
201            for char in text:
202                random_num = random.random()
203                perturbed_text += WhitespaceAddRemove.whitespace(char, random_num, config.remove_prob, config.add_prob)
204            perturbed_texts.append("".join(perturbed_text))
205        return perturbed_texts
206
207    @staticmethod
208    def whitespace(char, random_num, remove_prob, add_prob):
209        if char.isspace() and random_num < remove_prob:
210            return []
211        perturbed_char = [char]
212        if (not char.isspace()) and random_num < add_prob:
213            perturbed_char.append(" ")
214
215        return perturbed_char
@dataclass(frozen=True)
class ButterFingerConfig:
21@dataclass(frozen=True)
22class ButterFingerConfig:
23    """
24    Config for the Butter Finger perturbation.
25    Defaults set to match those in NL-Augmenter.
26
27    :param perturbation_prob: The probability that a given character will be perturbed.
28    """
29
30    perturbation_prob: float = 0.1

Config for the Butter Finger perturbation. Defaults set to match those in NL-Augmenter.

Parameters
  • perturbation_prob: The probability that a given character will be perturbed.
ButterFingerConfig(perturbation_prob: float = 0.1)
perturbation_prob: float = 0.1
@dataclass(frozen=True)
class RandomUpperCaseConfig:
33@dataclass(frozen=True)
34class RandomUpperCaseConfig:
35    """
36    Config for the RandomUpperCase perturbation.
37    Defaults set to match those in NL-Augmenter.
38
39    :param corrupt_proportion: Fraction of characters to be changed to uppercase.
40    """
41
42    corrupt_proportion: float = 0.1

Config for the RandomUpperCase perturbation. Defaults set to match those in NL-Augmenter.

Parameters
  • corrupt_proportion: Fraction of characters to be changed to uppercase.
RandomUpperCaseConfig(corrupt_proportion: float = 0.1)
corrupt_proportion: float = 0.1
@dataclass(frozen=True)
class WhitespaceAddRemoveConfig:
45@dataclass(frozen=True)
46class WhitespaceAddRemoveConfig:
47    """
48    Config for WhitespaceAddRemove perturbation.
49    Defaults set to match those in NL-Augmenter.
50
51    :param remove_prob: Given a whitespace, remove it with this much probability.
52    :param add_prob: Given a non-whitespace, add a whitespace before it with this probability.
53    """
54
55    remove_prob: float = 0.1
56    add_prob: float = 0.05

Config for WhitespaceAddRemove perturbation. Defaults set to match those in NL-Augmenter.

Parameters
  • remove_prob: Given a whitespace, remove it with this much probability.
  • add_prob: Given a non-whitespace, add a whitespace before it with this probability.
WhitespaceAddRemoveConfig(remove_prob: float = 0.1, add_prob: float = 0.05)
remove_prob: float = 0.1
add_prob: float = 0.05
class SemanticPerturbationUtil(abc.ABC):
59class SemanticPerturbationUtil(ABC):
60    """
61    The interface that each perturbation should implement.
62    """
63
64    def __init__(self, seed: int = 5):
65        self.set_seed(seed)
66
67    @abstractmethod
68    def perturb(
69        self,
70        text: str,
71        config: Union[ButterFingerConfig, RandomUpperCaseConfig, WhitespaceAddRemoveConfig],
72        num_perturbations: int = 5,
73    ) -> List[str]:
74        """
75        Given an input text, generates one or more perturbed versions of it. Some perturbations can
76        only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8).
77
78        :param text: The input text that needs to be perturbed.
79        :param config: The configuration containing parameters for the perturbation.
80        :param num_perturbations: Number of perturbed versions to generate. Some perturbations can
81        only generate a single perturbed versions and will ignore this parameter.
82        :returns: A list of perturbed texts.
83        """
84
85    @staticmethod
86    def set_seed(seed: int):
87        random.seed(seed)
88        np.random.seed(seed)

The interface that each perturbation should implement.

@abstractmethod
def perturb( self, text: str, config: Union[ButterFingerConfig, RandomUpperCaseConfig, WhitespaceAddRemoveConfig], num_perturbations: int = 5) -> List[str]:
67    @abstractmethod
68    def perturb(
69        self,
70        text: str,
71        config: Union[ButterFingerConfig, RandomUpperCaseConfig, WhitespaceAddRemoveConfig],
72        num_perturbations: int = 5,
73    ) -> List[str]:
74        """
75        Given an input text, generates one or more perturbed versions of it. Some perturbations can
76        only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8).
77
78        :param text: The input text that needs to be perturbed.
79        :param config: The configuration containing parameters for the perturbation.
80        :param num_perturbations: Number of perturbed versions to generate. Some perturbations can
81        only generate a single perturbed versions and will ignore this parameter.
82        :returns: A list of perturbed texts.
83        """

Given an input text, generates one or more perturbed versions of it. Some perturbations can only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8).

Parameters
  • text: The input text that needs to be perturbed.
  • config: The configuration containing parameters for the perturbation.
  • num_perturbations: Number of perturbed versions to generate. Some perturbations can only generate a single perturbed versions and will ignore this parameter. :returns: A list of perturbed texts.
@staticmethod
def set_seed(seed: int):
85    @staticmethod
86    def set_seed(seed: int):
87        random.seed(seed)
88        np.random.seed(seed)
class ButterFinger(SemanticPerturbationUtil):
 91class ButterFinger(SemanticPerturbationUtil):
 92    """
 93    Given a text, add keyboard induced typos in randomly selected words.
 94    Keyboard induced typos are ones where a character is replaced by adjacent characters on the keyboard.
 95
 96    Example:
 97        Original: A quick brown fox jumps over the lazy dog 10 times.
 98        Perturbed: W quick brmwn fox jumps over the lazy dig 10 times.
 99
100    Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/butter_fingers_perturbation/transformation.py
101    """
102
103    # Setting default values from NL-Augmenter
104    QUERTY_KEY_APPROX: Dict[str, str] = dict()
105    QUERTY_KEY_APPROX["q"] = "qwasedzx"
106    QUERTY_KEY_APPROX["w"] = "wqesadrfcx"
107    QUERTY_KEY_APPROX["e"] = "ewrsfdqazxcvgt"
108    QUERTY_KEY_APPROX["r"] = "retdgfwsxcvgt"
109    QUERTY_KEY_APPROX["t"] = "tryfhgedcvbnju"
110    QUERTY_KEY_APPROX["y"] = "ytugjhrfvbnji"
111    QUERTY_KEY_APPROX["u"] = "uyihkjtgbnmlo"
112    QUERTY_KEY_APPROX["i"] = "iuojlkyhnmlp"
113    QUERTY_KEY_APPROX["o"] = "oipklujm"
114    QUERTY_KEY_APPROX["p"] = "plo['ik"
115
116    QUERTY_KEY_APPROX["a"] = "aqszwxwdce"
117    QUERTY_KEY_APPROX["s"] = "swxadrfv"
118    QUERTY_KEY_APPROX["d"] = "decsfaqgbv"
119    QUERTY_KEY_APPROX["f"] = "fdgrvwsxyhn"
120    QUERTY_KEY_APPROX["g"] = "gtbfhedcyjn"
121    QUERTY_KEY_APPROX["h"] = "hyngjfrvkim"
122    QUERTY_KEY_APPROX["j"] = "jhknugtblom"
123    QUERTY_KEY_APPROX["k"] = "kjlinyhn"
124    QUERTY_KEY_APPROX["l"] = "lokmpujn"
125
126    QUERTY_KEY_APPROX["z"] = "zaxsvde"
127    QUERTY_KEY_APPROX["x"] = "xzcsdbvfrewq"
128    QUERTY_KEY_APPROX["c"] = "cxvdfzswergb"
129    QUERTY_KEY_APPROX["v"] = "vcfbgxdertyn"
130    QUERTY_KEY_APPROX["b"] = "bvnghcftyun"
131    QUERTY_KEY_APPROX["n"] = "nbmhjvgtuik"
132    QUERTY_KEY_APPROX["m"] = "mnkjloik"
133    QUERTY_KEY_APPROX[" "] = " "
134
135    def perturb(
136        self, text: str, config: ButterFingerConfig, num_perturbations: int = 5  # type: ignore[override]
137    ) -> List[str]:
138        prob_of_typo = int(config.perturbation_prob * 100)
139        perturbed_texts = []
140        for _ in itertools.repeat(None, num_perturbations):
141            butter_text = []
142            for letter in text:
143                lcletter = letter.lower()
144                if lcletter not in self.QUERTY_KEY_APPROX.keys():
145                    new_letter = lcletter
146                else:
147                    if random.choice(range(0, 100)) <= prob_of_typo:
148                        new_letter = random.choice(self.QUERTY_KEY_APPROX[lcletter])
149                    else:
150                        new_letter = lcletter
151                # go back to original case
152                if not lcletter == letter:
153                    new_letter = new_letter.upper()
154                butter_text.append(new_letter)
155            perturbed_texts.append("".join(butter_text))
156        return perturbed_texts

Given a text, add keyboard induced typos in randomly selected words. Keyboard induced typos are ones where a character is replaced by adjacent characters on the keyboard.

Example: Original: A quick brown fox jumps over the lazy dog 10 times. Perturbed: W quick brmwn fox jumps over the lazy dig 10 times.

Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/butter_fingers_perturbation/transformation.py

QUERTY_KEY_APPROX: Dict[str, str] = {'q': 'qwasedzx', 'w': 'wqesadrfcx', 'e': 'ewrsfdqazxcvgt', 'r': 'retdgfwsxcvgt', 't': 'tryfhgedcvbnju', 'y': 'ytugjhrfvbnji', 'u': 'uyihkjtgbnmlo', 'i': 'iuojlkyhnmlp', 'o': 'oipklujm', 'p': "plo['ik", 'a': 'aqszwxwdce', 's': 'swxadrfv', 'd': 'decsfaqgbv', 'f': 'fdgrvwsxyhn', 'g': 'gtbfhedcyjn', 'h': 'hyngjfrvkim', 'j': 'jhknugtblom', 'k': 'kjlinyhn', 'l': 'lokmpujn', 'z': 'zaxsvde', 'x': 'xzcsdbvfrewq', 'c': 'cxvdfzswergb', 'v': 'vcfbgxdertyn', 'b': 'bvnghcftyun', 'n': 'nbmhjvgtuik', 'm': 'mnkjloik', ' ': ' '}
def perturb( self, text: str, config: ButterFingerConfig, num_perturbations: int = 5) -> List[str]:
135    def perturb(
136        self, text: str, config: ButterFingerConfig, num_perturbations: int = 5  # type: ignore[override]
137    ) -> List[str]:
138        prob_of_typo = int(config.perturbation_prob * 100)
139        perturbed_texts = []
140        for _ in itertools.repeat(None, num_perturbations):
141            butter_text = []
142            for letter in text:
143                lcletter = letter.lower()
144                if lcletter not in self.QUERTY_KEY_APPROX.keys():
145                    new_letter = lcletter
146                else:
147                    if random.choice(range(0, 100)) <= prob_of_typo:
148                        new_letter = random.choice(self.QUERTY_KEY_APPROX[lcletter])
149                    else:
150                        new_letter = lcletter
151                # go back to original case
152                if not lcletter == letter:
153                    new_letter = new_letter.upper()
154                butter_text.append(new_letter)
155            perturbed_texts.append("".join(butter_text))
156        return perturbed_texts

Given an input text, generates one or more perturbed versions of it. Some perturbations can only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8).

Parameters
  • text: The input text that needs to be perturbed.
  • config: The configuration containing parameters for the perturbation.
  • num_perturbations: Number of perturbed versions to generate. Some perturbations can only generate a single perturbed versions and will ignore this parameter. :returns: A list of perturbed texts.
class RandomUpperCase(SemanticPerturbationUtil):
159class RandomUpperCase(SemanticPerturbationUtil):
160    """
161    Convert random characters in the text to uppercase.
162    Example:
163        Original: A quick brown fox jumps over the lazy dog 10 times.
164        Perturbed: A qUick brOwn fox jumps over the lazY dog 10 timEs.
165
166    Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/random_upper_transformation/transformation.py#L1
167    """
168
169    def perturb(
170        self, text: str, config: RandomUpperCaseConfig, num_perturbations: int = 5  # type: ignore[override]
171    ) -> List[str]:
172        return list(map(functools.partial(self.random_upper, config=config), itertools.repeat(text, num_perturbations)))
173
174    @staticmethod
175    def random_upper(text: str, config: RandomUpperCaseConfig):
176        positions = np.random.choice(
177            range(len(text)),
178            int(len(text) * config.corrupt_proportion),
179            False,
180        )
181
182        new_sentence = [letter if index not in positions else letter.upper() for index, letter in enumerate(text)]
183        return "".join(new_sentence)

Convert random characters in the text to uppercase. Example: Original: A quick brown fox jumps over the lazy dog 10 times. Perturbed: A qUick brOwn fox jumps over the lazY dog 10 timEs.

Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/random_upper_transformation/transformation.py#L1

def perturb( self, text: str, config: RandomUpperCaseConfig, num_perturbations: int = 5) -> List[str]:
169    def perturb(
170        self, text: str, config: RandomUpperCaseConfig, num_perturbations: int = 5  # type: ignore[override]
171    ) -> List[str]:
172        return list(map(functools.partial(self.random_upper, config=config), itertools.repeat(text, num_perturbations)))

Given an input text, generates one or more perturbed versions of it. Some perturbations can only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8).

Parameters
  • text: The input text that needs to be perturbed.
  • config: The configuration containing parameters for the perturbation.
  • num_perturbations: Number of perturbed versions to generate. Some perturbations can only generate a single perturbed versions and will ignore this parameter. :returns: A list of perturbed texts.
@staticmethod
def random_upper( text: str, config: RandomUpperCaseConfig):
174    @staticmethod
175    def random_upper(text: str, config: RandomUpperCaseConfig):
176        positions = np.random.choice(
177            range(len(text)),
178            int(len(text) * config.corrupt_proportion),
179            False,
180        )
181
182        new_sentence = [letter if index not in positions else letter.upper() for index, letter in enumerate(text)]
183        return "".join(new_sentence)
class WhitespaceAddRemove(SemanticPerturbationUtil):
186class WhitespaceAddRemove(SemanticPerturbationUtil):
187    """
188    Add and remove whitespaces at random.
189    Example:
190        Original: A quick brown fox jumps over the lazy dog 10 times.
191        Perturbed: A q uick bro wn fox ju mps overthe lazy dog 10 times.
192
193    Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/whitespace_perturbation/transformation.py
194    """
195
196    def perturb(
197        self, text: str, config: WhitespaceAddRemoveConfig, num_perturbations: int = 5  # type: ignore[override]
198    ) -> List[str]:
199        perturbed_texts = []
200        for _ in range(num_perturbations):
201            perturbed_text = []
202            for char in text:
203                random_num = random.random()
204                perturbed_text += WhitespaceAddRemove.whitespace(char, random_num, config.remove_prob, config.add_prob)
205            perturbed_texts.append("".join(perturbed_text))
206        return perturbed_texts
207
208    @staticmethod
209    def whitespace(char, random_num, remove_prob, add_prob):
210        if char.isspace() and random_num < remove_prob:
211            return []
212        perturbed_char = [char]
213        if (not char.isspace()) and random_num < add_prob:
214            perturbed_char.append(" ")
215
216        return perturbed_char

Add and remove whitespaces at random. Example: Original: A quick brown fox jumps over the lazy dog 10 times. Perturbed: A q uick bro wn fox ju mps overthe lazy dog 10 times.

Adopted from: https://github.com/GEM-benchmark/NL-Augmenter/blob/c591130760b453b3ad09516849dfc26e721eeb24/nlaugmenter/transformations/whitespace_perturbation/transformation.py

def perturb( self, text: str, config: WhitespaceAddRemoveConfig, num_perturbations: int = 5) -> List[str]:
196    def perturb(
197        self, text: str, config: WhitespaceAddRemoveConfig, num_perturbations: int = 5  # type: ignore[override]
198    ) -> List[str]:
199        perturbed_texts = []
200        for _ in range(num_perturbations):
201            perturbed_text = []
202            for char in text:
203                random_num = random.random()
204                perturbed_text += WhitespaceAddRemove.whitespace(char, random_num, config.remove_prob, config.add_prob)
205            perturbed_texts.append("".join(perturbed_text))
206        return perturbed_texts

Given an input text, generates one or more perturbed versions of it. Some perturbations can only generate a single perturbed version, e.g., converting all numbers to numerics (eight -> 8).

Parameters
  • text: The input text that needs to be perturbed.
  • config: The configuration containing parameters for the perturbation.
  • num_perturbations: Number of perturbed versions to generate. Some perturbations can only generate a single perturbed versions and will ignore this parameter. :returns: A list of perturbed texts.
@staticmethod
def whitespace(char, random_num, remove_prob, add_prob):
208    @staticmethod
209    def whitespace(char, random_num, remove_prob, add_prob):
210        if char.isspace() and random_num < remove_prob:
211            return []
212        perturbed_char = [char]
213        if (not char.isspace()) and random_num < add_prob:
214            perturbed_char.append(" ")
215
216        return perturbed_char