fmeval.reporting.constants
1from enum import Enum 2from typing import NamedTuple, Tuple, List 3 4from fmeval.eval_algorithms.factual_knowledge import FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT 5from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING, LOG_PROBABILITY_DIFFERENCE 6from fmeval.eval_algorithms.qa_accuracy import ( 7 F1_SCORE, 8 EXACT_MATCH_SCORE, 9 QUASI_EXACT_MATCH_SCORE, 10 PRECISION_OVER_WORDS, 11 RECALL_OVER_WORDS, 12) 13from fmeval.eval_algorithms.summarization_accuracy import METEOR_SCORE, BERT_SCORE, ROUGE_SCORE 14from fmeval.eval_algorithms.classification_accuracy import ( 15 CLASSIFICATION_ACCURACY_SCORE, 16 BALANCED_ACCURACY_SCORE, 17 PRECISION_SCORE, 18 RECALL_SCORE, 19) 20from fmeval.eval_algorithms.classification_accuracy_semantic_robustness import ( 21 DELTA_CLASSIFICATION_ACCURACY_SCORE, 22) 23from fmeval.eval_algorithms.qa_accuracy_semantic_robustness import ( 24 DELTA_F1_SCORE, 25 DELTA_EXACT_MATCH_SCORE, 26 DELTA_QUASI_EXACT_MATCH_SCORE, 27 DELTA_PRECISION_OVER_WORDS, 28 DELTA_RECALL_OVER_WORDS, 29) 30from fmeval.eval_algorithms.summarization_accuracy_semantic_robustness import ( 31 DELTA_ROUGE_SCORE, 32 DELTA_BERT_SCORE, 33 DELTA_METEOR_SCORE, 34) 35from fmeval.eval_algorithms.general_semantic_robustness import WER_SCORE, BERT_SCORE_DISSIMILARITY 36from fmeval.eval_algorithms import ( 37 TREX, 38 BOOLQ, 39 TRIVIA_QA, 40 NATURAL_QUESTIONS, 41 CROWS_PAIRS, 42 WOMENS_CLOTHING_ECOMMERCE_REVIEWS, 43 BOLD, 44 WIKITEXT2, 45 REAL_TOXICITY_PROMPTS, 46 REAL_TOXICITY_PROMPTS_CHALLENGING, 47 GIGAWORD, 48 GOV_REPORT, 49) 50from fmeval.eval_algorithms import EvalAlgorithm 51from fmeval.eval_algorithms.helper_models.helper_model import ( 52 TOXIGEN_SCORE_NAME, 53 DETOXIFY_SCORE_TOXICITY, 54 DETOXIFY_SCORE_SEVERE_TOXICITY, 55 DETOXIFY_SCORE_OBSCENE, 56 DETOXIFY_SCORE_IDENTITY_ATTACK, 57 DETOXIFY_SCORE_INSULT, 58 DETOXIFY_SCORE_THREAT, 59 DETOXIFY_SCORE_SEXUAL_EXPLICIT, 60) 61 62# For general HTML alignment 63CENTER = "center" 64LEFT = "left" 65RIGHT = "right" 66 67 68class ListType(str, Enum): 69 BULLETED = "bulleted" 70 NUMBERED = "numbered" 71 72 73# For general use in Markdown-related code 74SINGLE_NEWLINE = " \n" 75DOUBLE_NEWLINE = " \n\n" 76 77# For tables and bar plots 78NUM_SAMPLES_TO_DISPLAY_IN_TABLE = 5 79CATEGORY_BAR_COLOR = "steelblue" 80OVERALL_BAR_COLOR = "coral" 81MAX_CHAR = 200 82 83# Extensions used by the markdown library to convert markdown to HTML 84MARKDOWN_EXTENSIONS = ["tables", "md_in_html"] 85 86# Dataset score label used in category bar plot 87DATASET_SCORE_LABEL = "Overall" 88 89# Scores that are not per sample 90AGGREGATE_ONLY_SCORES = [BALANCED_ACCURACY_SCORE, PRECISION_SCORE, RECALL_SCORE] 91 92# For string formatting in eval names/score names 93GENERAL_STRING_REPLACEMENTS: List[Tuple[str, str]] = [("qa", "Q&A"), ("f1", "F1"), ("sagemaker", "SageMaker")] 94SCORE_STRING_REPLACEMENTS: List[Tuple[str, str]] = [ 95 ("prompt stereotyping", "is_biased"), 96 ("meteor", "METEOR"), 97 ("bertscore", "BERTScore"), 98 ("rouge", "ROUGE"), 99 ("F1 score", "F1 over words"), 100 ("obscene", "Obscenity"), 101 ("sexual explicit", "Sexual Explicitness"), 102] 103EVAL_NAME_STRING_REPLACEMENTS: List[Tuple[str, str]] = [ 104 (EvalAlgorithm.QA_ACCURACY.value, EvalAlgorithm.ACCURACY.value), 105 (EvalAlgorithm.SUMMARIZATION_ACCURACY.value, EvalAlgorithm.ACCURACY.value), 106 (EvalAlgorithm.CLASSIFICATION_ACCURACY.value, EvalAlgorithm.ACCURACY.value), 107 (EvalAlgorithm.GENERAL_SEMANTIC_ROBUSTNESS.value, "semantic_robustness"), 108 ("accuracy_semantic_robustness", "semantic_robustness"), 109 (EvalAlgorithm.QA_ACCURACY.value, EvalAlgorithm.TOXICITY.value), 110 (EvalAlgorithm.SUMMARIZATION_TOXICITY.value, EvalAlgorithm.TOXICITY.value), 111 (EvalAlgorithm.CLASSIFICATION_ACCURACY.value, EvalAlgorithm.TOXICITY.value), 112] 113PLOT_TITLE_STRING_REPLACEMENTS: List[Tuple[str, str]] = [("prompt_stereotyping", "is_biased score")] 114COLUMN_NAME_STRING_REPLACEMENTS: List[Tuple[str, str]] = [ 115 ("sent_more", "s_more"), 116 ("s_more_input", "<math>S<sub>more</sub></math>"), 117 ("sent_less", "s_less"), 118 ("s_less_input", "<math>S<sub>less</sub></math>"), 119 ("prob_", "probability_"), 120 ("word_error_rate", "Average WER"), 121 ("classification_accuracy", "accuracy"), 122 ("f1_score", "f1 over words"), 123 ("meteor", "METEOR"), 124 ("bertscore", "BERTScore"), 125 ("rouge", "ROUGE"), 126] 127AVOID_REMOVE_UNDERSCORE = ["sent_more_input", "sent_less_input", "is_biased"] 128ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS = [ 129 EvalAlgorithm.SUMMARIZATION_ACCURACY_SEMANTIC_ROBUSTNESS.value, 130 EvalAlgorithm.QA_ACCURACY_SEMANTIC_ROBUSTNESS, 131 EvalAlgorithm.CLASSIFICATION_ACCURACY_SEMANTIC_ROBUSTNESS, 132] 133ACCURACY_SEMANTIC_ROBUSTNESS_SCORES = [ 134 CLASSIFICATION_ACCURACY_SCORE, 135 METEOR_SCORE, 136 BERT_SCORE, 137 ROUGE_SCORE, 138 F1_SCORE, 139 EXACT_MATCH_SCORE, 140 QUASI_EXACT_MATCH_SCORE, 141] 142# Dataset types 143BUILT_IN_DATASET = "Built-in Dataset" 144CUSTOM_DATASET = "Custom Dataset" 145 146TOXICITY_EVAL_NAMES = [ 147 EvalAlgorithm.TOXICITY.value, 148 EvalAlgorithm.QA_TOXICITY.value, 149 EvalAlgorithm.SUMMARIZATION_TOXICITY.value, 150] 151 152# Prompt stereotyping table column name 153PROBABILITY_RATIO = "<math><box>p(S<sub>more</sub>)/p(S<sub>less</sub>)</box></math>" 154IS_BIASED = "is_biased" 155 156# Toxicity detector names 157TOXIGEN_NAME = "Toxigen-roberta" 158DETOXIFY_NAME = "UnitaryAI Detoxify-unbiased" 159TOXIGEN_URI = "https://github.com/microsoft/TOXIGEN" 160DETOXIFY_URI = "https://github.com/unitaryai/detoxify" 161# Example table descriptions 162TABLE_DESCRIPTION = "Below are a few examples of the highest and lowest-scoring examples across all categories. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job. " 163WER_TABLE_DESCRIPTION = "Below are a few examples of the highest and lowest-scoring examples across all categories. The lower the word error rate, the better the model performs. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job." 164STEREOTYPING_TABLE_DESCRIPTION = "**Probability ratio:** \n\nFor each sentence pair, we additionally report by how much the model stereotypes. This is given by the ratio <math><box>p(S<sub>more</sub>)/p(S<sub>less</sub>)</box></math>. This ratio will be between 0 and ∞. A ratio of 200, for example, means that the model assigns 200x the probability to the more stereotypical sentence than the less stereotypical sentence." 165FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION = "Below are a few examples of correct and incorrect model responses. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job. " 166 167# Score descriptions for the report 168SCORE_DESCRIPTIONS = { 169 # Factual knowledge 170 FACTUAL_KNOWLEDGE: "Exact inclusion score is a binary score where 1 indicates that the target output is contained in the model output, and 0 indicates otherwise. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts.", 171 FACTUAL_KNOWLEDGE_QUASI_EXACT: "Quasi-exact inclusion is a binary score that represents whether a slightly modified (removing excess whitespace and punctuation) version of the target output is contained in the model output. If yes, the score is 1, otherwise 0. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts after removing excess whitespace and punctuation.", 172 # Prompt stereotyping 173 PROMPT_STEREOTYPING: "The is_biased prompt stereotyping score measures whether the model assigns higher probability to the more stereotypical sentence than the less stereotypical one, i.e., <math><box>p(S<sub>more</sub>) > p(S<sub>less</sub>)</box></math>. It is a value between 0 and 1, where 1 indicates that the model always prefers the more stereotypical sentence while 0 means that it never prefers the more stereotypical sentence. An unbiased model prefers more and less stereotypical sentences at equal rates, corresponding to a score of 0.5", 174 LOG_PROBABILITY_DIFFERENCE: "For each sentence pair, we report the log probability difference, a value ranging -∞ to ∞, indicating how much the model stereotypes. ", 175 # QA accuracy 176 F1_SCORE: "Numerical score between 0 (worst) and 1 (best). F1-score is the harmonic mean of precision and recall. It is computed as follows: precision = true positives / (true positives + false positives) and recall = true positives / (true positives + false negatives). Then F1 = 2 (precision * recall)/(precision + recall) .", 177 EXACT_MATCH_SCORE: "An exact match score is a binary score where 1 indicates the model output and answer match exactly and 0 indicates otherwise.", 178 QUASI_EXACT_MATCH_SCORE: "Similar as above, but both model output and answer are normalised first by removing any articles and punctuation. E.g., 1 also for predicted answers “Antarctica.” or “the Antarctica” .", 179 PRECISION_OVER_WORDS: "The precision score is the fraction of words in the model output that are also found in the target output.", 180 RECALL_OVER_WORDS: "The recall score is the fraction of words in the target output that are also found in the model output.`", 181 # Summarization accuracy 182 ROUGE_SCORE: "A ROUGE-N score computes the N-gram (sequences of n words) word overlaps between the reference and model summary, with the value ranging between 0 (no match) to 1 (perfect match).", 183 METEOR_SCORE: "Meteor is similar to ROUGE-N, but it also accounts for rephrasing by using traditional NLP techniques such as stemming (e.g. matching “singing” to “sing”,“sings” etc.) and synonym lists.", 184 BERT_SCORE: "BERTScore uses a second ML model (from the BERT family) to compute sentence embeddings and compare their similarity.", 185 # Classification accuracy 186 CLASSIFICATION_ACCURACY_SCORE: "The classification accuracy is `predicted_label == true_label`, reported as the mean accuracy over all datapoints.", 187 PRECISION_SCORE: "The precision score is computed as `true positives / (true positives + false positives)`. ", 188 RECALL_SCORE: "The recall score is computed as `true positives / (true positives + false negatives)`", 189 BALANCED_ACCURACY_SCORE: "The balanced accuracy score is the same as accuracy in the binary case, otherwise averaged recall per class.", 190 # General semantic robustness 191 WER_SCORE: "Word error rate (WER) is a value between 0 and 1, and measures the difference between the model output on the unperturbed input and the output(s) on one or more perturbed versions of the same input. For more details on how word error rate is computed, see the [HuggingFace Article on Word Error Rate](https://huggingface.co/spaces/evaluate-metric/wer).", 192 BERT_SCORE_DISSIMILARITY: "BERTScore Dissimilarity is computed as 1 - BERTScore and measures semantic differences between the original and perturbed versions of the same input.", 193 # Summarization semantic robustness 194 DELTA_ROUGE_SCORE: "Delta ROUGE-N score measures the change in Rouge between the original and perturbed versions of the same input.", 195 DELTA_METEOR_SCORE: "Delta Meteor score measures the change in Meteor between the original and perturbed versions of the same input.", 196 DELTA_BERT_SCORE: "Delta BERTscore measures the change in BERTscore between the original and perturbed versions of the same input.", 197 # QA semantic robustness 198 DELTA_EXACT_MATCH_SCORE: "Delta Exact Match score measures the change in Exact Match score between the original and perturbed versions of the same input.", 199 DELTA_QUASI_EXACT_MATCH_SCORE: "Delta Quasi Exact Match score measures the change in Quasi Exact Match score between the original and perturbed versions of the same input.", 200 DELTA_F1_SCORE: "Delta F1 score measures the change in F1 score between the original and perturbed versions of the same input.", 201 DELTA_PRECISION_OVER_WORDS: "Delta Precision measures the change in Precision between the original and perturbed versions of the same input.", 202 DELTA_RECALL_OVER_WORDS: "Delta Recall measures the change in Recall between the original and perturbed versions of the same input.", 203 # Classification semantic robustness 204 DELTA_CLASSIFICATION_ACCURACY_SCORE: "The score is the binary indicator on whether or not the model answer is correct.", 205 # Toxicity 206 TOXIGEN_SCORE_NAME: "A binary score where 0 indicates no toxicity detected and 1 indicates toxicity detected.", 207 DETOXIFY_SCORE_TOXICITY: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **toxicity**", 208 DETOXIFY_SCORE_SEVERE_TOXICITY: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **severe_toxicity**", 209 DETOXIFY_SCORE_OBSCENE: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **obscene**", 210 DETOXIFY_SCORE_IDENTITY_ATTACK: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class **identity_attack**", 211 DETOXIFY_SCORE_INSULT: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **insult**", 212 DETOXIFY_SCORE_THREAT: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **threat**", 213 DETOXIFY_SCORE_SEXUAL_EXPLICIT: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **sexual_explicit**", 214} 215 216 217class DatasetDetails(NamedTuple): 218 name: str 219 url: str 220 description: str 221 size: int 222 223 224# Dataset details with the formatted names, URLs, descriptions and size 225DATASET_DETAILS = { 226 TREX: DatasetDetails( 227 name="T-REx", 228 url="https://hadyelsahar.github.io/t-rex/", 229 description="A dataset which consists of knowledge triplets extracted from Wikipedia. The triplets take the form (subject, predicate, object), for instance, (Berlin, capital of, Germany) or (Tata Motors, subsidiary of, Tata Group). ", 230 size=32260, 231 ), 232 BOOLQ: DatasetDetails( 233 name="BoolQ", 234 url="https://github.com/google-research-datasets/boolean-questions", 235 description="A dataset consisting of question-passage-answer triplets. The question can be answered with yes/no, and the answer is contained in the passage. The questions are provided anonymously and unsolicited by users of the Google search engine, and afterwards paired with a paragraph from a Wikipedia article containing the answer.", 236 size=12697, 237 ), 238 TRIVIA_QA: DatasetDetails( 239 name="TriviaQA", 240 url="http://nlp.cs.washington.edu/triviaqa/", 241 description="A dataset consisting of 95K question-answer pairs with with on average six supporting evidence documents per question, leading to ~650K question-passage-answer triplets. The questions are authored by trivia enthusiasts and the evidence documents are independently gathered. ", 242 size=156328, 243 ), 244 NATURAL_QUESTIONS: DatasetDetails( 245 name="Natural Questions", 246 url="https://github.com/google-research-datasets/natural-questions", 247 description="A dataset consisting of ~320K question-passage-answer triplets. The questions are factual naturally-occurring questions. The passages are extracts from wikipedia articles (referred to as “long answers” in the original dataset). As before, providing the passage is optional depending on whether the open-book or closed-book case should be evaluated.", 248 size=4289, 249 ), 250 CROWS_PAIRS: DatasetDetails( 251 name="CrowS-Pairs", 252 url="https://github.com/nyu-mll/crows-pairs", 253 description="This dataset provides crowdsourced sentence pairs for the different categories along which stereotyping is to be measured.", 254 size=1508, 255 ), 256 WOMENS_CLOTHING_ECOMMERCE_REVIEWS: DatasetDetails( 257 name="Women's E-commerce Clothing Reviews", 258 url="https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews", 259 description="This dataset consists of clothing reviews, both as a text and numerical scores.", 260 size=23486, 261 ), 262 BOLD: DatasetDetails( 263 name="BOLD", 264 url="https://github.com/amazon-science/bold", 265 description="A large-scale dataset that consists of English prompts aimed at testing bias and toxicity generation across five domains: profession, gender, race, religion, and political ideology.", 266 size=23679, 267 ), 268 WIKITEXT2: DatasetDetails( 269 name="WikiText2", 270 url="https://huggingface.co/datasets/wikitext", 271 description="A dataset which consists of Good and Featured articles from Wikipedia. To create prompts, we broke each article down into sentences and extracted first 6 tokens from each sentence as the prompt.", 272 size=86007, 273 ), 274 REAL_TOXICITY_PROMPTS: DatasetDetails( 275 name="Real Toxicity Prompts", 276 url="https://github.com/allenai/real-toxicity-prompts", 277 description="A dataset of truncated sentence snippets from the web. ", 278 size=98243, 279 ), 280 REAL_TOXICITY_PROMPTS_CHALLENGING: DatasetDetails( 281 name="Real Toxicity Prompts Challenging", 282 url="https://github.com/allenai/real-toxicity-prompts", 283 description="A dataset of truncated sentence snippets from the web. Prompts marked as “challenging” have been found by the authors to consistently lead to generation of toxic continuation by tested models (i.e., GPT-1, GPT-2, GPT-3, CTRL, CTRL-WIKI).", 284 size=1199, 285 ), 286 GIGAWORD: DatasetDetails( 287 name="Gigaword", 288 url="https://huggingface.co/datasets/gigaword", 289 description="A dataset with around 4 million news article headlines. Our dataset consists of 190k entries from its validation set.", 290 size=189651, 291 ), 292 GOV_REPORT: DatasetDetails( 293 name="Government Report", 294 url="https://gov-report-data.github.io/", 295 description="A dataset including a long-form summarization benchmark. It contains significantly longer documents (9.4k words) and summaries (553 words) than most existing datasets.", 296 size=7238, 297 ), 298} 299 300TREX_DESCRIPTION_EXAMPLES = "We convert these predicates to prompts, e.g., Berlin is the capital of ___ (expected answer: Germany) and Tata Motors is a subsidiary of ___ (expected answer: Tata Group)." 301 302CROWS_PAIRS_DISCLAIMER = "**Disclaimer**: 1) The crowdsourced CrowS dataset is noisy. While it gives a good indication of overall model performance, individual pairs may be invalid. 2) CrowS measures U.S.-typical stereotypes. Specifically, the bias categories are taken from the US Equal Employment Opportunities Commission’s list of protected categories and the sentence pairs are produced by Amazon Mechanical Turk workers in the United States."
CENTER =
'center'
LEFT =
'left'
RIGHT =
'right'
class
ListType(builtins.str, enum.Enum):
An enumeration.
BULLETED =
<ListType.BULLETED: 'bulleted'>
NUMBERED =
<ListType.NUMBERED: 'numbered'>
SINGLE_NEWLINE =
' \n'
DOUBLE_NEWLINE =
' \n\n'
NUM_SAMPLES_TO_DISPLAY_IN_TABLE =
5
CATEGORY_BAR_COLOR =
'steelblue'
OVERALL_BAR_COLOR =
'coral'
MAX_CHAR =
200
MARKDOWN_EXTENSIONS =
['tables', 'md_in_html']
DATASET_SCORE_LABEL =
'Overall'
AGGREGATE_ONLY_SCORES =
['balanced_accuracy_score', 'precision_score', 'recall_score']
GENERAL_STRING_REPLACEMENTS: List[Tuple[str, str]] =
[('qa', 'Q&A'), ('f1', 'F1'), ('sagemaker', 'SageMaker')]
SCORE_STRING_REPLACEMENTS: List[Tuple[str, str]] =
[('prompt stereotyping', 'is_biased'), ('meteor', 'METEOR'), ('bertscore', 'BERTScore'), ('rouge', 'ROUGE'), ('F1 score', 'F1 over words'), ('obscene', 'Obscenity'), ('sexual explicit', 'Sexual Explicitness')]
EVAL_NAME_STRING_REPLACEMENTS: List[Tuple[str, str]] =
[('qa_accuracy', 'accuracy'), ('summarization_accuracy', 'accuracy'), ('classification_accuracy', 'accuracy'), ('general_semantic_robustness', 'semantic_robustness'), ('accuracy_semantic_robustness', 'semantic_robustness'), ('qa_accuracy', 'toxicity'), ('summarization_toxicity', 'toxicity'), ('classification_accuracy', 'toxicity')]
PLOT_TITLE_STRING_REPLACEMENTS: List[Tuple[str, str]] =
[('prompt_stereotyping', 'is_biased score')]
COLUMN_NAME_STRING_REPLACEMENTS: List[Tuple[str, str]] =
[('sent_more', 's_more'), ('s_more_input', '<math>S<sub>more</sub></math>'), ('sent_less', 's_less'), ('s_less_input', '<math>S<sub>less</sub></math>'), ('prob_', 'probability_'), ('word_error_rate', 'Average WER'), ('classification_accuracy', 'accuracy'), ('f1_score', 'f1 over words'), ('meteor', 'METEOR'), ('bertscore', 'BERTScore'), ('rouge', 'ROUGE')]
AVOID_REMOVE_UNDERSCORE =
['sent_more_input', 'sent_less_input', 'is_biased']
ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS =
['summarization_accuracy_semantic_robustness', <EvalAlgorithm.QA_ACCURACY_SEMANTIC_ROBUSTNESS: 'qa_accuracy_semantic_robustness'>, <EvalAlgorithm.CLASSIFICATION_ACCURACY_SEMANTIC_ROBUSTNESS: 'classification_accuracy_semantic_robustness'>]
ACCURACY_SEMANTIC_ROBUSTNESS_SCORES =
['classification_accuracy_score', 'meteor', 'bertscore', 'rouge', 'f1_score', 'exact_match_score', 'quasi_exact_match_score']
BUILT_IN_DATASET =
'Built-in Dataset'
CUSTOM_DATASET =
'Custom Dataset'
TOXICITY_EVAL_NAMES =
['toxicity', 'qa_toxicity', 'summarization_toxicity']
PROBABILITY_RATIO =
'<math><box>p(S<sub>more</sub>)/p(S<sub>less</sub>)</box></math>'
IS_BIASED =
'is_biased'
TOXIGEN_NAME =
'Toxigen-roberta'
DETOXIFY_NAME =
'UnitaryAI Detoxify-unbiased'
TOXIGEN_URI =
'https://github.com/microsoft/TOXIGEN'
DETOXIFY_URI =
'https://github.com/unitaryai/detoxify'
TABLE_DESCRIPTION =
'Below are a few examples of the highest and lowest-scoring examples across all categories. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job. '
WER_TABLE_DESCRIPTION =
'Below are a few examples of the highest and lowest-scoring examples across all categories. The lower the word error rate, the better the model performs. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job.'
STEREOTYPING_TABLE_DESCRIPTION =
'**Probability ratio:** \n\nFor each sentence pair, we additionally report by how much the model stereotypes. This is given by the ratio <math><box>p(S<sub>more</sub>)/p(S<sub>less</sub>)</box></math>. This ratio will be between 0 and ∞. A ratio of 200, for example, means that the model assigns 200x the probability to the more stereotypical sentence than the less stereotypical sentence.'
FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION =
'Below are a few examples of correct and incorrect model responses. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job. '
SCORE_DESCRIPTIONS =
{'factual_knowledge': 'Exact inclusion score is a binary score where 1 indicates that the target output is contained in the model output, and 0 indicates otherwise. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts.', 'factual_knowledge_quasi_exact': 'Quasi-exact inclusion is a binary score that represents whether a slightly modified (removing excess whitespace and punctuation) version of the target output is contained in the model output. If yes, the score is 1, otherwise 0. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts after removing excess whitespace and punctuation.', 'prompt_stereotyping': 'The is_biased prompt stereotyping score measures whether the model assigns higher probability to the more stereotypical sentence than the less stereotypical one, i.e., <math><box>p(S<sub>more</sub>) > p(S<sub>less</sub>)</box></math>. It is a value between 0 and 1, where 1 indicates that the model always prefers the more stereotypical sentence while 0 means that it never prefers the more stereotypical sentence. An unbiased model prefers more and less stereotypical sentences at equal rates, corresponding to a score of 0.5', 'log_probability_difference': 'For each sentence pair, we report the log probability difference, a value ranging -∞ to ∞, indicating how much the model stereotypes. ', 'f1_score': 'Numerical score between 0 (worst) and 1 (best). F1-score is the harmonic mean of precision and recall. It is computed as follows: precision = true positives / (true positives + false positives) and recall = true positives / (true positives + false negatives). Then F1 = 2 (precision * recall)/(precision + recall) .', 'exact_match_score': 'An exact match score is a binary score where 1 indicates the model output and answer match exactly and 0 indicates otherwise.', 'quasi_exact_match_score': 'Similar as above, but both model output and answer are normalised first by removing any articles and punctuation. E.g., 1 also for predicted answers “Antarctica.” or “the Antarctica” .', 'precision_over_words': 'The precision score is the fraction of words in the model output that are also found in the target output.', 'recall_over_words': 'The recall score is the fraction of words in the target output that are also found in the model output.`', 'rouge': 'A ROUGE-N score computes the N-gram (sequences of n words) word overlaps between the reference and model summary, with the value ranging between 0 (no match) to 1 (perfect match).', 'meteor': 'Meteor is similar to ROUGE-N, but it also accounts for rephrasing by using traditional NLP techniques such as stemming (e.g. matching “singing” to “sing”,“sings” etc.) and synonym lists.', 'bertscore': 'BERTScore uses a second ML model (from the BERT family) to compute sentence embeddings and compare their similarity.', 'classification_accuracy_score': 'The classification accuracy is `predicted_label == true_label`, reported as the mean accuracy over all datapoints.', 'precision_score': 'The precision score is computed as `true positives / (true positives + false positives)`. ', 'recall_score': 'The recall score is computed as `true positives / (true positives + false negatives)`', 'balanced_accuracy_score': 'The balanced accuracy score is the same as accuracy in the binary case, otherwise averaged recall per class.', 'word_error_rate': 'Word error rate (WER) is a value between 0 and 1, and measures the difference between the model output on the unperturbed input and the output(s) on one or more perturbed versions of the same input. For more details on how word error rate is computed, see the [HuggingFace Article on Word Error Rate](https://huggingface.co/spaces/evaluate-metric/wer).', 'bertscore_dissimilarity': 'BERTScore Dissimilarity is computed as 1 - BERTScore and measures semantic differences between the original and perturbed versions of the same input.', 'delta_rouge': 'Delta ROUGE-N score measures the change in Rouge between the original and perturbed versions of the same input.', 'delta_meteor': 'Delta Meteor score measures the change in Meteor between the original and perturbed versions of the same input.', 'delta_bertscore': 'Delta BERTscore measures the change in BERTscore between the original and perturbed versions of the same input.', 'delta_exact_match_score': 'Delta Exact Match score measures the change in Exact Match score between the original and perturbed versions of the same input.', 'delta_quasi_exact_match_score': 'Delta Quasi Exact Match score measures the change in Quasi Exact Match score between the original and perturbed versions of the same input.', 'delta_f1_score': 'Delta F1 score measures the change in F1 score between the original and perturbed versions of the same input.', 'delta_precision_over_words': 'Delta Precision measures the change in Precision between the original and perturbed versions of the same input.', 'delta_recall_over_words': 'Delta Recall measures the change in Recall between the original and perturbed versions of the same input.', 'delta_classification_accuracy_score': 'The score is the binary indicator on whether or not the model answer is correct.', 'toxicity': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **toxicity**', 'severe_toxicity': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **severe_toxicity**', 'obscene': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **obscene**', 'identity_attack': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class **identity_attack**', 'insult': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **insult**', 'threat': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **threat**', 'sexual_explicit': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **sexual_explicit**'}
class
DatasetDetails(typing.NamedTuple):
DatasetDetails(name, url, description, size)
DATASET_DETAILS =
{'trex': DatasetDetails(name='T-REx', url='https://hadyelsahar.github.io/t-rex/', description='A dataset which consists of knowledge triplets extracted from Wikipedia. The triplets take the form (subject, predicate, object), for instance, (Berlin, capital of, Germany) or (Tata Motors, subsidiary of, Tata Group). ', size=32260), 'boolq': DatasetDetails(name='BoolQ', url='https://github.com/google-research-datasets/boolean-questions', description='A dataset consisting of question-passage-answer triplets. The question can be answered with yes/no, and the answer is contained in the passage. The questions are provided anonymously and unsolicited by users of the Google search engine, and afterwards paired with a paragraph from a Wikipedia article containing the answer.', size=12697), 'trivia_qa': DatasetDetails(name='TriviaQA', url='http://nlp.cs.washington.edu/triviaqa/', description='A dataset consisting of 95K question-answer pairs with with on average six supporting evidence documents per question, leading to ~650K question-passage-answer triplets. The questions are authored by trivia enthusiasts and the evidence documents are independently gathered. ', size=156328), 'natural_questions': DatasetDetails(name='Natural Questions', url='https://github.com/google-research-datasets/natural-questions', description='A dataset consisting of ~320K question-passage-answer triplets. The questions are factual naturally-occurring questions. The passages are extracts from wikipedia articles (referred to as “long answers” in the original dataset). As before, providing the passage is optional depending on whether the open-book or closed-book case should be evaluated.', size=4289), 'crows-pairs': DatasetDetails(name='CrowS-Pairs', url='https://github.com/nyu-mll/crows-pairs', description='This dataset provides crowdsourced sentence pairs for the different categories along which stereotyping is to be measured.', size=1508), 'womens_clothing_ecommerce_reviews': DatasetDetails(name="Women's E-commerce Clothing Reviews", url='https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews', description='This dataset consists of clothing reviews, both as a text and numerical scores.', size=23486), 'bold': DatasetDetails(name='BOLD', url='https://github.com/amazon-science/bold', description='A large-scale dataset that consists of English prompts aimed at testing bias and toxicity generation across five domains: profession, gender, race, religion, and political ideology.', size=23679), 'wikitext2': DatasetDetails(name='WikiText2', url='https://huggingface.co/datasets/wikitext', description='A dataset which consists of Good and Featured articles from Wikipedia. To create prompts, we broke each article down into sentences and extracted first 6 tokens from each sentence as the prompt.', size=86007), 'real_toxicity_prompts': DatasetDetails(name='Real Toxicity Prompts', url='https://github.com/allenai/real-toxicity-prompts', description='A dataset of truncated sentence snippets from the web. ', size=98243), 'real_toxicity_prompts_challenging': DatasetDetails(name='Real Toxicity Prompts Challenging', url='https://github.com/allenai/real-toxicity-prompts', description='A dataset of truncated sentence snippets from the web. Prompts marked as “challenging” have been found by the authors to consistently lead to generation of toxic continuation by tested models (i.e., GPT-1, GPT-2, GPT-3, CTRL, CTRL-WIKI).', size=1199), 'gigaword': DatasetDetails(name='Gigaword', url='https://huggingface.co/datasets/gigaword', description='A dataset with around 4 million news article headlines. Our dataset consists of 190k entries from its validation set.', size=189651), 'gov_report': DatasetDetails(name='Government Report', url='https://gov-report-data.github.io/', description='A dataset including a long-form summarization benchmark. It contains significantly longer documents (9.4k words) and summaries (553 words) than most existing datasets.', size=7238)}
TREX_DESCRIPTION_EXAMPLES =
'We convert these predicates to prompts, e.g., Berlin is the capital of ___ (expected answer: Germany) and Tata Motors is a subsidiary of ___ (expected answer: Tata Group).'
CROWS_PAIRS_DISCLAIMER =
'**Disclaimer**: 1) The crowdsourced CrowS dataset is noisy. While it gives a good indication of overall model performance, individual pairs may be invalid. 2) CrowS measures U.S.-typical stereotypes. Specifically, the bias categories are taken from the US Equal Employment Opportunities Commission’s list of protected categories and the sentence pairs are produced by Amazon Mechanical Turk workers in the United States.'