fmeval.reporting.constants

  1from enum import Enum
  2from typing import NamedTuple, Tuple, List
  3
  4from fmeval.eval_algorithms.factual_knowledge import FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT
  5from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING, LOG_PROBABILITY_DIFFERENCE
  6from fmeval.eval_algorithms.qa_accuracy import (
  7    F1_SCORE,
  8    EXACT_MATCH_SCORE,
  9    QUASI_EXACT_MATCH_SCORE,
 10    PRECISION_OVER_WORDS,
 11    RECALL_OVER_WORDS,
 12)
 13from fmeval.eval_algorithms.summarization_accuracy import METEOR_SCORE, BERT_SCORE, ROUGE_SCORE
 14from fmeval.eval_algorithms.classification_accuracy import (
 15    CLASSIFICATION_ACCURACY_SCORE,
 16    BALANCED_ACCURACY_SCORE,
 17    PRECISION_SCORE,
 18    RECALL_SCORE,
 19)
 20from fmeval.eval_algorithms.classification_accuracy_semantic_robustness import (
 21    DELTA_CLASSIFICATION_ACCURACY_SCORE,
 22)
 23from fmeval.eval_algorithms.qa_accuracy_semantic_robustness import (
 24    DELTA_F1_SCORE,
 25    DELTA_EXACT_MATCH_SCORE,
 26    DELTA_QUASI_EXACT_MATCH_SCORE,
 27    DELTA_PRECISION_OVER_WORDS,
 28    DELTA_RECALL_OVER_WORDS,
 29)
 30from fmeval.eval_algorithms.summarization_accuracy_semantic_robustness import (
 31    DELTA_ROUGE_SCORE,
 32    DELTA_BERT_SCORE,
 33    DELTA_METEOR_SCORE,
 34)
 35from fmeval.eval_algorithms.general_semantic_robustness import WER_SCORE, BERT_SCORE_DISSIMILARITY
 36from fmeval.eval_algorithms import (
 37    TREX,
 38    BOOLQ,
 39    TRIVIA_QA,
 40    NATURAL_QUESTIONS,
 41    CROWS_PAIRS,
 42    WOMENS_CLOTHING_ECOMMERCE_REVIEWS,
 43    BOLD,
 44    WIKITEXT2,
 45    REAL_TOXICITY_PROMPTS,
 46    REAL_TOXICITY_PROMPTS_CHALLENGING,
 47    GIGAWORD,
 48    GOV_REPORT,
 49)
 50from fmeval.eval_algorithms import EvalAlgorithm
 51from fmeval.eval_algorithms.helper_models.helper_model import (
 52    TOXIGEN_SCORE_NAME,
 53    DETOXIFY_SCORE_TOXICITY,
 54    DETOXIFY_SCORE_SEVERE_TOXICITY,
 55    DETOXIFY_SCORE_OBSCENE,
 56    DETOXIFY_SCORE_IDENTITY_ATTACK,
 57    DETOXIFY_SCORE_INSULT,
 58    DETOXIFY_SCORE_THREAT,
 59    DETOXIFY_SCORE_SEXUAL_EXPLICIT,
 60)
 61
 62# For general HTML alignment
 63CENTER = "center"
 64LEFT = "left"
 65RIGHT = "right"
 66
 67
 68class ListType(str, Enum):
 69    BULLETED = "bulleted"
 70    NUMBERED = "numbered"
 71
 72
 73# For general use in Markdown-related code
 74SINGLE_NEWLINE = "  \n"
 75DOUBLE_NEWLINE = "  \n\n"
 76
 77# For tables and bar plots
 78NUM_SAMPLES_TO_DISPLAY_IN_TABLE = 5
 79CATEGORY_BAR_COLOR = "steelblue"
 80OVERALL_BAR_COLOR = "coral"
 81MAX_CHAR = 200
 82
 83# Extensions used by the markdown library to convert markdown to HTML
 84MARKDOWN_EXTENSIONS = ["tables", "md_in_html"]
 85
 86# Dataset score label used in category bar plot
 87DATASET_SCORE_LABEL = "Overall"
 88
 89# Scores that are not per sample
 90AGGREGATE_ONLY_SCORES = [BALANCED_ACCURACY_SCORE, PRECISION_SCORE, RECALL_SCORE]
 91
 92# For string formatting in eval names/score names
 93GENERAL_STRING_REPLACEMENTS: List[Tuple[str, str]] = [("qa", "Q&A"), ("f1", "F1"), ("sagemaker", "SageMaker")]
 94SCORE_STRING_REPLACEMENTS: List[Tuple[str, str]] = [
 95    ("prompt stereotyping", "is_biased"),
 96    ("meteor", "METEOR"),
 97    ("bertscore", "BERTScore"),
 98    ("rouge", "ROUGE"),
 99    ("F1 score", "F1 over words"),
100    ("obscene", "Obscenity"),
101    ("sexual explicit", "Sexual Explicitness"),
102]
103EVAL_NAME_STRING_REPLACEMENTS: List[Tuple[str, str]] = [
104    (EvalAlgorithm.QA_ACCURACY.value, EvalAlgorithm.ACCURACY.value),
105    (EvalAlgorithm.SUMMARIZATION_ACCURACY.value, EvalAlgorithm.ACCURACY.value),
106    (EvalAlgorithm.CLASSIFICATION_ACCURACY.value, EvalAlgorithm.ACCURACY.value),
107    (EvalAlgorithm.GENERAL_SEMANTIC_ROBUSTNESS.value, "semantic_robustness"),
108    ("accuracy_semantic_robustness", "semantic_robustness"),
109    (EvalAlgorithm.QA_ACCURACY.value, EvalAlgorithm.TOXICITY.value),
110    (EvalAlgorithm.SUMMARIZATION_TOXICITY.value, EvalAlgorithm.TOXICITY.value),
111    (EvalAlgorithm.CLASSIFICATION_ACCURACY.value, EvalAlgorithm.TOXICITY.value),
112]
113PLOT_TITLE_STRING_REPLACEMENTS: List[Tuple[str, str]] = [("prompt_stereotyping", "is_biased score")]
114COLUMN_NAME_STRING_REPLACEMENTS: List[Tuple[str, str]] = [
115    ("sent_more", "s_more"),
116    ("s_more_input", "<math>S<sub>more</sub></math>"),
117    ("sent_less", "s_less"),
118    ("s_less_input", "<math>S<sub>less</sub></math>"),
119    ("prob_", "probability_"),
120    ("word_error_rate", "Average WER"),
121    ("classification_accuracy", "accuracy"),
122    ("f1_score", "f1 over words"),
123    ("meteor", "METEOR"),
124    ("bertscore", "BERTScore"),
125    ("rouge", "ROUGE"),
126]
127AVOID_REMOVE_UNDERSCORE = ["sent_more_input", "sent_less_input", "is_biased"]
128ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS = [
129    EvalAlgorithm.SUMMARIZATION_ACCURACY_SEMANTIC_ROBUSTNESS.value,
130    EvalAlgorithm.QA_ACCURACY_SEMANTIC_ROBUSTNESS,
131    EvalAlgorithm.CLASSIFICATION_ACCURACY_SEMANTIC_ROBUSTNESS,
132]
133ACCURACY_SEMANTIC_ROBUSTNESS_SCORES = [
134    CLASSIFICATION_ACCURACY_SCORE,
135    METEOR_SCORE,
136    BERT_SCORE,
137    ROUGE_SCORE,
138    F1_SCORE,
139    EXACT_MATCH_SCORE,
140    QUASI_EXACT_MATCH_SCORE,
141]
142# Dataset types
143BUILT_IN_DATASET = "Built-in Dataset"
144CUSTOM_DATASET = "Custom Dataset"
145
146TOXICITY_EVAL_NAMES = [
147    EvalAlgorithm.TOXICITY.value,
148    EvalAlgorithm.QA_TOXICITY.value,
149    EvalAlgorithm.SUMMARIZATION_TOXICITY.value,
150]
151
152# Prompt stereotyping table column name
153PROBABILITY_RATIO = "<math><box>p(S<sub>more</sub>)/p(S<sub>less</sub>)</box></math>"
154IS_BIASED = "is_biased"
155
156# Toxicity detector names
157TOXIGEN_NAME = "Toxigen-roberta"
158DETOXIFY_NAME = "UnitaryAI Detoxify-unbiased"
159TOXIGEN_URI = "https://github.com/microsoft/TOXIGEN"
160DETOXIFY_URI = "https://github.com/unitaryai/detoxify"
161# Example table descriptions
162TABLE_DESCRIPTION = "Below are a few examples of the highest and lowest-scoring examples across all categories. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job. "
163WER_TABLE_DESCRIPTION = "Below are a few examples of the highest and lowest-scoring examples across all categories. The lower the word error rate, the better the model performs. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job."
164STEREOTYPING_TABLE_DESCRIPTION = "**Probability ratio:** \n\nFor each sentence pair, we additionally report by how much the model stereotypes. This is given by the ratio <math><box>p(S<sub>more</sub>)/p(S<sub>less</sub>)</box></math>. This ratio will be between 0 and ∞. A ratio of 200, for example, means that the model assigns 200x the probability to the more stereotypical sentence than the less stereotypical sentence."
165FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION = "Below are a few examples of correct and incorrect model responses. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job. "
166
167# Score descriptions for the report
168SCORE_DESCRIPTIONS = {
169    # Factual knowledge
170    FACTUAL_KNOWLEDGE: "Exact inclusion score is a binary score where 1 indicates that the target output is contained in the model output, and 0 indicates otherwise. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts.",
171    FACTUAL_KNOWLEDGE_QUASI_EXACT: "Quasi-exact inclusion is a binary score that represents whether a slightly modified (removing excess whitespace and punctuation) version of the target output is contained in the model output. If yes, the score is 1, otherwise 0. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts after removing excess whitespace and punctuation.",
172    # Prompt stereotyping
173    PROMPT_STEREOTYPING: "The is_biased prompt stereotyping score measures whether the model assigns higher probability to the more stereotypical sentence than the less stereotypical one, i.e., <math><box>p(S<sub>more</sub>) > p(S<sub>less</sub>)</box></math>. It is a value between 0 and 1, where 1 indicates that the model always prefers the more stereotypical sentence while 0 means that it never prefers the more stereotypical sentence. An unbiased model prefers more and less stereotypical sentences at equal rates, corresponding to a score of 0.5",
174    LOG_PROBABILITY_DIFFERENCE: "For each sentence pair, we report the log probability difference, a value ranging -&#8734; to &#8734;, indicating how much the model stereotypes. ",
175    # QA accuracy
176    F1_SCORE: "Numerical score between 0 (worst) and 1 (best). F1-score is the harmonic mean of precision and recall. It is computed as follows:  precision = true positives / (true positives + false positives) and recall = true positives / (true positives + false negatives). Then F1 = 2 (precision * recall)/(precision + recall) .",
177    EXACT_MATCH_SCORE: "An exact match score is a binary score where 1 indicates the model output and answer match exactly and 0 indicates otherwise.",
178    QUASI_EXACT_MATCH_SCORE: "Similar as above, but both model output and answer are normalised first by removing any articles and punctuation. E.g., 1 also for predicted answers “Antarctica.” or “the Antarctica” .",
179    PRECISION_OVER_WORDS: "The precision score is the fraction of words in the model output that are also found in the target output.",
180    RECALL_OVER_WORDS: "The recall score is the fraction of words in the target output that are also found in the model output.`",
181    # Summarization accuracy
182    ROUGE_SCORE: "A ROUGE-N score computes the N-gram (sequences of n words) word overlaps between the reference and model summary, with the value ranging between 0 (no match) to 1 (perfect match).",
183    METEOR_SCORE: "Meteor is similar to ROUGE-N, but it also accounts for rephrasing by using traditional NLP techniques such as stemming (e.g. matching “singing” to “sing”,“sings” etc.) and synonym lists.",
184    BERT_SCORE: "BERTScore uses a second ML model (from the BERT family) to compute sentence embeddings and compare their similarity.",
185    # Classification accuracy
186    CLASSIFICATION_ACCURACY_SCORE: "The classification accuracy is `predicted_label == true_label`, reported as the mean accuracy over all datapoints.",
187    PRECISION_SCORE: "The precision score is computed as `true positives / (true positives + false positives)`. ",
188    RECALL_SCORE: "The recall score is computed as `true positives / (true positives + false negatives)`",
189    BALANCED_ACCURACY_SCORE: "The balanced accuracy score is the same as accuracy in the binary case, otherwise averaged recall per class.",
190    # General semantic robustness
191    WER_SCORE: "Word error rate (WER) is a value between 0 and 1, and measures the difference between the model output on the unperturbed input and the output(s) on one or more perturbed versions of the same input. For more details on how word error rate is computed, see the [HuggingFace Article on Word Error Rate](https://huggingface.co/spaces/evaluate-metric/wer).",
192    BERT_SCORE_DISSIMILARITY: "BERTScore Dissimilarity is computed as 1 - BERTScore and measures semantic differences between the original and perturbed versions of the same input.",
193    # Summarization semantic robustness
194    DELTA_ROUGE_SCORE: "Delta ROUGE-N score measures the change in Rouge between the original and perturbed versions of the same input.",
195    DELTA_METEOR_SCORE: "Delta Meteor score measures the change in Meteor between the original and perturbed versions of the same input.",
196    DELTA_BERT_SCORE: "Delta BERTscore measures the change in BERTscore between the original and perturbed versions of the same input.",
197    # QA semantic robustness
198    DELTA_EXACT_MATCH_SCORE: "Delta Exact Match score measures the change in Exact Match score between the original and perturbed versions of the same input.",
199    DELTA_QUASI_EXACT_MATCH_SCORE: "Delta Quasi Exact Match score measures the change in Quasi Exact Match score between the original and perturbed versions of the same input.",
200    DELTA_F1_SCORE: "Delta F1 score measures the change in F1 score between the original and perturbed versions of the same input.",
201    DELTA_PRECISION_OVER_WORDS: "Delta Precision measures the change in Precision between the original and perturbed versions of the same input.",
202    DELTA_RECALL_OVER_WORDS: "Delta Recall measures the change in Recall between the original and perturbed versions of the same input.",
203    # Classification semantic robustness
204    DELTA_CLASSIFICATION_ACCURACY_SCORE: "The score is the binary indicator on whether or not the model answer is correct.",
205    # Toxicity
206    TOXIGEN_SCORE_NAME: "A binary score where 0 indicates no toxicity detected and 1 indicates toxicity detected.",
207    DETOXIFY_SCORE_TOXICITY: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **toxicity**",
208    DETOXIFY_SCORE_SEVERE_TOXICITY: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **severe_toxicity**",
209    DETOXIFY_SCORE_OBSCENE: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **obscene**",
210    DETOXIFY_SCORE_IDENTITY_ATTACK: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class **identity_attack**",
211    DETOXIFY_SCORE_INSULT: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **insult**",
212    DETOXIFY_SCORE_THREAT: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **threat**",
213    DETOXIFY_SCORE_SEXUAL_EXPLICIT: "A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **sexual_explicit**",
214}
215
216
217class DatasetDetails(NamedTuple):
218    name: str
219    url: str
220    description: str
221    size: int
222
223
224# Dataset details with the formatted names, URLs, descriptions and size
225DATASET_DETAILS = {
226    TREX: DatasetDetails(
227        name="T-REx",
228        url="https://hadyelsahar.github.io/t-rex/",
229        description="A dataset which consists of knowledge triplets extracted from Wikipedia. The triplets take the form (subject, predicate, object), for instance, (Berlin, capital of, Germany) or (Tata Motors, subsidiary of, Tata Group). ",
230        size=32260,
231    ),
232    BOOLQ: DatasetDetails(
233        name="BoolQ",
234        url="https://github.com/google-research-datasets/boolean-questions",
235        description="A dataset consisting of question-passage-answer triplets. The question can be answered with yes/no, and the answer is contained in the passage. The questions are provided anonymously and unsolicited by users of the Google search engine, and afterwards paired with a paragraph from a Wikipedia article containing the answer.",
236        size=12697,
237    ),
238    TRIVIA_QA: DatasetDetails(
239        name="TriviaQA",
240        url="http://nlp.cs.washington.edu/triviaqa/",
241        description="A dataset consisting of 95K question-answer pairs with with on average six supporting evidence documents per question, leading to ~650K question-passage-answer triplets. The questions are authored by trivia enthusiasts and the evidence documents are independently gathered. ",
242        size=156328,
243    ),
244    NATURAL_QUESTIONS: DatasetDetails(
245        name="Natural Questions",
246        url="https://github.com/google-research-datasets/natural-questions",
247        description="A dataset consisting of ~320K question-passage-answer triplets. The questions are factual naturally-occurring questions. The passages are extracts from wikipedia articles (referred to as “long answers” in the original dataset). As before, providing the passage is optional depending on whether the open-book or closed-book case should be evaluated.",
248        size=4289,
249    ),
250    CROWS_PAIRS: DatasetDetails(
251        name="CrowS-Pairs",
252        url="https://github.com/nyu-mll/crows-pairs",
253        description="This dataset provides crowdsourced sentence pairs for the different categories along which stereotyping is to be measured.",
254        size=1508,
255    ),
256    WOMENS_CLOTHING_ECOMMERCE_REVIEWS: DatasetDetails(
257        name="Women's E-commerce Clothing Reviews",
258        url="https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews",
259        description="This dataset consists of clothing reviews, both as a text and numerical scores.",
260        size=23486,
261    ),
262    BOLD: DatasetDetails(
263        name="BOLD",
264        url="https://github.com/amazon-science/bold",
265        description="A large-scale dataset that consists of English prompts aimed at testing bias and toxicity generation across five domains: profession, gender, race, religion, and political ideology.",
266        size=23679,
267    ),
268    WIKITEXT2: DatasetDetails(
269        name="WikiText2",
270        url="https://huggingface.co/datasets/wikitext",
271        description="A dataset which consists of Good and Featured articles from Wikipedia. To create prompts, we broke each article down into sentences and extracted first 6 tokens from each sentence as the prompt.",
272        size=86007,
273    ),
274    REAL_TOXICITY_PROMPTS: DatasetDetails(
275        name="Real Toxicity Prompts",
276        url="https://github.com/allenai/real-toxicity-prompts",
277        description="A dataset of truncated sentence snippets from the web. ",
278        size=98243,
279    ),
280    REAL_TOXICITY_PROMPTS_CHALLENGING: DatasetDetails(
281        name="Real Toxicity Prompts Challenging",
282        url="https://github.com/allenai/real-toxicity-prompts",
283        description="A dataset of truncated sentence snippets from the web. Prompts marked as “challenging” have been found by the authors to consistently lead to generation of toxic continuation by tested models (i.e., GPT-1, GPT-2, GPT-3, CTRL, CTRL-WIKI).",
284        size=1199,
285    ),
286    GIGAWORD: DatasetDetails(
287        name="Gigaword",
288        url="https://huggingface.co/datasets/gigaword",
289        description="A dataset with around 4 million news article headlines. Our dataset consists of 190k entries from its validation set.",
290        size=189651,
291    ),
292    GOV_REPORT: DatasetDetails(
293        name="Government Report",
294        url="https://gov-report-data.github.io/",
295        description="A dataset including a long-form summarization benchmark. It contains significantly longer documents (9.4k words) and summaries (553 words) than most existing datasets.",
296        size=7238,
297    ),
298}
299
300TREX_DESCRIPTION_EXAMPLES = "We convert these predicates to prompts, e.g., Berlin is the capital of ___ (expected answer: Germany) and Tata Motors is a subsidiary of ___ (expected answer: Tata Group)."
301
302CROWS_PAIRS_DISCLAIMER = "**Disclaimer**: 1) The crowdsourced CrowS dataset is noisy. While it gives a good indication of overall model performance, individual pairs may be invalid. 2) CrowS measures U.S.-typical stereotypes. Specifically, the bias categories are taken from the US Equal Employment Opportunities Commission’s list of protected categories and the sentence pairs are produced by Amazon Mechanical Turk workers in the United States."
CENTER = 'center'
LEFT = 'left'
class ListType(builtins.str, enum.Enum):
69class ListType(str, Enum):
70    BULLETED = "bulleted"
71    NUMBERED = "numbered"

An enumeration.

BULLETED = <ListType.BULLETED: 'bulleted'>
NUMBERED = <ListType.NUMBERED: 'numbered'>
SINGLE_NEWLINE = ' \n'
DOUBLE_NEWLINE = ' \n\n'
NUM_SAMPLES_TO_DISPLAY_IN_TABLE = 5
CATEGORY_BAR_COLOR = 'steelblue'
OVERALL_BAR_COLOR = 'coral'
MAX_CHAR = 200
MARKDOWN_EXTENSIONS = ['tables', 'md_in_html']
DATASET_SCORE_LABEL = 'Overall'
AGGREGATE_ONLY_SCORES = ['balanced_accuracy_score', 'precision_score', 'recall_score']
GENERAL_STRING_REPLACEMENTS: List[Tuple[str, str]] = [('qa', 'Q&A'), ('f1', 'F1'), ('sagemaker', 'SageMaker')]
SCORE_STRING_REPLACEMENTS: List[Tuple[str, str]] = [('prompt stereotyping', 'is_biased'), ('meteor', 'METEOR'), ('bertscore', 'BERTScore'), ('rouge', 'ROUGE'), ('F1 score', 'F1 over words'), ('obscene', 'Obscenity'), ('sexual explicit', 'Sexual Explicitness')]
EVAL_NAME_STRING_REPLACEMENTS: List[Tuple[str, str]] = [('qa_accuracy', 'accuracy'), ('summarization_accuracy', 'accuracy'), ('classification_accuracy', 'accuracy'), ('general_semantic_robustness', 'semantic_robustness'), ('accuracy_semantic_robustness', 'semantic_robustness'), ('qa_accuracy', 'toxicity'), ('summarization_toxicity', 'toxicity'), ('classification_accuracy', 'toxicity')]
PLOT_TITLE_STRING_REPLACEMENTS: List[Tuple[str, str]] = [('prompt_stereotyping', 'is_biased score')]
COLUMN_NAME_STRING_REPLACEMENTS: List[Tuple[str, str]] = [('sent_more', 's_more'), ('s_more_input', '<math>S<sub>more</sub></math>'), ('sent_less', 's_less'), ('s_less_input', '<math>S<sub>less</sub></math>'), ('prob_', 'probability_'), ('word_error_rate', 'Average WER'), ('classification_accuracy', 'accuracy'), ('f1_score', 'f1 over words'), ('meteor', 'METEOR'), ('bertscore', 'BERTScore'), ('rouge', 'ROUGE')]
AVOID_REMOVE_UNDERSCORE = ['sent_more_input', 'sent_less_input', 'is_biased']
ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS = ['summarization_accuracy_semantic_robustness', <EvalAlgorithm.QA_ACCURACY_SEMANTIC_ROBUSTNESS: 'qa_accuracy_semantic_robustness'>, <EvalAlgorithm.CLASSIFICATION_ACCURACY_SEMANTIC_ROBUSTNESS: 'classification_accuracy_semantic_robustness'>]
ACCURACY_SEMANTIC_ROBUSTNESS_SCORES = ['classification_accuracy_score', 'meteor', 'bertscore', 'rouge', 'f1_score', 'exact_match_score', 'quasi_exact_match_score']
BUILT_IN_DATASET = 'Built-in Dataset'
CUSTOM_DATASET = 'Custom Dataset'
TOXICITY_EVAL_NAMES = ['toxicity', 'qa_toxicity', 'summarization_toxicity']
PROBABILITY_RATIO = '<math><box>p(S<sub>more</sub>)/p(S<sub>less</sub>)</box></math>'
IS_BIASED = 'is_biased'
TOXIGEN_NAME = 'Toxigen-roberta'
DETOXIFY_NAME = 'UnitaryAI Detoxify-unbiased'
TOXIGEN_URI = 'https://github.com/microsoft/TOXIGEN'
DETOXIFY_URI = 'https://github.com/unitaryai/detoxify'
TABLE_DESCRIPTION = 'Below are a few examples of the highest and lowest-scoring examples across all categories. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job. '
WER_TABLE_DESCRIPTION = 'Below are a few examples of the highest and lowest-scoring examples across all categories. The lower the word error rate, the better the model performs. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job.'
STEREOTYPING_TABLE_DESCRIPTION = '**Probability ratio:** \n\nFor each sentence pair, we additionally report by how much the model stereotypes. This is given by the ratio <math><box>p(S<sub>more</sub>)/p(S<sub>less</sub>)</box></math>. This ratio will be between 0 and ∞. A ratio of 200, for example, means that the model assigns 200x the probability to the more stereotypical sentence than the less stereotypical sentence.'
FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION = 'Below are a few examples of correct and incorrect model responses. Some text may be truncated due to length constraints. To view the full prompts, please go to the S3 job output location that you specified when configuring the job. '
SCORE_DESCRIPTIONS = {'factual_knowledge': 'Exact inclusion score is a binary score where 1 indicates that the target output is contained in the model output, and 0 indicates otherwise. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts.', 'factual_knowledge_quasi_exact': 'Quasi-exact inclusion is a binary score that represents whether a slightly modified (removing excess whitespace and punctuation) version of the target output is contained in the model output. If yes, the score is 1, otherwise 0. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts after removing excess whitespace and punctuation.', 'prompt_stereotyping': 'The is_biased prompt stereotyping score measures whether the model assigns higher probability to the more stereotypical sentence than the less stereotypical one, i.e., <math><box>p(S<sub>more</sub>) > p(S<sub>less</sub>)</box></math>. It is a value between 0 and 1, where 1 indicates that the model always prefers the more stereotypical sentence while 0 means that it never prefers the more stereotypical sentence. An unbiased model prefers more and less stereotypical sentences at equal rates, corresponding to a score of 0.5', 'log_probability_difference': 'For each sentence pair, we report the log probability difference, a value ranging -&#8734; to &#8734;, indicating how much the model stereotypes. ', 'f1_score': 'Numerical score between 0 (worst) and 1 (best). F1-score is the harmonic mean of precision and recall. It is computed as follows: precision = true positives / (true positives + false positives) and recall = true positives / (true positives + false negatives). Then F1 = 2 (precision * recall)/(precision + recall) .', 'exact_match_score': 'An exact match score is a binary score where 1 indicates the model output and answer match exactly and 0 indicates otherwise.', 'quasi_exact_match_score': 'Similar as above, but both model output and answer are normalised first by removing any articles and punctuation. E.g., 1 also for predicted answers “Antarctica.” or “the Antarctica” .', 'precision_over_words': 'The precision score is the fraction of words in the model output that are also found in the target output.', 'recall_over_words': 'The recall score is the fraction of words in the target output that are also found in the model output.`', 'rouge': 'A ROUGE-N score computes the N-gram (sequences of n words) word overlaps between the reference and model summary, with the value ranging between 0 (no match) to 1 (perfect match).', 'meteor': 'Meteor is similar to ROUGE-N, but it also accounts for rephrasing by using traditional NLP techniques such as stemming (e.g. matching “singing” to “sing”,“sings” etc.) and synonym lists.', 'bertscore': 'BERTScore uses a second ML model (from the BERT family) to compute sentence embeddings and compare their similarity.', 'classification_accuracy_score': 'The classification accuracy is `predicted_label == true_label`, reported as the mean accuracy over all datapoints.', 'precision_score': 'The precision score is computed as `true positives / (true positives + false positives)`. ', 'recall_score': 'The recall score is computed as `true positives / (true positives + false negatives)`', 'balanced_accuracy_score': 'The balanced accuracy score is the same as accuracy in the binary case, otherwise averaged recall per class.', 'word_error_rate': 'Word error rate (WER) is a value between 0 and 1, and measures the difference between the model output on the unperturbed input and the output(s) on one or more perturbed versions of the same input. For more details on how word error rate is computed, see the [HuggingFace Article on Word Error Rate](https://huggingface.co/spaces/evaluate-metric/wer).', 'bertscore_dissimilarity': 'BERTScore Dissimilarity is computed as 1 - BERTScore and measures semantic differences between the original and perturbed versions of the same input.', 'delta_rouge': 'Delta ROUGE-N score measures the change in Rouge between the original and perturbed versions of the same input.', 'delta_meteor': 'Delta Meteor score measures the change in Meteor between the original and perturbed versions of the same input.', 'delta_bertscore': 'Delta BERTscore measures the change in BERTscore between the original and perturbed versions of the same input.', 'delta_exact_match_score': 'Delta Exact Match score measures the change in Exact Match score between the original and perturbed versions of the same input.', 'delta_quasi_exact_match_score': 'Delta Quasi Exact Match score measures the change in Quasi Exact Match score between the original and perturbed versions of the same input.', 'delta_f1_score': 'Delta F1 score measures the change in F1 score between the original and perturbed versions of the same input.', 'delta_precision_over_words': 'Delta Precision measures the change in Precision between the original and perturbed versions of the same input.', 'delta_recall_over_words': 'Delta Recall measures the change in Recall between the original and perturbed versions of the same input.', 'delta_classification_accuracy_score': 'The score is the binary indicator on whether or not the model answer is correct.', 'toxicity': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **toxicity**', 'severe_toxicity': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **severe_toxicity**', 'obscene': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **obscene**', 'identity_attack': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class **identity_attack**', 'insult': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **insult**', 'threat': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **threat**', 'sexual_explicit': 'A binary score from 0 (no toxicity detected) to 1 (toxicity detected) for the class: **sexual_explicit**'}
class DatasetDetails(typing.NamedTuple):
218class DatasetDetails(NamedTuple):
219    name: str
220    url: str
221    description: str
222    size: int

DatasetDetails(name, url, description, size)

DatasetDetails(name: str, url: str, description: str, size: int)

Create new instance of DatasetDetails(name, url, description, size)

name: str

Alias for field number 0

url: str

Alias for field number 1

description: str

Alias for field number 2

size: int

Alias for field number 3

DATASET_DETAILS = {'trex': DatasetDetails(name='T-REx', url='https://hadyelsahar.github.io/t-rex/', description='A dataset which consists of knowledge triplets extracted from Wikipedia. The triplets take the form (subject, predicate, object), for instance, (Berlin, capital of, Germany) or (Tata Motors, subsidiary of, Tata Group). ', size=32260), 'boolq': DatasetDetails(name='BoolQ', url='https://github.com/google-research-datasets/boolean-questions', description='A dataset consisting of question-passage-answer triplets. The question can be answered with yes/no, and the answer is contained in the passage. The questions are provided anonymously and unsolicited by users of the Google search engine, and afterwards paired with a paragraph from a Wikipedia article containing the answer.', size=12697), 'trivia_qa': DatasetDetails(name='TriviaQA', url='http://nlp.cs.washington.edu/triviaqa/', description='A dataset consisting of 95K question-answer pairs with with on average six supporting evidence documents per question, leading to ~650K question-passage-answer triplets. The questions are authored by trivia enthusiasts and the evidence documents are independently gathered. ', size=156328), 'natural_questions': DatasetDetails(name='Natural Questions', url='https://github.com/google-research-datasets/natural-questions', description='A dataset consisting of ~320K question-passage-answer triplets. The questions are factual naturally-occurring questions. The passages are extracts from wikipedia articles (referred to as “long answers” in the original dataset). As before, providing the passage is optional depending on whether the open-book or closed-book case should be evaluated.', size=4289), 'crows-pairs': DatasetDetails(name='CrowS-Pairs', url='https://github.com/nyu-mll/crows-pairs', description='This dataset provides crowdsourced sentence pairs for the different categories along which stereotyping is to be measured.', size=1508), 'womens_clothing_ecommerce_reviews': DatasetDetails(name="Women's E-commerce Clothing Reviews", url='https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews', description='This dataset consists of clothing reviews, both as a text and numerical scores.', size=23486), 'bold': DatasetDetails(name='BOLD', url='https://github.com/amazon-science/bold', description='A large-scale dataset that consists of English prompts aimed at testing bias and toxicity generation across five domains: profession, gender, race, religion, and political ideology.', size=23679), 'wikitext2': DatasetDetails(name='WikiText2', url='https://huggingface.co/datasets/wikitext', description='A dataset which consists of Good and Featured articles from Wikipedia. To create prompts, we broke each article down into sentences and extracted first 6 tokens from each sentence as the prompt.', size=86007), 'real_toxicity_prompts': DatasetDetails(name='Real Toxicity Prompts', url='https://github.com/allenai/real-toxicity-prompts', description='A dataset of truncated sentence snippets from the web. ', size=98243), 'real_toxicity_prompts_challenging': DatasetDetails(name='Real Toxicity Prompts Challenging', url='https://github.com/allenai/real-toxicity-prompts', description='A dataset of truncated sentence snippets from the web. Prompts marked as “challenging” have been found by the authors to consistently lead to generation of toxic continuation by tested models (i.e., GPT-1, GPT-2, GPT-3, CTRL, CTRL-WIKI).', size=1199), 'gigaword': DatasetDetails(name='Gigaword', url='https://huggingface.co/datasets/gigaword', description='A dataset with around 4 million news article headlines. Our dataset consists of 190k entries from its validation set.', size=189651), 'gov_report': DatasetDetails(name='Government Report', url='https://gov-report-data.github.io/', description='A dataset including a long-form summarization benchmark. It contains significantly longer documents (9.4k words) and summaries (553 words) than most existing datasets.', size=7238)}
TREX_DESCRIPTION_EXAMPLES = 'We convert these predicates to prompts, e.g., Berlin is the capital of ___ (expected answer: Germany) and Tata Motors is a subsidiary of ___ (expected answer: Tata Group).'
CROWS_PAIRS_DISCLAIMER = '**Disclaimer**: 1) The crowdsourced CrowS dataset is noisy. While it gives a good indication of overall model performance, individual pairs may be invalid. 2) CrowS measures U.S.-typical stereotypes. Specifically, the bias categories are taken from the US Equal Employment Opportunities Commission’s list of protected categories and the sentence pairs are produced by Amazon Mechanical Turk workers in the United States.'