fmeval.reporting.eval_output_cells

  1from typing import List, Optional, Any
  2import ray.data
  3from textwrap import shorten
  4import numpy as np
  5from fmeval.eval_algorithms import (
  6    EvalOutput,
  7    DATASET_CONFIGS,
  8    EvalAlgorithm,
  9    TREX,
 10    CROWS_PAIRS,
 11    get_default_prompt_template,
 12)
 13from fmeval.eval_algorithms.classification_accuracy import CLASSIFICATION_ACCURACY_SCORE
 14from fmeval.eval_algorithms.factual_knowledge import FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT
 15from fmeval.eval_algorithms.general_semantic_robustness import WER_SCORE
 16from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING
 17from fmeval.constants import DatasetColumns, DATASET_COLUMNS
 18from fmeval.reporting.cells import MarkdownCell, BarPlotCell, TableCell, BoldCell, HeadingCell
 19from fmeval.reporting.constants import (
 20    LEFT,
 21    CATEGORY_BAR_COLOR,
 22    OVERALL_BAR_COLOR,
 23    NUM_SAMPLES_TO_DISPLAY_IN_TABLE,
 24    DATASET_SCORE_LABEL,
 25    SCORE_DESCRIPTIONS,
 26    DATASET_DETAILS,
 27    TABLE_DESCRIPTION,
 28    WER_TABLE_DESCRIPTION,
 29    STEREOTYPING_TABLE_DESCRIPTION,
 30    FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION,
 31    TREX_DESCRIPTION_EXAMPLES,
 32    BUILT_IN_DATASET,
 33    CUSTOM_DATASET,
 34    AGGREGATE_ONLY_SCORES,
 35    MAX_CHAR,
 36    TOXICITY_EVAL_NAMES,
 37    TOXIGEN_NAME,
 38    DETOXIFY_NAME,
 39    CROWS_PAIRS_DISCLAIMER,
 40    PROBABILITY_RATIO,
 41    IS_BIASED,
 42    ACCURACY_SEMANTIC_ROBUSTNESS_SCORES,
 43    ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS,
 44    DETOXIFY_URI,
 45    TOXIGEN_URI,
 46)
 47from fmeval.reporting.util import format_dataset_name, format_string, add_hyperlink
 48from html import escape
 49
 50TABLE_COLUMNS = list(set(DATASET_COLUMNS)) + list(set(SCORE_DESCRIPTIONS.keys())) + [PROBABILITY_RATIO, IS_BIASED]
 51
 52
 53class CategoryBarPlotCell(BarPlotCell):
 54    """
 55    This class represents a bar plot that displays category-level and overall evaluation scores.
 56    """
 57
 58    def __init__(
 59        self,
 60        categories: List[str],
 61        scores: List[float],
 62        score_name: str,
 63        dataset_score: float,
 64        height: Optional[str] = None,
 65        width: Optional[str] = None,
 66        center: bool = True,
 67        origin: float = 0,
 68    ):
 69        """
 70        :param categories: The names of the categories.
 71        :param scores: The values of the category scores.
 72        :param score_name: The name of the score that was computed in the evaluation.
 73        :param dataset_score: The overall score for the dataset.
 74        :param height: Height of the plot as a string
 75        :param width: Width the plot as a string
 76        :param center: Boolean indicating if the plot should be center aligned in the page
 77        """
 78        labels = categories + [DATASET_SCORE_LABEL]
 79        heights = scores + [dataset_score]
 80        super().__init__(
 81            labels=labels,
 82            heights=heights,
 83            color=CategoryBarPlotCell._create_bar_plot_colors(labels),
 84            title=CategoryBarPlotCell._create_bar_plot_title(score_name),
 85            plot_height=height,
 86            plot_width=width,
 87            center=center,
 88            origin=origin,
 89        )
 90
 91    @staticmethod
 92    def _create_bar_plot_colors(category_names: List[str]) -> List[str]:
 93        """
 94        Returns a list of colors corresponding to the bars for each of the categories.
 95
 96        :param category_names: Includes "Overall" as the last category name
 97        :returns: A list of colors, where the kth element is the color
 98            of the bar corresponding to category_names[k]
 99        """
100        return [CATEGORY_BAR_COLOR for _ in range(len(category_names) - 1)] + [OVERALL_BAR_COLOR]
101
102    @staticmethod
103    def _create_bar_plot_title(evaluation_type: str) -> str:
104        """
105        Generates a bar plot title from the evaluation type.
106
107        :param evaluation_type: Ex - "Stereotyping"
108        :returns: A title to be used in the bar plot for category scores
109        """
110        return format_string(f"{evaluation_type}", as_title=True, as_score=True, as_plot_title=True)
111
112
113class RayDatasetTableCell(TableCell):
114    """
115    This class represents a table that displays data from a Ray Dataset object.
116    """
117
118    def __init__(
119        self,
120        dataset: ray.data.Dataset,
121        col_to_sort: Optional[str] = None,
122        k: Optional[int] = None,
123        descending: bool = False,
124        abs_val: bool = False,
125        caption: Optional[str] = None,
126        cell_align: str = LEFT,
127    ):
128        """
129        :param dataset: The Ray Dataset that we create a TableCell out of
130        :param col_to_sort: The name of the column in the dataset to sort by
131        :param k: The number of samples from the dataset to display in the table
132        :param descending: Whether to sort in descending order.
133        :param abs_val: Whether to sort by absolute value when sorting is enabled.
134        :param caption: The caption text before the table.
135        :param cell_align: The text alignment within cells.
136        """
137        if col_to_sort:
138            assert (
139                col_to_sort in dataset.columns()
140            ), f"Column to be sorted `{col_to_sort}` is not present in dataset columns: {dataset.columns()}"
141            if abs_val:
142                pd_dataset = dataset.to_pandas()
143                pd_dataset = pd_dataset.sort_values(by=col_to_sort, key=abs, ascending=not descending)
144                dataset = ray.data.from_pandas(pd_dataset)
145            else:
146                dataset = dataset.sort(col_to_sort, descending=descending)
147        samples = dataset.take(k) if k else dataset.take_all()  # take() uses min(k, num samples in dataset)
148        table_data = [RayDatasetTableCell.truncate_samples(list(sample.values())) for sample in samples]
149        headers = dataset.columns()
150        if DatasetColumns.CATEGORY.value.name in headers:  # pragma: no branch
151            category_idx = headers.index(DatasetColumns.CATEGORY.value.name)
152            table_data = [[row[category_idx]] + row[:category_idx] + row[category_idx + 1 :] for row in table_data]
153            headers = [headers[category_idx]] + headers[:category_idx] + headers[category_idx + 1 :]
154        headers = [format_string(header, as_column_name=True, as_title=True) for header in headers]
155        super().__init__(data=table_data, headers=headers, cell_align=cell_align, caption=caption)
156
157    @staticmethod
158    def truncate_samples(samples: List[Any]) -> List[Any]:
159        """
160        :param samples: List of items representing one row in the table.
161        :return: Table row with strings longer than MAX_CHAR truncated.
162        """
163        truncated_samples = [
164            shorten(sample, MAX_CHAR)
165            if isinstance(sample, str) and len(sample) > MAX_CHAR
166            else np.round(sample, decimals=6)
167            if isinstance(sample, float)
168            else sample
169            for sample in samples
170        ]
171        return truncated_samples
172
173
174class CategoryScoreCell(MarkdownCell):
175    """
176    This class displays a bar plot for the different category scores from an evaluation, and outlines the lowest
177        scoring category.
178    """
179
180    def __init__(self, categories: List[str], scores: List[float], score_name: str, dataset_score: float, n: int = 10):
181        """
182        :param categories: The names of the categories.
183        :param scores: The values of the category scores.
184        :param score_name: The name of the score that was computed in the evaluation.
185        :param dataset_score: The overall score for the dataset.
186        :param n: Max number of categories to display.
187        """
188
189        note = (
190            f"The top {n} categories are displayed here. To view the remaining category scores, see the `output.json` file at your S3 output location."
191            if len(categories) > n
192            else ""
193        )
194        sorted_scores, sorted_categories = (list(l) for l in zip(*sorted(zip(scores, categories), reverse=True)))
195        bar_plot_origin = 0.5 if score_name == PROMPT_STEREOTYPING else 0
196        bar_plot = CategoryBarPlotCell(
197            sorted_categories[:n],
198            sorted_scores[:n],
199            score_name,
200            dataset_score,
201            height="70%",
202            width="70%",
203            origin=bar_plot_origin,
204        )
205
206        lowest_category = (
207            CategoryScoreCell._get_kth_category_score(categories, scores, reverse=True, origin=0.5, k=0)
208            if score_name == PROMPT_STEREOTYPING
209            else CategoryScoreCell._get_kth_category_score(categories, scores, k=0)
210        )
211        lowest_category = escape(lowest_category)
212        lowest_score_description = (
213            "The model stereotypes the most in the category"
214            if score_name == PROMPT_STEREOTYPING
215            else "The model scores lowest in the category"
216        )
217        super().__init__(
218            f"The plot shows the score breakdown into individual categories.",
219            note,
220            bar_plot,
221            f"{lowest_score_description} **{lowest_category}**. ",
222        )
223
224    @staticmethod
225    def _get_kth_category_score(
226        categories: List[str], scores: List[float], k: int = 0, reverse: bool = False, origin: float = 0
227    ) -> str:
228        """
229        Sorts `category_scores` by their `score` attribute and returns the kth element in the sorted list.
230
231        :param categories: The names of the categories.
232        :param scores: The values of the category scores.
233        :param k: The index of the CategoryScore to return
234        :param reverse: Whether to sort in descending order
235        :param origin: The origin of the score values.
236        """
237        assert 0 <= k < len(categories), "The provided `k` argument is outside of the valid range"
238        scores = [abs(score - origin) for score in scores] if origin != 0 else scores
239        sorted_categories = [cat for score, cat in sorted(zip(scores, categories), reverse=reverse)]
240        return sorted_categories[k]
241
242
243class ScoreTableCell(MarkdownCell):
244    """
245    This class generates two tables displaying the highest and lowest-scoring examples from a particular score.
246    """
247
248    def __init__(self, dataset: ray.data.Dataset, score_column_name: str, binary: Optional[bool] = False):
249        """
250        :param dataset: The Ray Dataset used in the evaluation task.
251        :param score_column_name: The name of the score column in the dataset.
252        :param binary: Boolean indicating if the score is binary.
253        """
254        description = (
255            WER_TABLE_DESCRIPTION
256            if score_column_name == WER_SCORE
257            else STEREOTYPING_TABLE_DESCRIPTION
258            if score_column_name == PROBABILITY_RATIO
259            else FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION
260            if binary
261            else TABLE_DESCRIPTION
262        )
263
264        n_samples = min(NUM_SAMPLES_TO_DISPLAY_IN_TABLE, dataset.count())
265        top_description = (
266            (f"Top {n_samples} most stereotypical examples:")
267            if score_column_name == PROBABILITY_RATIO
268            else f"{n_samples} correct examples:"
269            if binary
270            else f"Top {n_samples} examples with highest scores:"
271        )
272        bottom_description = (
273            (f"Top {n_samples} least stereotypical examples:")
274            if score_column_name == PROBABILITY_RATIO
275            else f"{n_samples} incorrect examples:"
276            if binary
277            else f"Bottom {n_samples} examples with lowest scores:"
278        )
279        abs_val = True if score_column_name == PROBABILITY_RATIO else False
280
281        cells = [
282            MarkdownCell(description),
283            RayDatasetTableCell(
284                dataset,
285                score_column_name,
286                k=n_samples,
287                descending=True,
288                abs_val=abs_val,
289                caption=top_description,
290            ),
291            RayDatasetTableCell(
292                dataset,
293                score_column_name,
294                k=n_samples,
295                descending=False,
296                abs_val=abs_val,
297                caption=bottom_description,
298            ),
299        ]
300        super().__init__(*cells)
301
302
303class ScoreCell(MarkdownCell):
304    """
305    This class generates visualizations for an evaluation score, including the overall dataset score, a bar plot
306        displaying category-level scores if provided, and tables displaying highest and lowest scoring examples.
307    """
308
309    def __init__(
310        self,
311        dataset: Optional[ray.data.Dataset],
312        score_name: str,
313        score_column_name: str,
314        dataset_score: float,
315        categories: Optional[List[str]],
316        category_scores: Optional[List[float]],
317    ):
318        """
319        :param dataset: The Ray Dataset used in the evaluation task.
320        :param score_name: The name of the score that was computed in the evaluation.
321        :param score_column_name: The name of the score column in the dataset.
322        :param dataset_score: The aggregated score computed across the whole dataset.
323        :param categories: The names of the categories.
324        :param category_scores: The values of the category scores.
325        """
326        score_name_display = (
327            format_string(score_name, as_title=True)
328            if score_name == WER_SCORE
329            else format_string(score_name, as_title=True, as_score=True)
330        )
331        cells = [
332            HeadingCell(text=score_name_display, level=5),
333            MarkdownCell(SCORE_DESCRIPTIONS[score_name]),
334            BoldCell(f"Average Score: {dataset_score}"),
335        ]
336        if categories and category_scores:  # pragma: no branch
337            cells.append(CategoryScoreCell(categories, category_scores, score_name, dataset_score))
338        if dataset:  # pragma: no cover
339            columns = [i for i in TABLE_COLUMNS if i != "target_output"] if score_name == WER_SCORE else TABLE_COLUMNS
340            present_columns = [col for col in dataset.columns() if col in columns]
341            dataset = dataset.select_columns(present_columns)
342            is_binary_score = (
343                True
344                if score_name in [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT, CLASSIFICATION_ACCURACY_SCORE]
345                else False
346            )
347            cells.append(ScoreTableCell(dataset, score_column_name, binary=is_binary_score))  # type: ignore
348        super().__init__(*cells)
349
350
351class EvalOutputCell(MarkdownCell):
352    def __init__(
353        self,
354        eval_output: EvalOutput,
355        dataset: Optional[ray.data.Dataset] = None,
356        score_column_names: Optional[dict] = None,
357    ):
358        """
359        :param eval_output: A EvalOutput object from an evaluation.
360        :param dataset: The Ray dataset containing the evaluation scores.
361        :param score_column_names: A dict mapping the score names and score column names for the evaluation.
362        """
363        dataset_type = BUILT_IN_DATASET if eval_output.dataset_name in DATASET_CONFIGS else CUSTOM_DATASET
364        dataset_description = EvalOutputCell.get_dataset_description(
365            dataset_name=eval_output.dataset_name,
366            dataset_type=dataset_type,
367            dataset=dataset,
368            eval_name=eval_output.eval_name,
369        )
370        prompt_template = EvalOutputCell.format_prompt_template(
371            dataset_type, eval_output.dataset_name, eval_output.prompt_template
372        )
373        toxicity_detector_name = (
374            f"**Toxicity detector model**: {add_hyperlink(DETOXIFY_NAME, DETOXIFY_URI)}"
375            if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) > 1
376            else f"**Toxicity detector model**: {add_hyperlink(TOXIGEN_NAME, TOXIGEN_URI)}"
377            if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) == 1
378            else ""
379        )
380
381        eval_cells = [
382            HeadingCell(f"{dataset_type}: {format_dataset_name(eval_output.dataset_name, hyperlink=True)}", level=4),
383            MarkdownCell(dataset_description),
384            MarkdownCell(prompt_template),
385            MarkdownCell(toxicity_detector_name),
386        ]
387        if eval_output.error:
388            error_cell = BoldCell(f"This evaluation failed with the error message: {eval_output.error}")
389            eval_cells.append(error_cell)
390        else:
391            dataset_scores = {dataset_score.name: dataset_score.value for dataset_score in eval_output.dataset_scores}
392            for score_name, dataset_score_value in dataset_scores.items():  # pragma: no cover
393                if (
394                    eval_output.eval_name in ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS
395                    and score_name in ACCURACY_SEMANTIC_ROBUSTNESS_SCORES
396                ):
397                    continue
398                else:
399                    categories = (
400                        {
401                            category_score.name: score.value
402                            for category_score in eval_output.category_scores
403                            for score in category_score.scores
404                            if score.name == score_name
405                        }
406                        if eval_output.category_scores
407                        else None
408                    )
409                    score_column_name = (
410                        PROBABILITY_RATIO if score_name == EvalAlgorithm.PROMPT_STEREOTYPING.value else score_name
411                    )
412                    if score_name not in AGGREGATE_ONLY_SCORES:  # pragma: no branch
413                        score_cell = ScoreCell(
414                            dataset=dataset,
415                            score_name=score_name,
416                            score_column_name=score_column_name,
417                            dataset_score=dataset_score_value,
418                            categories=list(categories.keys()) if categories else None,
419                            category_scores=list(categories.values()) if categories else None,
420                        )
421                        eval_cells.append(score_cell)
422
423        super().__init__(*eval_cells)
424
425    @staticmethod
426    def get_dataset_sampling_description(dataset_name: str, dataset: ray.data.Dataset) -> str:
427        """
428        :param dataset_name: The name of the Ray dataset.
429        :param dataset: The Ray dataset containing the evaluation scores.
430        :return: String describing the number of samples used in the evaluation.
431        """
432        num_records = dataset.count()
433        total_records = DATASET_DETAILS[dataset_name].size if dataset_name in DATASET_DETAILS else num_records
434
435        return f"We sampled {num_records} records out of {total_records} in the full dataset."
436
437    @staticmethod
438    def get_dataset_description(
439        dataset_name: str, dataset_type: str, dataset: Optional[ray.data.Dataset], eval_name: Optional[str] = None
440    ) -> str:
441        """
442        :param dataset_name: The name of the Ray dataset.
443        :param dataset_type: Whether the dataset is a built-in or custom dataset.
444        :param dataset: The Ray dataset containing the evaluation scores.
445        :param eval_name: The name of the selected evaluation.
446        :return: The description of the dataset, including the number of samples used in the evaluation.
447        """
448
449        dataset_sampling_description = (
450            EvalOutputCell.get_dataset_sampling_description(dataset_name, dataset) if dataset else ""
451        )
452        if dataset_type == CUSTOM_DATASET:
453            return dataset_sampling_description
454        else:
455
456            dataset_description = (
457                DATASET_DETAILS[dataset_name].description + TREX_DESCRIPTION_EXAMPLES + dataset_sampling_description
458                if dataset_name == TREX and eval_name == EvalAlgorithm.FACTUAL_KNOWLEDGE.value
459                else DATASET_DETAILS[dataset_name].description
460                + dataset_sampling_description
461                + "\n\n"
462                + CROWS_PAIRS_DISCLAIMER
463                if dataset_name == CROWS_PAIRS
464                else DATASET_DETAILS[dataset_name].description + " " + dataset_sampling_description
465            )
466            return dataset_description
467
468    @staticmethod
469    def format_prompt_template(dataset_type: str, dataset_name: str, prompt_template: Optional[str] = None) -> str:
470        """
471        :param dataset_type: string indicating if dataset is a built-in or custom dataset.
472        :param dataset_name: the name of the dataset.
473        :param prompt_template: optional prompt template used in the evaluation.
474        :return: prompt template string formatted for the report.
475        """
476        prompt_template_str = "**Prompt Template:** "
477        if prompt_template:
478            return prompt_template_str + escape(prompt_template)
479        elif dataset_type == BUILT_IN_DATASET:
480            return prompt_template_str + get_default_prompt_template(dataset_name)
481        else:
482            return prompt_template_str + "No prompt template was provided for this dataset."
TABLE_COLUMNS = ['sent_more_prompt', 'model_log_probability', 'sent_more_log_prob', 'category', 'context', 'error', 'model_input', 'sent_less_log_prob', 'sent_more_input', 'sent_less_prompt', 'model_output', 'prompt', 'sent_less_input', 'target_output', 'precision_score', 'delta_meteor', 'sexual_explicit', 'f1_score', 'bertscore', 'obscene', 'quasi_exact_match_score', 'factual_knowledge', 'prompt_stereotyping', 'classification_accuracy_score', 'recall_over_words', 'delta_exact_match_score', 'exact_match_score', 'delta_f1_score', 'toxicity', 'threat', 'word_error_rate', 'meteor', 'delta_rouge', 'delta_quasi_exact_match_score', 'factual_knowledge_quasi_exact', 'delta_bertscore', 'recall_score', 'bertscore_dissimilarity', 'delta_precision_over_words', 'delta_recall_over_words', 'rouge', 'severe_toxicity', 'balanced_accuracy_score', 'delta_classification_accuracy_score', 'identity_attack', 'precision_over_words', 'log_probability_difference', 'insult', '<math><box>p(S<sub>more</sub>)/p(S<sub>less</sub>)</box></math>', 'is_biased']
class CategoryBarPlotCell(fmeval.reporting.cells.BarPlotCell):
 54class CategoryBarPlotCell(BarPlotCell):
 55    """
 56    This class represents a bar plot that displays category-level and overall evaluation scores.
 57    """
 58
 59    def __init__(
 60        self,
 61        categories: List[str],
 62        scores: List[float],
 63        score_name: str,
 64        dataset_score: float,
 65        height: Optional[str] = None,
 66        width: Optional[str] = None,
 67        center: bool = True,
 68        origin: float = 0,
 69    ):
 70        """
 71        :param categories: The names of the categories.
 72        :param scores: The values of the category scores.
 73        :param score_name: The name of the score that was computed in the evaluation.
 74        :param dataset_score: The overall score for the dataset.
 75        :param height: Height of the plot as a string
 76        :param width: Width the plot as a string
 77        :param center: Boolean indicating if the plot should be center aligned in the page
 78        """
 79        labels = categories + [DATASET_SCORE_LABEL]
 80        heights = scores + [dataset_score]
 81        super().__init__(
 82            labels=labels,
 83            heights=heights,
 84            color=CategoryBarPlotCell._create_bar_plot_colors(labels),
 85            title=CategoryBarPlotCell._create_bar_plot_title(score_name),
 86            plot_height=height,
 87            plot_width=width,
 88            center=center,
 89            origin=origin,
 90        )
 91
 92    @staticmethod
 93    def _create_bar_plot_colors(category_names: List[str]) -> List[str]:
 94        """
 95        Returns a list of colors corresponding to the bars for each of the categories.
 96
 97        :param category_names: Includes "Overall" as the last category name
 98        :returns: A list of colors, where the kth element is the color
 99            of the bar corresponding to category_names[k]
100        """
101        return [CATEGORY_BAR_COLOR for _ in range(len(category_names) - 1)] + [OVERALL_BAR_COLOR]
102
103    @staticmethod
104    def _create_bar_plot_title(evaluation_type: str) -> str:
105        """
106        Generates a bar plot title from the evaluation type.
107
108        :param evaluation_type: Ex - "Stereotyping"
109        :returns: A title to be used in the bar plot for category scores
110        """
111        return format_string(f"{evaluation_type}", as_title=True, as_score=True, as_plot_title=True)

This class represents a bar plot that displays category-level and overall evaluation scores.

CategoryBarPlotCell( categories: List[str], scores: List[float], score_name: str, dataset_score: float, height: Optional[str] = None, width: Optional[str] = None, center: bool = True, origin: float = 0)
59    def __init__(
60        self,
61        categories: List[str],
62        scores: List[float],
63        score_name: str,
64        dataset_score: float,
65        height: Optional[str] = None,
66        width: Optional[str] = None,
67        center: bool = True,
68        origin: float = 0,
69    ):
70        """
71        :param categories: The names of the categories.
72        :param scores: The values of the category scores.
73        :param score_name: The name of the score that was computed in the evaluation.
74        :param dataset_score: The overall score for the dataset.
75        :param height: Height of the plot as a string
76        :param width: Width the plot as a string
77        :param center: Boolean indicating if the plot should be center aligned in the page
78        """
79        labels = categories + [DATASET_SCORE_LABEL]
80        heights = scores + [dataset_score]
81        super().__init__(
82            labels=labels,
83            heights=heights,
84            color=CategoryBarPlotCell._create_bar_plot_colors(labels),
85            title=CategoryBarPlotCell._create_bar_plot_title(score_name),
86            plot_height=height,
87            plot_width=width,
88            center=center,
89            origin=origin,
90        )
Parameters
  • categories: The names of the categories.
  • scores: The values of the category scores.
  • score_name: The name of the score that was computed in the evaluation.
  • dataset_score: The overall score for the dataset.
  • height: Height of the plot as a string
  • width: Width the plot as a string
  • center: Boolean indicating if the plot should be center aligned in the page
class RayDatasetTableCell(fmeval.reporting.cells.TableCell):
114class RayDatasetTableCell(TableCell):
115    """
116    This class represents a table that displays data from a Ray Dataset object.
117    """
118
119    def __init__(
120        self,
121        dataset: ray.data.Dataset,
122        col_to_sort: Optional[str] = None,
123        k: Optional[int] = None,
124        descending: bool = False,
125        abs_val: bool = False,
126        caption: Optional[str] = None,
127        cell_align: str = LEFT,
128    ):
129        """
130        :param dataset: The Ray Dataset that we create a TableCell out of
131        :param col_to_sort: The name of the column in the dataset to sort by
132        :param k: The number of samples from the dataset to display in the table
133        :param descending: Whether to sort in descending order.
134        :param abs_val: Whether to sort by absolute value when sorting is enabled.
135        :param caption: The caption text before the table.
136        :param cell_align: The text alignment within cells.
137        """
138        if col_to_sort:
139            assert (
140                col_to_sort in dataset.columns()
141            ), f"Column to be sorted `{col_to_sort}` is not present in dataset columns: {dataset.columns()}"
142            if abs_val:
143                pd_dataset = dataset.to_pandas()
144                pd_dataset = pd_dataset.sort_values(by=col_to_sort, key=abs, ascending=not descending)
145                dataset = ray.data.from_pandas(pd_dataset)
146            else:
147                dataset = dataset.sort(col_to_sort, descending=descending)
148        samples = dataset.take(k) if k else dataset.take_all()  # take() uses min(k, num samples in dataset)
149        table_data = [RayDatasetTableCell.truncate_samples(list(sample.values())) for sample in samples]
150        headers = dataset.columns()
151        if DatasetColumns.CATEGORY.value.name in headers:  # pragma: no branch
152            category_idx = headers.index(DatasetColumns.CATEGORY.value.name)
153            table_data = [[row[category_idx]] + row[:category_idx] + row[category_idx + 1 :] for row in table_data]
154            headers = [headers[category_idx]] + headers[:category_idx] + headers[category_idx + 1 :]
155        headers = [format_string(header, as_column_name=True, as_title=True) for header in headers]
156        super().__init__(data=table_data, headers=headers, cell_align=cell_align, caption=caption)
157
158    @staticmethod
159    def truncate_samples(samples: List[Any]) -> List[Any]:
160        """
161        :param samples: List of items representing one row in the table.
162        :return: Table row with strings longer than MAX_CHAR truncated.
163        """
164        truncated_samples = [
165            shorten(sample, MAX_CHAR)
166            if isinstance(sample, str) and len(sample) > MAX_CHAR
167            else np.round(sample, decimals=6)
168            if isinstance(sample, float)
169            else sample
170            for sample in samples
171        ]
172        return truncated_samples

This class represents a table that displays data from a Ray Dataset object.

RayDatasetTableCell( dataset: ray.data.dataset.Dataset, col_to_sort: Optional[str] = None, k: Optional[int] = None, descending: bool = False, abs_val: bool = False, caption: Optional[str] = None, cell_align: str = 'left')
119    def __init__(
120        self,
121        dataset: ray.data.Dataset,
122        col_to_sort: Optional[str] = None,
123        k: Optional[int] = None,
124        descending: bool = False,
125        abs_val: bool = False,
126        caption: Optional[str] = None,
127        cell_align: str = LEFT,
128    ):
129        """
130        :param dataset: The Ray Dataset that we create a TableCell out of
131        :param col_to_sort: The name of the column in the dataset to sort by
132        :param k: The number of samples from the dataset to display in the table
133        :param descending: Whether to sort in descending order.
134        :param abs_val: Whether to sort by absolute value when sorting is enabled.
135        :param caption: The caption text before the table.
136        :param cell_align: The text alignment within cells.
137        """
138        if col_to_sort:
139            assert (
140                col_to_sort in dataset.columns()
141            ), f"Column to be sorted `{col_to_sort}` is not present in dataset columns: {dataset.columns()}"
142            if abs_val:
143                pd_dataset = dataset.to_pandas()
144                pd_dataset = pd_dataset.sort_values(by=col_to_sort, key=abs, ascending=not descending)
145                dataset = ray.data.from_pandas(pd_dataset)
146            else:
147                dataset = dataset.sort(col_to_sort, descending=descending)
148        samples = dataset.take(k) if k else dataset.take_all()  # take() uses min(k, num samples in dataset)
149        table_data = [RayDatasetTableCell.truncate_samples(list(sample.values())) for sample in samples]
150        headers = dataset.columns()
151        if DatasetColumns.CATEGORY.value.name in headers:  # pragma: no branch
152            category_idx = headers.index(DatasetColumns.CATEGORY.value.name)
153            table_data = [[row[category_idx]] + row[:category_idx] + row[category_idx + 1 :] for row in table_data]
154            headers = [headers[category_idx]] + headers[:category_idx] + headers[category_idx + 1 :]
155        headers = [format_string(header, as_column_name=True, as_title=True) for header in headers]
156        super().__init__(data=table_data, headers=headers, cell_align=cell_align, caption=caption)
Parameters
  • dataset: The Ray Dataset that we create a TableCell out of
  • col_to_sort: The name of the column in the dataset to sort by
  • k: The number of samples from the dataset to display in the table
  • descending: Whether to sort in descending order.
  • abs_val: Whether to sort by absolute value when sorting is enabled.
  • caption: The caption text before the table.
  • cell_align: The text alignment within cells.
@staticmethod
def truncate_samples(samples: List[Any]) -> List[Any]:
158    @staticmethod
159    def truncate_samples(samples: List[Any]) -> List[Any]:
160        """
161        :param samples: List of items representing one row in the table.
162        :return: Table row with strings longer than MAX_CHAR truncated.
163        """
164        truncated_samples = [
165            shorten(sample, MAX_CHAR)
166            if isinstance(sample, str) and len(sample) > MAX_CHAR
167            else np.round(sample, decimals=6)
168            if isinstance(sample, float)
169            else sample
170            for sample in samples
171        ]
172        return truncated_samples
Parameters
  • samples: List of items representing one row in the table.
Returns

Table row with strings longer than MAX_CHAR truncated.

class CategoryScoreCell(fmeval.reporting.cells.MarkdownCell):
175class CategoryScoreCell(MarkdownCell):
176    """
177    This class displays a bar plot for the different category scores from an evaluation, and outlines the lowest
178        scoring category.
179    """
180
181    def __init__(self, categories: List[str], scores: List[float], score_name: str, dataset_score: float, n: int = 10):
182        """
183        :param categories: The names of the categories.
184        :param scores: The values of the category scores.
185        :param score_name: The name of the score that was computed in the evaluation.
186        :param dataset_score: The overall score for the dataset.
187        :param n: Max number of categories to display.
188        """
189
190        note = (
191            f"The top {n} categories are displayed here. To view the remaining category scores, see the `output.json` file at your S3 output location."
192            if len(categories) > n
193            else ""
194        )
195        sorted_scores, sorted_categories = (list(l) for l in zip(*sorted(zip(scores, categories), reverse=True)))
196        bar_plot_origin = 0.5 if score_name == PROMPT_STEREOTYPING else 0
197        bar_plot = CategoryBarPlotCell(
198            sorted_categories[:n],
199            sorted_scores[:n],
200            score_name,
201            dataset_score,
202            height="70%",
203            width="70%",
204            origin=bar_plot_origin,
205        )
206
207        lowest_category = (
208            CategoryScoreCell._get_kth_category_score(categories, scores, reverse=True, origin=0.5, k=0)
209            if score_name == PROMPT_STEREOTYPING
210            else CategoryScoreCell._get_kth_category_score(categories, scores, k=0)
211        )
212        lowest_category = escape(lowest_category)
213        lowest_score_description = (
214            "The model stereotypes the most in the category"
215            if score_name == PROMPT_STEREOTYPING
216            else "The model scores lowest in the category"
217        )
218        super().__init__(
219            f"The plot shows the score breakdown into individual categories.",
220            note,
221            bar_plot,
222            f"{lowest_score_description} **{lowest_category}**. ",
223        )
224
225    @staticmethod
226    def _get_kth_category_score(
227        categories: List[str], scores: List[float], k: int = 0, reverse: bool = False, origin: float = 0
228    ) -> str:
229        """
230        Sorts `category_scores` by their `score` attribute and returns the kth element in the sorted list.
231
232        :param categories: The names of the categories.
233        :param scores: The values of the category scores.
234        :param k: The index of the CategoryScore to return
235        :param reverse: Whether to sort in descending order
236        :param origin: The origin of the score values.
237        """
238        assert 0 <= k < len(categories), "The provided `k` argument is outside of the valid range"
239        scores = [abs(score - origin) for score in scores] if origin != 0 else scores
240        sorted_categories = [cat for score, cat in sorted(zip(scores, categories), reverse=reverse)]
241        return sorted_categories[k]

This class displays a bar plot for the different category scores from an evaluation, and outlines the lowest scoring category.

CategoryScoreCell( categories: List[str], scores: List[float], score_name: str, dataset_score: float, n: int = 10)
181    def __init__(self, categories: List[str], scores: List[float], score_name: str, dataset_score: float, n: int = 10):
182        """
183        :param categories: The names of the categories.
184        :param scores: The values of the category scores.
185        :param score_name: The name of the score that was computed in the evaluation.
186        :param dataset_score: The overall score for the dataset.
187        :param n: Max number of categories to display.
188        """
189
190        note = (
191            f"The top {n} categories are displayed here. To view the remaining category scores, see the `output.json` file at your S3 output location."
192            if len(categories) > n
193            else ""
194        )
195        sorted_scores, sorted_categories = (list(l) for l in zip(*sorted(zip(scores, categories), reverse=True)))
196        bar_plot_origin = 0.5 if score_name == PROMPT_STEREOTYPING else 0
197        bar_plot = CategoryBarPlotCell(
198            sorted_categories[:n],
199            sorted_scores[:n],
200            score_name,
201            dataset_score,
202            height="70%",
203            width="70%",
204            origin=bar_plot_origin,
205        )
206
207        lowest_category = (
208            CategoryScoreCell._get_kth_category_score(categories, scores, reverse=True, origin=0.5, k=0)
209            if score_name == PROMPT_STEREOTYPING
210            else CategoryScoreCell._get_kth_category_score(categories, scores, k=0)
211        )
212        lowest_category = escape(lowest_category)
213        lowest_score_description = (
214            "The model stereotypes the most in the category"
215            if score_name == PROMPT_STEREOTYPING
216            else "The model scores lowest in the category"
217        )
218        super().__init__(
219            f"The plot shows the score breakdown into individual categories.",
220            note,
221            bar_plot,
222            f"{lowest_score_description} **{lowest_category}**. ",
223        )
Parameters
  • categories: The names of the categories.
  • scores: The values of the category scores.
  • score_name: The name of the score that was computed in the evaluation.
  • dataset_score: The overall score for the dataset.
  • n: Max number of categories to display.
class ScoreTableCell(fmeval.reporting.cells.MarkdownCell):
244class ScoreTableCell(MarkdownCell):
245    """
246    This class generates two tables displaying the highest and lowest-scoring examples from a particular score.
247    """
248
249    def __init__(self, dataset: ray.data.Dataset, score_column_name: str, binary: Optional[bool] = False):
250        """
251        :param dataset: The Ray Dataset used in the evaluation task.
252        :param score_column_name: The name of the score column in the dataset.
253        :param binary: Boolean indicating if the score is binary.
254        """
255        description = (
256            WER_TABLE_DESCRIPTION
257            if score_column_name == WER_SCORE
258            else STEREOTYPING_TABLE_DESCRIPTION
259            if score_column_name == PROBABILITY_RATIO
260            else FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION
261            if binary
262            else TABLE_DESCRIPTION
263        )
264
265        n_samples = min(NUM_SAMPLES_TO_DISPLAY_IN_TABLE, dataset.count())
266        top_description = (
267            (f"Top {n_samples} most stereotypical examples:")
268            if score_column_name == PROBABILITY_RATIO
269            else f"{n_samples} correct examples:"
270            if binary
271            else f"Top {n_samples} examples with highest scores:"
272        )
273        bottom_description = (
274            (f"Top {n_samples} least stereotypical examples:")
275            if score_column_name == PROBABILITY_RATIO
276            else f"{n_samples} incorrect examples:"
277            if binary
278            else f"Bottom {n_samples} examples with lowest scores:"
279        )
280        abs_val = True if score_column_name == PROBABILITY_RATIO else False
281
282        cells = [
283            MarkdownCell(description),
284            RayDatasetTableCell(
285                dataset,
286                score_column_name,
287                k=n_samples,
288                descending=True,
289                abs_val=abs_val,
290                caption=top_description,
291            ),
292            RayDatasetTableCell(
293                dataset,
294                score_column_name,
295                k=n_samples,
296                descending=False,
297                abs_val=abs_val,
298                caption=bottom_description,
299            ),
300        ]
301        super().__init__(*cells)

This class generates two tables displaying the highest and lowest-scoring examples from a particular score.

ScoreTableCell( dataset: ray.data.dataset.Dataset, score_column_name: str, binary: Optional[bool] = False)
249    def __init__(self, dataset: ray.data.Dataset, score_column_name: str, binary: Optional[bool] = False):
250        """
251        :param dataset: The Ray Dataset used in the evaluation task.
252        :param score_column_name: The name of the score column in the dataset.
253        :param binary: Boolean indicating if the score is binary.
254        """
255        description = (
256            WER_TABLE_DESCRIPTION
257            if score_column_name == WER_SCORE
258            else STEREOTYPING_TABLE_DESCRIPTION
259            if score_column_name == PROBABILITY_RATIO
260            else FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION
261            if binary
262            else TABLE_DESCRIPTION
263        )
264
265        n_samples = min(NUM_SAMPLES_TO_DISPLAY_IN_TABLE, dataset.count())
266        top_description = (
267            (f"Top {n_samples} most stereotypical examples:")
268            if score_column_name == PROBABILITY_RATIO
269            else f"{n_samples} correct examples:"
270            if binary
271            else f"Top {n_samples} examples with highest scores:"
272        )
273        bottom_description = (
274            (f"Top {n_samples} least stereotypical examples:")
275            if score_column_name == PROBABILITY_RATIO
276            else f"{n_samples} incorrect examples:"
277            if binary
278            else f"Bottom {n_samples} examples with lowest scores:"
279        )
280        abs_val = True if score_column_name == PROBABILITY_RATIO else False
281
282        cells = [
283            MarkdownCell(description),
284            RayDatasetTableCell(
285                dataset,
286                score_column_name,
287                k=n_samples,
288                descending=True,
289                abs_val=abs_val,
290                caption=top_description,
291            ),
292            RayDatasetTableCell(
293                dataset,
294                score_column_name,
295                k=n_samples,
296                descending=False,
297                abs_val=abs_val,
298                caption=bottom_description,
299            ),
300        ]
301        super().__init__(*cells)
Parameters
  • dataset: The Ray Dataset used in the evaluation task.
  • score_column_name: The name of the score column in the dataset.
  • binary: Boolean indicating if the score is binary.
class ScoreCell(fmeval.reporting.cells.MarkdownCell):
304class ScoreCell(MarkdownCell):
305    """
306    This class generates visualizations for an evaluation score, including the overall dataset score, a bar plot
307        displaying category-level scores if provided, and tables displaying highest and lowest scoring examples.
308    """
309
310    def __init__(
311        self,
312        dataset: Optional[ray.data.Dataset],
313        score_name: str,
314        score_column_name: str,
315        dataset_score: float,
316        categories: Optional[List[str]],
317        category_scores: Optional[List[float]],
318    ):
319        """
320        :param dataset: The Ray Dataset used in the evaluation task.
321        :param score_name: The name of the score that was computed in the evaluation.
322        :param score_column_name: The name of the score column in the dataset.
323        :param dataset_score: The aggregated score computed across the whole dataset.
324        :param categories: The names of the categories.
325        :param category_scores: The values of the category scores.
326        """
327        score_name_display = (
328            format_string(score_name, as_title=True)
329            if score_name == WER_SCORE
330            else format_string(score_name, as_title=True, as_score=True)
331        )
332        cells = [
333            HeadingCell(text=score_name_display, level=5),
334            MarkdownCell(SCORE_DESCRIPTIONS[score_name]),
335            BoldCell(f"Average Score: {dataset_score}"),
336        ]
337        if categories and category_scores:  # pragma: no branch
338            cells.append(CategoryScoreCell(categories, category_scores, score_name, dataset_score))
339        if dataset:  # pragma: no cover
340            columns = [i for i in TABLE_COLUMNS if i != "target_output"] if score_name == WER_SCORE else TABLE_COLUMNS
341            present_columns = [col for col in dataset.columns() if col in columns]
342            dataset = dataset.select_columns(present_columns)
343            is_binary_score = (
344                True
345                if score_name in [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT, CLASSIFICATION_ACCURACY_SCORE]
346                else False
347            )
348            cells.append(ScoreTableCell(dataset, score_column_name, binary=is_binary_score))  # type: ignore
349        super().__init__(*cells)

This class generates visualizations for an evaluation score, including the overall dataset score, a bar plot displaying category-level scores if provided, and tables displaying highest and lowest scoring examples.

ScoreCell( dataset: Optional[ray.data.dataset.Dataset], score_name: str, score_column_name: str, dataset_score: float, categories: Optional[List[str]], category_scores: Optional[List[float]])
310    def __init__(
311        self,
312        dataset: Optional[ray.data.Dataset],
313        score_name: str,
314        score_column_name: str,
315        dataset_score: float,
316        categories: Optional[List[str]],
317        category_scores: Optional[List[float]],
318    ):
319        """
320        :param dataset: The Ray Dataset used in the evaluation task.
321        :param score_name: The name of the score that was computed in the evaluation.
322        :param score_column_name: The name of the score column in the dataset.
323        :param dataset_score: The aggregated score computed across the whole dataset.
324        :param categories: The names of the categories.
325        :param category_scores: The values of the category scores.
326        """
327        score_name_display = (
328            format_string(score_name, as_title=True)
329            if score_name == WER_SCORE
330            else format_string(score_name, as_title=True, as_score=True)
331        )
332        cells = [
333            HeadingCell(text=score_name_display, level=5),
334            MarkdownCell(SCORE_DESCRIPTIONS[score_name]),
335            BoldCell(f"Average Score: {dataset_score}"),
336        ]
337        if categories and category_scores:  # pragma: no branch
338            cells.append(CategoryScoreCell(categories, category_scores, score_name, dataset_score))
339        if dataset:  # pragma: no cover
340            columns = [i for i in TABLE_COLUMNS if i != "target_output"] if score_name == WER_SCORE else TABLE_COLUMNS
341            present_columns = [col for col in dataset.columns() if col in columns]
342            dataset = dataset.select_columns(present_columns)
343            is_binary_score = (
344                True
345                if score_name in [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT, CLASSIFICATION_ACCURACY_SCORE]
346                else False
347            )
348            cells.append(ScoreTableCell(dataset, score_column_name, binary=is_binary_score))  # type: ignore
349        super().__init__(*cells)
Parameters
  • dataset: The Ray Dataset used in the evaluation task.
  • score_name: The name of the score that was computed in the evaluation.
  • score_column_name: The name of the score column in the dataset.
  • dataset_score: The aggregated score computed across the whole dataset.
  • categories: The names of the categories.
  • category_scores: The values of the category scores.
class EvalOutputCell(fmeval.reporting.cells.MarkdownCell):
352class EvalOutputCell(MarkdownCell):
353    def __init__(
354        self,
355        eval_output: EvalOutput,
356        dataset: Optional[ray.data.Dataset] = None,
357        score_column_names: Optional[dict] = None,
358    ):
359        """
360        :param eval_output: A EvalOutput object from an evaluation.
361        :param dataset: The Ray dataset containing the evaluation scores.
362        :param score_column_names: A dict mapping the score names and score column names for the evaluation.
363        """
364        dataset_type = BUILT_IN_DATASET if eval_output.dataset_name in DATASET_CONFIGS else CUSTOM_DATASET
365        dataset_description = EvalOutputCell.get_dataset_description(
366            dataset_name=eval_output.dataset_name,
367            dataset_type=dataset_type,
368            dataset=dataset,
369            eval_name=eval_output.eval_name,
370        )
371        prompt_template = EvalOutputCell.format_prompt_template(
372            dataset_type, eval_output.dataset_name, eval_output.prompt_template
373        )
374        toxicity_detector_name = (
375            f"**Toxicity detector model**: {add_hyperlink(DETOXIFY_NAME, DETOXIFY_URI)}"
376            if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) > 1
377            else f"**Toxicity detector model**: {add_hyperlink(TOXIGEN_NAME, TOXIGEN_URI)}"
378            if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) == 1
379            else ""
380        )
381
382        eval_cells = [
383            HeadingCell(f"{dataset_type}: {format_dataset_name(eval_output.dataset_name, hyperlink=True)}", level=4),
384            MarkdownCell(dataset_description),
385            MarkdownCell(prompt_template),
386            MarkdownCell(toxicity_detector_name),
387        ]
388        if eval_output.error:
389            error_cell = BoldCell(f"This evaluation failed with the error message: {eval_output.error}")
390            eval_cells.append(error_cell)
391        else:
392            dataset_scores = {dataset_score.name: dataset_score.value for dataset_score in eval_output.dataset_scores}
393            for score_name, dataset_score_value in dataset_scores.items():  # pragma: no cover
394                if (
395                    eval_output.eval_name in ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS
396                    and score_name in ACCURACY_SEMANTIC_ROBUSTNESS_SCORES
397                ):
398                    continue
399                else:
400                    categories = (
401                        {
402                            category_score.name: score.value
403                            for category_score in eval_output.category_scores
404                            for score in category_score.scores
405                            if score.name == score_name
406                        }
407                        if eval_output.category_scores
408                        else None
409                    )
410                    score_column_name = (
411                        PROBABILITY_RATIO if score_name == EvalAlgorithm.PROMPT_STEREOTYPING.value else score_name
412                    )
413                    if score_name not in AGGREGATE_ONLY_SCORES:  # pragma: no branch
414                        score_cell = ScoreCell(
415                            dataset=dataset,
416                            score_name=score_name,
417                            score_column_name=score_column_name,
418                            dataset_score=dataset_score_value,
419                            categories=list(categories.keys()) if categories else None,
420                            category_scores=list(categories.values()) if categories else None,
421                        )
422                        eval_cells.append(score_cell)
423
424        super().__init__(*eval_cells)
425
426    @staticmethod
427    def get_dataset_sampling_description(dataset_name: str, dataset: ray.data.Dataset) -> str:
428        """
429        :param dataset_name: The name of the Ray dataset.
430        :param dataset: The Ray dataset containing the evaluation scores.
431        :return: String describing the number of samples used in the evaluation.
432        """
433        num_records = dataset.count()
434        total_records = DATASET_DETAILS[dataset_name].size if dataset_name in DATASET_DETAILS else num_records
435
436        return f"We sampled {num_records} records out of {total_records} in the full dataset."
437
438    @staticmethod
439    def get_dataset_description(
440        dataset_name: str, dataset_type: str, dataset: Optional[ray.data.Dataset], eval_name: Optional[str] = None
441    ) -> str:
442        """
443        :param dataset_name: The name of the Ray dataset.
444        :param dataset_type: Whether the dataset is a built-in or custom dataset.
445        :param dataset: The Ray dataset containing the evaluation scores.
446        :param eval_name: The name of the selected evaluation.
447        :return: The description of the dataset, including the number of samples used in the evaluation.
448        """
449
450        dataset_sampling_description = (
451            EvalOutputCell.get_dataset_sampling_description(dataset_name, dataset) if dataset else ""
452        )
453        if dataset_type == CUSTOM_DATASET:
454            return dataset_sampling_description
455        else:
456
457            dataset_description = (
458                DATASET_DETAILS[dataset_name].description + TREX_DESCRIPTION_EXAMPLES + dataset_sampling_description
459                if dataset_name == TREX and eval_name == EvalAlgorithm.FACTUAL_KNOWLEDGE.value
460                else DATASET_DETAILS[dataset_name].description
461                + dataset_sampling_description
462                + "\n\n"
463                + CROWS_PAIRS_DISCLAIMER
464                if dataset_name == CROWS_PAIRS
465                else DATASET_DETAILS[dataset_name].description + " " + dataset_sampling_description
466            )
467            return dataset_description
468
469    @staticmethod
470    def format_prompt_template(dataset_type: str, dataset_name: str, prompt_template: Optional[str] = None) -> str:
471        """
472        :param dataset_type: string indicating if dataset is a built-in or custom dataset.
473        :param dataset_name: the name of the dataset.
474        :param prompt_template: optional prompt template used in the evaluation.
475        :return: prompt template string formatted for the report.
476        """
477        prompt_template_str = "**Prompt Template:** "
478        if prompt_template:
479            return prompt_template_str + escape(prompt_template)
480        elif dataset_type == BUILT_IN_DATASET:
481            return prompt_template_str + get_default_prompt_template(dataset_name)
482        else:
483            return prompt_template_str + "No prompt template was provided for this dataset."

Base class representing a markdown cell.

EvalOutputCell( eval_output: fmeval.eval_algorithms.EvalOutput, dataset: Optional[ray.data.dataset.Dataset] = None, score_column_names: Optional[dict] = None)
353    def __init__(
354        self,
355        eval_output: EvalOutput,
356        dataset: Optional[ray.data.Dataset] = None,
357        score_column_names: Optional[dict] = None,
358    ):
359        """
360        :param eval_output: A EvalOutput object from an evaluation.
361        :param dataset: The Ray dataset containing the evaluation scores.
362        :param score_column_names: A dict mapping the score names and score column names for the evaluation.
363        """
364        dataset_type = BUILT_IN_DATASET if eval_output.dataset_name in DATASET_CONFIGS else CUSTOM_DATASET
365        dataset_description = EvalOutputCell.get_dataset_description(
366            dataset_name=eval_output.dataset_name,
367            dataset_type=dataset_type,
368            dataset=dataset,
369            eval_name=eval_output.eval_name,
370        )
371        prompt_template = EvalOutputCell.format_prompt_template(
372            dataset_type, eval_output.dataset_name, eval_output.prompt_template
373        )
374        toxicity_detector_name = (
375            f"**Toxicity detector model**: {add_hyperlink(DETOXIFY_NAME, DETOXIFY_URI)}"
376            if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) > 1
377            else f"**Toxicity detector model**: {add_hyperlink(TOXIGEN_NAME, TOXIGEN_URI)}"
378            if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) == 1
379            else ""
380        )
381
382        eval_cells = [
383            HeadingCell(f"{dataset_type}: {format_dataset_name(eval_output.dataset_name, hyperlink=True)}", level=4),
384            MarkdownCell(dataset_description),
385            MarkdownCell(prompt_template),
386            MarkdownCell(toxicity_detector_name),
387        ]
388        if eval_output.error:
389            error_cell = BoldCell(f"This evaluation failed with the error message: {eval_output.error}")
390            eval_cells.append(error_cell)
391        else:
392            dataset_scores = {dataset_score.name: dataset_score.value for dataset_score in eval_output.dataset_scores}
393            for score_name, dataset_score_value in dataset_scores.items():  # pragma: no cover
394                if (
395                    eval_output.eval_name in ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS
396                    and score_name in ACCURACY_SEMANTIC_ROBUSTNESS_SCORES
397                ):
398                    continue
399                else:
400                    categories = (
401                        {
402                            category_score.name: score.value
403                            for category_score in eval_output.category_scores
404                            for score in category_score.scores
405                            if score.name == score_name
406                        }
407                        if eval_output.category_scores
408                        else None
409                    )
410                    score_column_name = (
411                        PROBABILITY_RATIO if score_name == EvalAlgorithm.PROMPT_STEREOTYPING.value else score_name
412                    )
413                    if score_name not in AGGREGATE_ONLY_SCORES:  # pragma: no branch
414                        score_cell = ScoreCell(
415                            dataset=dataset,
416                            score_name=score_name,
417                            score_column_name=score_column_name,
418                            dataset_score=dataset_score_value,
419                            categories=list(categories.keys()) if categories else None,
420                            category_scores=list(categories.values()) if categories else None,
421                        )
422                        eval_cells.append(score_cell)
423
424        super().__init__(*eval_cells)
Parameters
  • eval_output: A EvalOutput object from an evaluation.
  • dataset: The Ray dataset containing the evaluation scores.
  • score_column_names: A dict mapping the score names and score column names for the evaluation.
@staticmethod
def get_dataset_sampling_description(dataset_name: str, dataset: ray.data.dataset.Dataset) -> str:
426    @staticmethod
427    def get_dataset_sampling_description(dataset_name: str, dataset: ray.data.Dataset) -> str:
428        """
429        :param dataset_name: The name of the Ray dataset.
430        :param dataset: The Ray dataset containing the evaluation scores.
431        :return: String describing the number of samples used in the evaluation.
432        """
433        num_records = dataset.count()
434        total_records = DATASET_DETAILS[dataset_name].size if dataset_name in DATASET_DETAILS else num_records
435
436        return f"We sampled {num_records} records out of {total_records} in the full dataset."
Parameters
  • dataset_name: The name of the Ray dataset.
  • dataset: The Ray dataset containing the evaluation scores.
Returns

String describing the number of samples used in the evaluation.

@staticmethod
def get_dataset_description( dataset_name: str, dataset_type: str, dataset: Optional[ray.data.dataset.Dataset], eval_name: Optional[str] = None) -> str:
438    @staticmethod
439    def get_dataset_description(
440        dataset_name: str, dataset_type: str, dataset: Optional[ray.data.Dataset], eval_name: Optional[str] = None
441    ) -> str:
442        """
443        :param dataset_name: The name of the Ray dataset.
444        :param dataset_type: Whether the dataset is a built-in or custom dataset.
445        :param dataset: The Ray dataset containing the evaluation scores.
446        :param eval_name: The name of the selected evaluation.
447        :return: The description of the dataset, including the number of samples used in the evaluation.
448        """
449
450        dataset_sampling_description = (
451            EvalOutputCell.get_dataset_sampling_description(dataset_name, dataset) if dataset else ""
452        )
453        if dataset_type == CUSTOM_DATASET:
454            return dataset_sampling_description
455        else:
456
457            dataset_description = (
458                DATASET_DETAILS[dataset_name].description + TREX_DESCRIPTION_EXAMPLES + dataset_sampling_description
459                if dataset_name == TREX and eval_name == EvalAlgorithm.FACTUAL_KNOWLEDGE.value
460                else DATASET_DETAILS[dataset_name].description
461                + dataset_sampling_description
462                + "\n\n"
463                + CROWS_PAIRS_DISCLAIMER
464                if dataset_name == CROWS_PAIRS
465                else DATASET_DETAILS[dataset_name].description + " " + dataset_sampling_description
466            )
467            return dataset_description
Parameters
  • dataset_name: The name of the Ray dataset.
  • dataset_type: Whether the dataset is a built-in or custom dataset.
  • dataset: The Ray dataset containing the evaluation scores.
  • eval_name: The name of the selected evaluation.
Returns

The description of the dataset, including the number of samples used in the evaluation.

@staticmethod
def format_prompt_template( dataset_type: str, dataset_name: str, prompt_template: Optional[str] = None) -> str:
469    @staticmethod
470    def format_prompt_template(dataset_type: str, dataset_name: str, prompt_template: Optional[str] = None) -> str:
471        """
472        :param dataset_type: string indicating if dataset is a built-in or custom dataset.
473        :param dataset_name: the name of the dataset.
474        :param prompt_template: optional prompt template used in the evaluation.
475        :return: prompt template string formatted for the report.
476        """
477        prompt_template_str = "**Prompt Template:** "
478        if prompt_template:
479            return prompt_template_str + escape(prompt_template)
480        elif dataset_type == BUILT_IN_DATASET:
481            return prompt_template_str + get_default_prompt_template(dataset_name)
482        else:
483            return prompt_template_str + "No prompt template was provided for this dataset."
Parameters
  • dataset_type: string indicating if dataset is a built-in or custom dataset.
  • dataset_name: the name of the dataset.
  • prompt_template: optional prompt template used in the evaluation.
Returns

prompt template string formatted for the report.