fmeval.reporting.eval_output_cells
1from typing import List, Optional, Any 2import ray.data 3from textwrap import shorten 4import numpy as np 5from fmeval.eval_algorithms import ( 6 EvalOutput, 7 DATASET_CONFIGS, 8 EvalAlgorithm, 9 TREX, 10 CROWS_PAIRS, 11 get_default_prompt_template, 12) 13from fmeval.eval_algorithms.classification_accuracy import CLASSIFICATION_ACCURACY_SCORE 14from fmeval.eval_algorithms.factual_knowledge import FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT 15from fmeval.eval_algorithms.general_semantic_robustness import WER_SCORE 16from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING 17from fmeval.constants import DatasetColumns, DATASET_COLUMNS 18from fmeval.reporting.cells import MarkdownCell, BarPlotCell, TableCell, BoldCell, HeadingCell 19from fmeval.reporting.constants import ( 20 LEFT, 21 CATEGORY_BAR_COLOR, 22 OVERALL_BAR_COLOR, 23 NUM_SAMPLES_TO_DISPLAY_IN_TABLE, 24 DATASET_SCORE_LABEL, 25 SCORE_DESCRIPTIONS, 26 DATASET_DETAILS, 27 TABLE_DESCRIPTION, 28 WER_TABLE_DESCRIPTION, 29 STEREOTYPING_TABLE_DESCRIPTION, 30 FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION, 31 TREX_DESCRIPTION_EXAMPLES, 32 BUILT_IN_DATASET, 33 CUSTOM_DATASET, 34 AGGREGATE_ONLY_SCORES, 35 MAX_CHAR, 36 TOXICITY_EVAL_NAMES, 37 TOXIGEN_NAME, 38 DETOXIFY_NAME, 39 CROWS_PAIRS_DISCLAIMER, 40 PROBABILITY_RATIO, 41 IS_BIASED, 42 ACCURACY_SEMANTIC_ROBUSTNESS_SCORES, 43 ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS, 44 DETOXIFY_URI, 45 TOXIGEN_URI, 46) 47from fmeval.reporting.util import format_dataset_name, format_string, add_hyperlink 48from html import escape 49 50TABLE_COLUMNS = list(set(DATASET_COLUMNS)) + list(set(SCORE_DESCRIPTIONS.keys())) + [PROBABILITY_RATIO, IS_BIASED] 51 52 53class CategoryBarPlotCell(BarPlotCell): 54 """ 55 This class represents a bar plot that displays category-level and overall evaluation scores. 56 """ 57 58 def __init__( 59 self, 60 categories: List[str], 61 scores: List[float], 62 score_name: str, 63 dataset_score: float, 64 height: Optional[str] = None, 65 width: Optional[str] = None, 66 center: bool = True, 67 origin: float = 0, 68 ): 69 """ 70 :param categories: The names of the categories. 71 :param scores: The values of the category scores. 72 :param score_name: The name of the score that was computed in the evaluation. 73 :param dataset_score: The overall score for the dataset. 74 :param height: Height of the plot as a string 75 :param width: Width the plot as a string 76 :param center: Boolean indicating if the plot should be center aligned in the page 77 """ 78 labels = categories + [DATASET_SCORE_LABEL] 79 heights = scores + [dataset_score] 80 super().__init__( 81 labels=labels, 82 heights=heights, 83 color=CategoryBarPlotCell._create_bar_plot_colors(labels), 84 title=CategoryBarPlotCell._create_bar_plot_title(score_name), 85 plot_height=height, 86 plot_width=width, 87 center=center, 88 origin=origin, 89 ) 90 91 @staticmethod 92 def _create_bar_plot_colors(category_names: List[str]) -> List[str]: 93 """ 94 Returns a list of colors corresponding to the bars for each of the categories. 95 96 :param category_names: Includes "Overall" as the last category name 97 :returns: A list of colors, where the kth element is the color 98 of the bar corresponding to category_names[k] 99 """ 100 return [CATEGORY_BAR_COLOR for _ in range(len(category_names) - 1)] + [OVERALL_BAR_COLOR] 101 102 @staticmethod 103 def _create_bar_plot_title(evaluation_type: str) -> str: 104 """ 105 Generates a bar plot title from the evaluation type. 106 107 :param evaluation_type: Ex - "Stereotyping" 108 :returns: A title to be used in the bar plot for category scores 109 """ 110 return format_string(f"{evaluation_type}", as_title=True, as_score=True, as_plot_title=True) 111 112 113class RayDatasetTableCell(TableCell): 114 """ 115 This class represents a table that displays data from a Ray Dataset object. 116 """ 117 118 def __init__( 119 self, 120 dataset: ray.data.Dataset, 121 col_to_sort: Optional[str] = None, 122 k: Optional[int] = None, 123 descending: bool = False, 124 abs_val: bool = False, 125 caption: Optional[str] = None, 126 cell_align: str = LEFT, 127 ): 128 """ 129 :param dataset: The Ray Dataset that we create a TableCell out of 130 :param col_to_sort: The name of the column in the dataset to sort by 131 :param k: The number of samples from the dataset to display in the table 132 :param descending: Whether to sort in descending order. 133 :param abs_val: Whether to sort by absolute value when sorting is enabled. 134 :param caption: The caption text before the table. 135 :param cell_align: The text alignment within cells. 136 """ 137 if col_to_sort: 138 assert ( 139 col_to_sort in dataset.columns() 140 ), f"Column to be sorted `{col_to_sort}` is not present in dataset columns: {dataset.columns()}" 141 if abs_val: 142 pd_dataset = dataset.to_pandas() 143 pd_dataset = pd_dataset.sort_values(by=col_to_sort, key=abs, ascending=not descending) 144 dataset = ray.data.from_pandas(pd_dataset) 145 else: 146 dataset = dataset.sort(col_to_sort, descending=descending) 147 samples = dataset.take(k) if k else dataset.take_all() # take() uses min(k, num samples in dataset) 148 table_data = [RayDatasetTableCell.truncate_samples(list(sample.values())) for sample in samples] 149 headers = dataset.columns() 150 if DatasetColumns.CATEGORY.value.name in headers: # pragma: no branch 151 category_idx = headers.index(DatasetColumns.CATEGORY.value.name) 152 table_data = [[row[category_idx]] + row[:category_idx] + row[category_idx + 1 :] for row in table_data] 153 headers = [headers[category_idx]] + headers[:category_idx] + headers[category_idx + 1 :] 154 headers = [format_string(header, as_column_name=True, as_title=True) for header in headers] 155 super().__init__(data=table_data, headers=headers, cell_align=cell_align, caption=caption) 156 157 @staticmethod 158 def truncate_samples(samples: List[Any]) -> List[Any]: 159 """ 160 :param samples: List of items representing one row in the table. 161 :return: Table row with strings longer than MAX_CHAR truncated. 162 """ 163 truncated_samples = [ 164 shorten(sample, MAX_CHAR) 165 if isinstance(sample, str) and len(sample) > MAX_CHAR 166 else np.round(sample, decimals=6) 167 if isinstance(sample, float) 168 else sample 169 for sample in samples 170 ] 171 return truncated_samples 172 173 174class CategoryScoreCell(MarkdownCell): 175 """ 176 This class displays a bar plot for the different category scores from an evaluation, and outlines the lowest 177 scoring category. 178 """ 179 180 def __init__(self, categories: List[str], scores: List[float], score_name: str, dataset_score: float, n: int = 10): 181 """ 182 :param categories: The names of the categories. 183 :param scores: The values of the category scores. 184 :param score_name: The name of the score that was computed in the evaluation. 185 :param dataset_score: The overall score for the dataset. 186 :param n: Max number of categories to display. 187 """ 188 189 note = ( 190 f"The top {n} categories are displayed here. To view the remaining category scores, see the `output.json` file at your S3 output location." 191 if len(categories) > n 192 else "" 193 ) 194 sorted_scores, sorted_categories = (list(l) for l in zip(*sorted(zip(scores, categories), reverse=True))) 195 bar_plot_origin = 0.5 if score_name == PROMPT_STEREOTYPING else 0 196 bar_plot = CategoryBarPlotCell( 197 sorted_categories[:n], 198 sorted_scores[:n], 199 score_name, 200 dataset_score, 201 height="70%", 202 width="70%", 203 origin=bar_plot_origin, 204 ) 205 206 lowest_category = ( 207 CategoryScoreCell._get_kth_category_score(categories, scores, reverse=True, origin=0.5, k=0) 208 if score_name == PROMPT_STEREOTYPING 209 else CategoryScoreCell._get_kth_category_score(categories, scores, k=0) 210 ) 211 lowest_category = escape(lowest_category) 212 lowest_score_description = ( 213 "The model stereotypes the most in the category" 214 if score_name == PROMPT_STEREOTYPING 215 else "The model scores lowest in the category" 216 ) 217 super().__init__( 218 f"The plot shows the score breakdown into individual categories.", 219 note, 220 bar_plot, 221 f"{lowest_score_description} **{lowest_category}**. ", 222 ) 223 224 @staticmethod 225 def _get_kth_category_score( 226 categories: List[str], scores: List[float], k: int = 0, reverse: bool = False, origin: float = 0 227 ) -> str: 228 """ 229 Sorts `category_scores` by their `score` attribute and returns the kth element in the sorted list. 230 231 :param categories: The names of the categories. 232 :param scores: The values of the category scores. 233 :param k: The index of the CategoryScore to return 234 :param reverse: Whether to sort in descending order 235 :param origin: The origin of the score values. 236 """ 237 assert 0 <= k < len(categories), "The provided `k` argument is outside of the valid range" 238 scores = [abs(score - origin) for score in scores] if origin != 0 else scores 239 sorted_categories = [cat for score, cat in sorted(zip(scores, categories), reverse=reverse)] 240 return sorted_categories[k] 241 242 243class ScoreTableCell(MarkdownCell): 244 """ 245 This class generates two tables displaying the highest and lowest-scoring examples from a particular score. 246 """ 247 248 def __init__(self, dataset: ray.data.Dataset, score_column_name: str, binary: Optional[bool] = False): 249 """ 250 :param dataset: The Ray Dataset used in the evaluation task. 251 :param score_column_name: The name of the score column in the dataset. 252 :param binary: Boolean indicating if the score is binary. 253 """ 254 description = ( 255 WER_TABLE_DESCRIPTION 256 if score_column_name == WER_SCORE 257 else STEREOTYPING_TABLE_DESCRIPTION 258 if score_column_name == PROBABILITY_RATIO 259 else FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION 260 if binary 261 else TABLE_DESCRIPTION 262 ) 263 264 n_samples = min(NUM_SAMPLES_TO_DISPLAY_IN_TABLE, dataset.count()) 265 top_description = ( 266 (f"Top {n_samples} most stereotypical examples:") 267 if score_column_name == PROBABILITY_RATIO 268 else f"{n_samples} correct examples:" 269 if binary 270 else f"Top {n_samples} examples with highest scores:" 271 ) 272 bottom_description = ( 273 (f"Top {n_samples} least stereotypical examples:") 274 if score_column_name == PROBABILITY_RATIO 275 else f"{n_samples} incorrect examples:" 276 if binary 277 else f"Bottom {n_samples} examples with lowest scores:" 278 ) 279 abs_val = True if score_column_name == PROBABILITY_RATIO else False 280 281 cells = [ 282 MarkdownCell(description), 283 RayDatasetTableCell( 284 dataset, 285 score_column_name, 286 k=n_samples, 287 descending=True, 288 abs_val=abs_val, 289 caption=top_description, 290 ), 291 RayDatasetTableCell( 292 dataset, 293 score_column_name, 294 k=n_samples, 295 descending=False, 296 abs_val=abs_val, 297 caption=bottom_description, 298 ), 299 ] 300 super().__init__(*cells) 301 302 303class ScoreCell(MarkdownCell): 304 """ 305 This class generates visualizations for an evaluation score, including the overall dataset score, a bar plot 306 displaying category-level scores if provided, and tables displaying highest and lowest scoring examples. 307 """ 308 309 def __init__( 310 self, 311 dataset: Optional[ray.data.Dataset], 312 score_name: str, 313 score_column_name: str, 314 dataset_score: float, 315 categories: Optional[List[str]], 316 category_scores: Optional[List[float]], 317 ): 318 """ 319 :param dataset: The Ray Dataset used in the evaluation task. 320 :param score_name: The name of the score that was computed in the evaluation. 321 :param score_column_name: The name of the score column in the dataset. 322 :param dataset_score: The aggregated score computed across the whole dataset. 323 :param categories: The names of the categories. 324 :param category_scores: The values of the category scores. 325 """ 326 score_name_display = ( 327 format_string(score_name, as_title=True) 328 if score_name == WER_SCORE 329 else format_string(score_name, as_title=True, as_score=True) 330 ) 331 cells = [ 332 HeadingCell(text=score_name_display, level=5), 333 MarkdownCell(SCORE_DESCRIPTIONS[score_name]), 334 BoldCell(f"Average Score: {dataset_score}"), 335 ] 336 if categories and category_scores: # pragma: no branch 337 cells.append(CategoryScoreCell(categories, category_scores, score_name, dataset_score)) 338 if dataset: # pragma: no cover 339 columns = [i for i in TABLE_COLUMNS if i != "target_output"] if score_name == WER_SCORE else TABLE_COLUMNS 340 present_columns = [col for col in dataset.columns() if col in columns] 341 dataset = dataset.select_columns(present_columns) 342 is_binary_score = ( 343 True 344 if score_name in [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT, CLASSIFICATION_ACCURACY_SCORE] 345 else False 346 ) 347 cells.append(ScoreTableCell(dataset, score_column_name, binary=is_binary_score)) # type: ignore 348 super().__init__(*cells) 349 350 351class EvalOutputCell(MarkdownCell): 352 def __init__( 353 self, 354 eval_output: EvalOutput, 355 dataset: Optional[ray.data.Dataset] = None, 356 score_column_names: Optional[dict] = None, 357 ): 358 """ 359 :param eval_output: A EvalOutput object from an evaluation. 360 :param dataset: The Ray dataset containing the evaluation scores. 361 :param score_column_names: A dict mapping the score names and score column names for the evaluation. 362 """ 363 dataset_type = BUILT_IN_DATASET if eval_output.dataset_name in DATASET_CONFIGS else CUSTOM_DATASET 364 dataset_description = EvalOutputCell.get_dataset_description( 365 dataset_name=eval_output.dataset_name, 366 dataset_type=dataset_type, 367 dataset=dataset, 368 eval_name=eval_output.eval_name, 369 ) 370 prompt_template = EvalOutputCell.format_prompt_template( 371 dataset_type, eval_output.dataset_name, eval_output.prompt_template 372 ) 373 toxicity_detector_name = ( 374 f"**Toxicity detector model**: {add_hyperlink(DETOXIFY_NAME, DETOXIFY_URI)}" 375 if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) > 1 376 else f"**Toxicity detector model**: {add_hyperlink(TOXIGEN_NAME, TOXIGEN_URI)}" 377 if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) == 1 378 else "" 379 ) 380 381 eval_cells = [ 382 HeadingCell(f"{dataset_type}: {format_dataset_name(eval_output.dataset_name, hyperlink=True)}", level=4), 383 MarkdownCell(dataset_description), 384 MarkdownCell(prompt_template), 385 MarkdownCell(toxicity_detector_name), 386 ] 387 if eval_output.error: 388 error_cell = BoldCell(f"This evaluation failed with the error message: {eval_output.error}") 389 eval_cells.append(error_cell) 390 else: 391 dataset_scores = {dataset_score.name: dataset_score.value for dataset_score in eval_output.dataset_scores} 392 for score_name, dataset_score_value in dataset_scores.items(): # pragma: no cover 393 if ( 394 eval_output.eval_name in ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS 395 and score_name in ACCURACY_SEMANTIC_ROBUSTNESS_SCORES 396 ): 397 continue 398 else: 399 categories = ( 400 { 401 category_score.name: score.value 402 for category_score in eval_output.category_scores 403 for score in category_score.scores 404 if score.name == score_name 405 } 406 if eval_output.category_scores 407 else None 408 ) 409 score_column_name = ( 410 PROBABILITY_RATIO if score_name == EvalAlgorithm.PROMPT_STEREOTYPING.value else score_name 411 ) 412 if score_name not in AGGREGATE_ONLY_SCORES: # pragma: no branch 413 score_cell = ScoreCell( 414 dataset=dataset, 415 score_name=score_name, 416 score_column_name=score_column_name, 417 dataset_score=dataset_score_value, 418 categories=list(categories.keys()) if categories else None, 419 category_scores=list(categories.values()) if categories else None, 420 ) 421 eval_cells.append(score_cell) 422 423 super().__init__(*eval_cells) 424 425 @staticmethod 426 def get_dataset_sampling_description(dataset_name: str, dataset: ray.data.Dataset) -> str: 427 """ 428 :param dataset_name: The name of the Ray dataset. 429 :param dataset: The Ray dataset containing the evaluation scores. 430 :return: String describing the number of samples used in the evaluation. 431 """ 432 num_records = dataset.count() 433 total_records = DATASET_DETAILS[dataset_name].size if dataset_name in DATASET_DETAILS else num_records 434 435 return f"We sampled {num_records} records out of {total_records} in the full dataset." 436 437 @staticmethod 438 def get_dataset_description( 439 dataset_name: str, dataset_type: str, dataset: Optional[ray.data.Dataset], eval_name: Optional[str] = None 440 ) -> str: 441 """ 442 :param dataset_name: The name of the Ray dataset. 443 :param dataset_type: Whether the dataset is a built-in or custom dataset. 444 :param dataset: The Ray dataset containing the evaluation scores. 445 :param eval_name: The name of the selected evaluation. 446 :return: The description of the dataset, including the number of samples used in the evaluation. 447 """ 448 449 dataset_sampling_description = ( 450 EvalOutputCell.get_dataset_sampling_description(dataset_name, dataset) if dataset else "" 451 ) 452 if dataset_type == CUSTOM_DATASET: 453 return dataset_sampling_description 454 else: 455 456 dataset_description = ( 457 DATASET_DETAILS[dataset_name].description + TREX_DESCRIPTION_EXAMPLES + dataset_sampling_description 458 if dataset_name == TREX and eval_name == EvalAlgorithm.FACTUAL_KNOWLEDGE.value 459 else DATASET_DETAILS[dataset_name].description 460 + dataset_sampling_description 461 + "\n\n" 462 + CROWS_PAIRS_DISCLAIMER 463 if dataset_name == CROWS_PAIRS 464 else DATASET_DETAILS[dataset_name].description + " " + dataset_sampling_description 465 ) 466 return dataset_description 467 468 @staticmethod 469 def format_prompt_template(dataset_type: str, dataset_name: str, prompt_template: Optional[str] = None) -> str: 470 """ 471 :param dataset_type: string indicating if dataset is a built-in or custom dataset. 472 :param dataset_name: the name of the dataset. 473 :param prompt_template: optional prompt template used in the evaluation. 474 :return: prompt template string formatted for the report. 475 """ 476 prompt_template_str = "**Prompt Template:** " 477 if prompt_template: 478 return prompt_template_str + escape(prompt_template) 479 elif dataset_type == BUILT_IN_DATASET: 480 return prompt_template_str + get_default_prompt_template(dataset_name) 481 else: 482 return prompt_template_str + "No prompt template was provided for this dataset."
54class CategoryBarPlotCell(BarPlotCell): 55 """ 56 This class represents a bar plot that displays category-level and overall evaluation scores. 57 """ 58 59 def __init__( 60 self, 61 categories: List[str], 62 scores: List[float], 63 score_name: str, 64 dataset_score: float, 65 height: Optional[str] = None, 66 width: Optional[str] = None, 67 center: bool = True, 68 origin: float = 0, 69 ): 70 """ 71 :param categories: The names of the categories. 72 :param scores: The values of the category scores. 73 :param score_name: The name of the score that was computed in the evaluation. 74 :param dataset_score: The overall score for the dataset. 75 :param height: Height of the plot as a string 76 :param width: Width the plot as a string 77 :param center: Boolean indicating if the plot should be center aligned in the page 78 """ 79 labels = categories + [DATASET_SCORE_LABEL] 80 heights = scores + [dataset_score] 81 super().__init__( 82 labels=labels, 83 heights=heights, 84 color=CategoryBarPlotCell._create_bar_plot_colors(labels), 85 title=CategoryBarPlotCell._create_bar_plot_title(score_name), 86 plot_height=height, 87 plot_width=width, 88 center=center, 89 origin=origin, 90 ) 91 92 @staticmethod 93 def _create_bar_plot_colors(category_names: List[str]) -> List[str]: 94 """ 95 Returns a list of colors corresponding to the bars for each of the categories. 96 97 :param category_names: Includes "Overall" as the last category name 98 :returns: A list of colors, where the kth element is the color 99 of the bar corresponding to category_names[k] 100 """ 101 return [CATEGORY_BAR_COLOR for _ in range(len(category_names) - 1)] + [OVERALL_BAR_COLOR] 102 103 @staticmethod 104 def _create_bar_plot_title(evaluation_type: str) -> str: 105 """ 106 Generates a bar plot title from the evaluation type. 107 108 :param evaluation_type: Ex - "Stereotyping" 109 :returns: A title to be used in the bar plot for category scores 110 """ 111 return format_string(f"{evaluation_type}", as_title=True, as_score=True, as_plot_title=True)
This class represents a bar plot that displays category-level and overall evaluation scores.
59 def __init__( 60 self, 61 categories: List[str], 62 scores: List[float], 63 score_name: str, 64 dataset_score: float, 65 height: Optional[str] = None, 66 width: Optional[str] = None, 67 center: bool = True, 68 origin: float = 0, 69 ): 70 """ 71 :param categories: The names of the categories. 72 :param scores: The values of the category scores. 73 :param score_name: The name of the score that was computed in the evaluation. 74 :param dataset_score: The overall score for the dataset. 75 :param height: Height of the plot as a string 76 :param width: Width the plot as a string 77 :param center: Boolean indicating if the plot should be center aligned in the page 78 """ 79 labels = categories + [DATASET_SCORE_LABEL] 80 heights = scores + [dataset_score] 81 super().__init__( 82 labels=labels, 83 heights=heights, 84 color=CategoryBarPlotCell._create_bar_plot_colors(labels), 85 title=CategoryBarPlotCell._create_bar_plot_title(score_name), 86 plot_height=height, 87 plot_width=width, 88 center=center, 89 origin=origin, 90 )
Parameters
- categories: The names of the categories.
- scores: The values of the category scores.
- score_name: The name of the score that was computed in the evaluation.
- dataset_score: The overall score for the dataset.
- height: Height of the plot as a string
- width: Width the plot as a string
- center: Boolean indicating if the plot should be center aligned in the page
Inherited Members
114class RayDatasetTableCell(TableCell): 115 """ 116 This class represents a table that displays data from a Ray Dataset object. 117 """ 118 119 def __init__( 120 self, 121 dataset: ray.data.Dataset, 122 col_to_sort: Optional[str] = None, 123 k: Optional[int] = None, 124 descending: bool = False, 125 abs_val: bool = False, 126 caption: Optional[str] = None, 127 cell_align: str = LEFT, 128 ): 129 """ 130 :param dataset: The Ray Dataset that we create a TableCell out of 131 :param col_to_sort: The name of the column in the dataset to sort by 132 :param k: The number of samples from the dataset to display in the table 133 :param descending: Whether to sort in descending order. 134 :param abs_val: Whether to sort by absolute value when sorting is enabled. 135 :param caption: The caption text before the table. 136 :param cell_align: The text alignment within cells. 137 """ 138 if col_to_sort: 139 assert ( 140 col_to_sort in dataset.columns() 141 ), f"Column to be sorted `{col_to_sort}` is not present in dataset columns: {dataset.columns()}" 142 if abs_val: 143 pd_dataset = dataset.to_pandas() 144 pd_dataset = pd_dataset.sort_values(by=col_to_sort, key=abs, ascending=not descending) 145 dataset = ray.data.from_pandas(pd_dataset) 146 else: 147 dataset = dataset.sort(col_to_sort, descending=descending) 148 samples = dataset.take(k) if k else dataset.take_all() # take() uses min(k, num samples in dataset) 149 table_data = [RayDatasetTableCell.truncate_samples(list(sample.values())) for sample in samples] 150 headers = dataset.columns() 151 if DatasetColumns.CATEGORY.value.name in headers: # pragma: no branch 152 category_idx = headers.index(DatasetColumns.CATEGORY.value.name) 153 table_data = [[row[category_idx]] + row[:category_idx] + row[category_idx + 1 :] for row in table_data] 154 headers = [headers[category_idx]] + headers[:category_idx] + headers[category_idx + 1 :] 155 headers = [format_string(header, as_column_name=True, as_title=True) for header in headers] 156 super().__init__(data=table_data, headers=headers, cell_align=cell_align, caption=caption) 157 158 @staticmethod 159 def truncate_samples(samples: List[Any]) -> List[Any]: 160 """ 161 :param samples: List of items representing one row in the table. 162 :return: Table row with strings longer than MAX_CHAR truncated. 163 """ 164 truncated_samples = [ 165 shorten(sample, MAX_CHAR) 166 if isinstance(sample, str) and len(sample) > MAX_CHAR 167 else np.round(sample, decimals=6) 168 if isinstance(sample, float) 169 else sample 170 for sample in samples 171 ] 172 return truncated_samples
This class represents a table that displays data from a Ray Dataset object.
119 def __init__( 120 self, 121 dataset: ray.data.Dataset, 122 col_to_sort: Optional[str] = None, 123 k: Optional[int] = None, 124 descending: bool = False, 125 abs_val: bool = False, 126 caption: Optional[str] = None, 127 cell_align: str = LEFT, 128 ): 129 """ 130 :param dataset: The Ray Dataset that we create a TableCell out of 131 :param col_to_sort: The name of the column in the dataset to sort by 132 :param k: The number of samples from the dataset to display in the table 133 :param descending: Whether to sort in descending order. 134 :param abs_val: Whether to sort by absolute value when sorting is enabled. 135 :param caption: The caption text before the table. 136 :param cell_align: The text alignment within cells. 137 """ 138 if col_to_sort: 139 assert ( 140 col_to_sort in dataset.columns() 141 ), f"Column to be sorted `{col_to_sort}` is not present in dataset columns: {dataset.columns()}" 142 if abs_val: 143 pd_dataset = dataset.to_pandas() 144 pd_dataset = pd_dataset.sort_values(by=col_to_sort, key=abs, ascending=not descending) 145 dataset = ray.data.from_pandas(pd_dataset) 146 else: 147 dataset = dataset.sort(col_to_sort, descending=descending) 148 samples = dataset.take(k) if k else dataset.take_all() # take() uses min(k, num samples in dataset) 149 table_data = [RayDatasetTableCell.truncate_samples(list(sample.values())) for sample in samples] 150 headers = dataset.columns() 151 if DatasetColumns.CATEGORY.value.name in headers: # pragma: no branch 152 category_idx = headers.index(DatasetColumns.CATEGORY.value.name) 153 table_data = [[row[category_idx]] + row[:category_idx] + row[category_idx + 1 :] for row in table_data] 154 headers = [headers[category_idx]] + headers[:category_idx] + headers[category_idx + 1 :] 155 headers = [format_string(header, as_column_name=True, as_title=True) for header in headers] 156 super().__init__(data=table_data, headers=headers, cell_align=cell_align, caption=caption)
Parameters
- dataset: The Ray Dataset that we create a TableCell out of
- col_to_sort: The name of the column in the dataset to sort by
- k: The number of samples from the dataset to display in the table
- descending: Whether to sort in descending order.
- abs_val: Whether to sort by absolute value when sorting is enabled.
- caption: The caption text before the table.
- cell_align: The text alignment within cells.
158 @staticmethod 159 def truncate_samples(samples: List[Any]) -> List[Any]: 160 """ 161 :param samples: List of items representing one row in the table. 162 :return: Table row with strings longer than MAX_CHAR truncated. 163 """ 164 truncated_samples = [ 165 shorten(sample, MAX_CHAR) 166 if isinstance(sample, str) and len(sample) > MAX_CHAR 167 else np.round(sample, decimals=6) 168 if isinstance(sample, float) 169 else sample 170 for sample in samples 171 ] 172 return truncated_samples
Parameters
- samples: List of items representing one row in the table.
Returns
Table row with strings longer than MAX_CHAR truncated.
Inherited Members
175class CategoryScoreCell(MarkdownCell): 176 """ 177 This class displays a bar plot for the different category scores from an evaluation, and outlines the lowest 178 scoring category. 179 """ 180 181 def __init__(self, categories: List[str], scores: List[float], score_name: str, dataset_score: float, n: int = 10): 182 """ 183 :param categories: The names of the categories. 184 :param scores: The values of the category scores. 185 :param score_name: The name of the score that was computed in the evaluation. 186 :param dataset_score: The overall score for the dataset. 187 :param n: Max number of categories to display. 188 """ 189 190 note = ( 191 f"The top {n} categories are displayed here. To view the remaining category scores, see the `output.json` file at your S3 output location." 192 if len(categories) > n 193 else "" 194 ) 195 sorted_scores, sorted_categories = (list(l) for l in zip(*sorted(zip(scores, categories), reverse=True))) 196 bar_plot_origin = 0.5 if score_name == PROMPT_STEREOTYPING else 0 197 bar_plot = CategoryBarPlotCell( 198 sorted_categories[:n], 199 sorted_scores[:n], 200 score_name, 201 dataset_score, 202 height="70%", 203 width="70%", 204 origin=bar_plot_origin, 205 ) 206 207 lowest_category = ( 208 CategoryScoreCell._get_kth_category_score(categories, scores, reverse=True, origin=0.5, k=0) 209 if score_name == PROMPT_STEREOTYPING 210 else CategoryScoreCell._get_kth_category_score(categories, scores, k=0) 211 ) 212 lowest_category = escape(lowest_category) 213 lowest_score_description = ( 214 "The model stereotypes the most in the category" 215 if score_name == PROMPT_STEREOTYPING 216 else "The model scores lowest in the category" 217 ) 218 super().__init__( 219 f"The plot shows the score breakdown into individual categories.", 220 note, 221 bar_plot, 222 f"{lowest_score_description} **{lowest_category}**. ", 223 ) 224 225 @staticmethod 226 def _get_kth_category_score( 227 categories: List[str], scores: List[float], k: int = 0, reverse: bool = False, origin: float = 0 228 ) -> str: 229 """ 230 Sorts `category_scores` by their `score` attribute and returns the kth element in the sorted list. 231 232 :param categories: The names of the categories. 233 :param scores: The values of the category scores. 234 :param k: The index of the CategoryScore to return 235 :param reverse: Whether to sort in descending order 236 :param origin: The origin of the score values. 237 """ 238 assert 0 <= k < len(categories), "The provided `k` argument is outside of the valid range" 239 scores = [abs(score - origin) for score in scores] if origin != 0 else scores 240 sorted_categories = [cat for score, cat in sorted(zip(scores, categories), reverse=reverse)] 241 return sorted_categories[k]
This class displays a bar plot for the different category scores from an evaluation, and outlines the lowest scoring category.
181 def __init__(self, categories: List[str], scores: List[float], score_name: str, dataset_score: float, n: int = 10): 182 """ 183 :param categories: The names of the categories. 184 :param scores: The values of the category scores. 185 :param score_name: The name of the score that was computed in the evaluation. 186 :param dataset_score: The overall score for the dataset. 187 :param n: Max number of categories to display. 188 """ 189 190 note = ( 191 f"The top {n} categories are displayed here. To view the remaining category scores, see the `output.json` file at your S3 output location." 192 if len(categories) > n 193 else "" 194 ) 195 sorted_scores, sorted_categories = (list(l) for l in zip(*sorted(zip(scores, categories), reverse=True))) 196 bar_plot_origin = 0.5 if score_name == PROMPT_STEREOTYPING else 0 197 bar_plot = CategoryBarPlotCell( 198 sorted_categories[:n], 199 sorted_scores[:n], 200 score_name, 201 dataset_score, 202 height="70%", 203 width="70%", 204 origin=bar_plot_origin, 205 ) 206 207 lowest_category = ( 208 CategoryScoreCell._get_kth_category_score(categories, scores, reverse=True, origin=0.5, k=0) 209 if score_name == PROMPT_STEREOTYPING 210 else CategoryScoreCell._get_kth_category_score(categories, scores, k=0) 211 ) 212 lowest_category = escape(lowest_category) 213 lowest_score_description = ( 214 "The model stereotypes the most in the category" 215 if score_name == PROMPT_STEREOTYPING 216 else "The model scores lowest in the category" 217 ) 218 super().__init__( 219 f"The plot shows the score breakdown into individual categories.", 220 note, 221 bar_plot, 222 f"{lowest_score_description} **{lowest_category}**. ", 223 )
Parameters
- categories: The names of the categories.
- scores: The values of the category scores.
- score_name: The name of the score that was computed in the evaluation.
- dataset_score: The overall score for the dataset.
- n: Max number of categories to display.
Inherited Members
244class ScoreTableCell(MarkdownCell): 245 """ 246 This class generates two tables displaying the highest and lowest-scoring examples from a particular score. 247 """ 248 249 def __init__(self, dataset: ray.data.Dataset, score_column_name: str, binary: Optional[bool] = False): 250 """ 251 :param dataset: The Ray Dataset used in the evaluation task. 252 :param score_column_name: The name of the score column in the dataset. 253 :param binary: Boolean indicating if the score is binary. 254 """ 255 description = ( 256 WER_TABLE_DESCRIPTION 257 if score_column_name == WER_SCORE 258 else STEREOTYPING_TABLE_DESCRIPTION 259 if score_column_name == PROBABILITY_RATIO 260 else FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION 261 if binary 262 else TABLE_DESCRIPTION 263 ) 264 265 n_samples = min(NUM_SAMPLES_TO_DISPLAY_IN_TABLE, dataset.count()) 266 top_description = ( 267 (f"Top {n_samples} most stereotypical examples:") 268 if score_column_name == PROBABILITY_RATIO 269 else f"{n_samples} correct examples:" 270 if binary 271 else f"Top {n_samples} examples with highest scores:" 272 ) 273 bottom_description = ( 274 (f"Top {n_samples} least stereotypical examples:") 275 if score_column_name == PROBABILITY_RATIO 276 else f"{n_samples} incorrect examples:" 277 if binary 278 else f"Bottom {n_samples} examples with lowest scores:" 279 ) 280 abs_val = True if score_column_name == PROBABILITY_RATIO else False 281 282 cells = [ 283 MarkdownCell(description), 284 RayDatasetTableCell( 285 dataset, 286 score_column_name, 287 k=n_samples, 288 descending=True, 289 abs_val=abs_val, 290 caption=top_description, 291 ), 292 RayDatasetTableCell( 293 dataset, 294 score_column_name, 295 k=n_samples, 296 descending=False, 297 abs_val=abs_val, 298 caption=bottom_description, 299 ), 300 ] 301 super().__init__(*cells)
This class generates two tables displaying the highest and lowest-scoring examples from a particular score.
249 def __init__(self, dataset: ray.data.Dataset, score_column_name: str, binary: Optional[bool] = False): 250 """ 251 :param dataset: The Ray Dataset used in the evaluation task. 252 :param score_column_name: The name of the score column in the dataset. 253 :param binary: Boolean indicating if the score is binary. 254 """ 255 description = ( 256 WER_TABLE_DESCRIPTION 257 if score_column_name == WER_SCORE 258 else STEREOTYPING_TABLE_DESCRIPTION 259 if score_column_name == PROBABILITY_RATIO 260 else FACTUAL_KNOWLEDGE_TABLE_DESCRIPTION 261 if binary 262 else TABLE_DESCRIPTION 263 ) 264 265 n_samples = min(NUM_SAMPLES_TO_DISPLAY_IN_TABLE, dataset.count()) 266 top_description = ( 267 (f"Top {n_samples} most stereotypical examples:") 268 if score_column_name == PROBABILITY_RATIO 269 else f"{n_samples} correct examples:" 270 if binary 271 else f"Top {n_samples} examples with highest scores:" 272 ) 273 bottom_description = ( 274 (f"Top {n_samples} least stereotypical examples:") 275 if score_column_name == PROBABILITY_RATIO 276 else f"{n_samples} incorrect examples:" 277 if binary 278 else f"Bottom {n_samples} examples with lowest scores:" 279 ) 280 abs_val = True if score_column_name == PROBABILITY_RATIO else False 281 282 cells = [ 283 MarkdownCell(description), 284 RayDatasetTableCell( 285 dataset, 286 score_column_name, 287 k=n_samples, 288 descending=True, 289 abs_val=abs_val, 290 caption=top_description, 291 ), 292 RayDatasetTableCell( 293 dataset, 294 score_column_name, 295 k=n_samples, 296 descending=False, 297 abs_val=abs_val, 298 caption=bottom_description, 299 ), 300 ] 301 super().__init__(*cells)
Parameters
- dataset: The Ray Dataset used in the evaluation task.
- score_column_name: The name of the score column in the dataset.
- binary: Boolean indicating if the score is binary.
Inherited Members
304class ScoreCell(MarkdownCell): 305 """ 306 This class generates visualizations for an evaluation score, including the overall dataset score, a bar plot 307 displaying category-level scores if provided, and tables displaying highest and lowest scoring examples. 308 """ 309 310 def __init__( 311 self, 312 dataset: Optional[ray.data.Dataset], 313 score_name: str, 314 score_column_name: str, 315 dataset_score: float, 316 categories: Optional[List[str]], 317 category_scores: Optional[List[float]], 318 ): 319 """ 320 :param dataset: The Ray Dataset used in the evaluation task. 321 :param score_name: The name of the score that was computed in the evaluation. 322 :param score_column_name: The name of the score column in the dataset. 323 :param dataset_score: The aggregated score computed across the whole dataset. 324 :param categories: The names of the categories. 325 :param category_scores: The values of the category scores. 326 """ 327 score_name_display = ( 328 format_string(score_name, as_title=True) 329 if score_name == WER_SCORE 330 else format_string(score_name, as_title=True, as_score=True) 331 ) 332 cells = [ 333 HeadingCell(text=score_name_display, level=5), 334 MarkdownCell(SCORE_DESCRIPTIONS[score_name]), 335 BoldCell(f"Average Score: {dataset_score}"), 336 ] 337 if categories and category_scores: # pragma: no branch 338 cells.append(CategoryScoreCell(categories, category_scores, score_name, dataset_score)) 339 if dataset: # pragma: no cover 340 columns = [i for i in TABLE_COLUMNS if i != "target_output"] if score_name == WER_SCORE else TABLE_COLUMNS 341 present_columns = [col for col in dataset.columns() if col in columns] 342 dataset = dataset.select_columns(present_columns) 343 is_binary_score = ( 344 True 345 if score_name in [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT, CLASSIFICATION_ACCURACY_SCORE] 346 else False 347 ) 348 cells.append(ScoreTableCell(dataset, score_column_name, binary=is_binary_score)) # type: ignore 349 super().__init__(*cells)
This class generates visualizations for an evaluation score, including the overall dataset score, a bar plot displaying category-level scores if provided, and tables displaying highest and lowest scoring examples.
310 def __init__( 311 self, 312 dataset: Optional[ray.data.Dataset], 313 score_name: str, 314 score_column_name: str, 315 dataset_score: float, 316 categories: Optional[List[str]], 317 category_scores: Optional[List[float]], 318 ): 319 """ 320 :param dataset: The Ray Dataset used in the evaluation task. 321 :param score_name: The name of the score that was computed in the evaluation. 322 :param score_column_name: The name of the score column in the dataset. 323 :param dataset_score: The aggregated score computed across the whole dataset. 324 :param categories: The names of the categories. 325 :param category_scores: The values of the category scores. 326 """ 327 score_name_display = ( 328 format_string(score_name, as_title=True) 329 if score_name == WER_SCORE 330 else format_string(score_name, as_title=True, as_score=True) 331 ) 332 cells = [ 333 HeadingCell(text=score_name_display, level=5), 334 MarkdownCell(SCORE_DESCRIPTIONS[score_name]), 335 BoldCell(f"Average Score: {dataset_score}"), 336 ] 337 if categories and category_scores: # pragma: no branch 338 cells.append(CategoryScoreCell(categories, category_scores, score_name, dataset_score)) 339 if dataset: # pragma: no cover 340 columns = [i for i in TABLE_COLUMNS if i != "target_output"] if score_name == WER_SCORE else TABLE_COLUMNS 341 present_columns = [col for col in dataset.columns() if col in columns] 342 dataset = dataset.select_columns(present_columns) 343 is_binary_score = ( 344 True 345 if score_name in [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT, CLASSIFICATION_ACCURACY_SCORE] 346 else False 347 ) 348 cells.append(ScoreTableCell(dataset, score_column_name, binary=is_binary_score)) # type: ignore 349 super().__init__(*cells)
Parameters
- dataset: The Ray Dataset used in the evaluation task.
- score_name: The name of the score that was computed in the evaluation.
- score_column_name: The name of the score column in the dataset.
- dataset_score: The aggregated score computed across the whole dataset.
- categories: The names of the categories.
- category_scores: The values of the category scores.
Inherited Members
352class EvalOutputCell(MarkdownCell): 353 def __init__( 354 self, 355 eval_output: EvalOutput, 356 dataset: Optional[ray.data.Dataset] = None, 357 score_column_names: Optional[dict] = None, 358 ): 359 """ 360 :param eval_output: A EvalOutput object from an evaluation. 361 :param dataset: The Ray dataset containing the evaluation scores. 362 :param score_column_names: A dict mapping the score names and score column names for the evaluation. 363 """ 364 dataset_type = BUILT_IN_DATASET if eval_output.dataset_name in DATASET_CONFIGS else CUSTOM_DATASET 365 dataset_description = EvalOutputCell.get_dataset_description( 366 dataset_name=eval_output.dataset_name, 367 dataset_type=dataset_type, 368 dataset=dataset, 369 eval_name=eval_output.eval_name, 370 ) 371 prompt_template = EvalOutputCell.format_prompt_template( 372 dataset_type, eval_output.dataset_name, eval_output.prompt_template 373 ) 374 toxicity_detector_name = ( 375 f"**Toxicity detector model**: {add_hyperlink(DETOXIFY_NAME, DETOXIFY_URI)}" 376 if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) > 1 377 else f"**Toxicity detector model**: {add_hyperlink(TOXIGEN_NAME, TOXIGEN_URI)}" 378 if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) == 1 379 else "" 380 ) 381 382 eval_cells = [ 383 HeadingCell(f"{dataset_type}: {format_dataset_name(eval_output.dataset_name, hyperlink=True)}", level=4), 384 MarkdownCell(dataset_description), 385 MarkdownCell(prompt_template), 386 MarkdownCell(toxicity_detector_name), 387 ] 388 if eval_output.error: 389 error_cell = BoldCell(f"This evaluation failed with the error message: {eval_output.error}") 390 eval_cells.append(error_cell) 391 else: 392 dataset_scores = {dataset_score.name: dataset_score.value for dataset_score in eval_output.dataset_scores} 393 for score_name, dataset_score_value in dataset_scores.items(): # pragma: no cover 394 if ( 395 eval_output.eval_name in ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS 396 and score_name in ACCURACY_SEMANTIC_ROBUSTNESS_SCORES 397 ): 398 continue 399 else: 400 categories = ( 401 { 402 category_score.name: score.value 403 for category_score in eval_output.category_scores 404 for score in category_score.scores 405 if score.name == score_name 406 } 407 if eval_output.category_scores 408 else None 409 ) 410 score_column_name = ( 411 PROBABILITY_RATIO if score_name == EvalAlgorithm.PROMPT_STEREOTYPING.value else score_name 412 ) 413 if score_name not in AGGREGATE_ONLY_SCORES: # pragma: no branch 414 score_cell = ScoreCell( 415 dataset=dataset, 416 score_name=score_name, 417 score_column_name=score_column_name, 418 dataset_score=dataset_score_value, 419 categories=list(categories.keys()) if categories else None, 420 category_scores=list(categories.values()) if categories else None, 421 ) 422 eval_cells.append(score_cell) 423 424 super().__init__(*eval_cells) 425 426 @staticmethod 427 def get_dataset_sampling_description(dataset_name: str, dataset: ray.data.Dataset) -> str: 428 """ 429 :param dataset_name: The name of the Ray dataset. 430 :param dataset: The Ray dataset containing the evaluation scores. 431 :return: String describing the number of samples used in the evaluation. 432 """ 433 num_records = dataset.count() 434 total_records = DATASET_DETAILS[dataset_name].size if dataset_name in DATASET_DETAILS else num_records 435 436 return f"We sampled {num_records} records out of {total_records} in the full dataset." 437 438 @staticmethod 439 def get_dataset_description( 440 dataset_name: str, dataset_type: str, dataset: Optional[ray.data.Dataset], eval_name: Optional[str] = None 441 ) -> str: 442 """ 443 :param dataset_name: The name of the Ray dataset. 444 :param dataset_type: Whether the dataset is a built-in or custom dataset. 445 :param dataset: The Ray dataset containing the evaluation scores. 446 :param eval_name: The name of the selected evaluation. 447 :return: The description of the dataset, including the number of samples used in the evaluation. 448 """ 449 450 dataset_sampling_description = ( 451 EvalOutputCell.get_dataset_sampling_description(dataset_name, dataset) if dataset else "" 452 ) 453 if dataset_type == CUSTOM_DATASET: 454 return dataset_sampling_description 455 else: 456 457 dataset_description = ( 458 DATASET_DETAILS[dataset_name].description + TREX_DESCRIPTION_EXAMPLES + dataset_sampling_description 459 if dataset_name == TREX and eval_name == EvalAlgorithm.FACTUAL_KNOWLEDGE.value 460 else DATASET_DETAILS[dataset_name].description 461 + dataset_sampling_description 462 + "\n\n" 463 + CROWS_PAIRS_DISCLAIMER 464 if dataset_name == CROWS_PAIRS 465 else DATASET_DETAILS[dataset_name].description + " " + dataset_sampling_description 466 ) 467 return dataset_description 468 469 @staticmethod 470 def format_prompt_template(dataset_type: str, dataset_name: str, prompt_template: Optional[str] = None) -> str: 471 """ 472 :param dataset_type: string indicating if dataset is a built-in or custom dataset. 473 :param dataset_name: the name of the dataset. 474 :param prompt_template: optional prompt template used in the evaluation. 475 :return: prompt template string formatted for the report. 476 """ 477 prompt_template_str = "**Prompt Template:** " 478 if prompt_template: 479 return prompt_template_str + escape(prompt_template) 480 elif dataset_type == BUILT_IN_DATASET: 481 return prompt_template_str + get_default_prompt_template(dataset_name) 482 else: 483 return prompt_template_str + "No prompt template was provided for this dataset."
Base class representing a markdown cell.
353 def __init__( 354 self, 355 eval_output: EvalOutput, 356 dataset: Optional[ray.data.Dataset] = None, 357 score_column_names: Optional[dict] = None, 358 ): 359 """ 360 :param eval_output: A EvalOutput object from an evaluation. 361 :param dataset: The Ray dataset containing the evaluation scores. 362 :param score_column_names: A dict mapping the score names and score column names for the evaluation. 363 """ 364 dataset_type = BUILT_IN_DATASET if eval_output.dataset_name in DATASET_CONFIGS else CUSTOM_DATASET 365 dataset_description = EvalOutputCell.get_dataset_description( 366 dataset_name=eval_output.dataset_name, 367 dataset_type=dataset_type, 368 dataset=dataset, 369 eval_name=eval_output.eval_name, 370 ) 371 prompt_template = EvalOutputCell.format_prompt_template( 372 dataset_type, eval_output.dataset_name, eval_output.prompt_template 373 ) 374 toxicity_detector_name = ( 375 f"**Toxicity detector model**: {add_hyperlink(DETOXIFY_NAME, DETOXIFY_URI)}" 376 if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) > 1 377 else f"**Toxicity detector model**: {add_hyperlink(TOXIGEN_NAME, TOXIGEN_URI)}" 378 if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) == 1 379 else "" 380 ) 381 382 eval_cells = [ 383 HeadingCell(f"{dataset_type}: {format_dataset_name(eval_output.dataset_name, hyperlink=True)}", level=4), 384 MarkdownCell(dataset_description), 385 MarkdownCell(prompt_template), 386 MarkdownCell(toxicity_detector_name), 387 ] 388 if eval_output.error: 389 error_cell = BoldCell(f"This evaluation failed with the error message: {eval_output.error}") 390 eval_cells.append(error_cell) 391 else: 392 dataset_scores = {dataset_score.name: dataset_score.value for dataset_score in eval_output.dataset_scores} 393 for score_name, dataset_score_value in dataset_scores.items(): # pragma: no cover 394 if ( 395 eval_output.eval_name in ACCURACY_SEMANTIC_ROBUSTNESS_ALGOS 396 and score_name in ACCURACY_SEMANTIC_ROBUSTNESS_SCORES 397 ): 398 continue 399 else: 400 categories = ( 401 { 402 category_score.name: score.value 403 for category_score in eval_output.category_scores 404 for score in category_score.scores 405 if score.name == score_name 406 } 407 if eval_output.category_scores 408 else None 409 ) 410 score_column_name = ( 411 PROBABILITY_RATIO if score_name == EvalAlgorithm.PROMPT_STEREOTYPING.value else score_name 412 ) 413 if score_name not in AGGREGATE_ONLY_SCORES: # pragma: no branch 414 score_cell = ScoreCell( 415 dataset=dataset, 416 score_name=score_name, 417 score_column_name=score_column_name, 418 dataset_score=dataset_score_value, 419 categories=list(categories.keys()) if categories else None, 420 category_scores=list(categories.values()) if categories else None, 421 ) 422 eval_cells.append(score_cell) 423 424 super().__init__(*eval_cells)
Parameters
- eval_output: A EvalOutput object from an evaluation.
- dataset: The Ray dataset containing the evaluation scores.
- score_column_names: A dict mapping the score names and score column names for the evaluation.
426 @staticmethod 427 def get_dataset_sampling_description(dataset_name: str, dataset: ray.data.Dataset) -> str: 428 """ 429 :param dataset_name: The name of the Ray dataset. 430 :param dataset: The Ray dataset containing the evaluation scores. 431 :return: String describing the number of samples used in the evaluation. 432 """ 433 num_records = dataset.count() 434 total_records = DATASET_DETAILS[dataset_name].size if dataset_name in DATASET_DETAILS else num_records 435 436 return f"We sampled {num_records} records out of {total_records} in the full dataset."
Parameters
- dataset_name: The name of the Ray dataset.
- dataset: The Ray dataset containing the evaluation scores.
Returns
String describing the number of samples used in the evaluation.
438 @staticmethod 439 def get_dataset_description( 440 dataset_name: str, dataset_type: str, dataset: Optional[ray.data.Dataset], eval_name: Optional[str] = None 441 ) -> str: 442 """ 443 :param dataset_name: The name of the Ray dataset. 444 :param dataset_type: Whether the dataset is a built-in or custom dataset. 445 :param dataset: The Ray dataset containing the evaluation scores. 446 :param eval_name: The name of the selected evaluation. 447 :return: The description of the dataset, including the number of samples used in the evaluation. 448 """ 449 450 dataset_sampling_description = ( 451 EvalOutputCell.get_dataset_sampling_description(dataset_name, dataset) if dataset else "" 452 ) 453 if dataset_type == CUSTOM_DATASET: 454 return dataset_sampling_description 455 else: 456 457 dataset_description = ( 458 DATASET_DETAILS[dataset_name].description + TREX_DESCRIPTION_EXAMPLES + dataset_sampling_description 459 if dataset_name == TREX and eval_name == EvalAlgorithm.FACTUAL_KNOWLEDGE.value 460 else DATASET_DETAILS[dataset_name].description 461 + dataset_sampling_description 462 + "\n\n" 463 + CROWS_PAIRS_DISCLAIMER 464 if dataset_name == CROWS_PAIRS 465 else DATASET_DETAILS[dataset_name].description + " " + dataset_sampling_description 466 ) 467 return dataset_description
Parameters
- dataset_name: The name of the Ray dataset.
- dataset_type: Whether the dataset is a built-in or custom dataset.
- dataset: The Ray dataset containing the evaluation scores.
- eval_name: The name of the selected evaluation.
Returns
The description of the dataset, including the number of samples used in the evaluation.
469 @staticmethod 470 def format_prompt_template(dataset_type: str, dataset_name: str, prompt_template: Optional[str] = None) -> str: 471 """ 472 :param dataset_type: string indicating if dataset is a built-in or custom dataset. 473 :param dataset_name: the name of the dataset. 474 :param prompt_template: optional prompt template used in the evaluation. 475 :return: prompt template string formatted for the report. 476 """ 477 prompt_template_str = "**Prompt Template:** " 478 if prompt_template: 479 return prompt_template_str + escape(prompt_template) 480 elif dataset_type == BUILT_IN_DATASET: 481 return prompt_template_str + get_default_prompt_template(dataset_name) 482 else: 483 return prompt_template_str + "No prompt template was provided for this dataset."
Parameters
- dataset_type: string indicating if dataset is a built-in or custom dataset.
- dataset_name: the name of the dataset.
- prompt_template: optional prompt template used in the evaluation.
Returns
prompt template string formatted for the report.