fmeval.eval_algorithms.util
1import json 2import logging 3import os 4import ray.data 5import string 6 7import fmeval.util as util 8 9from ray.data import Dataset 10from collections import OrderedDict 11from dataclasses import dataclass 12from typing import Any, Dict, List, Optional, Tuple, Union 13from fmeval.constants import ( 14 DatasetColumns, 15 EVAL_OUTPUT_RECORDS_BATCH_SIZE, 16 MEAN, 17 NUM_ROWS_DETERMINISTIC, 18 DATASET_COLUMNS, 19) 20from fmeval.data_loaders.data_config import DataConfig 21from fmeval.eval_algorithms import ( 22 EvalScore, 23 CategoryScore, 24 DATASET_CONFIGS, 25 EVAL_DATASETS, 26 EvalOutput, 27 get_default_prompt_template, 28) 29from fmeval.exceptions import EvalAlgorithmInternalError 30from fmeval.model_runners.composers.composers import PromptComposer 31from fmeval.model_runners.model_runner import ModelRunner 32from fmeval.perf_util import timed_block 33from fmeval.transforms.common import GeneratePrompt, GetModelOutputs 34from fmeval.transforms.transform_pipeline import TransformPipeline 35from fmeval.util import get_num_actors 36 37# punctuation and articles for the normalize function 38ENGLISH_ARTICLES = ["a", "an", "the"] 39ENGLISH_PUNCTUATIONS = string.punctuation 40 41logger = logging.getLogger(__name__) 42 43 44def get_dataset_configs(data_config: Optional[Union[DataConfig, List[DataConfig]]], eval_name: str) -> List[DataConfig]: 45 if not data_config: 46 return [DATASET_CONFIGS[dataset_name] for dataset_name in EVAL_DATASETS[eval_name]] 47 elif isinstance(data_config, list): 48 return data_config 49 elif isinstance(data_config, tuple): 50 return [cfg for cfg in data_config] 51 else: 52 return [data_config] 53 54 55def generate_model_predict_response_for_dataset( 56 model: ModelRunner, 57 data: Dataset, 58 model_input_column_name: str, 59 model_output_column_name: Optional[str] = None, 60 model_log_probability_column_name: Optional[str] = None, 61) -> Dataset: 62 """ 63 Runs the model on the given data. Output will be written to the 64 `model_output_column_name` column, and log_probability will be 65 written to the `model_log_probability_column_name` column. 66 67 :param model: ModelRunner to get predictions from. 68 :param data: The dataset containing model inputs to feed to `model`. 69 :param model_input_column_name: The name of the column containing the model input. 70 :param model_output_column_name: The name of the column to write the model output to. 71 :param model_log_probability_column_name: The name of the column to write the model log probability to. 72 :return: The dataset with a model output column and model log probability column added. 73 Note that both columns are optional, i.e. it is possible that a model output 74 column is added, but a log probability column is not added (and vice versa). 75 """ 76 with timed_block(f"Performing inference on dataset on {model}", logger): 77 78 class ModelRunnerWrapper: # pragma: no cover 79 """ 80 This class represents the Ray Actor that gets model predictions 81 by feeding model inputs from the dataset to the model runner. 82 83 We use Ray Actors instead of Tasks because the Actor approach minimizes 84 the number of times that the ModelRunner `model` gets deserialized. 85 With Tasks, Ray will serialize and deserialize `model` for every single 86 prediction. With Actors, `model` gets deserialized once per Actor when 87 the Actor gets initialized. 88 """ 89 90 def __init__(self): 91 self.model_runner = model 92 logger.setLevel(logging.DEBUG) 93 94 def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]: 95 predict_output = self.model_runner.predict(row[model_input_column_name]) 96 if model_output_column_name: 97 row[model_output_column_name] = predict_output[0] 98 if model_log_probability_column_name: 99 row[model_log_probability_column_name] = predict_output[1] 100 return row 101 102 data = data.map( 103 ModelRunnerWrapper, compute=ray.data.ActorPoolStrategy(size=get_num_actors()) # type: ignore[arg-type] 104 ).materialize() 105 return data 106 107 108def generate_prompt_column_for_dataset( 109 prompt_template: str, data: Dataset, model_input_column_name: str, prompt_column_name: str 110) -> Dataset: 111 """ 112 Generates prompts column for a given input dataset and prompt_template 113 :param prompt_template: Prompt template 114 :param data: the dataset where each instance is a row in the dataset. 115 :param model_input_column_name: the name of the column containing the model input. 116 :param prompt_column_name: Output column name to which composed prompts are added 117 :return: the dataset with the composed prompts added. 118 """ 119 with timed_block(f"Generating prompt column", logger): 120 prompt_composer = PromptComposer(prompt_template) 121 122 def _generate_prompt_column(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cover 123 """ 124 Map function for generating the prompt column value given a dataset row. 125 """ 126 row[prompt_column_name] = prompt_composer.compose(row[model_input_column_name]) 127 return row 128 129 data = data.map(_generate_prompt_column).materialize() 130 return data 131 132 133def validate_dataset(dataset: Dataset, column_names: List[str]): 134 """ 135 Util function to validate that dataset contains the required column names. 136 137 :param dataset: Input ray dataset 138 :param column_names: names of the columns that must be present in the dataset 139 :raises: EvalAlgorithmClientError for an invalid dataset 140 """ 141 for column_name in column_names: 142 util.require( 143 column_name in dataset.columns(), 144 f"Missing required column: {column_name}, for evaluate() method", 145 ) 146 147 148def validate_prompt_template(prompt_template: str, placeholders: List[str]): 149 """ 150 Util function to validate that prompt_template contains the keywords. 151 152 :param prompt_template: A template used to compose prompts. Ex: '{"Question":$question, "Answer": $answer}' 153 :param placeholders: Placeholder keyword list. This keyword appears 154 in `prompt_template` with a $ sign prepended. In the above example, 155 the placeholders are ["question", "answer"]. 156 :raises: EvalAlgorithmClientError for an invalid prompt_template 157 """ 158 for placeholder in placeholders: 159 util.require( 160 f"${placeholder}" in prompt_template, 161 f"Unable to find placeholder ${placeholder} in prompt_template.", 162 ) 163 164 165def aggregate_evaluation_scores( 166 dataset: Dataset, score_column_names: List[str], agg_method: str 167) -> Tuple[List[EvalScore], Optional[List[CategoryScore]]]: 168 """ 169 The method aggregates scores at the dataset level and optionally at the category level if 170 categories are available in the dataset. 171 172 :param dataset: ray dataset with eval scores 173 :param score_column_names: a list of column names which contain the scores to aggregate 174 :param agg_method: the name of the aggregation to perform 175 :return: a tuple containing 1) dataset-level scores and 176 2) a list of category-level scores if categories are available, `None` otherwise 177 """ 178 dataset_scores = [ 179 EvalScore(name=score_column_name, value=dataset_aggregation(dataset, score_column_name, agg_method)) 180 for score_column_name in score_column_names 181 ] 182 category_scores: Optional[Dict[str, CategoryScore]] = None 183 if DatasetColumns.CATEGORY.value.name in dataset.columns(): 184 category_scores = { 185 name: CategoryScore(name=name, scores=[]) for name in dataset.unique(DatasetColumns.CATEGORY.value.name) 186 } 187 for score_column_name in score_column_names: 188 category_aggregate: Dataset = category_wise_aggregation(dataset, score_column_name, agg_method) 189 for row in category_aggregate.iter_rows(): 190 category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append( 191 EvalScore(name=score_column_name, value=row[f"mean({score_column_name})"]) 192 ) 193 194 return dataset_scores, list(category_scores.values()) if category_scores else None 195 196 197def dataset_aggregation(dataset: Dataset, score_column_name: str, agg_method: str) -> float: 198 if agg_method == MEAN: 199 aggregate = dataset.mean(on=score_column_name, ignore_nulls=True) 200 assert isinstance(aggregate, float) 201 return aggregate 202 else: 203 raise EvalAlgorithmInternalError(f"Aggregation method {agg_method} is not supported") 204 205 206def category_wise_aggregation(dataset: Dataset, score_column_name: str, agg_method: str) -> Dataset: 207 category_aggregate: Dataset = dataset.groupby(DatasetColumns.CATEGORY.value.name) # type: ignore 208 if agg_method == MEAN: 209 category_aggregate = category_aggregate.mean(on=score_column_name, ignore_nulls=True) 210 else: 211 raise EvalAlgorithmInternalError(f"Aggregation method {agg_method} is not supported") 212 return category_aggregate 213 214 215# Moved function to util.py because it's being used by both factual knowledge and qa accuracy 216def normalize_text_quac_protocol(text: str) -> str: 217 """ 218 Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py 219 Given a text, normalize it using the SQUAD / QUAC protocol. That is remove punctuations, excess spaces and articles, and return the lowercased tokens. 220 SQUAD (https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) and 221 QuAC benchmarks (https://s3.amazonaws.com/my89public/quac/scorer.py) use this protocol to normalize text before evaluating it. 222 HELM (https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L116) 223 and HuggingFace evaluate (https://github.com/huggingface/evaluate/blob/775555d80af30d83dc6e9f42051840d29a34f31b/metrics/squad/compute_score.py#L11) 224 also use this to normalization procedure. 225 226 :param text: The text that needs to be normalized. 227 :returns: The normalized text. 228 """ 229 230 text = text.lower() 231 text = "".join(character for character in text if character not in ENGLISH_PUNCTUATIONS) 232 return " ".join([word for word in text.split(" ") if (word != "" and word not in ENGLISH_ARTICLES)]) 233 234 235@dataclass 236class EvalOutputRecord: 237 """ 238 This class represents a single record that gets written by the `save_dataset` method. 239 In other words, it represents a single row from the Ray Dataset that is being saved. 240 241 :param scores: A list of EvalScores, where each EvalScore corresponds 242 to one of the score columns in the Ray Dataset being saved. 243 :param dataset_columns: Maps a column name to its contents in the current row 244 (recall that an EvalOutputRecord corresponds to a single Ray Dataset row). 245 246 Note: the keys in `dataset_columns` must belong to constants.COLUMN_NAMES, 247 because constants.COLUMN_NAMES defines which (non-score) columns are allowed 248 to appear in the saved output, i.e. it defines the schema for an output record. 249 """ 250 251 scores: List[EvalScore] 252 dataset_columns: Dict[str, Union[str, float, int]] 253 254 def __post_init__(self): 255 for col in self.dataset_columns: 256 util.assert_condition( 257 col in DATASET_COLUMNS, 258 f"Attempting to initialize an EvalOutputRecord with invalid non-score column {col}.", 259 ) 260 261 def __str__(self): 262 return json.dumps(self.to_dict()) 263 264 def to_dict(self) -> OrderedDict[str, Union[str, float, int, List]]: 265 """ 266 Returns a dictionary representation of this instance, 267 to be used when writing this object to JSON Lines. 268 269 Note that we use an OrderedDict to maintain consistency 270 in the ordering of columns. The score columns always come 271 at the end, and the non-score columns are ordered according 272 to constants.COLUMN_NAMES. 273 """ 274 json_obj = OrderedDict( 275 (col_name, self.dataset_columns[col_name]) 276 for col_name in DATASET_COLUMNS 277 if col_name in self.dataset_columns 278 ) 279 json_obj["scores"] = [ 280 # filter out None "value" and None "error" 281 {k: v for k, v in eval_score.__dict__.items() if v is not None} 282 for eval_score in self.scores 283 ] 284 return json_obj 285 286 @staticmethod 287 def from_row(row: Dict[str, Union[str, float, int]], score_names: List[str]) -> "EvalOutputRecord": 288 """ 289 Returns an instance of EvalOutputRecord, created from a Ray Dataset row (represented as a dict). 290 291 Example input: 292 row = { 293 "model_input": "input", 294 "model_output": "output", 295 "column_that_wont_be_included": "hello", 296 "rouge": 0.42, 297 "bert": 0.162 298 } 299 300 Corresponding output: 301 EvalOutputRecord( 302 scores=[ 303 EvalScore(name="rouge", value=0.42), 304 EvalScore(name="bert", value=0.162) 305 ], 306 dataset_columns={ 307 "model_input": "input", 308 "model_output": "output" 309 } 310 ) 311 312 Note how "column_that_wont_be_included" is not included in the produced EvalOutputRecord. 313 This is because only columns in constants.COLUMN_NAMES are considered to be valid columns 314 in the saved output file generated by `save_dataset`. The reason why it's even possible 315 for a column name that doesn't belong to constants.COLUMN_NAMES to appear in `row` is that 316 the Ray Dataset that `row` belongs to can contain columns used to store intermediate computations. 317 For example, ClassificationAccuracy generates a column named CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME 318 that is used to compute CLASSIFICATION_ACCURACY_SCORE, which is one of the score columns. 319 320 :param row: a Ray Dataset row represented as a dict 321 :param score_names: column names included in the Ray Dataset that `row` 322 is a sample of that correspond to evaluation algorithm scores 323 :returns: an instance of EvalOutputRecord corresponding to `row` 324 """ 325 dataset_columns = {} 326 scores = [] 327 for column_name, value in row.items(): 328 if column_name not in score_names: # pragma: no branch 329 if column_name in DATASET_COLUMNS: # pragma: no branch 330 dataset_columns[column_name] = value 331 else: 332 assert isinstance(value, float) or isinstance(value, int) or value is None # to satisfy Mypy 333 if value is None: 334 assert row.get(DatasetColumns.ERROR.value.name, None) 335 scores.append(EvalScore(name=column_name, error=row.get(DatasetColumns.ERROR.value.name))) 336 else: 337 scores.append(EvalScore(name=column_name, value=value)) 338 339 return EvalOutputRecord( 340 scores=scores, 341 dataset_columns=dataset_columns, 342 ) 343 344 345def generate_output_dataset_path(path_to_parent_dir: str, eval_name: str, dataset_name) -> str: 346 """ 347 Returns the path to be used by an EvalAlgorithm when calling `save_dataset`. 348 349 :param path_to_parent_dir: The path to the parent directory of the file to be saved. 350 :param eval_name: The evaluation name provided by the EvalAlgorithm. 351 :param dataset_name: The name of the dataset. 352 :returns: A path that is unique to an evaluation/dataset pair for a given job. 353 """ 354 return os.path.join(path_to_parent_dir, f"{eval_name}_{dataset_name}.jsonl") 355 356 357def generate_mean_delta_score(original_score: EvalScore, perturbed_input_scores: List[EvalScore]) -> float: 358 """ 359 Util method to generate mean of difference between original and perturbed input scores 360 :param original_score: Original score 361 :param perturbed_input_scores: List of scores for model inference outputs on perturbed inputs 362 :returns: mean of delta between the scores 363 """ 364 return sum([abs(original_score.value - reference_score.value) for reference_score in perturbed_input_scores]) / len( 365 perturbed_input_scores 366 ) 367 368 369def verify_model_determinism( 370 model: ModelRunner, 371 dataset: Dataset, 372 prompt_template: str, 373 model_input_column_name: str = DatasetColumns.MODEL_INPUT.value.name, 374) -> bool: 375 """Heuristic for whether model is deterministic. 376 377 This function invokes the provided model twice on each of the first 378 NUM_ROWS_DETERMINISTIC rows in the dataset. If the two model outputs 379 for each input are the same for all rows, the model is considered deterministic. 380 381 :param model: A ModelRunner instance representing the model under investigation. 382 :param dataset: A Ray Dataset that includes a model input column. 383 :param prompt_template: The template used to compose the prompt from the model input. 384 :param model_input_column_name: Model input column name. 385 :returns: Whether the model is deterministic. 386 """ 387 prompt_composer = PromptComposer(prompt_template) 388 for row in dataset.limit(NUM_ROWS_DETERMINISTIC).iter_rows(): 389 prompt = prompt_composer.compose(row[model_input_column_name]) 390 model_output = model.predict(prompt)[0] 391 if model.predict(prompt)[0] != model_output: 392 return False 393 return True 394 395 396def create_model_invocation_pipeline(model: ModelRunner, prompt_template: str) -> TransformPipeline: 397 """Create a transform pipeline for performing the standard action of invoking a model on a prompt. 398 399 :param model: The model to be invoked. 400 :param prompt_template: The template used for constructing prompts (out of raw inputs) 401 that will be fed to the model. 402 :returns: A TransformPipeline instance containing a GeneratePrompt transform that uses `prompt_template` 403 and a GetModelOutputs transform for invoking the model on the generated prompts. 404 """ 405 gen_prompt = GeneratePrompt( 406 input_keys=[DatasetColumns.MODEL_INPUT.value.name], 407 output_keys=[DatasetColumns.PROMPT.value.name], 408 prompt_template=prompt_template, 409 ) 410 get_model_outputs = GetModelOutputs( 411 input_to_output_keys={DatasetColumns.PROMPT.value.name: [DatasetColumns.MODEL_OUTPUT.value.name]}, 412 model_runner=model, 413 ) 414 return TransformPipeline([gen_prompt, get_model_outputs])
45def get_dataset_configs(data_config: Optional[Union[DataConfig, List[DataConfig]]], eval_name: str) -> List[DataConfig]: 46 if not data_config: 47 return [DATASET_CONFIGS[dataset_name] for dataset_name in EVAL_DATASETS[eval_name]] 48 elif isinstance(data_config, list): 49 return data_config 50 elif isinstance(data_config, tuple): 51 return [cfg for cfg in data_config] 52 else: 53 return [data_config]
56def generate_model_predict_response_for_dataset( 57 model: ModelRunner, 58 data: Dataset, 59 model_input_column_name: str, 60 model_output_column_name: Optional[str] = None, 61 model_log_probability_column_name: Optional[str] = None, 62) -> Dataset: 63 """ 64 Runs the model on the given data. Output will be written to the 65 `model_output_column_name` column, and log_probability will be 66 written to the `model_log_probability_column_name` column. 67 68 :param model: ModelRunner to get predictions from. 69 :param data: The dataset containing model inputs to feed to `model`. 70 :param model_input_column_name: The name of the column containing the model input. 71 :param model_output_column_name: The name of the column to write the model output to. 72 :param model_log_probability_column_name: The name of the column to write the model log probability to. 73 :return: The dataset with a model output column and model log probability column added. 74 Note that both columns are optional, i.e. it is possible that a model output 75 column is added, but a log probability column is not added (and vice versa). 76 """ 77 with timed_block(f"Performing inference on dataset on {model}", logger): 78 79 class ModelRunnerWrapper: # pragma: no cover 80 """ 81 This class represents the Ray Actor that gets model predictions 82 by feeding model inputs from the dataset to the model runner. 83 84 We use Ray Actors instead of Tasks because the Actor approach minimizes 85 the number of times that the ModelRunner `model` gets deserialized. 86 With Tasks, Ray will serialize and deserialize `model` for every single 87 prediction. With Actors, `model` gets deserialized once per Actor when 88 the Actor gets initialized. 89 """ 90 91 def __init__(self): 92 self.model_runner = model 93 logger.setLevel(logging.DEBUG) 94 95 def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]: 96 predict_output = self.model_runner.predict(row[model_input_column_name]) 97 if model_output_column_name: 98 row[model_output_column_name] = predict_output[0] 99 if model_log_probability_column_name: 100 row[model_log_probability_column_name] = predict_output[1] 101 return row 102 103 data = data.map( 104 ModelRunnerWrapper, compute=ray.data.ActorPoolStrategy(size=get_num_actors()) # type: ignore[arg-type] 105 ).materialize() 106 return data
Runs the model on the given data. Output will be written to the
model_output_column_name
column, and log_probability will be
written to the model_log_probability_column_name
column.
Parameters
- model: ModelRunner to get predictions from.
- data: The dataset containing model inputs to feed to
model
. - model_input_column_name: The name of the column containing the model input.
- model_output_column_name: The name of the column to write the model output to.
- model_log_probability_column_name: The name of the column to write the model log probability to.
Returns
The dataset with a model output column and model log probability column added. Note that both columns are optional, i.e. it is possible that a model output column is added, but a log probability column is not added (and vice versa).
109def generate_prompt_column_for_dataset( 110 prompt_template: str, data: Dataset, model_input_column_name: str, prompt_column_name: str 111) -> Dataset: 112 """ 113 Generates prompts column for a given input dataset and prompt_template 114 :param prompt_template: Prompt template 115 :param data: the dataset where each instance is a row in the dataset. 116 :param model_input_column_name: the name of the column containing the model input. 117 :param prompt_column_name: Output column name to which composed prompts are added 118 :return: the dataset with the composed prompts added. 119 """ 120 with timed_block(f"Generating prompt column", logger): 121 prompt_composer = PromptComposer(prompt_template) 122 123 def _generate_prompt_column(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cover 124 """ 125 Map function for generating the prompt column value given a dataset row. 126 """ 127 row[prompt_column_name] = prompt_composer.compose(row[model_input_column_name]) 128 return row 129 130 data = data.map(_generate_prompt_column).materialize() 131 return data
Generates prompts column for a given input dataset and prompt_template
Parameters
- prompt_template: Prompt template
- data: the dataset where each instance is a row in the dataset.
- model_input_column_name: the name of the column containing the model input.
- prompt_column_name: Output column name to which composed prompts are added
Returns
the dataset with the composed prompts added.
134def validate_dataset(dataset: Dataset, column_names: List[str]): 135 """ 136 Util function to validate that dataset contains the required column names. 137 138 :param dataset: Input ray dataset 139 :param column_names: names of the columns that must be present in the dataset 140 :raises: EvalAlgorithmClientError for an invalid dataset 141 """ 142 for column_name in column_names: 143 util.require( 144 column_name in dataset.columns(), 145 f"Missing required column: {column_name}, for evaluate() method", 146 )
Util function to validate that dataset contains the required column names.
Parameters
- dataset: Input ray dataset
- column_names: names of the columns that must be present in the dataset
Raises
- EvalAlgorithmClientError for an invalid dataset
149def validate_prompt_template(prompt_template: str, placeholders: List[str]): 150 """ 151 Util function to validate that prompt_template contains the keywords. 152 153 :param prompt_template: A template used to compose prompts. Ex: '{"Question":$question, "Answer": $answer}' 154 :param placeholders: Placeholder keyword list. This keyword appears 155 in `prompt_template` with a $ sign prepended. In the above example, 156 the placeholders are ["question", "answer"]. 157 :raises: EvalAlgorithmClientError for an invalid prompt_template 158 """ 159 for placeholder in placeholders: 160 util.require( 161 f"${placeholder}" in prompt_template, 162 f"Unable to find placeholder ${placeholder} in prompt_template.", 163 )
Util function to validate that prompt_template contains the keywords.
Parameters
- prompt_template: A template used to compose prompts. Ex: '{"Question":$question, "Answer": $answer}'
- placeholders: Placeholder keyword list. This keyword appears
in
prompt_template
with a $ sign prepended. In the above example, the placeholders are ["question", "answer"].
Raises
- EvalAlgorithmClientError for an invalid prompt_template
166def aggregate_evaluation_scores( 167 dataset: Dataset, score_column_names: List[str], agg_method: str 168) -> Tuple[List[EvalScore], Optional[List[CategoryScore]]]: 169 """ 170 The method aggregates scores at the dataset level and optionally at the category level if 171 categories are available in the dataset. 172 173 :param dataset: ray dataset with eval scores 174 :param score_column_names: a list of column names which contain the scores to aggregate 175 :param agg_method: the name of the aggregation to perform 176 :return: a tuple containing 1) dataset-level scores and 177 2) a list of category-level scores if categories are available, `None` otherwise 178 """ 179 dataset_scores = [ 180 EvalScore(name=score_column_name, value=dataset_aggregation(dataset, score_column_name, agg_method)) 181 for score_column_name in score_column_names 182 ] 183 category_scores: Optional[Dict[str, CategoryScore]] = None 184 if DatasetColumns.CATEGORY.value.name in dataset.columns(): 185 category_scores = { 186 name: CategoryScore(name=name, scores=[]) for name in dataset.unique(DatasetColumns.CATEGORY.value.name) 187 } 188 for score_column_name in score_column_names: 189 category_aggregate: Dataset = category_wise_aggregation(dataset, score_column_name, agg_method) 190 for row in category_aggregate.iter_rows(): 191 category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append( 192 EvalScore(name=score_column_name, value=row[f"mean({score_column_name})"]) 193 ) 194 195 return dataset_scores, list(category_scores.values()) if category_scores else None
The method aggregates scores at the dataset level and optionally at the category level if categories are available in the dataset.
Parameters
- dataset: ray dataset with eval scores
- score_column_names: a list of column names which contain the scores to aggregate
- agg_method: the name of the aggregation to perform
Returns
a tuple containing 1) dataset-level scores and 2) a list of category-level scores if categories are available,
None
otherwise
198def dataset_aggregation(dataset: Dataset, score_column_name: str, agg_method: str) -> float: 199 if agg_method == MEAN: 200 aggregate = dataset.mean(on=score_column_name, ignore_nulls=True) 201 assert isinstance(aggregate, float) 202 return aggregate 203 else: 204 raise EvalAlgorithmInternalError(f"Aggregation method {agg_method} is not supported")
207def category_wise_aggregation(dataset: Dataset, score_column_name: str, agg_method: str) -> Dataset: 208 category_aggregate: Dataset = dataset.groupby(DatasetColumns.CATEGORY.value.name) # type: ignore 209 if agg_method == MEAN: 210 category_aggregate = category_aggregate.mean(on=score_column_name, ignore_nulls=True) 211 else: 212 raise EvalAlgorithmInternalError(f"Aggregation method {agg_method} is not supported") 213 return category_aggregate
217def normalize_text_quac_protocol(text: str) -> str: 218 """ 219 Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py 220 Given a text, normalize it using the SQUAD / QUAC protocol. That is remove punctuations, excess spaces and articles, and return the lowercased tokens. 221 SQUAD (https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) and 222 QuAC benchmarks (https://s3.amazonaws.com/my89public/quac/scorer.py) use this protocol to normalize text before evaluating it. 223 HELM (https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L116) 224 and HuggingFace evaluate (https://github.com/huggingface/evaluate/blob/775555d80af30d83dc6e9f42051840d29a34f31b/metrics/squad/compute_score.py#L11) 225 also use this to normalization procedure. 226 227 :param text: The text that needs to be normalized. 228 :returns: The normalized text. 229 """ 230 231 text = text.lower() 232 text = "".join(character for character in text if character not in ENGLISH_PUNCTUATIONS) 233 return " ".join([word for word in text.split(" ") if (word != "" and word not in ENGLISH_ARTICLES)])
Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py Given a text, normalize it using the SQUAD / QUAC protocol. That is remove punctuations, excess spaces and articles, and return the lowercased tokens. SQUAD (https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) and QuAC benchmarks (https://s3.amazonaws.com/my89public/quac/scorer.py) use this protocol to normalize text before evaluating it. HELM (https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L116) and HuggingFace evaluate (https://github.com/huggingface/evaluate/blob/775555d80af30d83dc6e9f42051840d29a34f31b/metrics/squad/compute_score.py#L11) also use this to normalization procedure.
Parameters
- text: The text that needs to be normalized. :returns: The normalized text.
236@dataclass 237class EvalOutputRecord: 238 """ 239 This class represents a single record that gets written by the `save_dataset` method. 240 In other words, it represents a single row from the Ray Dataset that is being saved. 241 242 :param scores: A list of EvalScores, where each EvalScore corresponds 243 to one of the score columns in the Ray Dataset being saved. 244 :param dataset_columns: Maps a column name to its contents in the current row 245 (recall that an EvalOutputRecord corresponds to a single Ray Dataset row). 246 247 Note: the keys in `dataset_columns` must belong to constants.COLUMN_NAMES, 248 because constants.COLUMN_NAMES defines which (non-score) columns are allowed 249 to appear in the saved output, i.e. it defines the schema for an output record. 250 """ 251 252 scores: List[EvalScore] 253 dataset_columns: Dict[str, Union[str, float, int]] 254 255 def __post_init__(self): 256 for col in self.dataset_columns: 257 util.assert_condition( 258 col in DATASET_COLUMNS, 259 f"Attempting to initialize an EvalOutputRecord with invalid non-score column {col}.", 260 ) 261 262 def __str__(self): 263 return json.dumps(self.to_dict()) 264 265 def to_dict(self) -> OrderedDict[str, Union[str, float, int, List]]: 266 """ 267 Returns a dictionary representation of this instance, 268 to be used when writing this object to JSON Lines. 269 270 Note that we use an OrderedDict to maintain consistency 271 in the ordering of columns. The score columns always come 272 at the end, and the non-score columns are ordered according 273 to constants.COLUMN_NAMES. 274 """ 275 json_obj = OrderedDict( 276 (col_name, self.dataset_columns[col_name]) 277 for col_name in DATASET_COLUMNS 278 if col_name in self.dataset_columns 279 ) 280 json_obj["scores"] = [ 281 # filter out None "value" and None "error" 282 {k: v for k, v in eval_score.__dict__.items() if v is not None} 283 for eval_score in self.scores 284 ] 285 return json_obj 286 287 @staticmethod 288 def from_row(row: Dict[str, Union[str, float, int]], score_names: List[str]) -> "EvalOutputRecord": 289 """ 290 Returns an instance of EvalOutputRecord, created from a Ray Dataset row (represented as a dict). 291 292 Example input: 293 row = { 294 "model_input": "input", 295 "model_output": "output", 296 "column_that_wont_be_included": "hello", 297 "rouge": 0.42, 298 "bert": 0.162 299 } 300 301 Corresponding output: 302 EvalOutputRecord( 303 scores=[ 304 EvalScore(name="rouge", value=0.42), 305 EvalScore(name="bert", value=0.162) 306 ], 307 dataset_columns={ 308 "model_input": "input", 309 "model_output": "output" 310 } 311 ) 312 313 Note how "column_that_wont_be_included" is not included in the produced EvalOutputRecord. 314 This is because only columns in constants.COLUMN_NAMES are considered to be valid columns 315 in the saved output file generated by `save_dataset`. The reason why it's even possible 316 for a column name that doesn't belong to constants.COLUMN_NAMES to appear in `row` is that 317 the Ray Dataset that `row` belongs to can contain columns used to store intermediate computations. 318 For example, ClassificationAccuracy generates a column named CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME 319 that is used to compute CLASSIFICATION_ACCURACY_SCORE, which is one of the score columns. 320 321 :param row: a Ray Dataset row represented as a dict 322 :param score_names: column names included in the Ray Dataset that `row` 323 is a sample of that correspond to evaluation algorithm scores 324 :returns: an instance of EvalOutputRecord corresponding to `row` 325 """ 326 dataset_columns = {} 327 scores = [] 328 for column_name, value in row.items(): 329 if column_name not in score_names: # pragma: no branch 330 if column_name in DATASET_COLUMNS: # pragma: no branch 331 dataset_columns[column_name] = value 332 else: 333 assert isinstance(value, float) or isinstance(value, int) or value is None # to satisfy Mypy 334 if value is None: 335 assert row.get(DatasetColumns.ERROR.value.name, None) 336 scores.append(EvalScore(name=column_name, error=row.get(DatasetColumns.ERROR.value.name))) 337 else: 338 scores.append(EvalScore(name=column_name, value=value)) 339 340 return EvalOutputRecord( 341 scores=scores, 342 dataset_columns=dataset_columns, 343 )
This class represents a single record that gets written by the save_dataset
method.
In other words, it represents a single row from the Ray Dataset that is being saved.
Parameters
- scores: A list of EvalScores, where each EvalScore corresponds to one of the score columns in the Ray Dataset being saved.
dataset_columns: Maps a column name to its contents in the current row (recall that an EvalOutputRecord corresponds to a single Ray Dataset row).
Note: the keys in
dataset_columns
must belong to constants.COLUMN_NAMES, because constants.COLUMN_NAMES defines which (non-score) columns are allowed to appear in the saved output, i.e. it defines the schema for an output record.
265 def to_dict(self) -> OrderedDict[str, Union[str, float, int, List]]: 266 """ 267 Returns a dictionary representation of this instance, 268 to be used when writing this object to JSON Lines. 269 270 Note that we use an OrderedDict to maintain consistency 271 in the ordering of columns. The score columns always come 272 at the end, and the non-score columns are ordered according 273 to constants.COLUMN_NAMES. 274 """ 275 json_obj = OrderedDict( 276 (col_name, self.dataset_columns[col_name]) 277 for col_name in DATASET_COLUMNS 278 if col_name in self.dataset_columns 279 ) 280 json_obj["scores"] = [ 281 # filter out None "value" and None "error" 282 {k: v for k, v in eval_score.__dict__.items() if v is not None} 283 for eval_score in self.scores 284 ] 285 return json_obj
Returns a dictionary representation of this instance, to be used when writing this object to JSON Lines.
Note that we use an OrderedDict to maintain consistency in the ordering of columns. The score columns always come at the end, and the non-score columns are ordered according to constants.COLUMN_NAMES.
287 @staticmethod 288 def from_row(row: Dict[str, Union[str, float, int]], score_names: List[str]) -> "EvalOutputRecord": 289 """ 290 Returns an instance of EvalOutputRecord, created from a Ray Dataset row (represented as a dict). 291 292 Example input: 293 row = { 294 "model_input": "input", 295 "model_output": "output", 296 "column_that_wont_be_included": "hello", 297 "rouge": 0.42, 298 "bert": 0.162 299 } 300 301 Corresponding output: 302 EvalOutputRecord( 303 scores=[ 304 EvalScore(name="rouge", value=0.42), 305 EvalScore(name="bert", value=0.162) 306 ], 307 dataset_columns={ 308 "model_input": "input", 309 "model_output": "output" 310 } 311 ) 312 313 Note how "column_that_wont_be_included" is not included in the produced EvalOutputRecord. 314 This is because only columns in constants.COLUMN_NAMES are considered to be valid columns 315 in the saved output file generated by `save_dataset`. The reason why it's even possible 316 for a column name that doesn't belong to constants.COLUMN_NAMES to appear in `row` is that 317 the Ray Dataset that `row` belongs to can contain columns used to store intermediate computations. 318 For example, ClassificationAccuracy generates a column named CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME 319 that is used to compute CLASSIFICATION_ACCURACY_SCORE, which is one of the score columns. 320 321 :param row: a Ray Dataset row represented as a dict 322 :param score_names: column names included in the Ray Dataset that `row` 323 is a sample of that correspond to evaluation algorithm scores 324 :returns: an instance of EvalOutputRecord corresponding to `row` 325 """ 326 dataset_columns = {} 327 scores = [] 328 for column_name, value in row.items(): 329 if column_name not in score_names: # pragma: no branch 330 if column_name in DATASET_COLUMNS: # pragma: no branch 331 dataset_columns[column_name] = value 332 else: 333 assert isinstance(value, float) or isinstance(value, int) or value is None # to satisfy Mypy 334 if value is None: 335 assert row.get(DatasetColumns.ERROR.value.name, None) 336 scores.append(EvalScore(name=column_name, error=row.get(DatasetColumns.ERROR.value.name))) 337 else: 338 scores.append(EvalScore(name=column_name, value=value)) 339 340 return EvalOutputRecord( 341 scores=scores, 342 dataset_columns=dataset_columns, 343 )
Returns an instance of EvalOutputRecord, created from a Ray Dataset row (represented as a dict).
Example input: row = { "model_input": "input", "model_output": "output", "column_that_wont_be_included": "hello", "rouge": 0.42, "bert": 0.162 }
Corresponding output: EvalOutputRecord( scores=[ EvalScore(name="rouge", value=0.42), EvalScore(name="bert", value=0.162) ], dataset_columns={ "model_input": "input", "model_output": "output" } )
Note how "column_that_wont_be_included" is not included in the produced EvalOutputRecord.
This is because only columns in constants.COLUMN_NAMES are considered to be valid columns
in the saved output file generated by save_dataset
. The reason why it's even possible
for a column name that doesn't belong to constants.COLUMN_NAMES to appear in row
is that
the Ray Dataset that row
belongs to can contain columns used to store intermediate computations.
For example, ClassificationAccuracy generates a column named CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME
that is used to compute CLASSIFICATION_ACCURACY_SCORE, which is one of the score columns.
Parameters
- row: a Ray Dataset row represented as a dict
- score_names: column names included in the Ray Dataset that
row
is a sample of that correspond to evaluation algorithm scores :returns: an instance of EvalOutputRecord corresponding torow
346def generate_output_dataset_path(path_to_parent_dir: str, eval_name: str, dataset_name) -> str: 347 """ 348 Returns the path to be used by an EvalAlgorithm when calling `save_dataset`. 349 350 :param path_to_parent_dir: The path to the parent directory of the file to be saved. 351 :param eval_name: The evaluation name provided by the EvalAlgorithm. 352 :param dataset_name: The name of the dataset. 353 :returns: A path that is unique to an evaluation/dataset pair for a given job. 354 """ 355 return os.path.join(path_to_parent_dir, f"{eval_name}_{dataset_name}.jsonl")
Returns the path to be used by an EvalAlgorithm when calling save_dataset
.
Parameters
- path_to_parent_dir: The path to the parent directory of the file to be saved.
- eval_name: The evaluation name provided by the EvalAlgorithm.
- dataset_name: The name of the dataset. :returns: A path that is unique to an evaluation/dataset pair for a given job.
358def generate_mean_delta_score(original_score: EvalScore, perturbed_input_scores: List[EvalScore]) -> float: 359 """ 360 Util method to generate mean of difference between original and perturbed input scores 361 :param original_score: Original score 362 :param perturbed_input_scores: List of scores for model inference outputs on perturbed inputs 363 :returns: mean of delta between the scores 364 """ 365 return sum([abs(original_score.value - reference_score.value) for reference_score in perturbed_input_scores]) / len( 366 perturbed_input_scores 367 )
Util method to generate mean of difference between original and perturbed input scores
Parameters
- original_score: Original score
- perturbed_input_scores: List of scores for model inference outputs on perturbed inputs :returns: mean of delta between the scores
370def verify_model_determinism( 371 model: ModelRunner, 372 dataset: Dataset, 373 prompt_template: str, 374 model_input_column_name: str = DatasetColumns.MODEL_INPUT.value.name, 375) -> bool: 376 """Heuristic for whether model is deterministic. 377 378 This function invokes the provided model twice on each of the first 379 NUM_ROWS_DETERMINISTIC rows in the dataset. If the two model outputs 380 for each input are the same for all rows, the model is considered deterministic. 381 382 :param model: A ModelRunner instance representing the model under investigation. 383 :param dataset: A Ray Dataset that includes a model input column. 384 :param prompt_template: The template used to compose the prompt from the model input. 385 :param model_input_column_name: Model input column name. 386 :returns: Whether the model is deterministic. 387 """ 388 prompt_composer = PromptComposer(prompt_template) 389 for row in dataset.limit(NUM_ROWS_DETERMINISTIC).iter_rows(): 390 prompt = prompt_composer.compose(row[model_input_column_name]) 391 model_output = model.predict(prompt)[0] 392 if model.predict(prompt)[0] != model_output: 393 return False 394 return True
Heuristic for whether model is deterministic.
This function invokes the provided model twice on each of the first NUM_ROWS_DETERMINISTIC rows in the dataset. If the two model outputs for each input are the same for all rows, the model is considered deterministic.
Parameters
- model: A ModelRunner instance representing the model under investigation.
- dataset: A Ray Dataset that includes a model input column.
- prompt_template: The template used to compose the prompt from the model input.
- model_input_column_name: Model input column name. :returns: Whether the model is deterministic.
397def create_model_invocation_pipeline(model: ModelRunner, prompt_template: str) -> TransformPipeline: 398 """Create a transform pipeline for performing the standard action of invoking a model on a prompt. 399 400 :param model: The model to be invoked. 401 :param prompt_template: The template used for constructing prompts (out of raw inputs) 402 that will be fed to the model. 403 :returns: A TransformPipeline instance containing a GeneratePrompt transform that uses `prompt_template` 404 and a GetModelOutputs transform for invoking the model on the generated prompts. 405 """ 406 gen_prompt = GeneratePrompt( 407 input_keys=[DatasetColumns.MODEL_INPUT.value.name], 408 output_keys=[DatasetColumns.PROMPT.value.name], 409 prompt_template=prompt_template, 410 ) 411 get_model_outputs = GetModelOutputs( 412 input_to_output_keys={DatasetColumns.PROMPT.value.name: [DatasetColumns.MODEL_OUTPUT.value.name]}, 413 model_runner=model, 414 ) 415 return TransformPipeline([gen_prompt, get_model_outputs])
Create a transform pipeline for performing the standard action of invoking a model on a prompt.
Parameters
- model: The model to be invoked.
- prompt_template: The template used for constructing prompts (out of raw inputs)
that will be fed to the model.
:returns: A TransformPipeline instance containing a GeneratePrompt transform that uses
prompt_template
and a GetModelOutputs transform for invoking the model on the generated prompts.