fmeval.eval_algorithms.classification_accuracy
1import logging 2import warnings 3from dataclasses import dataclass 4from typing import Any, Callable, Dict, List, Optional, Tuple, Union 5 6from ray.data import Dataset 7from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score 8 9import fmeval.util as util 10from fmeval.constants import ( 11 DatasetColumns, 12 MEAN, 13) 14from fmeval.data_loaders.util import get_dataset 15from fmeval.data_loaders.data_config import DataConfig 16from fmeval.eval_algorithms.common import save_dataset 17from fmeval.eval_algorithms.eval_algorithm import ( 18 EvalAlgorithmInterface, 19 EvalAlgorithmConfig, 20) 21from fmeval.eval_algorithms import ( 22 EvalAlgorithm, 23 EvalOutput, 24 EvalScore, 25 CategoryScore, 26 get_default_prompt_template, 27) 28from fmeval.eval_algorithms.save_strategy import SaveStrategy, FileSaveStrategy 29from fmeval.eval_algorithms.util import ( 30 validate_dataset, 31 category_wise_aggregation, 32 generate_output_dataset_path, 33 get_dataset_configs, 34 create_model_invocation_pipeline, 35) 36from fmeval.model_runners.model_runner import ModelRunner 37from fmeval.perf_util import timed_block 38from fmeval.transforms.transform import Transform 39from fmeval.transforms.transform_pipeline import TransformPipeline 40from fmeval.transforms.util import validate_call 41 42CLASSIFICATION_ACCURACY_SCORE = "classification_accuracy_score" 43BALANCED_ACCURACY_SCORE = "balanced_accuracy_score" 44PRECISION_SCORE = "precision_score" 45RECALL_SCORE = "recall_score" 46UNKNOWN_LABEL = "unknown" 47CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME = "classified_model_output" 48OUTPUT_KEYS = [CLASSIFICATION_ACCURACY_SCORE, CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME] 49CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = { 50 BALANCED_ACCURACY_SCORE: balanced_accuracy_score, 51 PRECISION_SCORE: precision_score, 52 RECALL_SCORE: recall_score, 53} 54UNIQUENESS_FACTOR = 0.05 55 56logger = logging.getLogger(__name__) 57 58 59def convert_model_output_to_label(model_output: str, valid_labels: List[str]) -> str: 60 """Convert model output to string class label. The model is expected to return a label directly (if it has a 61 classification head), or a string containing a label (if it has a language modelling head). In the latter case we 62 strip any additional text (e.g. "The answer is 2." --> "2"). If no valid labels is contained in the 63 `model_output` an "unknown" label is returned. Users can define other `converter_fn`s, e.g. to translate a text 64 label to string ("NEGATIVE" --> "0"). 65 66 :param model_output: Value returned by the model. 67 :param valid_labels: Valid labels. 68 :return: `model_output` transformed into a label 69 """ 70 # normalise to lowercase & strip 71 valid_labels = [label.lower().strip() for label in valid_labels] 72 73 response_words = model_output.split(" ") 74 predicted_labels = [word.lower().strip() for word in response_words if word.lower().strip() in valid_labels] 75 # if there is more than one label in the model output we pick the first 76 string_label = predicted_labels[0] if predicted_labels else UNKNOWN_LABEL 77 78 return string_label 79 80 81class ClassificationAccuracyScores(Transform): 82 """This transform augments its input record with computed classification accuracy scores.""" 83 84 def __init__( 85 self, 86 target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name, 87 model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name, 88 classified_model_output_key: str = CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME, 89 classification_accuracy_score_key: str = CLASSIFICATION_ACCURACY_SCORE, 90 valid_labels: Optional[List[str]] = None, 91 converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label, 92 ): 93 """ClassificationAccuracyScores initializer. 94 95 :param target_output_key: The record key corresponding to the target output. 96 :param model_output_key: The record key corresponding to the model output. 97 :param classified_model_output_key: The key to use for the classified model output 98 that will be added to the record. 99 :param classification_accuracy_score_key: The key to use for the classification accuracy 100 score that will be added to the record. 101 :param valid_labels: See corresponding parameter in ClassificationAccuracyConfig. 102 :param converter_fn: See corresponding parameter in ClassificationAccuracyConfig. 103 """ 104 super().__init__( 105 target_output_key, 106 model_output_key, 107 classified_model_output_key, 108 classification_accuracy_score_key, 109 valid_labels, 110 converter_fn, 111 ) 112 self.register_input_output_keys( 113 input_keys=[target_output_key, model_output_key], 114 output_keys=[classified_model_output_key, classification_accuracy_score_key], 115 ) 116 self.target_output_key = target_output_key 117 self.model_output_key = model_output_key 118 self.classified_model_output_key = classified_model_output_key 119 self.classification_accuracy_score_key = classification_accuracy_score_key 120 self.valid_labels = valid_labels 121 self.converter_fn = converter_fn 122 123 @validate_call 124 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 125 """Augment the input record with computed classification accuracy scores. 126 127 :param record: The input record. 128 :returns: The input record with the classification accuracy score 129 and the classified model output added in. 130 """ 131 target_output = record[self.target_output_key] 132 model_output = record[self.model_output_key] 133 record[self.classified_model_output_key] = self.converter_fn(model_output, self.valid_labels) # type: ignore 134 record[self.classification_accuracy_score_key] = int( 135 record[self.classified_model_output_key] == str(target_output) 136 ) 137 return record 138 139 140@dataclass(frozen=True) 141class ClassificationAccuracyConfig(EvalAlgorithmConfig): 142 """Configures the Classification Accuracy evaluation algorithm. 143 144 :param valid_labels: The labels of the classes predicted from the model. 145 :param converter_fn: Function to process model output to labels, defaults to simple integer conversion. 146 :param multiclass_average_strategy: `average` to be passed to sklearn's precision and recall scores. 147 This determines how scores are aggregated in the multiclass classification setting 148 (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html). 149 Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='micro'. 150 """ 151 152 valid_labels: Optional[List[str]] = None 153 converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label 154 multiclass_average_strategy: Optional[str] = "micro" 155 156 def __post_init__(self): 157 if self.valid_labels: 158 for i, label in enumerate(self.valid_labels): 159 if not isinstance(label, str): 160 warnings.warn("Valid labels should be strings, casting.") 161 self.valid_labels[i] = str(label) 162 163 164class ClassificationAccuracy(EvalAlgorithmInterface): 165 """This evaluation measures how accurately a model performs in text classification tasks. Our built-in example task is sentiment classification where the model predicts whether a user review is positive or negative. 166 The accuracy of its response is measured by comparing model output to target answer under different metrics: 167 168 1. Classification accuracy: Is `model_output == target_answer`? This metric is computed for each datapoint as well as on average over the whole dataset. 169 2. Precision: true positives / (true positives + false positives), computed once for the whole dataset. Its parameter `multiclass_average_stategy` can be set in the `ClassificationAccuracyConfig`. 170 3. Recall: true positives / (true positives + false negatives), computed once for the whole dataset. Its parameter `multiclass_average_stategy` can be set in the `ClassificationAccuracyConfig`. 171 4. Balanced classification accuracy: Same as accuracy in the binary case, otherwise averaged recall per class. This metric is computed once for the whole dataset. 172 173 All metrics are reported on average over `num_records` datapoints and per category, resulting in a number between 0 174 (worst) and 1 (best) for each metric. 175 176 """ 177 178 eval_name = EvalAlgorithm.CLASSIFICATION_ACCURACY.value 179 180 def __init__(self, eval_algorithm_config: ClassificationAccuracyConfig = ClassificationAccuracyConfig()): 181 """Default constructor 182 183 :param eval_algorithm_config: Classification Accuracy eval algorithm config. 184 """ 185 super().__init__(eval_algorithm_config) 186 self.valid_labels = eval_algorithm_config.valid_labels 187 self.converter_fn = eval_algorithm_config.converter_fn 188 self.multiclass_average_strategy = eval_algorithm_config.multiclass_average_strategy 189 190 def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: 191 """Compute classification accuracy metrics for a single sample. 192 193 :param target_output: The expected/desired model output. 194 :param model_output: The actual model output. 195 :returns: A single-element list with an EvalScore for the classification accuracy score. 196 """ 197 util.require( 198 self.valid_labels, 199 "ClassificationAccuracy evaluate_sample method requires the `valid_labels` " 200 "attribute of the ClassificationAccuracy instance to be set.", 201 ) 202 sample = { 203 DatasetColumns.TARGET_OUTPUT.value.name: target_output, 204 DatasetColumns.MODEL_OUTPUT.value.name: model_output, 205 } 206 pipeline = self._build_pipeline(self.valid_labels) 207 result = pipeline.execute_record(sample) 208 return [ 209 EvalScore( 210 name=CLASSIFICATION_ACCURACY_SCORE, 211 value=result[CLASSIFICATION_ACCURACY_SCORE], # type: ignore 212 ) 213 ] 214 215 def _build_pipeline(self, valid_labels: Optional[List[str]]) -> TransformPipeline: 216 return TransformPipeline( 217 [ClassificationAccuracyScores(valid_labels=valid_labels, converter_fn=self.converter_fn)] 218 ) 219 220 def evaluate( 221 self, 222 model: Optional[ModelRunner] = None, 223 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 224 prompt_template: Optional[str] = None, 225 num_records: int = 100, 226 save: bool = False, 227 save_strategy: Optional[SaveStrategy] = None, 228 ) -> List[EvalOutput]: 229 """Compute classification accuracy metrics on one or more datasets. 230 231 :param model: An instance of ModelRunner representing the model under evaluation. 232 If this argument is None, the `dataset_config` argument must not be None, 233 and must correspond to a dataset that already contains a column with model outputs. 234 :param dataset_config: Configures a single dataset or list of datasets used for the 235 evaluation. If not provided, this method will run evaluations using all of its 236 supported built-in datasets. 237 :param prompt_template: A template used to generate prompts that are fed to the model. 238 If not provided, defaults will be used. If provided, `model` must not be None. 239 :param num_records: The number of records to be sampled randomly from the input dataset(s) 240 used to perform the evaluation(s). 241 :param save: If set to true, prompt responses and scores will be saved to a file. 242 :param save_strategy: Specifies the strategy to use to save the localized outputs of the evaluations. If not 243 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 244 If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`. 245 246 :return: A list of EvalOutput objects. 247 """ 248 dataset_configs = get_dataset_configs(dataset_config, self.eval_name) 249 eval_outputs: List[EvalOutput] = [] 250 for dataset_config in dataset_configs: 251 dataset = get_dataset(dataset_config, num_records) 252 253 validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name]) 254 valid_labels = ( 255 self.valid_labels 256 if self.valid_labels 257 else dataset.unique(column=DatasetColumns.TARGET_OUTPUT.value.name) 258 ) 259 row_count = dataset.count() 260 if len(valid_labels) / (row_count + 1) < UNIQUENESS_FACTOR: # pragma: no cover 261 logger.warning( 262 f"The number of classes: {len(valid_labels)} in the dataset is too large " 263 f"for the number of rows in the dataset: {row_count}", 264 ) 265 266 pipeline = self._build_pipeline(valid_labels) 267 dataset_prompt_template = None 268 if DatasetColumns.MODEL_OUTPUT.value.name not in dataset.columns(): 269 util.require(model, "No ModelRunner provided. ModelRunner is required for inference on model_inputs") 270 validate_dataset(dataset, [DatasetColumns.MODEL_INPUT.value.name]) 271 dataset_prompt_template = ( 272 get_default_prompt_template(dataset_config.dataset_name) if not prompt_template else prompt_template 273 ) 274 model_invocation_pipeline = create_model_invocation_pipeline(model, dataset_prompt_template) 275 pipeline = TransformPipeline([model_invocation_pipeline, pipeline]) 276 277 output_path = generate_output_dataset_path( 278 path_to_parent_dir=util.get_eval_results_path(), 279 eval_name=self.eval_name, 280 dataset_name=dataset_config.dataset_name, 281 ) 282 with timed_block(f"Computing score and aggregation on dataset {dataset_config.dataset_name}", logger): 283 dataset = pipeline.execute(dataset) 284 dataset_scores, category_scores = self._generate_dataset_and_category_level_scores(dataset) 285 eval_outputs.append( 286 EvalOutput( 287 eval_name=self.eval_name, 288 dataset_name=dataset_config.dataset_name, 289 prompt_template=dataset_prompt_template, 290 dataset_scores=dataset_scores, 291 category_scores=category_scores, 292 output_path=output_path, 293 ) 294 ) 295 296 if save: 297 save_dataset( 298 dataset=dataset, 299 score_names=[CLASSIFICATION_ACCURACY_SCORE], 300 save_strategy=save_strategy if save_strategy else FileSaveStrategy(output_path), 301 ) 302 303 return eval_outputs 304 305 def _get_score(self, y_true, y_pred, score_fn: Callable[..., float]) -> float: 306 """ 307 Method to generate accuracy score 308 :param y_true: Ground truth (correct) target values. 309 :param y_pred: Estimated targets as returned by a classifier. 310 :param score_fn: Function for computing one of the classification accuracy scores. 311 :returns: Computed score 312 """ 313 if score_fn == recall_score or score_fn == precision_score: 314 return score_fn(y_true, y_pred, average=self.multiclass_average_strategy) 315 return score_fn(y_true, y_pred) 316 317 def _generate_dataset_and_category_level_scores( 318 self, dataset: Dataset 319 ) -> Tuple[List[EvalScore], Optional[List[CategoryScore]]]: 320 df = dataset.to_pandas() 321 dataset_scores = [ 322 EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=dataset.mean(CLASSIFICATION_ACCURACY_SCORE)) 323 ] 324 325 for eval_score, score_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items(): 326 dataset_scores.append( 327 EvalScore( 328 name=eval_score, 329 value=self._get_score( 330 # TODO dataloader should ensure target output is string 331 y_true=df[DatasetColumns.TARGET_OUTPUT.value.name], 332 y_pred=df[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME], 333 score_fn=score_fn, 334 ), 335 ) 336 ) 337 338 category_scores: Optional[Dict[str, CategoryScore]] = None 339 if DatasetColumns.CATEGORY.value.name in dataset.columns(): 340 category_scores = { 341 name: CategoryScore(name=name, scores=[]) for name in dataset.unique(DatasetColumns.CATEGORY.value.name) 342 } 343 category_aggregate: Dataset = category_wise_aggregation(dataset, CLASSIFICATION_ACCURACY_SCORE, MEAN) 344 for row in category_aggregate.iter_rows(): 345 category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append( 346 EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=row[f"mean({CLASSIFICATION_ACCURACY_SCORE})"]) 347 ) 348 categorical_y_true = df.loc[ 349 df[DatasetColumns.CATEGORY.value.name] == row[DatasetColumns.CATEGORY.value.name], 350 DatasetColumns.TARGET_OUTPUT.value.name, 351 ] 352 categorical_y_pred = df.loc[ 353 df[DatasetColumns.CATEGORY.value.name] == row[DatasetColumns.CATEGORY.value.name], 354 CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME, 355 ] 356 for eval_score, score_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items(): 357 category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append( 358 EvalScore( 359 name=eval_score, 360 value=self._get_score( 361 y_true=categorical_y_true, y_pred=categorical_y_pred, score_fn=score_fn 362 ), 363 ) 364 ) 365 366 return dataset_scores, list(category_scores.values()) if category_scores else None
60def convert_model_output_to_label(model_output: str, valid_labels: List[str]) -> str: 61 """Convert model output to string class label. The model is expected to return a label directly (if it has a 62 classification head), or a string containing a label (if it has a language modelling head). In the latter case we 63 strip any additional text (e.g. "The answer is 2." --> "2"). If no valid labels is contained in the 64 `model_output` an "unknown" label is returned. Users can define other `converter_fn`s, e.g. to translate a text 65 label to string ("NEGATIVE" --> "0"). 66 67 :param model_output: Value returned by the model. 68 :param valid_labels: Valid labels. 69 :return: `model_output` transformed into a label 70 """ 71 # normalise to lowercase & strip 72 valid_labels = [label.lower().strip() for label in valid_labels] 73 74 response_words = model_output.split(" ") 75 predicted_labels = [word.lower().strip() for word in response_words if word.lower().strip() in valid_labels] 76 # if there is more than one label in the model output we pick the first 77 string_label = predicted_labels[0] if predicted_labels else UNKNOWN_LABEL 78 79 return string_label
Convert model output to string class label. The model is expected to return a label directly (if it has a
classification head), or a string containing a label (if it has a language modelling head). In the latter case we
strip any additional text (e.g. "The answer is 2." --> "2"). If no valid labels is contained in the
model_output
an "unknown" label is returned. Users can define other converter_fn
s, e.g. to translate a text
label to string ("NEGATIVE" --> "0").
Parameters
- model_output: Value returned by the model.
- valid_labels: Valid labels.
Returns
model_output
transformed into a label
82class ClassificationAccuracyScores(Transform): 83 """This transform augments its input record with computed classification accuracy scores.""" 84 85 def __init__( 86 self, 87 target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name, 88 model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name, 89 classified_model_output_key: str = CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME, 90 classification_accuracy_score_key: str = CLASSIFICATION_ACCURACY_SCORE, 91 valid_labels: Optional[List[str]] = None, 92 converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label, 93 ): 94 """ClassificationAccuracyScores initializer. 95 96 :param target_output_key: The record key corresponding to the target output. 97 :param model_output_key: The record key corresponding to the model output. 98 :param classified_model_output_key: The key to use for the classified model output 99 that will be added to the record. 100 :param classification_accuracy_score_key: The key to use for the classification accuracy 101 score that will be added to the record. 102 :param valid_labels: See corresponding parameter in ClassificationAccuracyConfig. 103 :param converter_fn: See corresponding parameter in ClassificationAccuracyConfig. 104 """ 105 super().__init__( 106 target_output_key, 107 model_output_key, 108 classified_model_output_key, 109 classification_accuracy_score_key, 110 valid_labels, 111 converter_fn, 112 ) 113 self.register_input_output_keys( 114 input_keys=[target_output_key, model_output_key], 115 output_keys=[classified_model_output_key, classification_accuracy_score_key], 116 ) 117 self.target_output_key = target_output_key 118 self.model_output_key = model_output_key 119 self.classified_model_output_key = classified_model_output_key 120 self.classification_accuracy_score_key = classification_accuracy_score_key 121 self.valid_labels = valid_labels 122 self.converter_fn = converter_fn 123 124 @validate_call 125 def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]: 126 """Augment the input record with computed classification accuracy scores. 127 128 :param record: The input record. 129 :returns: The input record with the classification accuracy score 130 and the classified model output added in. 131 """ 132 target_output = record[self.target_output_key] 133 model_output = record[self.model_output_key] 134 record[self.classified_model_output_key] = self.converter_fn(model_output, self.valid_labels) # type: ignore 135 record[self.classification_accuracy_score_key] = int( 136 record[self.classified_model_output_key] == str(target_output) 137 ) 138 return record
This transform augments its input record with computed classification accuracy scores.
85 def __init__( 86 self, 87 target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name, 88 model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name, 89 classified_model_output_key: str = CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME, 90 classification_accuracy_score_key: str = CLASSIFICATION_ACCURACY_SCORE, 91 valid_labels: Optional[List[str]] = None, 92 converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label, 93 ): 94 """ClassificationAccuracyScores initializer. 95 96 :param target_output_key: The record key corresponding to the target output. 97 :param model_output_key: The record key corresponding to the model output. 98 :param classified_model_output_key: The key to use for the classified model output 99 that will be added to the record. 100 :param classification_accuracy_score_key: The key to use for the classification accuracy 101 score that will be added to the record. 102 :param valid_labels: See corresponding parameter in ClassificationAccuracyConfig. 103 :param converter_fn: See corresponding parameter in ClassificationAccuracyConfig. 104 """ 105 super().__init__( 106 target_output_key, 107 model_output_key, 108 classified_model_output_key, 109 classification_accuracy_score_key, 110 valid_labels, 111 converter_fn, 112 ) 113 self.register_input_output_keys( 114 input_keys=[target_output_key, model_output_key], 115 output_keys=[classified_model_output_key, classification_accuracy_score_key], 116 ) 117 self.target_output_key = target_output_key 118 self.model_output_key = model_output_key 119 self.classified_model_output_key = classified_model_output_key 120 self.classification_accuracy_score_key = classification_accuracy_score_key 121 self.valid_labels = valid_labels 122 self.converter_fn = converter_fn
ClassificationAccuracyScores initializer.
Parameters
- target_output_key: The record key corresponding to the target output.
- model_output_key: The record key corresponding to the model output.
- classified_model_output_key: The key to use for the classified model output that will be added to the record.
- classification_accuracy_score_key: The key to use for the classification accuracy score that will be added to the record.
- valid_labels: See corresponding parameter in ClassificationAccuracyConfig.
- converter_fn: See corresponding parameter in ClassificationAccuracyConfig.
141@dataclass(frozen=True) 142class ClassificationAccuracyConfig(EvalAlgorithmConfig): 143 """Configures the Classification Accuracy evaluation algorithm. 144 145 :param valid_labels: The labels of the classes predicted from the model. 146 :param converter_fn: Function to process model output to labels, defaults to simple integer conversion. 147 :param multiclass_average_strategy: `average` to be passed to sklearn's precision and recall scores. 148 This determines how scores are aggregated in the multiclass classification setting 149 (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html). 150 Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='micro'. 151 """ 152 153 valid_labels: Optional[List[str]] = None 154 converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label 155 multiclass_average_strategy: Optional[str] = "micro" 156 157 def __post_init__(self): 158 if self.valid_labels: 159 for i, label in enumerate(self.valid_labels): 160 if not isinstance(label, str): 161 warnings.warn("Valid labels should be strings, casting.") 162 self.valid_labels[i] = str(label)
Configures the Classification Accuracy evaluation algorithm.
Parameters
- valid_labels: The labels of the classes predicted from the model.
- converter_fn: Function to process model output to labels, defaults to simple integer conversion.
- multiclass_average_strategy:
average
to be passed to sklearn's precision and recall scores. This determines how scores are aggregated in the multiclass classification setting (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html). Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='micro'.
60def convert_model_output_to_label(model_output: str, valid_labels: List[str]) -> str: 61 """Convert model output to string class label. The model is expected to return a label directly (if it has a 62 classification head), or a string containing a label (if it has a language modelling head). In the latter case we 63 strip any additional text (e.g. "The answer is 2." --> "2"). If no valid labels is contained in the 64 `model_output` an "unknown" label is returned. Users can define other `converter_fn`s, e.g. to translate a text 65 label to string ("NEGATIVE" --> "0"). 66 67 :param model_output: Value returned by the model. 68 :param valid_labels: Valid labels. 69 :return: `model_output` transformed into a label 70 """ 71 # normalise to lowercase & strip 72 valid_labels = [label.lower().strip() for label in valid_labels] 73 74 response_words = model_output.split(" ") 75 predicted_labels = [word.lower().strip() for word in response_words if word.lower().strip() in valid_labels] 76 # if there is more than one label in the model output we pick the first 77 string_label = predicted_labels[0] if predicted_labels else UNKNOWN_LABEL 78 79 return string_label
Convert model output to string class label. The model is expected to return a label directly (if it has a
classification head), or a string containing a label (if it has a language modelling head). In the latter case we
strip any additional text (e.g. "The answer is 2." --> "2"). If no valid labels is contained in the
model_output
an "unknown" label is returned. Users can define other converter_fn
s, e.g. to translate a text
label to string ("NEGATIVE" --> "0").
Parameters
- model_output: Value returned by the model.
- valid_labels: Valid labels.
Returns
model_output
transformed into a label
165class ClassificationAccuracy(EvalAlgorithmInterface): 166 """This evaluation measures how accurately a model performs in text classification tasks. Our built-in example task is sentiment classification where the model predicts whether a user review is positive or negative. 167 The accuracy of its response is measured by comparing model output to target answer under different metrics: 168 169 1. Classification accuracy: Is `model_output == target_answer`? This metric is computed for each datapoint as well as on average over the whole dataset. 170 2. Precision: true positives / (true positives + false positives), computed once for the whole dataset. Its parameter `multiclass_average_stategy` can be set in the `ClassificationAccuracyConfig`. 171 3. Recall: true positives / (true positives + false negatives), computed once for the whole dataset. Its parameter `multiclass_average_stategy` can be set in the `ClassificationAccuracyConfig`. 172 4. Balanced classification accuracy: Same as accuracy in the binary case, otherwise averaged recall per class. This metric is computed once for the whole dataset. 173 174 All metrics are reported on average over `num_records` datapoints and per category, resulting in a number between 0 175 (worst) and 1 (best) for each metric. 176 177 """ 178 179 eval_name = EvalAlgorithm.CLASSIFICATION_ACCURACY.value 180 181 def __init__(self, eval_algorithm_config: ClassificationAccuracyConfig = ClassificationAccuracyConfig()): 182 """Default constructor 183 184 :param eval_algorithm_config: Classification Accuracy eval algorithm config. 185 """ 186 super().__init__(eval_algorithm_config) 187 self.valid_labels = eval_algorithm_config.valid_labels 188 self.converter_fn = eval_algorithm_config.converter_fn 189 self.multiclass_average_strategy = eval_algorithm_config.multiclass_average_strategy 190 191 def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: 192 """Compute classification accuracy metrics for a single sample. 193 194 :param target_output: The expected/desired model output. 195 :param model_output: The actual model output. 196 :returns: A single-element list with an EvalScore for the classification accuracy score. 197 """ 198 util.require( 199 self.valid_labels, 200 "ClassificationAccuracy evaluate_sample method requires the `valid_labels` " 201 "attribute of the ClassificationAccuracy instance to be set.", 202 ) 203 sample = { 204 DatasetColumns.TARGET_OUTPUT.value.name: target_output, 205 DatasetColumns.MODEL_OUTPUT.value.name: model_output, 206 } 207 pipeline = self._build_pipeline(self.valid_labels) 208 result = pipeline.execute_record(sample) 209 return [ 210 EvalScore( 211 name=CLASSIFICATION_ACCURACY_SCORE, 212 value=result[CLASSIFICATION_ACCURACY_SCORE], # type: ignore 213 ) 214 ] 215 216 def _build_pipeline(self, valid_labels: Optional[List[str]]) -> TransformPipeline: 217 return TransformPipeline( 218 [ClassificationAccuracyScores(valid_labels=valid_labels, converter_fn=self.converter_fn)] 219 ) 220 221 def evaluate( 222 self, 223 model: Optional[ModelRunner] = None, 224 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 225 prompt_template: Optional[str] = None, 226 num_records: int = 100, 227 save: bool = False, 228 save_strategy: Optional[SaveStrategy] = None, 229 ) -> List[EvalOutput]: 230 """Compute classification accuracy metrics on one or more datasets. 231 232 :param model: An instance of ModelRunner representing the model under evaluation. 233 If this argument is None, the `dataset_config` argument must not be None, 234 and must correspond to a dataset that already contains a column with model outputs. 235 :param dataset_config: Configures a single dataset or list of datasets used for the 236 evaluation. If not provided, this method will run evaluations using all of its 237 supported built-in datasets. 238 :param prompt_template: A template used to generate prompts that are fed to the model. 239 If not provided, defaults will be used. If provided, `model` must not be None. 240 :param num_records: The number of records to be sampled randomly from the input dataset(s) 241 used to perform the evaluation(s). 242 :param save: If set to true, prompt responses and scores will be saved to a file. 243 :param save_strategy: Specifies the strategy to use to save the localized outputs of the evaluations. If not 244 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 245 If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`. 246 247 :return: A list of EvalOutput objects. 248 """ 249 dataset_configs = get_dataset_configs(dataset_config, self.eval_name) 250 eval_outputs: List[EvalOutput] = [] 251 for dataset_config in dataset_configs: 252 dataset = get_dataset(dataset_config, num_records) 253 254 validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name]) 255 valid_labels = ( 256 self.valid_labels 257 if self.valid_labels 258 else dataset.unique(column=DatasetColumns.TARGET_OUTPUT.value.name) 259 ) 260 row_count = dataset.count() 261 if len(valid_labels) / (row_count + 1) < UNIQUENESS_FACTOR: # pragma: no cover 262 logger.warning( 263 f"The number of classes: {len(valid_labels)} in the dataset is too large " 264 f"for the number of rows in the dataset: {row_count}", 265 ) 266 267 pipeline = self._build_pipeline(valid_labels) 268 dataset_prompt_template = None 269 if DatasetColumns.MODEL_OUTPUT.value.name not in dataset.columns(): 270 util.require(model, "No ModelRunner provided. ModelRunner is required for inference on model_inputs") 271 validate_dataset(dataset, [DatasetColumns.MODEL_INPUT.value.name]) 272 dataset_prompt_template = ( 273 get_default_prompt_template(dataset_config.dataset_name) if not prompt_template else prompt_template 274 ) 275 model_invocation_pipeline = create_model_invocation_pipeline(model, dataset_prompt_template) 276 pipeline = TransformPipeline([model_invocation_pipeline, pipeline]) 277 278 output_path = generate_output_dataset_path( 279 path_to_parent_dir=util.get_eval_results_path(), 280 eval_name=self.eval_name, 281 dataset_name=dataset_config.dataset_name, 282 ) 283 with timed_block(f"Computing score and aggregation on dataset {dataset_config.dataset_name}", logger): 284 dataset = pipeline.execute(dataset) 285 dataset_scores, category_scores = self._generate_dataset_and_category_level_scores(dataset) 286 eval_outputs.append( 287 EvalOutput( 288 eval_name=self.eval_name, 289 dataset_name=dataset_config.dataset_name, 290 prompt_template=dataset_prompt_template, 291 dataset_scores=dataset_scores, 292 category_scores=category_scores, 293 output_path=output_path, 294 ) 295 ) 296 297 if save: 298 save_dataset( 299 dataset=dataset, 300 score_names=[CLASSIFICATION_ACCURACY_SCORE], 301 save_strategy=save_strategy if save_strategy else FileSaveStrategy(output_path), 302 ) 303 304 return eval_outputs 305 306 def _get_score(self, y_true, y_pred, score_fn: Callable[..., float]) -> float: 307 """ 308 Method to generate accuracy score 309 :param y_true: Ground truth (correct) target values. 310 :param y_pred: Estimated targets as returned by a classifier. 311 :param score_fn: Function for computing one of the classification accuracy scores. 312 :returns: Computed score 313 """ 314 if score_fn == recall_score or score_fn == precision_score: 315 return score_fn(y_true, y_pred, average=self.multiclass_average_strategy) 316 return score_fn(y_true, y_pred) 317 318 def _generate_dataset_and_category_level_scores( 319 self, dataset: Dataset 320 ) -> Tuple[List[EvalScore], Optional[List[CategoryScore]]]: 321 df = dataset.to_pandas() 322 dataset_scores = [ 323 EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=dataset.mean(CLASSIFICATION_ACCURACY_SCORE)) 324 ] 325 326 for eval_score, score_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items(): 327 dataset_scores.append( 328 EvalScore( 329 name=eval_score, 330 value=self._get_score( 331 # TODO dataloader should ensure target output is string 332 y_true=df[DatasetColumns.TARGET_OUTPUT.value.name], 333 y_pred=df[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME], 334 score_fn=score_fn, 335 ), 336 ) 337 ) 338 339 category_scores: Optional[Dict[str, CategoryScore]] = None 340 if DatasetColumns.CATEGORY.value.name in dataset.columns(): 341 category_scores = { 342 name: CategoryScore(name=name, scores=[]) for name in dataset.unique(DatasetColumns.CATEGORY.value.name) 343 } 344 category_aggregate: Dataset = category_wise_aggregation(dataset, CLASSIFICATION_ACCURACY_SCORE, MEAN) 345 for row in category_aggregate.iter_rows(): 346 category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append( 347 EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=row[f"mean({CLASSIFICATION_ACCURACY_SCORE})"]) 348 ) 349 categorical_y_true = df.loc[ 350 df[DatasetColumns.CATEGORY.value.name] == row[DatasetColumns.CATEGORY.value.name], 351 DatasetColumns.TARGET_OUTPUT.value.name, 352 ] 353 categorical_y_pred = df.loc[ 354 df[DatasetColumns.CATEGORY.value.name] == row[DatasetColumns.CATEGORY.value.name], 355 CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME, 356 ] 357 for eval_score, score_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items(): 358 category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append( 359 EvalScore( 360 name=eval_score, 361 value=self._get_score( 362 y_true=categorical_y_true, y_pred=categorical_y_pred, score_fn=score_fn 363 ), 364 ) 365 ) 366 367 return dataset_scores, list(category_scores.values()) if category_scores else None
This evaluation measures how accurately a model performs in text classification tasks. Our built-in example task is sentiment classification where the model predicts whether a user review is positive or negative. The accuracy of its response is measured by comparing model output to target answer under different metrics:
- Classification accuracy: Is
model_output == target_answer
? This metric is computed for each datapoint as well as on average over the whole dataset. - Precision: true positives / (true positives + false positives), computed once for the whole dataset. Its parameter
multiclass_average_stategy
can be set in theClassificationAccuracyConfig
. - Recall: true positives / (true positives + false negatives), computed once for the whole dataset. Its parameter
multiclass_average_stategy
can be set in theClassificationAccuracyConfig
. - Balanced classification accuracy: Same as accuracy in the binary case, otherwise averaged recall per class. This metric is computed once for the whole dataset.
All metrics are reported on average over num_records
datapoints and per category, resulting in a number between 0
(worst) and 1 (best) for each metric.
181 def __init__(self, eval_algorithm_config: ClassificationAccuracyConfig = ClassificationAccuracyConfig()): 182 """Default constructor 183 184 :param eval_algorithm_config: Classification Accuracy eval algorithm config. 185 """ 186 super().__init__(eval_algorithm_config) 187 self.valid_labels = eval_algorithm_config.valid_labels 188 self.converter_fn = eval_algorithm_config.converter_fn 189 self.multiclass_average_strategy = eval_algorithm_config.multiclass_average_strategy
Default constructor
Parameters
- eval_algorithm_config: Classification Accuracy eval algorithm config.
191 def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: 192 """Compute classification accuracy metrics for a single sample. 193 194 :param target_output: The expected/desired model output. 195 :param model_output: The actual model output. 196 :returns: A single-element list with an EvalScore for the classification accuracy score. 197 """ 198 util.require( 199 self.valid_labels, 200 "ClassificationAccuracy evaluate_sample method requires the `valid_labels` " 201 "attribute of the ClassificationAccuracy instance to be set.", 202 ) 203 sample = { 204 DatasetColumns.TARGET_OUTPUT.value.name: target_output, 205 DatasetColumns.MODEL_OUTPUT.value.name: model_output, 206 } 207 pipeline = self._build_pipeline(self.valid_labels) 208 result = pipeline.execute_record(sample) 209 return [ 210 EvalScore( 211 name=CLASSIFICATION_ACCURACY_SCORE, 212 value=result[CLASSIFICATION_ACCURACY_SCORE], # type: ignore 213 ) 214 ]
Compute classification accuracy metrics for a single sample.
Parameters
- target_output: The expected/desired model output.
- model_output: The actual model output. :returns: A single-element list with an EvalScore for the classification accuracy score.
221 def evaluate( 222 self, 223 model: Optional[ModelRunner] = None, 224 dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None, 225 prompt_template: Optional[str] = None, 226 num_records: int = 100, 227 save: bool = False, 228 save_strategy: Optional[SaveStrategy] = None, 229 ) -> List[EvalOutput]: 230 """Compute classification accuracy metrics on one or more datasets. 231 232 :param model: An instance of ModelRunner representing the model under evaluation. 233 If this argument is None, the `dataset_config` argument must not be None, 234 and must correspond to a dataset that already contains a column with model outputs. 235 :param dataset_config: Configures a single dataset or list of datasets used for the 236 evaluation. If not provided, this method will run evaluations using all of its 237 supported built-in datasets. 238 :param prompt_template: A template used to generate prompts that are fed to the model. 239 If not provided, defaults will be used. If provided, `model` must not be None. 240 :param num_records: The number of records to be sampled randomly from the input dataset(s) 241 used to perform the evaluation(s). 242 :param save: If set to true, prompt responses and scores will be saved to a file. 243 :param save_strategy: Specifies the strategy to use to save the localized outputs of the evaluations. If not 244 specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. 245 If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`. 246 247 :return: A list of EvalOutput objects. 248 """ 249 dataset_configs = get_dataset_configs(dataset_config, self.eval_name) 250 eval_outputs: List[EvalOutput] = [] 251 for dataset_config in dataset_configs: 252 dataset = get_dataset(dataset_config, num_records) 253 254 validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name]) 255 valid_labels = ( 256 self.valid_labels 257 if self.valid_labels 258 else dataset.unique(column=DatasetColumns.TARGET_OUTPUT.value.name) 259 ) 260 row_count = dataset.count() 261 if len(valid_labels) / (row_count + 1) < UNIQUENESS_FACTOR: # pragma: no cover 262 logger.warning( 263 f"The number of classes: {len(valid_labels)} in the dataset is too large " 264 f"for the number of rows in the dataset: {row_count}", 265 ) 266 267 pipeline = self._build_pipeline(valid_labels) 268 dataset_prompt_template = None 269 if DatasetColumns.MODEL_OUTPUT.value.name not in dataset.columns(): 270 util.require(model, "No ModelRunner provided. ModelRunner is required for inference on model_inputs") 271 validate_dataset(dataset, [DatasetColumns.MODEL_INPUT.value.name]) 272 dataset_prompt_template = ( 273 get_default_prompt_template(dataset_config.dataset_name) if not prompt_template else prompt_template 274 ) 275 model_invocation_pipeline = create_model_invocation_pipeline(model, dataset_prompt_template) 276 pipeline = TransformPipeline([model_invocation_pipeline, pipeline]) 277 278 output_path = generate_output_dataset_path( 279 path_to_parent_dir=util.get_eval_results_path(), 280 eval_name=self.eval_name, 281 dataset_name=dataset_config.dataset_name, 282 ) 283 with timed_block(f"Computing score and aggregation on dataset {dataset_config.dataset_name}", logger): 284 dataset = pipeline.execute(dataset) 285 dataset_scores, category_scores = self._generate_dataset_and_category_level_scores(dataset) 286 eval_outputs.append( 287 EvalOutput( 288 eval_name=self.eval_name, 289 dataset_name=dataset_config.dataset_name, 290 prompt_template=dataset_prompt_template, 291 dataset_scores=dataset_scores, 292 category_scores=category_scores, 293 output_path=output_path, 294 ) 295 ) 296 297 if save: 298 save_dataset( 299 dataset=dataset, 300 score_names=[CLASSIFICATION_ACCURACY_SCORE], 301 save_strategy=save_strategy if save_strategy else FileSaveStrategy(output_path), 302 ) 303 304 return eval_outputs
Compute classification accuracy metrics on one or more datasets.
Parameters
- model: An instance of ModelRunner representing the model under evaluation.
If this argument is None, the
dataset_config
argument must not be None, and must correspond to a dataset that already contains a column with model outputs. - dataset_config: Configures a single dataset or list of datasets used for the evaluation. If not provided, this method will run evaluations using all of its supported built-in datasets.
- prompt_template: A template used to generate prompts that are fed to the model.
If not provided, defaults will be used. If provided,
model
must not be None. - num_records: The number of records to be sampled randomly from the input dataset(s) used to perform the evaluation(s).
- save: If set to true, prompt responses and scores will be saved to a file.
- save_strategy: Specifies the strategy to use to save the localized outputs of the evaluations. If not
specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
If that environment variable is also not configured, it will be saved to the default path
/tmp/eval_results/
.
Returns
A list of EvalOutput objects.