fmeval.eval_algorithms.classification_accuracy

  1import logging
  2import warnings
  3from dataclasses import dataclass
  4from typing import Any, Callable, Dict, List, Optional, Tuple, Union
  5
  6from ray.data import Dataset
  7from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score
  8
  9import fmeval.util as util
 10from fmeval.constants import (
 11    DatasetColumns,
 12    MEAN,
 13)
 14from fmeval.data_loaders.util import get_dataset
 15from fmeval.data_loaders.data_config import DataConfig
 16from fmeval.eval_algorithms.common import save_dataset
 17from fmeval.eval_algorithms.eval_algorithm import (
 18    EvalAlgorithmInterface,
 19    EvalAlgorithmConfig,
 20)
 21from fmeval.eval_algorithms import (
 22    EvalAlgorithm,
 23    EvalOutput,
 24    EvalScore,
 25    CategoryScore,
 26    get_default_prompt_template,
 27)
 28from fmeval.eval_algorithms.save_strategy import SaveStrategy, FileSaveStrategy
 29from fmeval.eval_algorithms.util import (
 30    validate_dataset,
 31    category_wise_aggregation,
 32    generate_output_dataset_path,
 33    get_dataset_configs,
 34    create_model_invocation_pipeline,
 35)
 36from fmeval.model_runners.model_runner import ModelRunner
 37from fmeval.perf_util import timed_block
 38from fmeval.transforms.transform import Transform
 39from fmeval.transforms.transform_pipeline import TransformPipeline
 40from fmeval.transforms.util import validate_call
 41
 42CLASSIFICATION_ACCURACY_SCORE = "classification_accuracy_score"
 43BALANCED_ACCURACY_SCORE = "balanced_accuracy_score"
 44PRECISION_SCORE = "precision_score"
 45RECALL_SCORE = "recall_score"
 46UNKNOWN_LABEL = "unknown"
 47CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME = "classified_model_output"
 48OUTPUT_KEYS = [CLASSIFICATION_ACCURACY_SCORE, CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME]
 49CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = {
 50    BALANCED_ACCURACY_SCORE: balanced_accuracy_score,
 51    PRECISION_SCORE: precision_score,
 52    RECALL_SCORE: recall_score,
 53}
 54UNIQUENESS_FACTOR = 0.05
 55
 56logger = logging.getLogger(__name__)
 57
 58
 59def convert_model_output_to_label(model_output: str, valid_labels: List[str]) -> str:
 60    """Convert model output to string class label. The model is expected to return a label directly (if it has a
 61    classification head), or a string containing a label (if it has a language modelling head). In the latter case we
 62    strip any additional text (e.g. "The answer is 2." --> "2"). If no valid labels is contained in the
 63    `model_output` an "unknown" label is returned. Users can define other `converter_fn`s, e.g. to translate a text
 64    label to string ("NEGATIVE" --> "0").
 65
 66    :param model_output: Value returned by the model.
 67    :param valid_labels: Valid labels.
 68    :return: `model_output` transformed into a label
 69    """
 70    # normalise to lowercase & strip
 71    valid_labels = [label.lower().strip() for label in valid_labels]
 72
 73    response_words = model_output.split(" ")
 74    predicted_labels = [word.lower().strip() for word in response_words if word.lower().strip() in valid_labels]
 75    # if there is more than one label in the model output we pick the first
 76    string_label = predicted_labels[0] if predicted_labels else UNKNOWN_LABEL
 77
 78    return string_label
 79
 80
 81class ClassificationAccuracyScores(Transform):
 82    """This transform augments its input record with computed classification accuracy scores."""
 83
 84    def __init__(
 85        self,
 86        target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name,
 87        model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name,
 88        classified_model_output_key: str = CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME,
 89        classification_accuracy_score_key: str = CLASSIFICATION_ACCURACY_SCORE,
 90        valid_labels: Optional[List[str]] = None,
 91        converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label,
 92    ):
 93        """ClassificationAccuracyScores initializer.
 94
 95        :param target_output_key: The record key corresponding to the target output.
 96        :param model_output_key: The record key corresponding to the model output.
 97        :param classified_model_output_key: The key to use for the classified model output
 98            that will be added to the record.
 99        :param classification_accuracy_score_key: The key to use for the classification accuracy
100            score that will be added to the record.
101        :param valid_labels: See corresponding parameter in ClassificationAccuracyConfig.
102        :param converter_fn: See corresponding parameter in ClassificationAccuracyConfig.
103        """
104        super().__init__(
105            target_output_key,
106            model_output_key,
107            classified_model_output_key,
108            classification_accuracy_score_key,
109            valid_labels,
110            converter_fn,
111        )
112        self.register_input_output_keys(
113            input_keys=[target_output_key, model_output_key],
114            output_keys=[classified_model_output_key, classification_accuracy_score_key],
115        )
116        self.target_output_key = target_output_key
117        self.model_output_key = model_output_key
118        self.classified_model_output_key = classified_model_output_key
119        self.classification_accuracy_score_key = classification_accuracy_score_key
120        self.valid_labels = valid_labels
121        self.converter_fn = converter_fn
122
123    @validate_call
124    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
125        """Augment the input record with computed classification accuracy scores.
126
127        :param record: The input record.
128        :returns: The input record with the classification accuracy score
129            and the classified model output added in.
130        """
131        target_output = record[self.target_output_key]
132        model_output = record[self.model_output_key]
133        record[self.classified_model_output_key] = self.converter_fn(model_output, self.valid_labels)  # type: ignore
134        record[self.classification_accuracy_score_key] = int(
135            record[self.classified_model_output_key] == str(target_output)
136        )
137        return record
138
139
140@dataclass(frozen=True)
141class ClassificationAccuracyConfig(EvalAlgorithmConfig):
142    """Configures the Classification Accuracy evaluation algorithm.
143
144    :param valid_labels: The labels of the classes predicted from the model.
145    :param converter_fn: Function to process model output to labels, defaults to simple integer conversion.
146    :param multiclass_average_strategy: `average` to be passed to sklearn's precision and recall scores.
147        This determines how scores are aggregated in the multiclass classification setting
148        (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html).
149        Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='micro'.
150    """
151
152    valid_labels: Optional[List[str]] = None
153    converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label
154    multiclass_average_strategy: Optional[str] = "micro"
155
156    def __post_init__(self):
157        if self.valid_labels:
158            for i, label in enumerate(self.valid_labels):
159                if not isinstance(label, str):
160                    warnings.warn("Valid labels should be strings, casting.")
161                    self.valid_labels[i] = str(label)
162
163
164class ClassificationAccuracy(EvalAlgorithmInterface):
165    """This evaluation measures how accurately a model performs in text classification tasks. Our built-in example task is sentiment classification where the model predicts whether a user review is positive or negative.
166    The accuracy of its response is measured by comparing model output to target answer under different metrics:
167
168    1. Classification accuracy: Is `model_output == target_answer`? This metric is computed for each datapoint as well as on average over the whole dataset.
169    2. Precision: true positives / (true positives + false positives), computed once for the whole dataset. Its parameter `multiclass_average_stategy` can be set in the `ClassificationAccuracyConfig`.
170    3. Recall: true positives / (true positives + false negatives), computed once for the whole dataset. Its parameter `multiclass_average_stategy` can be set in the `ClassificationAccuracyConfig`.
171    4. Balanced classification accuracy: Same as accuracy in the binary case, otherwise averaged recall per class. This metric is computed once for the whole dataset.
172
173    All metrics are reported on average over `num_records` datapoints and per category, resulting in a number between 0
174    (worst) and 1 (best) for each metric.
175
176    """
177
178    eval_name = EvalAlgorithm.CLASSIFICATION_ACCURACY.value
179
180    def __init__(self, eval_algorithm_config: ClassificationAccuracyConfig = ClassificationAccuracyConfig()):
181        """Default constructor
182
183        :param eval_algorithm_config: Classification Accuracy eval algorithm config.
184        """
185        super().__init__(eval_algorithm_config)
186        self.valid_labels = eval_algorithm_config.valid_labels
187        self.converter_fn = eval_algorithm_config.converter_fn
188        self.multiclass_average_strategy = eval_algorithm_config.multiclass_average_strategy
189
190    def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]:
191        """Compute classification accuracy metrics for a single sample.
192
193        :param target_output: The expected/desired model output.
194        :param model_output: The actual model output.
195        :returns: A single-element list with an EvalScore for the classification accuracy score.
196        """
197        util.require(
198            self.valid_labels,
199            "ClassificationAccuracy evaluate_sample method requires the `valid_labels` "
200            "attribute of the ClassificationAccuracy instance to be set.",
201        )
202        sample = {
203            DatasetColumns.TARGET_OUTPUT.value.name: target_output,
204            DatasetColumns.MODEL_OUTPUT.value.name: model_output,
205        }
206        pipeline = self._build_pipeline(self.valid_labels)
207        result = pipeline.execute_record(sample)
208        return [
209            EvalScore(
210                name=CLASSIFICATION_ACCURACY_SCORE,
211                value=result[CLASSIFICATION_ACCURACY_SCORE],  # type: ignore
212            )
213        ]
214
215    def _build_pipeline(self, valid_labels: Optional[List[str]]) -> TransformPipeline:
216        return TransformPipeline(
217            [ClassificationAccuracyScores(valid_labels=valid_labels, converter_fn=self.converter_fn)]
218        )
219
220    def evaluate(
221        self,
222        model: Optional[ModelRunner] = None,
223        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
224        prompt_template: Optional[str] = None,
225        num_records: int = 100,
226        save: bool = False,
227        save_strategy: Optional[SaveStrategy] = None,
228    ) -> List[EvalOutput]:
229        """Compute classification accuracy metrics on one or more datasets.
230
231        :param model: An instance of ModelRunner representing the model under evaluation.
232            If this argument is None, the `dataset_config` argument must not be None,
233            and must correspond to a dataset that already contains a column with model outputs.
234        :param dataset_config: Configures a single dataset or list of datasets used for the
235            evaluation. If not provided, this method will run evaluations using all of its
236            supported built-in datasets.
237        :param prompt_template: A template used to generate prompts that are fed to the model.
238            If not provided, defaults will be used. If provided, `model` must not be None.
239        :param num_records: The number of records to be sampled randomly from the input dataset(s)
240            used to perform the evaluation(s).
241        :param save: If set to true, prompt responses and scores will be saved to a file.
242        :param save_strategy: Specifies the strategy to use to save the localized outputs of the evaluations. If not
243            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
244            If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`.
245
246        :return: A list of EvalOutput objects.
247        """
248        dataset_configs = get_dataset_configs(dataset_config, self.eval_name)
249        eval_outputs: List[EvalOutput] = []
250        for dataset_config in dataset_configs:
251            dataset = get_dataset(dataset_config, num_records)
252
253            validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name])
254            valid_labels = (
255                self.valid_labels
256                if self.valid_labels
257                else dataset.unique(column=DatasetColumns.TARGET_OUTPUT.value.name)
258            )
259            row_count = dataset.count()
260            if len(valid_labels) / (row_count + 1) < UNIQUENESS_FACTOR:  # pragma: no cover
261                logger.warning(
262                    f"The number of classes: {len(valid_labels)} in the dataset is too large "
263                    f"for the number of rows in the dataset: {row_count}",
264                )
265
266            pipeline = self._build_pipeline(valid_labels)
267            dataset_prompt_template = None
268            if DatasetColumns.MODEL_OUTPUT.value.name not in dataset.columns():
269                util.require(model, "No ModelRunner provided. ModelRunner is required for inference on model_inputs")
270                validate_dataset(dataset, [DatasetColumns.MODEL_INPUT.value.name])
271                dataset_prompt_template = (
272                    get_default_prompt_template(dataset_config.dataset_name) if not prompt_template else prompt_template
273                )
274                model_invocation_pipeline = create_model_invocation_pipeline(model, dataset_prompt_template)
275                pipeline = TransformPipeline([model_invocation_pipeline, pipeline])
276
277            output_path = generate_output_dataset_path(
278                path_to_parent_dir=util.get_eval_results_path(),
279                eval_name=self.eval_name,
280                dataset_name=dataset_config.dataset_name,
281            )
282            with timed_block(f"Computing score and aggregation on dataset {dataset_config.dataset_name}", logger):
283                dataset = pipeline.execute(dataset)
284                dataset_scores, category_scores = self._generate_dataset_and_category_level_scores(dataset)
285                eval_outputs.append(
286                    EvalOutput(
287                        eval_name=self.eval_name,
288                        dataset_name=dataset_config.dataset_name,
289                        prompt_template=dataset_prompt_template,
290                        dataset_scores=dataset_scores,
291                        category_scores=category_scores,
292                        output_path=output_path,
293                    )
294                )
295
296            if save:
297                save_dataset(
298                    dataset=dataset,
299                    score_names=[CLASSIFICATION_ACCURACY_SCORE],
300                    save_strategy=save_strategy if save_strategy else FileSaveStrategy(output_path),
301                )
302
303        return eval_outputs
304
305    def _get_score(self, y_true, y_pred, score_fn: Callable[..., float]) -> float:
306        """
307        Method to generate accuracy score
308        :param y_true: Ground truth (correct) target values.
309        :param y_pred: Estimated targets as returned by a classifier.
310        :param score_fn: Function for computing one of the classification accuracy scores.
311        :returns: Computed score
312        """
313        if score_fn == recall_score or score_fn == precision_score:
314            return score_fn(y_true, y_pred, average=self.multiclass_average_strategy)
315        return score_fn(y_true, y_pred)
316
317    def _generate_dataset_and_category_level_scores(
318        self, dataset: Dataset
319    ) -> Tuple[List[EvalScore], Optional[List[CategoryScore]]]:
320        df = dataset.to_pandas()
321        dataset_scores = [
322            EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=dataset.mean(CLASSIFICATION_ACCURACY_SCORE))
323        ]
324
325        for eval_score, score_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items():
326            dataset_scores.append(
327                EvalScore(
328                    name=eval_score,
329                    value=self._get_score(
330                        # TODO dataloader should ensure target output is string
331                        y_true=df[DatasetColumns.TARGET_OUTPUT.value.name],
332                        y_pred=df[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME],
333                        score_fn=score_fn,
334                    ),
335                )
336            )
337
338        category_scores: Optional[Dict[str, CategoryScore]] = None
339        if DatasetColumns.CATEGORY.value.name in dataset.columns():
340            category_scores = {
341                name: CategoryScore(name=name, scores=[]) for name in dataset.unique(DatasetColumns.CATEGORY.value.name)
342            }
343            category_aggregate: Dataset = category_wise_aggregation(dataset, CLASSIFICATION_ACCURACY_SCORE, MEAN)
344            for row in category_aggregate.iter_rows():
345                category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append(
346                    EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=row[f"mean({CLASSIFICATION_ACCURACY_SCORE})"])
347                )
348                categorical_y_true = df.loc[
349                    df[DatasetColumns.CATEGORY.value.name] == row[DatasetColumns.CATEGORY.value.name],
350                    DatasetColumns.TARGET_OUTPUT.value.name,
351                ]
352                categorical_y_pred = df.loc[
353                    df[DatasetColumns.CATEGORY.value.name] == row[DatasetColumns.CATEGORY.value.name],
354                    CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME,
355                ]
356                for eval_score, score_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items():
357                    category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append(
358                        EvalScore(
359                            name=eval_score,
360                            value=self._get_score(
361                                y_true=categorical_y_true, y_pred=categorical_y_pred, score_fn=score_fn
362                            ),
363                        )
364                    )
365
366        return dataset_scores, list(category_scores.values()) if category_scores else None
CLASSIFICATION_ACCURACY_SCORE = 'classification_accuracy_score'
BALANCED_ACCURACY_SCORE = 'balanced_accuracy_score'
PRECISION_SCORE = 'precision_score'
RECALL_SCORE = 'recall_score'
UNKNOWN_LABEL = 'unknown'
CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME = 'classified_model_output'
OUTPUT_KEYS = ['classification_accuracy_score', 'classified_model_output']
CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = {'balanced_accuracy_score': <function balanced_accuracy_score>, 'precision_score': <function precision_score>, 'recall_score': <function recall_score>}
UNIQUENESS_FACTOR = 0.05
def convert_model_output_to_label(model_output: str, valid_labels: List[str]) -> str:
60def convert_model_output_to_label(model_output: str, valid_labels: List[str]) -> str:
61    """Convert model output to string class label. The model is expected to return a label directly (if it has a
62    classification head), or a string containing a label (if it has a language modelling head). In the latter case we
63    strip any additional text (e.g. "The answer is 2." --> "2"). If no valid labels is contained in the
64    `model_output` an "unknown" label is returned. Users can define other `converter_fn`s, e.g. to translate a text
65    label to string ("NEGATIVE" --> "0").
66
67    :param model_output: Value returned by the model.
68    :param valid_labels: Valid labels.
69    :return: `model_output` transformed into a label
70    """
71    # normalise to lowercase & strip
72    valid_labels = [label.lower().strip() for label in valid_labels]
73
74    response_words = model_output.split(" ")
75    predicted_labels = [word.lower().strip() for word in response_words if word.lower().strip() in valid_labels]
76    # if there is more than one label in the model output we pick the first
77    string_label = predicted_labels[0] if predicted_labels else UNKNOWN_LABEL
78
79    return string_label

Convert model output to string class label. The model is expected to return a label directly (if it has a classification head), or a string containing a label (if it has a language modelling head). In the latter case we strip any additional text (e.g. "The answer is 2." --> "2"). If no valid labels is contained in the model_output an "unknown" label is returned. Users can define other converter_fns, e.g. to translate a text label to string ("NEGATIVE" --> "0").

Parameters
  • model_output: Value returned by the model.
  • valid_labels: Valid labels.
Returns

model_output transformed into a label

class ClassificationAccuracyScores(fmeval.transforms.transform.Transform):
 82class ClassificationAccuracyScores(Transform):
 83    """This transform augments its input record with computed classification accuracy scores."""
 84
 85    def __init__(
 86        self,
 87        target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name,
 88        model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name,
 89        classified_model_output_key: str = CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME,
 90        classification_accuracy_score_key: str = CLASSIFICATION_ACCURACY_SCORE,
 91        valid_labels: Optional[List[str]] = None,
 92        converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label,
 93    ):
 94        """ClassificationAccuracyScores initializer.
 95
 96        :param target_output_key: The record key corresponding to the target output.
 97        :param model_output_key: The record key corresponding to the model output.
 98        :param classified_model_output_key: The key to use for the classified model output
 99            that will be added to the record.
100        :param classification_accuracy_score_key: The key to use for the classification accuracy
101            score that will be added to the record.
102        :param valid_labels: See corresponding parameter in ClassificationAccuracyConfig.
103        :param converter_fn: See corresponding parameter in ClassificationAccuracyConfig.
104        """
105        super().__init__(
106            target_output_key,
107            model_output_key,
108            classified_model_output_key,
109            classification_accuracy_score_key,
110            valid_labels,
111            converter_fn,
112        )
113        self.register_input_output_keys(
114            input_keys=[target_output_key, model_output_key],
115            output_keys=[classified_model_output_key, classification_accuracy_score_key],
116        )
117        self.target_output_key = target_output_key
118        self.model_output_key = model_output_key
119        self.classified_model_output_key = classified_model_output_key
120        self.classification_accuracy_score_key = classification_accuracy_score_key
121        self.valid_labels = valid_labels
122        self.converter_fn = converter_fn
123
124    @validate_call
125    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
126        """Augment the input record with computed classification accuracy scores.
127
128        :param record: The input record.
129        :returns: The input record with the classification accuracy score
130            and the classified model output added in.
131        """
132        target_output = record[self.target_output_key]
133        model_output = record[self.model_output_key]
134        record[self.classified_model_output_key] = self.converter_fn(model_output, self.valid_labels)  # type: ignore
135        record[self.classification_accuracy_score_key] = int(
136            record[self.classified_model_output_key] == str(target_output)
137        )
138        return record

This transform augments its input record with computed classification accuracy scores.

ClassificationAccuracyScores( target_output_key: str = 'target_output', model_output_key: str = 'model_output', classified_model_output_key: str = 'classified_model_output', classification_accuracy_score_key: str = 'classification_accuracy_score', valid_labels: Optional[List[str]] = None, converter_fn: Callable[[str, List[str]], str] = <function convert_model_output_to_label>)
 85    def __init__(
 86        self,
 87        target_output_key: str = DatasetColumns.TARGET_OUTPUT.value.name,
 88        model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name,
 89        classified_model_output_key: str = CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME,
 90        classification_accuracy_score_key: str = CLASSIFICATION_ACCURACY_SCORE,
 91        valid_labels: Optional[List[str]] = None,
 92        converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label,
 93    ):
 94        """ClassificationAccuracyScores initializer.
 95
 96        :param target_output_key: The record key corresponding to the target output.
 97        :param model_output_key: The record key corresponding to the model output.
 98        :param classified_model_output_key: The key to use for the classified model output
 99            that will be added to the record.
100        :param classification_accuracy_score_key: The key to use for the classification accuracy
101            score that will be added to the record.
102        :param valid_labels: See corresponding parameter in ClassificationAccuracyConfig.
103        :param converter_fn: See corresponding parameter in ClassificationAccuracyConfig.
104        """
105        super().__init__(
106            target_output_key,
107            model_output_key,
108            classified_model_output_key,
109            classification_accuracy_score_key,
110            valid_labels,
111            converter_fn,
112        )
113        self.register_input_output_keys(
114            input_keys=[target_output_key, model_output_key],
115            output_keys=[classified_model_output_key, classification_accuracy_score_key],
116        )
117        self.target_output_key = target_output_key
118        self.model_output_key = model_output_key
119        self.classified_model_output_key = classified_model_output_key
120        self.classification_accuracy_score_key = classification_accuracy_score_key
121        self.valid_labels = valid_labels
122        self.converter_fn = converter_fn

ClassificationAccuracyScores initializer.

Parameters
  • target_output_key: The record key corresponding to the target output.
  • model_output_key: The record key corresponding to the model output.
  • classified_model_output_key: The key to use for the classified model output that will be added to the record.
  • classification_accuracy_score_key: The key to use for the classification accuracy score that will be added to the record.
  • valid_labels: See corresponding parameter in ClassificationAccuracyConfig.
  • converter_fn: See corresponding parameter in ClassificationAccuracyConfig.
target_output_key
model_output_key
classified_model_output_key
classification_accuracy_score_key
valid_labels
converter_fn
@dataclass(frozen=True)
class ClassificationAccuracyConfig(fmeval.eval_algorithms.eval_algorithm.EvalAlgorithmConfig):
141@dataclass(frozen=True)
142class ClassificationAccuracyConfig(EvalAlgorithmConfig):
143    """Configures the Classification Accuracy evaluation algorithm.
144
145    :param valid_labels: The labels of the classes predicted from the model.
146    :param converter_fn: Function to process model output to labels, defaults to simple integer conversion.
147    :param multiclass_average_strategy: `average` to be passed to sklearn's precision and recall scores.
148        This determines how scores are aggregated in the multiclass classification setting
149        (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html).
150        Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='micro'.
151    """
152
153    valid_labels: Optional[List[str]] = None
154    converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label
155    multiclass_average_strategy: Optional[str] = "micro"
156
157    def __post_init__(self):
158        if self.valid_labels:
159            for i, label in enumerate(self.valid_labels):
160                if not isinstance(label, str):
161                    warnings.warn("Valid labels should be strings, casting.")
162                    self.valid_labels[i] = str(label)

Configures the Classification Accuracy evaluation algorithm.

Parameters
  • valid_labels: The labels of the classes predicted from the model.
  • converter_fn: Function to process model output to labels, defaults to simple integer conversion.
  • multiclass_average_strategy: average to be passed to sklearn's precision and recall scores. This determines how scores are aggregated in the multiclass classification setting (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html). Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='micro'.
ClassificationAccuracyConfig( valid_labels: Optional[List[str]] = None, converter_fn: Callable[[str, List[str]], str] = <function convert_model_output_to_label>, multiclass_average_strategy: Optional[str] = 'micro')
valid_labels: Optional[List[str]] = None
def converter_fn(model_output: str, valid_labels: List[str]) -> str:
60def convert_model_output_to_label(model_output: str, valid_labels: List[str]) -> str:
61    """Convert model output to string class label. The model is expected to return a label directly (if it has a
62    classification head), or a string containing a label (if it has a language modelling head). In the latter case we
63    strip any additional text (e.g. "The answer is 2." --> "2"). If no valid labels is contained in the
64    `model_output` an "unknown" label is returned. Users can define other `converter_fn`s, e.g. to translate a text
65    label to string ("NEGATIVE" --> "0").
66
67    :param model_output: Value returned by the model.
68    :param valid_labels: Valid labels.
69    :return: `model_output` transformed into a label
70    """
71    # normalise to lowercase & strip
72    valid_labels = [label.lower().strip() for label in valid_labels]
73
74    response_words = model_output.split(" ")
75    predicted_labels = [word.lower().strip() for word in response_words if word.lower().strip() in valid_labels]
76    # if there is more than one label in the model output we pick the first
77    string_label = predicted_labels[0] if predicted_labels else UNKNOWN_LABEL
78
79    return string_label

Convert model output to string class label. The model is expected to return a label directly (if it has a classification head), or a string containing a label (if it has a language modelling head). In the latter case we strip any additional text (e.g. "The answer is 2." --> "2"). If no valid labels is contained in the model_output an "unknown" label is returned. Users can define other converter_fns, e.g. to translate a text label to string ("NEGATIVE" --> "0").

Parameters
  • model_output: Value returned by the model.
  • valid_labels: Valid labels.
Returns

model_output transformed into a label

multiclass_average_strategy: Optional[str] = 'micro'
class ClassificationAccuracy(fmeval.eval_algorithms.eval_algorithm.EvalAlgorithmInterface):
165class ClassificationAccuracy(EvalAlgorithmInterface):
166    """This evaluation measures how accurately a model performs in text classification tasks. Our built-in example task is sentiment classification where the model predicts whether a user review is positive or negative.
167    The accuracy of its response is measured by comparing model output to target answer under different metrics:
168
169    1. Classification accuracy: Is `model_output == target_answer`? This metric is computed for each datapoint as well as on average over the whole dataset.
170    2. Precision: true positives / (true positives + false positives), computed once for the whole dataset. Its parameter `multiclass_average_stategy` can be set in the `ClassificationAccuracyConfig`.
171    3. Recall: true positives / (true positives + false negatives), computed once for the whole dataset. Its parameter `multiclass_average_stategy` can be set in the `ClassificationAccuracyConfig`.
172    4. Balanced classification accuracy: Same as accuracy in the binary case, otherwise averaged recall per class. This metric is computed once for the whole dataset.
173
174    All metrics are reported on average over `num_records` datapoints and per category, resulting in a number between 0
175    (worst) and 1 (best) for each metric.
176
177    """
178
179    eval_name = EvalAlgorithm.CLASSIFICATION_ACCURACY.value
180
181    def __init__(self, eval_algorithm_config: ClassificationAccuracyConfig = ClassificationAccuracyConfig()):
182        """Default constructor
183
184        :param eval_algorithm_config: Classification Accuracy eval algorithm config.
185        """
186        super().__init__(eval_algorithm_config)
187        self.valid_labels = eval_algorithm_config.valid_labels
188        self.converter_fn = eval_algorithm_config.converter_fn
189        self.multiclass_average_strategy = eval_algorithm_config.multiclass_average_strategy
190
191    def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]:
192        """Compute classification accuracy metrics for a single sample.
193
194        :param target_output: The expected/desired model output.
195        :param model_output: The actual model output.
196        :returns: A single-element list with an EvalScore for the classification accuracy score.
197        """
198        util.require(
199            self.valid_labels,
200            "ClassificationAccuracy evaluate_sample method requires the `valid_labels` "
201            "attribute of the ClassificationAccuracy instance to be set.",
202        )
203        sample = {
204            DatasetColumns.TARGET_OUTPUT.value.name: target_output,
205            DatasetColumns.MODEL_OUTPUT.value.name: model_output,
206        }
207        pipeline = self._build_pipeline(self.valid_labels)
208        result = pipeline.execute_record(sample)
209        return [
210            EvalScore(
211                name=CLASSIFICATION_ACCURACY_SCORE,
212                value=result[CLASSIFICATION_ACCURACY_SCORE],  # type: ignore
213            )
214        ]
215
216    def _build_pipeline(self, valid_labels: Optional[List[str]]) -> TransformPipeline:
217        return TransformPipeline(
218            [ClassificationAccuracyScores(valid_labels=valid_labels, converter_fn=self.converter_fn)]
219        )
220
221    def evaluate(
222        self,
223        model: Optional[ModelRunner] = None,
224        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
225        prompt_template: Optional[str] = None,
226        num_records: int = 100,
227        save: bool = False,
228        save_strategy: Optional[SaveStrategy] = None,
229    ) -> List[EvalOutput]:
230        """Compute classification accuracy metrics on one or more datasets.
231
232        :param model: An instance of ModelRunner representing the model under evaluation.
233            If this argument is None, the `dataset_config` argument must not be None,
234            and must correspond to a dataset that already contains a column with model outputs.
235        :param dataset_config: Configures a single dataset or list of datasets used for the
236            evaluation. If not provided, this method will run evaluations using all of its
237            supported built-in datasets.
238        :param prompt_template: A template used to generate prompts that are fed to the model.
239            If not provided, defaults will be used. If provided, `model` must not be None.
240        :param num_records: The number of records to be sampled randomly from the input dataset(s)
241            used to perform the evaluation(s).
242        :param save: If set to true, prompt responses and scores will be saved to a file.
243        :param save_strategy: Specifies the strategy to use to save the localized outputs of the evaluations. If not
244            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
245            If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`.
246
247        :return: A list of EvalOutput objects.
248        """
249        dataset_configs = get_dataset_configs(dataset_config, self.eval_name)
250        eval_outputs: List[EvalOutput] = []
251        for dataset_config in dataset_configs:
252            dataset = get_dataset(dataset_config, num_records)
253
254            validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name])
255            valid_labels = (
256                self.valid_labels
257                if self.valid_labels
258                else dataset.unique(column=DatasetColumns.TARGET_OUTPUT.value.name)
259            )
260            row_count = dataset.count()
261            if len(valid_labels) / (row_count + 1) < UNIQUENESS_FACTOR:  # pragma: no cover
262                logger.warning(
263                    f"The number of classes: {len(valid_labels)} in the dataset is too large "
264                    f"for the number of rows in the dataset: {row_count}",
265                )
266
267            pipeline = self._build_pipeline(valid_labels)
268            dataset_prompt_template = None
269            if DatasetColumns.MODEL_OUTPUT.value.name not in dataset.columns():
270                util.require(model, "No ModelRunner provided. ModelRunner is required for inference on model_inputs")
271                validate_dataset(dataset, [DatasetColumns.MODEL_INPUT.value.name])
272                dataset_prompt_template = (
273                    get_default_prompt_template(dataset_config.dataset_name) if not prompt_template else prompt_template
274                )
275                model_invocation_pipeline = create_model_invocation_pipeline(model, dataset_prompt_template)
276                pipeline = TransformPipeline([model_invocation_pipeline, pipeline])
277
278            output_path = generate_output_dataset_path(
279                path_to_parent_dir=util.get_eval_results_path(),
280                eval_name=self.eval_name,
281                dataset_name=dataset_config.dataset_name,
282            )
283            with timed_block(f"Computing score and aggregation on dataset {dataset_config.dataset_name}", logger):
284                dataset = pipeline.execute(dataset)
285                dataset_scores, category_scores = self._generate_dataset_and_category_level_scores(dataset)
286                eval_outputs.append(
287                    EvalOutput(
288                        eval_name=self.eval_name,
289                        dataset_name=dataset_config.dataset_name,
290                        prompt_template=dataset_prompt_template,
291                        dataset_scores=dataset_scores,
292                        category_scores=category_scores,
293                        output_path=output_path,
294                    )
295                )
296
297            if save:
298                save_dataset(
299                    dataset=dataset,
300                    score_names=[CLASSIFICATION_ACCURACY_SCORE],
301                    save_strategy=save_strategy if save_strategy else FileSaveStrategy(output_path),
302                )
303
304        return eval_outputs
305
306    def _get_score(self, y_true, y_pred, score_fn: Callable[..., float]) -> float:
307        """
308        Method to generate accuracy score
309        :param y_true: Ground truth (correct) target values.
310        :param y_pred: Estimated targets as returned by a classifier.
311        :param score_fn: Function for computing one of the classification accuracy scores.
312        :returns: Computed score
313        """
314        if score_fn == recall_score or score_fn == precision_score:
315            return score_fn(y_true, y_pred, average=self.multiclass_average_strategy)
316        return score_fn(y_true, y_pred)
317
318    def _generate_dataset_and_category_level_scores(
319        self, dataset: Dataset
320    ) -> Tuple[List[EvalScore], Optional[List[CategoryScore]]]:
321        df = dataset.to_pandas()
322        dataset_scores = [
323            EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=dataset.mean(CLASSIFICATION_ACCURACY_SCORE))
324        ]
325
326        for eval_score, score_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items():
327            dataset_scores.append(
328                EvalScore(
329                    name=eval_score,
330                    value=self._get_score(
331                        # TODO dataloader should ensure target output is string
332                        y_true=df[DatasetColumns.TARGET_OUTPUT.value.name],
333                        y_pred=df[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME],
334                        score_fn=score_fn,
335                    ),
336                )
337            )
338
339        category_scores: Optional[Dict[str, CategoryScore]] = None
340        if DatasetColumns.CATEGORY.value.name in dataset.columns():
341            category_scores = {
342                name: CategoryScore(name=name, scores=[]) for name in dataset.unique(DatasetColumns.CATEGORY.value.name)
343            }
344            category_aggregate: Dataset = category_wise_aggregation(dataset, CLASSIFICATION_ACCURACY_SCORE, MEAN)
345            for row in category_aggregate.iter_rows():
346                category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append(
347                    EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=row[f"mean({CLASSIFICATION_ACCURACY_SCORE})"])
348                )
349                categorical_y_true = df.loc[
350                    df[DatasetColumns.CATEGORY.value.name] == row[DatasetColumns.CATEGORY.value.name],
351                    DatasetColumns.TARGET_OUTPUT.value.name,
352                ]
353                categorical_y_pred = df.loc[
354                    df[DatasetColumns.CATEGORY.value.name] == row[DatasetColumns.CATEGORY.value.name],
355                    CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME,
356                ]
357                for eval_score, score_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items():
358                    category_scores[row[DatasetColumns.CATEGORY.value.name]].scores.append(
359                        EvalScore(
360                            name=eval_score,
361                            value=self._get_score(
362                                y_true=categorical_y_true, y_pred=categorical_y_pred, score_fn=score_fn
363                            ),
364                        )
365                    )
366
367        return dataset_scores, list(category_scores.values()) if category_scores else None

This evaluation measures how accurately a model performs in text classification tasks. Our built-in example task is sentiment classification where the model predicts whether a user review is positive or negative. The accuracy of its response is measured by comparing model output to target answer under different metrics:

  1. Classification accuracy: Is model_output == target_answer? This metric is computed for each datapoint as well as on average over the whole dataset.
  2. Precision: true positives / (true positives + false positives), computed once for the whole dataset. Its parameter multiclass_average_stategy can be set in the ClassificationAccuracyConfig.
  3. Recall: true positives / (true positives + false negatives), computed once for the whole dataset. Its parameter multiclass_average_stategy can be set in the ClassificationAccuracyConfig.
  4. Balanced classification accuracy: Same as accuracy in the binary case, otherwise averaged recall per class. This metric is computed once for the whole dataset.

All metrics are reported on average over num_records datapoints and per category, resulting in a number between 0 (worst) and 1 (best) for each metric.

ClassificationAccuracy( eval_algorithm_config: ClassificationAccuracyConfig = ClassificationAccuracyConfig(valid_labels=None, converter_fn=<function convert_model_output_to_label>, multiclass_average_strategy='micro'))
181    def __init__(self, eval_algorithm_config: ClassificationAccuracyConfig = ClassificationAccuracyConfig()):
182        """Default constructor
183
184        :param eval_algorithm_config: Classification Accuracy eval algorithm config.
185        """
186        super().__init__(eval_algorithm_config)
187        self.valid_labels = eval_algorithm_config.valid_labels
188        self.converter_fn = eval_algorithm_config.converter_fn
189        self.multiclass_average_strategy = eval_algorithm_config.multiclass_average_strategy

Default constructor

Parameters
  • eval_algorithm_config: Classification Accuracy eval algorithm config.
eval_name = 'classification_accuracy'
valid_labels
converter_fn
multiclass_average_strategy
def evaluate_sample( self, target_output: str, model_output: str) -> List[fmeval.eval_algorithms.EvalScore]:
191    def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]:
192        """Compute classification accuracy metrics for a single sample.
193
194        :param target_output: The expected/desired model output.
195        :param model_output: The actual model output.
196        :returns: A single-element list with an EvalScore for the classification accuracy score.
197        """
198        util.require(
199            self.valid_labels,
200            "ClassificationAccuracy evaluate_sample method requires the `valid_labels` "
201            "attribute of the ClassificationAccuracy instance to be set.",
202        )
203        sample = {
204            DatasetColumns.TARGET_OUTPUT.value.name: target_output,
205            DatasetColumns.MODEL_OUTPUT.value.name: model_output,
206        }
207        pipeline = self._build_pipeline(self.valid_labels)
208        result = pipeline.execute_record(sample)
209        return [
210            EvalScore(
211                name=CLASSIFICATION_ACCURACY_SCORE,
212                value=result[CLASSIFICATION_ACCURACY_SCORE],  # type: ignore
213            )
214        ]

Compute classification accuracy metrics for a single sample.

Parameters
  • target_output: The expected/desired model output.
  • model_output: The actual model output. :returns: A single-element list with an EvalScore for the classification accuracy score.
def evaluate( self, model: Optional[fmeval.model_runners.model_runner.ModelRunner] = None, dataset_config: Union[fmeval.data_loaders.data_config.DataConfig, List[fmeval.data_loaders.data_config.DataConfig], NoneType] = None, prompt_template: Optional[str] = None, num_records: int = 100, save: bool = False, save_strategy: Optional[fmeval.eval_algorithms.save_strategy.SaveStrategy] = None) -> List[fmeval.eval_algorithms.EvalOutput]:
221    def evaluate(
222        self,
223        model: Optional[ModelRunner] = None,
224        dataset_config: Optional[Union[DataConfig, List[DataConfig]]] = None,
225        prompt_template: Optional[str] = None,
226        num_records: int = 100,
227        save: bool = False,
228        save_strategy: Optional[SaveStrategy] = None,
229    ) -> List[EvalOutput]:
230        """Compute classification accuracy metrics on one or more datasets.
231
232        :param model: An instance of ModelRunner representing the model under evaluation.
233            If this argument is None, the `dataset_config` argument must not be None,
234            and must correspond to a dataset that already contains a column with model outputs.
235        :param dataset_config: Configures a single dataset or list of datasets used for the
236            evaluation. If not provided, this method will run evaluations using all of its
237            supported built-in datasets.
238        :param prompt_template: A template used to generate prompts that are fed to the model.
239            If not provided, defaults will be used. If provided, `model` must not be None.
240        :param num_records: The number of records to be sampled randomly from the input dataset(s)
241            used to perform the evaluation(s).
242        :param save: If set to true, prompt responses and scores will be saved to a file.
243        :param save_strategy: Specifies the strategy to use to save the localized outputs of the evaluations. If not
244            specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable.
245            If that environment variable is also not configured, it will be saved to the default path `/tmp/eval_results/`.
246
247        :return: A list of EvalOutput objects.
248        """
249        dataset_configs = get_dataset_configs(dataset_config, self.eval_name)
250        eval_outputs: List[EvalOutput] = []
251        for dataset_config in dataset_configs:
252            dataset = get_dataset(dataset_config, num_records)
253
254            validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name])
255            valid_labels = (
256                self.valid_labels
257                if self.valid_labels
258                else dataset.unique(column=DatasetColumns.TARGET_OUTPUT.value.name)
259            )
260            row_count = dataset.count()
261            if len(valid_labels) / (row_count + 1) < UNIQUENESS_FACTOR:  # pragma: no cover
262                logger.warning(
263                    f"The number of classes: {len(valid_labels)} in the dataset is too large "
264                    f"for the number of rows in the dataset: {row_count}",
265                )
266
267            pipeline = self._build_pipeline(valid_labels)
268            dataset_prompt_template = None
269            if DatasetColumns.MODEL_OUTPUT.value.name not in dataset.columns():
270                util.require(model, "No ModelRunner provided. ModelRunner is required for inference on model_inputs")
271                validate_dataset(dataset, [DatasetColumns.MODEL_INPUT.value.name])
272                dataset_prompt_template = (
273                    get_default_prompt_template(dataset_config.dataset_name) if not prompt_template else prompt_template
274                )
275                model_invocation_pipeline = create_model_invocation_pipeline(model, dataset_prompt_template)
276                pipeline = TransformPipeline([model_invocation_pipeline, pipeline])
277
278            output_path = generate_output_dataset_path(
279                path_to_parent_dir=util.get_eval_results_path(),
280                eval_name=self.eval_name,
281                dataset_name=dataset_config.dataset_name,
282            )
283            with timed_block(f"Computing score and aggregation on dataset {dataset_config.dataset_name}", logger):
284                dataset = pipeline.execute(dataset)
285                dataset_scores, category_scores = self._generate_dataset_and_category_level_scores(dataset)
286                eval_outputs.append(
287                    EvalOutput(
288                        eval_name=self.eval_name,
289                        dataset_name=dataset_config.dataset_name,
290                        prompt_template=dataset_prompt_template,
291                        dataset_scores=dataset_scores,
292                        category_scores=category_scores,
293                        output_path=output_path,
294                    )
295                )
296
297            if save:
298                save_dataset(
299                    dataset=dataset,
300                    score_names=[CLASSIFICATION_ACCURACY_SCORE],
301                    save_strategy=save_strategy if save_strategy else FileSaveStrategy(output_path),
302                )
303
304        return eval_outputs

Compute classification accuracy metrics on one or more datasets.

Parameters
  • model: An instance of ModelRunner representing the model under evaluation. If this argument is None, the dataset_config argument must not be None, and must correspond to a dataset that already contains a column with model outputs.
  • dataset_config: Configures a single dataset or list of datasets used for the evaluation. If not provided, this method will run evaluations using all of its supported built-in datasets.
  • prompt_template: A template used to generate prompts that are fed to the model. If not provided, defaults will be used. If provided, model must not be None.
  • num_records: The number of records to be sampled randomly from the input dataset(s) used to perform the evaluation(s).
  • save: If set to true, prompt responses and scores will be saved to a file.
  • save_strategy: Specifies the strategy to use to save the localized outputs of the evaluations. If not specified, it will save it to the path that can be configured by the EVAL_RESULTS_PATH environment variable. If that environment variable is also not configured, it will be saved to the default path /tmp/eval_results/.
Returns

A list of EvalOutput objects.