fmeval.constants

  1from enum import Enum
  2from dataclasses import dataclass
  3from collections import OrderedDict
  4from typing import Optional
  5
  6# Output results path
  7
  8EVAL_RESULTS_PATH = "EVAL_RESULTS_PATH"
  9DEFAULT_EVAL_RESULTS_PATH = "/tmp/eval_results/"
 10
 11# Number of actors to use
 12PARALLELIZATION_FACTOR = "PARALLELIZATION_FACTOR"
 13PARTITION_MULTIPLIER = 5
 14
 15# Environment variables for SageMaker endpoint urls
 16SAGEMAKER_SERVICE_ENDPOINT_URL = "SAGEMAKER_SERVICE_ENDPOINT_URL"
 17SAGEMAKER_RUNTIME_ENDPOINT_URL = "SAGEMAKER_RUNTIME_ENDPOINT_URL"
 18
 19# We default the boto3 client region to us-west-2 as the dataset bucket cannot be accessed from opt-in regions.
 20BUILT_IN_DATASET_PREFIX = "s3://fmeval/datasets"
 21BUILT_IN_DATASET_DEFAULT_REGION = "us-west-2"
 22
 23# Mapping of iso region to built in dataset region in the same partition
 24BUILT_IN_DATASET_ISO_REGIONS = {"us-isof-south-1": "us-isof-south-1", "us-isof-east-1": "us-isof-south-1"}
 25
 26# Environment variable for disabling telemetry
 27DISABLE_FMEVAL_TELEMETRY = "DISABLE_FMEVAL_TELEMETRY"
 28
 29
 30@dataclass(frozen=True)
 31class Column:
 32    """
 33    This class represents a column in the Ray Dataset produced by
 34    an evaluation algorithm's `evaluate` method.
 35
 36    Note that some columns are created during the "data loading" stage,
 37    when the initial Ray Dataset object is created by data_loaders.util.get_dataset,
 38    while the remaining columns are created during the execution of `evaluate`.
 39    Only the contents of the columns created during the data loading stage
 40    have the potential to be casted to strings.
 41
 42    :param name: The name of the column as it appears in the Ray Dataset.
 43    :param should_cast: Whether the contents of this column should
 44        be casted to strings during data loading.
 45        This parameter is None (as opposed to False) for columns that do
 46        not exist during data loading to make it clear that casting these
 47        columns is not even a possibility to begin with.
 48    """
 49
 50    name: str
 51    should_cast: Optional[bool] = None
 52
 53
 54class DatasetColumns(Enum):
 55    """
 56    This Enum represents the columns that appear in the finalized
 57    Ray Dataset produced during the course of executing an eval algorithm's
 58    `evaluate` method.
 59
 60    These are the only columns (aside from score columns) whose
 61    data gets written to output records by `util.save_dataset`.
 62    Other algorithm-specific columns that get produced as intermediate
 63    results (for example, CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME in
 64    ClassificationAccuracy) are not included here, and thus won't
 65    get saved by `util.save_dataset`.
 66    """
 67
 68    MODEL_INPUT = Column(name="model_input", should_cast=True)
 69    PROMPT = Column(name="prompt")
 70    MODEL_OUTPUT = Column(name="model_output", should_cast=True)
 71    MODEL_LOG_PROBABILITY = Column(name="model_log_probability")
 72    TARGET_OUTPUT = Column(name="target_output", should_cast=True)
 73    CATEGORY = Column(name="category", should_cast=True)
 74    CONTEXT = Column(name="context", should_cast=True)
 75    SENT_MORE_INPUT = Column(name="sent_more_input", should_cast=True)
 76    SENT_LESS_INPUT = Column(name="sent_less_input", should_cast=True)
 77    SENT_MORE_PROMPT = Column(name="sent_more_prompt")
 78    SENT_LESS_PROMPT = Column(name="sent_less_prompt")
 79    SENT_MORE_LOG_PROB = Column(name="sent_more_log_prob", should_cast=False)
 80    SENT_LESS_LOG_PROB = Column(name="sent_less_log_prob", should_cast=False)
 81    ERROR = Column(name="error", should_cast=False)
 82
 83
 84DATASET_COLUMNS = OrderedDict((col.value.name, col) for col in DatasetColumns)
 85COLUMNS_WITH_LISTS = [DatasetColumns.CONTEXT.value.name]
 86
 87# This suffix must be included at the end of all
 88# DataConfig attribute names where the attribute
 89# represents a mechanism for locating the data for
 90# a column. An example mechanism is a JMESPath query
 91# (when the dataset format is JSON/JSON Lines).
 92DATA_CONFIG_LOCATION_SUFFIX = "_location"
 93
 94# Supported MIME types
 95MIME_TYPE_JSON = "application/json"
 96MIME_TYPE_JSONLINES = "application/jsonlines"
 97
 98SUPPORTED_MIME_TYPES = [MIME_TYPE_JSON, MIME_TYPE_JSONLINES]
 99
100# Aggregation methods
101MEAN = "mean"
102
103# Configures `save_dataset` behavior regarding how many
104# EvalOutputRecords to accumulate before writing them
105# to the output JSON Lines file.
106EVAL_OUTPUT_RECORDS_BATCH_SIZE = 1024
107
108# Dataloader seed
109SEED = 1234
110
111# Semantic robustness perturbation types
112BUTTER_FINGER = "butter_finger"
113RANDOM_UPPER_CASE = "random_upper_case"
114WHITESPACE_ADD_REMOVE = "whitespace_add_remove"
115
116PREFIX_FOR_DELTA_SCORES = "delta_"
117
118# Check if model is deterministic for first NUM_ROWS_DETERMINISTIC rows of dataset
119NUM_ROWS_DETERMINISTIC = 5
120
121MAX_ROWS_TO_TAKE = 100000
122
123# The absolute tolerance used when performing approximate numerical comparisons,
124# specifically, when comparing EvalScore objects.
125ABS_TOL = 1e-3
126
127# Jumpstart
128JUMPSTART_MODEL_ID = "jumpstart_model_id"
129JUMPSTART_MODEL_VERSION = "jumpstart_model_version"
130JUMPSTART_MODEL_TYPE = "jumpstart_model_type"
131MODEL_ID = "model_id"
132SPEC_KEY = "spec_key"
133DEFAULT_PAYLOADS = "default_payloads"
134SDK_MANIFEST_FILE = "models_manifest.json"
135PROPRIETARY_SDK_MANIFEST_FILE = "proprietary-sdk-manifest.json"
136JUMPSTART_BUCKET_BASE_URL_FORMAT = "https://jumpstart-cache-prod-{}.s3.{}.amazonaws.com"
137JUMPSTART_BUCKET_BASE_URL_FORMAT_ENV_VAR = "JUMPSTART_BUCKET_BASE_URL_FORMAT"
138GENERATED_TEXT_JMESPATH_EXPRESSION = "*.output_keys.generated_text"
139INPUT_LOG_PROBS_JMESPATH_EXPRESSION = "*.output_keys.input_logprobs"
140EMBEDDING_JMESPATH_EXPRESSION = "embedding"
141IS_EMBEDDING_MODEL = "is_embedding_model"
142
143# BERTScore
144BERTSCORE_DEFAULT_MODEL = "microsoft/deberta-xlarge-mnli"
145
146
147# S3 multi-part upload constants
148UPLOAD_ID = "UploadId"
149PARTS = "Parts"
150E_TAG = "ETag"
151PART_NUMBER = "PartNumber"
EVAL_RESULTS_PATH = 'EVAL_RESULTS_PATH'
DEFAULT_EVAL_RESULTS_PATH = '/tmp/eval_results/'
PARALLELIZATION_FACTOR = 'PARALLELIZATION_FACTOR'
PARTITION_MULTIPLIER = 5
SAGEMAKER_SERVICE_ENDPOINT_URL = 'SAGEMAKER_SERVICE_ENDPOINT_URL'
SAGEMAKER_RUNTIME_ENDPOINT_URL = 'SAGEMAKER_RUNTIME_ENDPOINT_URL'
BUILT_IN_DATASET_PREFIX = 's3://fmeval/datasets'
BUILT_IN_DATASET_DEFAULT_REGION = 'us-west-2'
BUILT_IN_DATASET_ISO_REGIONS = {'us-isof-south-1': 'us-isof-south-1', 'us-isof-east-1': 'us-isof-south-1'}
DISABLE_FMEVAL_TELEMETRY = 'DISABLE_FMEVAL_TELEMETRY'
@dataclass(frozen=True)
class Column:
31@dataclass(frozen=True)
32class Column:
33    """
34    This class represents a column in the Ray Dataset produced by
35    an evaluation algorithm's `evaluate` method.
36
37    Note that some columns are created during the "data loading" stage,
38    when the initial Ray Dataset object is created by data_loaders.util.get_dataset,
39    while the remaining columns are created during the execution of `evaluate`.
40    Only the contents of the columns created during the data loading stage
41    have the potential to be casted to strings.
42
43    :param name: The name of the column as it appears in the Ray Dataset.
44    :param should_cast: Whether the contents of this column should
45        be casted to strings during data loading.
46        This parameter is None (as opposed to False) for columns that do
47        not exist during data loading to make it clear that casting these
48        columns is not even a possibility to begin with.
49    """
50
51    name: str
52    should_cast: Optional[bool] = None

This class represents a column in the Ray Dataset produced by an evaluation algorithm's evaluate method.

Note that some columns are created during the "data loading" stage, when the initial Ray Dataset object is created by data_loaders.util.get_dataset, while the remaining columns are created during the execution of evaluate. Only the contents of the columns created during the data loading stage have the potential to be casted to strings.

Parameters
  • name: The name of the column as it appears in the Ray Dataset.
  • should_cast: Whether the contents of this column should be casted to strings during data loading. This parameter is None (as opposed to False) for columns that do not exist during data loading to make it clear that casting these columns is not even a possibility to begin with.
Column(name: str, should_cast: Optional[bool] = None)
name: str
should_cast: Optional[bool] = None
class DatasetColumns(enum.Enum):
55class DatasetColumns(Enum):
56    """
57    This Enum represents the columns that appear in the finalized
58    Ray Dataset produced during the course of executing an eval algorithm's
59    `evaluate` method.
60
61    These are the only columns (aside from score columns) whose
62    data gets written to output records by `util.save_dataset`.
63    Other algorithm-specific columns that get produced as intermediate
64    results (for example, CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME in
65    ClassificationAccuracy) are not included here, and thus won't
66    get saved by `util.save_dataset`.
67    """
68
69    MODEL_INPUT = Column(name="model_input", should_cast=True)
70    PROMPT = Column(name="prompt")
71    MODEL_OUTPUT = Column(name="model_output", should_cast=True)
72    MODEL_LOG_PROBABILITY = Column(name="model_log_probability")
73    TARGET_OUTPUT = Column(name="target_output", should_cast=True)
74    CATEGORY = Column(name="category", should_cast=True)
75    CONTEXT = Column(name="context", should_cast=True)
76    SENT_MORE_INPUT = Column(name="sent_more_input", should_cast=True)
77    SENT_LESS_INPUT = Column(name="sent_less_input", should_cast=True)
78    SENT_MORE_PROMPT = Column(name="sent_more_prompt")
79    SENT_LESS_PROMPT = Column(name="sent_less_prompt")
80    SENT_MORE_LOG_PROB = Column(name="sent_more_log_prob", should_cast=False)
81    SENT_LESS_LOG_PROB = Column(name="sent_less_log_prob", should_cast=False)
82    ERROR = Column(name="error", should_cast=False)

This Enum represents the columns that appear in the finalized Ray Dataset produced during the course of executing an eval algorithm's evaluate method.

These are the only columns (aside from score columns) whose data gets written to output records by util.save_dataset. Other algorithm-specific columns that get produced as intermediate results (for example, CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME in ClassificationAccuracy) are not included here, and thus won't get saved by util.save_dataset.

MODEL_INPUT = <DatasetColumns.MODEL_INPUT: Column(name='model_input', should_cast=True)>
PROMPT = <DatasetColumns.PROMPT: Column(name='prompt', should_cast=None)>
MODEL_OUTPUT = <DatasetColumns.MODEL_OUTPUT: Column(name='model_output', should_cast=True)>
MODEL_LOG_PROBABILITY = <DatasetColumns.MODEL_LOG_PROBABILITY: Column(name='model_log_probability', should_cast=None)>
TARGET_OUTPUT = <DatasetColumns.TARGET_OUTPUT: Column(name='target_output', should_cast=True)>
CATEGORY = <DatasetColumns.CATEGORY: Column(name='category', should_cast=True)>
CONTEXT = <DatasetColumns.CONTEXT: Column(name='context', should_cast=True)>
SENT_MORE_INPUT = <DatasetColumns.SENT_MORE_INPUT: Column(name='sent_more_input', should_cast=True)>
SENT_LESS_INPUT = <DatasetColumns.SENT_LESS_INPUT: Column(name='sent_less_input', should_cast=True)>
SENT_MORE_PROMPT = <DatasetColumns.SENT_MORE_PROMPT: Column(name='sent_more_prompt', should_cast=None)>
SENT_LESS_PROMPT = <DatasetColumns.SENT_LESS_PROMPT: Column(name='sent_less_prompt', should_cast=None)>
SENT_MORE_LOG_PROB = <DatasetColumns.SENT_MORE_LOG_PROB: Column(name='sent_more_log_prob', should_cast=False)>
SENT_LESS_LOG_PROB = <DatasetColumns.SENT_LESS_LOG_PROB: Column(name='sent_less_log_prob', should_cast=False)>
ERROR = <DatasetColumns.ERROR: Column(name='error', should_cast=False)>
DATASET_COLUMNS = OrderedDict([('model_input', <DatasetColumns.MODEL_INPUT: Column(name='model_input', should_cast=True)>), ('prompt', <DatasetColumns.PROMPT: Column(name='prompt', should_cast=None)>), ('model_output', <DatasetColumns.MODEL_OUTPUT: Column(name='model_output', should_cast=True)>), ('model_log_probability', <DatasetColumns.MODEL_LOG_PROBABILITY: Column(name='model_log_probability', should_cast=None)>), ('target_output', <DatasetColumns.TARGET_OUTPUT: Column(name='target_output', should_cast=True)>), ('category', <DatasetColumns.CATEGORY: Column(name='category', should_cast=True)>), ('context', <DatasetColumns.CONTEXT: Column(name='context', should_cast=True)>), ('sent_more_input', <DatasetColumns.SENT_MORE_INPUT: Column(name='sent_more_input', should_cast=True)>), ('sent_less_input', <DatasetColumns.SENT_LESS_INPUT: Column(name='sent_less_input', should_cast=True)>), ('sent_more_prompt', <DatasetColumns.SENT_MORE_PROMPT: Column(name='sent_more_prompt', should_cast=None)>), ('sent_less_prompt', <DatasetColumns.SENT_LESS_PROMPT: Column(name='sent_less_prompt', should_cast=None)>), ('sent_more_log_prob', <DatasetColumns.SENT_MORE_LOG_PROB: Column(name='sent_more_log_prob', should_cast=False)>), ('sent_less_log_prob', <DatasetColumns.SENT_LESS_LOG_PROB: Column(name='sent_less_log_prob', should_cast=False)>), ('error', <DatasetColumns.ERROR: Column(name='error', should_cast=False)>)])
COLUMNS_WITH_LISTS = ['context']
DATA_CONFIG_LOCATION_SUFFIX = '_location'
MIME_TYPE_JSON = 'application/json'
MIME_TYPE_JSONLINES = 'application/jsonlines'
SUPPORTED_MIME_TYPES = ['application/json', 'application/jsonlines']
MEAN = 'mean'
EVAL_OUTPUT_RECORDS_BATCH_SIZE = 1024
SEED = 1234
BUTTER_FINGER = 'butter_finger'
RANDOM_UPPER_CASE = 'random_upper_case'
WHITESPACE_ADD_REMOVE = 'whitespace_add_remove'
PREFIX_FOR_DELTA_SCORES = 'delta_'
NUM_ROWS_DETERMINISTIC = 5
MAX_ROWS_TO_TAKE = 100000
ABS_TOL = 0.001
JUMPSTART_MODEL_ID = 'jumpstart_model_id'
JUMPSTART_MODEL_VERSION = 'jumpstart_model_version'
JUMPSTART_MODEL_TYPE = 'jumpstart_model_type'
MODEL_ID = 'model_id'
SPEC_KEY = 'spec_key'
DEFAULT_PAYLOADS = 'default_payloads'
SDK_MANIFEST_FILE = 'models_manifest.json'
PROPRIETARY_SDK_MANIFEST_FILE = 'proprietary-sdk-manifest.json'
JUMPSTART_BUCKET_BASE_URL_FORMAT = 'https://jumpstart-cache-prod-{}.s3.{}.amazonaws.com'
JUMPSTART_BUCKET_BASE_URL_FORMAT_ENV_VAR = 'JUMPSTART_BUCKET_BASE_URL_FORMAT'
GENERATED_TEXT_JMESPATH_EXPRESSION = '*.output_keys.generated_text'
INPUT_LOG_PROBS_JMESPATH_EXPRESSION = '*.output_keys.input_logprobs'
EMBEDDING_JMESPATH_EXPRESSION = 'embedding'
IS_EMBEDDING_MODEL = 'is_embedding_model'
BERTSCORE_DEFAULT_MODEL = 'microsoft/deberta-xlarge-mnli'
UPLOAD_ID = 'UploadId'
PARTS = 'Parts'
E_TAG = 'ETag'
PART_NUMBER = 'PartNumber'