# Comprehensive config for the DataOps Data Quality module.
# Exercises ALL non-excluded schema properties at full depth.
# Defines Glue Data Quality rulesets for customer and order data
# validation, wired to a DataOps project for resource resolution.

# DataOps project name for data quality ruleset integration and naming.
projectName: test-dataops-project

# SNS topic ARN for job notifications and workflow alerts.
# Auto-resolved from project when projectName is set.
notificationTopicArn: arn:{{partition}}:sns:{{region}}:{{account}}:test-topic

# Map of ruleset names to Glue Data Quality ruleset definitions for automated table validation.
rulesets:
  # Ruleset exercising structured rule array with all DataQualityRule properties
  customer-data-quality:
    # Description explaining the purpose and scope of the ruleset.
    description: Validate customer data completeness and uniqueness
    # Target table specifying which Glue Catalog table to validate.
    targetTable:
      # AWS account ID for cross-account Glue Catalog access.
      catalogId: '{{account}}'
      # Glue database name containing the target table.
      databaseName: project:databaseName/customer-data
      # Glue table name to validate with data quality rules.
      tableName: customers
    # Ruleset as an array of structured rule objects.
    ruleset:
      # IsComplete rule — checks column has no nulls
      - ruleType: IsComplete
        # Column name for column-specific rules.
        column: customer_id

      # Uniqueness rule — percentage-based threshold check
      - ruleType: Uniqueness
        column: email
        # Comparison operator for threshold and value-based rules.
        comparisonOperator: '>'
        # Threshold value (0.0–1.0) for percentage-based rules.
        threshold: 0.95

      # RowCount rule — numeric value comparison
      - ruleType: RowCount
        comparisonOperator: '>'
        # Numeric value for count and statistical rules.
        value: 100

      # ColumnValues rule — allowed values list with 'in' operator
      - ruleType: ColumnValues
        column: status
        comparisonOperator: in
        # Allowed values list for ColumnValues rule with 'in' operator.
        values:
          - active
          - inactive
          - pending

      # ColumnDataType rule — validates column data type
      - ruleType: ColumnDataType
        column: created_at
        # Expected data type for ColumnDataType rule.
        dataType: DATE

      # DataFreshness rule — validates data recency
      - ruleType: DataFreshness
        column: updated_at
        # Duration specifying maximum data age.
        duration: '24 hours'

      # CustomSql rule — custom SQL-based validation
      - ruleType: CustomSql
        # SQL query for CustomSql rule, must return a single numeric value.
        sql: 'SELECT COUNT(*) FROM customers WHERE customer_id IS NULL'
        comparisonOperator: '='
        value: 0

      # Rule with WHERE clause — conditional validation
      - ruleType: IsComplete
        column: phone_number
        # SQL WHERE clause to filter rows before applying the rule.
        where: "country = 'US'"

  # Ruleset exercising raw DQDL string form
  order-data-quality:
    description: Validate order data freshness and values
    targetTable:
      databaseName: project:databaseName/order-data
      tableName: orders
    # Ruleset as a raw DQDL string.
    ruleset: |
      Rules = [
        IsComplete "order_id",
        ColumnValues "status" in ["pending", "completed", "cancelled"],
        RowCount > 0
      ]

  # Ruleset with Redshift source metadata and SMUS asset mapping
  redshift-inventory-quality:
    description: Validate inventory data from Redshift
    targetTable:
      databaseName: project:databaseName/inventory-data
      tableName: inventory
    # Source configuration describing where the data lives.
    source:
      sourceType: redshift
      connectionName: project:connections/redshift-jdbc
      redshiftTable: public.inventory
    # DataZone asset ID for SMUS publishing.
    smusAssetId: asset-abc-123
    ruleset:
      - ruleType: IsComplete
        column: product_id

  # Recommendation-based ruleset (no explicit rules)
  auto-recommended-rules:
    description: Auto-generated rules from Glue DQ recommendations
    targetTable:
      databaseName: project:databaseName/customer-data
      tableName: customers
    # Glue Data Quality recommendation run ID.
    recommendationRunId: dqrun-abc-123-def

# Dynamic targets for runtime table discovery.
dynamicTargets:
  - name: raw-parquet-data
    s3DirUri: s3://my-data-lake/raw/parquet/
    source:
      sourceType: s3
      s3Format: parquet

# SMUS publishing configuration for DataZone integration.
smusPublishing:
  domainId: dzd_my_domain
  accountId: '{{account}}'
  region: '{{region}}'
  # roleArn: arn:{{partition}}:iam::{{account}}:role/dq-publisher
  # domainKmsKeyArn: arn:{{partition}}:kms:{{region}}:{{account}}:key/abc-123
