config/loader/aws/redshift.config.reference.hocon

{
  # Specifies the cloud provider that application will be deployed into
  "cloud": "aws"
  # Data Lake (S3) region
  # This field is optional if it can be resolved with AWS region provider chain.
  # It checks places like env variables, system properties, AWS profile file.
  # https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/regions/providers/DefaultAwsRegionProviderChain.html
  "region": "us-east-1",

  # SQS topic name used by Transformer and Loader to communicate
  "messageQueue": "test-queue",

  # Optional. S3 path that holds JSONPaths
  "jsonpaths": "s3://bucket/jsonpaths/",

  # Warehouse connection details
  "storage" : {
    # The destination DB
    "type": "redshift",
    # Redshift hostname
    "host": "redshift.amazonaws.com",
    # Database name
    "database": "snowplow",
    # Database port. Optional, default value 5439
    "port": 5439,
    # AWS Role ARN allowing Redshift to load data from S3
    # Must be provided if 'NoCreds' is chosen as load auth method.
    "roleArn": "arn:aws:iam::123456789876:role/RedshiftLoadRole",
    # DB schema name
    "schema": "atomic",
    # DB user with permissions to load data
    "username": "admin",
    # DB password
    "password": "Supersecret1",
    # Custom JDBC configuration. Optional, default value { "ssl": true }
    "jdbc": { "ssl": true },
    # MAXERROR, amount of acceptable loading errors. Optional, default value 10
    "maxError": 10
    # Optional, default method is 'NoCreds'
    # Specifies the auth method to use with 'COPY' statement.
    "loadAuthMethod": {
      # With 'NoCreds', no credentials will be passed to 'COPY' statement.
      # Redshift cluster needs to be configured AWS Role ARN allowing Redshift to load data from S3
      # and this Role ARN needs to be passed field 'roleArn' above.
      # More information can be found here:
      # https://docs.aws.amazon.com/redshift/latest/dg/copy-usage_notes-access-permissions.html
      "type": "NoCreds"
    }
    #"loadAuthMethod": {
    #  # With 'TempCreds', temporary credentials will be created for every
    #  # load operation and these temporary credentials will be passed to
    #  # 'COPY' statement. With this way, Redshift cluster doesn't have to be
    #  # configured beforehand.
    #  # This access will be provided by temporary credentials.
    #  "type": "TempCreds"
    #  # IAM role that is used while creating temporary credentials
    #  # Created credentials will allow to access resources specified in the given role
    #  # In our case, “s3:GetObject*”, “s3:ListBucket”, and “s3:GetBucketLocation” permissions
    #  # for transformer output S3 bucket should be specified in the role.
    #  "roleArn": "arn:aws:iam::123456789:role/role_name"
    #}
  },

  "schedules": {
    # Periodic schedules to stop loading, e.g. for Redshift maintenance window
    # Any amount of schedules is supported, but recommended to not overlap them
    # The schedule works with machine's local timezone (and UTC is recommended)
    "noOperation": [
      {
        # Human-readable name of the no-op window
        "name": "Maintenance window",
        # Cron expression with second granularity
        "when": "0 0 12 * * ?",
        # For how long the loader should be paused
        "duration": "1 hour"
      }
    ]
  }

  # Observability and reporting options
  "monitoring": {
    # Snowplow tracking (optional)
    "snowplow": {
      "appId": "redshift-loader",
      "collector": "snplow.acme.com",
    },

    # An endpoint for alerts and infromational events
    # Everything sent to snowplow collector (as properly formed self-describing events)
    # will also be sent to the webhook as POST payloads with self-describing JSONs
    "webhook": {
      # An actual HTTP endpoint
      "endpoint": "https://webhook.acme.com",
      # Set of arbitrary key-value pairs attached to the payload
      "tags": {
        "pipeline": "production"
      }
    },

    # Optional, for tracking runtime exceptions
    "sentry": {
      "dsn": "http://sentry.acme.com"
    },

    # Optional, configure how metrics are reported
    "metrics": {
      # Optional, send metrics to StatsD server
      "statsd": {
        "hostname": "localhost",
        "port": 8125,
        # Any key-value pairs to be tagged on every StatsD metric
        "tags": {
          "app": "rdb-loader"
        }
        # Optional, override the default metric prefix
        # "prefix": "snowplow.rdbloader."
      },

      # Optional, print metrics on stdout (with slf4j)
      "stdout": {
        # Optional, override the default metric prefix
        # "prefix": "snowplow.rdbloader."
      }

      # Optional, period for metrics emitted periodically
      # Default value 5 minutes
      # There is only one periodic metric at the moment.
      # This metric is minimum_age_of_loaded_data.
      # It specifies how old is the latest event in the warehouse.
      "period": "5 minutes"
    },

    # Optional, configuration for periodic unloaded/corrupted folders checks
    "folders": {
      # Path where Loader could store auxiliary logs
      # Loader should be able to write here, Redshift should be able to load from here
      "staging": "s3://acme-snowplow/loader/logs/",
      # How often to check
      "period": "1 hour"
      # Specifies since when folder monitoring will check
      "since": "14 days"
      # Specifies until when folder monitoring will check
      "until": "7 days"
      # Path to transformer archive (must be same as Transformer's `output.path`)
      "transformerOutput": "s3://acme-snowplow/loader/transformed/"
      # How many times the check can fail before generating an alarm instead of warning
      "failBeforeAlarm": 3
    },

    # Periodic DB health-check, raising a warning if DB hasn't responded to `SELECT 1`
    "healthCheck": {
      # How often query a DB
      "frequency": "20 minutes",
      # How long to wait for a response
      "timeout": "15 seconds"
    }
  },

  # Immediate retries configuration
  # Unlike retryQueue these retries happen immediately, without proceeding to another message
  "retries": {
    # Starting backoff period
    "backoff": "30 seconds"
    # A strategy to use when deciding on next backoff
    "strategy": "EXPONENTIAL"
    # How many attempts to make before sending the message into retry queue
    # If missing - the loader will be retrying until cumulative bound
    "attempts": 3,
    # When backoff reaches this delay the Loader will stop retrying
    # Missing cumulativeBound with missing attempts will force to retry inifintely
    "cumulativeBound": "1 hour"
  },

  # Check the target destination to make sure it is ready.
  # Retry the checking until target got ready and block the application in the meantime
  "readyCheck": {
    # Starting backoff period
    "backoff": "15 seconds"
    # A strategy to use when deciding on next backoff
    "strategy": "CONSTANT"
  },

  # Retries configuration for initilization block
  # It will retry on all exceptions from there
  "initRetries": {
    # Starting backoff period
    "backoff": "30 seconds"
    # A strategy to use when deciding on next backoff
    "strategy": "EXPONENTIAL"
    # How many attempts to make before sending the message into retry queue
    # If missing - the loader will be retrying until cumulative bound
    "attempts": 3,
    # When backoff reaches this delay the Loader will stop retrying
    # Missing cumulativeBound with missing attempts will force to retry inifintely
    "cumulativeBound": "1 hour"
  },

  # Additional backlog of recently failed folders that could be automatically retried
  # Retry Queue saves a failed folder and then re-reads the info from shredding_complete S3 file
  "retryQueue": {
    # How often batch of failed folders should be pulled into a discovery queue
    "period": "30 minutes",
    # How many failures should be kept in memory
    # After the limit is reached new failures are dropped
    "size": 64,
    # How many attempt to make for each folder
    # After the limit is reached new failures are dropped
    "maxAttempts": 3,
    # Artificial pause after each failed folder being added to the queue
    "interval": "5 seconds"
  },

  "timeouts": {
    # How long loading (actual COPY statements) can take before considering Redshift unhealthy
    # Without any progress (i.e. different subfolder) within this period, loader
    # will abort the transaction
    "loading": "1 hour",

    # How long non-loading steps (such as ALTER TABLE or metadata queries) can take
    # before considering Redshift unhealthy
    "nonLoading": "10 minutes"

    # SQS visibility timeout is the time window in which a message must be
    # deleted (acknowledged). Otherwise it is considered abandoned.
    # If a message has been pulled, but hasn't been deleted, the next time
    # it will re-appear in another consumer is equal to the visibility timeout
    # Another consequence is that if Loader has failed on a message processing,
    # the next time it will get this (or anything) from a queue has this delay
    "sqsVisibility": "5 minutes"
  }

  # Optional. Configure telemetry
  # All the fields are optional
  "telemetry": {
    # Set to true to disable telemetry
    "disable": false
    # Interval for the heartbeat event
    "interval": 15 minutes
    # HTTP method used to send the heartbeat event
    "method": "POST"
    # URI of the collector receiving the heartbeat event
    "collectorUri": "collector-g.snowplowanalytics.com"
    # Port of the collector receiving the heartbeat event
    "collectorPort": 443
    # Whether to use https or not
    "secure": true
    # Identifier intended to tie events together across modules,
    # infrastructure and apps when used consistently
    "userProvidedId": "my_pipeline"
    # ID automatically generated upon running a modules deployment script
    # Intended to identify each independent module, and the infrastructure it controls
    "autoGeneratedId": "hfy67e5ydhtrd"
    # Unique identifier for the VM instance
    # Unique for each instance of the app running within a module
    "instanceId": "665bhft5u6udjf"
    # Name of the terraform module that deployed the app
    "moduleName": "rdb-loader-ce"
    # Version of the terraform module that deployed the app
    "moduleVersion": "1.0.0"
  }
}