-
Notifications
You must be signed in to change notification settings - Fork 17
/
redshift.config.reference.hocon
261 lines (241 loc) · 10.1 KB
/
redshift.config.reference.hocon
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
{
# Specifies the cloud provider that application will be deployed into
"cloud": "aws"
# Data Lake (S3) region
# This field is optional if it can be resolved with AWS region provider chain.
# It checks places like env variables, system properties, AWS profile file.
# https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/regions/providers/DefaultAwsRegionProviderChain.html
"region": "us-east-1",
# SQS topic name used by Transformer and Loader to communicate
"messageQueue": "test-queue",
# Optional. S3 path that holds JSONPaths
"jsonpaths": "s3://bucket/jsonpaths/",
# Warehouse connection details
"storage" : {
# The destination DB
"type": "redshift",
# Redshift hostname
"host": "redshift.amazonaws.com",
# Database name
"database": "snowplow",
# Database port. Optional, default value 5439
"port": 5439,
# AWS Role ARN allowing Redshift to load data from S3
# Must be provided if 'NoCreds' is chosen as load auth method.
"roleArn": "arn:aws:iam::123456789876:role/RedshiftLoadRole",
# DB schema name
"schema": "atomic",
# DB user with permissions to load data
"username": "admin",
# DB password
"password": "Supersecret1",
# Custom JDBC configuration. Optional, default value { "ssl": true }
"jdbc": { "ssl": true },
# MAXERROR, amount of acceptable loading errors. Optional, default value 10
"maxError": 10
# Optional, default method is 'NoCreds'
# Specifies the auth method to use with 'COPY' statement.
"loadAuthMethod": {
# With 'NoCreds', no credentials will be passed to 'COPY' statement.
# Redshift cluster needs to be configured AWS Role ARN allowing Redshift to load data from S3
# and this Role ARN needs to be passed field 'roleArn' above.
# More information can be found here:
# https://docs.aws.amazon.com/redshift/latest/dg/copy-usage_notes-access-permissions.html
"type": "NoCreds"
}
#"loadAuthMethod": {
# # With 'TempCreds', temporary credentials will be created for every
# # load operation and these temporary credentials will be passed to
# # 'COPY' statement. With this way, Redshift cluster doesn't have to be
# # configured beforehand.
# # This access will be provided by temporary credentials.
# "type": "TempCreds"
# # IAM role that is used while creating temporary credentials
# # Created credentials will allow to access resources specified in the given role
# # In our case, “s3:GetObject*”, “s3:ListBucket”, and “s3:GetBucketLocation” permissions
# # for transformer output S3 bucket should be specified in the role.
# "roleArn": "arn:aws:iam::123456789:role/role_name"
#}
},
"schedules": {
# Periodic schedules to stop loading, e.g. for Redshift maintenance window
# Any amount of schedules is supported, but recommended to not overlap them
# The schedule works with machine's local timezone (and UTC is recommended)
"noOperation": [
{
# Human-readable name of the no-op window
"name": "Maintenance window",
# Cron expression with second granularity
"when": "0 0 12 * * ?",
# For how long the loader should be paused
"duration": "1 hour"
}
]
}
# Observability and reporting options
"monitoring": {
# Snowplow tracking (optional)
"snowplow": {
"appId": "redshift-loader",
"collector": "snplow.acme.com",
},
# An endpoint for alerts and infromational events
# Everything sent to snowplow collector (as properly formed self-describing events)
# will also be sent to the webhook as POST payloads with self-describing JSONs
"webhook": {
# An actual HTTP endpoint
"endpoint": "https://webhook.acme.com",
# Set of arbitrary key-value pairs attached to the payload
"tags": {
"pipeline": "production"
}
},
# Optional, for tracking runtime exceptions
"sentry": {
"dsn": "http://sentry.acme.com"
},
# Optional, configure how metrics are reported
"metrics": {
# Optional, send metrics to StatsD server
"statsd": {
"hostname": "localhost",
"port": 8125,
# Any key-value pairs to be tagged on every StatsD metric
"tags": {
"app": "rdb-loader"
}
# Optional, override the default metric prefix
# "prefix": "snowplow.rdbloader."
},
# Optional, print metrics on stdout (with slf4j)
"stdout": {
# Optional, override the default metric prefix
# "prefix": "snowplow.rdbloader."
}
# Optional, period for metrics emitted periodically
# Default value 5 minutes
# There is only one periodic metric at the moment.
# This metric is minimum_age_of_loaded_data.
# It specifies how old is the latest event in the warehouse.
"period": "5 minutes"
},
# Optional, configuration for periodic unloaded/corrupted folders checks
"folders": {
# Path where Loader could store auxiliary logs
# Loader should be able to write here, Redshift should be able to load from here
"staging": "s3://acme-snowplow/loader/logs/",
# How often to check
"period": "1 hour"
# Specifies since when folder monitoring will check
"since": "14 days"
# Specifies until when folder monitoring will check
"until": "7 days"
# Path to transformer archive (must be same as Transformer's `output.path`)
"transformerOutput": "s3://acme-snowplow/loader/transformed/"
# How many times the check can fail before generating an alarm instead of warning
"failBeforeAlarm": 3
},
# Periodic DB health-check, raising a warning if DB hasn't responded to `SELECT 1`
"healthCheck": {
# How often query a DB
"frequency": "20 minutes",
# How long to wait for a response
"timeout": "15 seconds"
}
},
# Immediate retries configuration
# Unlike retryQueue these retries happen immediately, without proceeding to another message
"retries": {
# Starting backoff period
"backoff": "30 seconds"
# A strategy to use when deciding on next backoff
"strategy": "EXPONENTIAL"
# How many attempts to make before sending the message into retry queue
# If missing - the loader will be retrying until cumulative bound
"attempts": 3,
# When backoff reaches this delay the Loader will stop retrying
# Missing cumulativeBound with missing attempts will force to retry inifintely
"cumulativeBound": "1 hour"
},
# Check the target destination to make sure it is ready.
# Retry the checking until target got ready and block the application in the meantime
"readyCheck": {
# Starting backoff period
"backoff": "15 seconds"
# A strategy to use when deciding on next backoff
"strategy": "CONSTANT"
},
# Retries configuration for initilization block
# It will retry on all exceptions from there
"initRetries": {
# Starting backoff period
"backoff": "30 seconds"
# A strategy to use when deciding on next backoff
"strategy": "EXPONENTIAL"
# How many attempts to make before sending the message into retry queue
# If missing - the loader will be retrying until cumulative bound
"attempts": 3,
# When backoff reaches this delay the Loader will stop retrying
# Missing cumulativeBound with missing attempts will force to retry inifintely
"cumulativeBound": "1 hour"
},
# Additional backlog of recently failed folders that could be automatically retried
# Retry Queue saves a failed folder and then re-reads the info from shredding_complete S3 file
"retryQueue": {
# How often batch of failed folders should be pulled into a discovery queue
"period": "30 minutes",
# How many failures should be kept in memory
# After the limit is reached new failures are dropped
"size": 64,
# How many attempt to make for each folder
# After the limit is reached new failures are dropped
"maxAttempts": 3,
# Artificial pause after each failed folder being added to the queue
"interval": "5 seconds"
},
"timeouts": {
# How long loading (actual COPY statements) can take before considering Redshift unhealthy
# Without any progress (i.e. different subfolder) within this period, loader
# will abort the transaction
"loading": "1 hour",
# How long non-loading steps (such as ALTER TABLE or metadata queries) can take
# before considering Redshift unhealthy
"nonLoading": "10 minutes"
# SQS visibility timeout is the time window in which a message must be
# deleted (acknowledged). Otherwise it is considered abandoned.
# If a message has been pulled, but hasn't been deleted, the next time
# it will re-appear in another consumer is equal to the visibility timeout
# Another consequence is that if Loader has failed on a message processing,
# the next time it will get this (or anything) from a queue has this delay
"sqsVisibility": "5 minutes"
}
# Optional. Configure telemetry
# All the fields are optional
"telemetry": {
# Set to true to disable telemetry
"disable": false
# Interval for the heartbeat event
"interval": 15 minutes
# HTTP method used to send the heartbeat event
"method": "POST"
# URI of the collector receiving the heartbeat event
"collectorUri": "collector-g.snowplowanalytics.com"
# Port of the collector receiving the heartbeat event
"collectorPort": 443
# Whether to use https or not
"secure": true
# Identifier intended to tie events together across modules,
# infrastructure and apps when used consistently
"userProvidedId": "my_pipeline"
# ID automatically generated upon running a modules deployment script
# Intended to identify each independent module, and the infrastructure it controls
"autoGeneratedId": "hfy67e5ydhtrd"
# Unique identifier for the VM instance
# Unique for each instance of the app running within a module
"instanceId": "665bhft5u6udjf"
# Name of the terraform module that deployed the app
"moduleName": "rdb-loader-ce"
# Version of the terraform module that deployed the app
"moduleVersion": "1.0.0"
}
}