forked from gchq/sleeper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinstance.properties
997 lines (757 loc) · 50.2 KB
/
instance.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
## The following properties are commonly used throughout Sleeper.
# A string to uniquely identify this deployment. This should be no longer than 20 chars. It should be
# globally unique as it will be used to name AWS resources such as S3 buckets.
sleeper.id=full-example
# The S3 bucket containing the jar files of the Sleeper components.
sleeper.jars.bucket=the name of the bucket containing your jars, e.g. sleeper-<insert-unique-name-here>-jars
# A comma-separated list of the jars containing application specific iterator code. These jars are
# assumed to be in the bucket given by sleeper.jars.bucket, e.g. if that bucket contains two iterator
# jars called iterator1.jar and iterator2.jar then the property should be
# 'sleeper.userjars=iterator1.jar,iterator2.jar'.
# sleeper.userjars=
# A name for a tag to identify the stack that deployed a resource. This will be set for all AWS
# resources, to the ID of the CDK stack that they are deployed under. This can be used to organise the
# cost explorer for billing.
sleeper.stack.tag.name=DeploymentStack
# Whether to keep the sleeper table bucket, Dynamo tables, query results bucket, etc., when the
# instance is destroyed.
sleeper.retain.infra.after.destroy=true
# The optional stacks to deploy.
sleeper.optional.stacks=CompactionStack,GarbageCollectorStack,IngestStack,PartitionSplittingStack,QueryStack,AthenaStack,EmrServerlessBulkImportStack,EmrStudioStack,DashboardStack
# The AWS account number. This is the AWS account that the instance will be deployed to.
sleeper.account=1234567890
# The AWS region to deploy to.
sleeper.region=eu-west-2
# The id of the VPC to deploy to.
sleeper.vpc=1234567890
# Whether to check that the VPC that the instance is deployed to has an S3 endpoint. If there is no S3
# endpoint then the NAT costs can be very significant.
sleeper.vpc.endpoint.check=true
# A comma separated list of subnets to deploy to. ECS tasks will be run across multiple subnets. EMR
# clusters will be deployed in a subnet chosen when the cluster is created.
sleeper.subnets=subnet-abcdefgh
# The Hadoop filesystem used to connect to S3.
sleeper.filesystem=s3a://
# An email address used by the TopicStack to publish SNS notifications of errors.
# sleeper.errors.email=
# The visibility timeout on the queues used in ingest, query, etc.
sleeper.queue.visibility.timeout.seconds=900
# The length of time in days that CloudWatch logs from lambda functions, ECS containers, etc., are
# retained.
# See https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-logs-loggroup.html
# for valid options.
# Use -1 to indicate infinite retention.
sleeper.log.retention.days=30
# Used to set the value of fs.s3a.connection.maximum on the Hadoop configuration. This controls the
# maximum number of http connections to S3.
# See https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/performance.html
sleeper.fs.s3a.max-connections=25
# Used to set the value of fs.s3a.block.size on the Hadoop configuration. Uploads to S3 happen in
# blocks, and this sets the size of blocks. If a larger value is used, then more data is buffered
# before the upload begins.
# See https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/performance.html
sleeper.fs.s3a.upload.block.size=32M
# The version of Fargate to use.
sleeper.fargate.version=1.4.0
# The amount of memory for the lambda that creates ECS tasks to execute compaction and ingest jobs.
sleeper.task.runner.memory=1024
# The timeout in seconds for the lambda that creates ECS tasks to execute compaction jobs and ingest
# jobs.
# This must be >0 and <= 900.
sleeper.task.runner.timeout.seconds=900
# The namespaces for the metrics used in the metrics stack.
sleeper.metrics.namespace=Sleeper
# If true, properties will be reloaded every time a long running job is started or a lambda is run.
# This will mainly be used in test scenarios to ensure properties are up to date.
sleeper.properties.force.reload=false
# If set, this property will be used as a prefix for the names of ECR repositories. If unset, then the
# instance ID will be used to determine the names instead.
# Note: This is only used by the deployment scripts to upload Docker images, not the CDK. We may add
# the ability to use this in the CDK in the future.
# sleeper.ecr.repository.prefix=
# This specifies whether point in time recovery is enabled for the DynamoDB state store. This is set
# on the DynamoDB tables.
sleeper.metadata.dynamo.pointintimerecovery=false
# This specifies whether point in time recovery is enabled for the S3 state store. This is set on the
# revision DynamoDB table.
sleeper.metadata.s3.dynamo.pointintimerecovery=false
# This specifies whether point in time recovery is enabled for the Sleeper table index. This is set on
# the DynamoDB tables.
sleeper.tables.index.dynamo.pointintimerecovery=false
# The timeout in minutes for when the table properties provider cache should be cleared, forcing table
# properties to be reloaded from S3.
sleeper.table.properties.provider.timeout.minutes=60
## The following properties relate to standard ingest.
# The name of the ECR repository for the ingest container. The Docker image from the ingest module
# should have been uploaded to an ECR repository of this name in this account.
sleeper.ingest.repo=<insert-unique-sleeper-id>/ingest
# The maximum number of concurrent ECS tasks to run.
sleeper.ingest.max.concurrent.tasks=200
# The frequency in minutes with which an EventBridge rule runs to trigger a lambda that, if necessary,
# runs more ECS tasks to perform ingest jobs.
sleeper.ingest.task.creation.period.minutes=1
# The frequency, in seconds, with which change message visibility requests are sent to extend the
# visibility of messages on the ingest queue so that they are not processed by other processes.
# This should be less than the value of sleeper.queue.visibility.timeout.seconds.
sleeper.ingest.keepalive.period.seconds=300
# This sets the value of fs.s3a.experimental.input.fadvise on the Hadoop configuration used to read
# and write files to and from S3 in ingest jobs. Changing this value allows you to fine-tune how files
# are read. Possible values are "normal", "sequential" and "random". More information is available
# here:
# https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/performance.html#fadvise.
sleeper.ingest.fs.s3a.experimental.input.fadvise=sequential
# The amount of CPU used by Fargate tasks that perform ingest jobs.
# Note that only certain combinations of CPU and memory are valid.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.ingest.task.cpu=2048
# The amount of memory used by Fargate tasks that perform ingest jobs.
# Note that only certain combinations of CPU and memory are valid.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.ingest.task.memory=4096
# The frequency in seconds with which ingest tasks refresh their view of the partitions.
# (NB Refreshes only happen once a batch of data has been written so this is a lower bound on the
# refresh frequency.)
sleeper.ingest.partition.refresh.period=120
# A comma-separated list of buckets that contain files to be ingested via ingest jobs. The buckets
# should already exist, i.e. they will not be created as part of the cdk deployment of this instance
# of Sleeper. The ingest and bulk import stacks will be given read access to these buckets so that
# they can consume data from them.
# sleeper.ingest.source.bucket=
# A comma-separated list of role names which should be able to ingest data into Sleeper.
# sleeper.ingest.source.role=
# The way in which records are held in memory before they are written to a local store.
# Valid values are 'arraylist' and 'arrow'.
# The arraylist method is simpler, but it is slower and requires careful tuning of the number of
# records in each batch.
sleeper.ingest.record.batch.type=arrow
# The way in which partition files are written to the main Sleeper store.
# Valid values are 'direct' (which writes using the s3a Hadoop file system) and 'async' (which writes
# locally and then copies the completed Parquet file asynchronously into S3).
# The direct method is simpler but the async method should provide better performance when the number
# of partitions is large.
sleeper.ingest.partition.file.writer.type=async
# Flag to enable/disable storage of tracking information for ingest jobs and tasks.
sleeper.ingest.status.store.enabled=true
# The time to live in seconds for ingest job updates in the status store. Default is 1 week.
# The expiry time is fixed when an update is saved to the store, so changing this will only affect new
# data.
sleeper.ingest.job.status.ttl=604800
# The time to live in seconds for ingest task updates in the status store. Default is 1 week.
# The expiry time is fixed when an update is saved to the store, so changing this will only affect new
# data.
sleeper.ingest.task.status.ttl=604800
# The time in seconds to wait for ingest jobs to appear on the queue before an ingest task terminates.
# Must be >= 0 and <= 20.
# See also
# https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-short-and-long-polling.html
sleeper.ingest.job.queue.wait.time=20
# The maximum number of records written to local file in an ingest job. (Records are written in sorted
# order to local disk before being uploaded to S3. Increasing this value increases the amount of time
# before data is visible in the system, but increases the number of records written to S3 in a batch,
# therefore reducing costs.)
# (arraylist-based ingest only)
sleeper.ingest.max.local.records=100000000
# The maximum number of records to read into memory in an ingest job. (Up to
# sleeper.ingest.memory.max.batch.size records are read into memory before being sorted and written to
# disk. This process is repeated until sleeper.ingest.max.local.records records have been written to
# local files. Then the sorted files and merged and the data is written to sorted files in S3.)
# (arraylist-based ingest only)
sleeper.ingest.memory.max.batch.size=1000000
# The number of bytes to allocate to the Arrow working buffer. This buffer is used for sorting and
# other sundry activities. Note that this is off-heap memory, which is in addition to the memory
# assigned to the JVM.
# (arrow-based ingest only) [256MB]
sleeper.ingest.arrow.working.buffer.bytes=268435456
# The number of bytes to allocate to the Arrow batch buffer, which is used to hold the records before
# they are written to local disk. A larger value means that the local disk holds fewer, larger files,
# which are more efficient to merge together during an upload to S3. Larger values may require a
# larger working buffer. Note that this is off-heap memory, which is in addition to the memory
# assigned to the JVM.
# (arrow-based ingest only) [1GB]
sleeper.ingest.arrow.batch.buffer.bytes=1073741824
# The maximum number of bytes to store on the local disk before uploading to the main Sleeper store. A
# larger value reduces the number of S3 PUTs that are required to upload thle data to S3 and results
# in fewer files per partition.
# (arrow-based ingest only) [2GB]
sleeper.ingest.arrow.max.local.store.bytes=2147483648
# The number of records to write at once into an Arrow file in the local store. A single Arrow file
# contains many of these micro-batches and so this parameter does not significantly affect the final
# size of the Arrow file. Larger values may require a larger working buffer.
# (arrow-based ingest only) [1K]
sleeper.ingest.arrow.max.single.write.to.file.records=1024
# The implementation of the async S3 client to use for upload during ingest.
# Valid values are 'java' or 'crt'. This determines the implementation of S3AsyncClient that gets
# used.
# With 'java' it makes a single PutObject request for each file.
# With 'crt' it uses the AWS Common Runtime (CRT) to make multipart uploads.
# Note that the CRT option is recommended. Using the Java option may cause failures if any file is
# >5GB in size, and will lead to the following warning:
# "The provided S3AsyncClient is not an instance of S3CrtAsyncClient, and thus multipart
# upload/download feature is not enabled and resumable file upload is not supported. To benefit from
# maximum throughput, consider using S3AsyncClient.crtBuilder().build() instead."
# (async partition file writer only)
sleeper.ingest.async.client.type=crt
# The part size in bytes to use for multipart uploads.
# (CRT async ingest only) [128MB]
sleeper.ingest.async.crt.part.size.bytes=134217728
# The target throughput for multipart uploads, in GB/s. Determines how many parts should be uploaded
# simultaneously.
# (CRT async ingest only)
sleeper.ingest.async.crt.target.throughput.gbps=10
# The amount of memory in MB for the lambda that receives submitted requests to ingest files.
sleeper.ingest.batcher.submitter.memory.mb=1024
# The timeout in seconds for the lambda that receives submitted requests to ingest files.
sleeper.ingest.batcher.submitter.timeout.seconds=20
# The amount of memory in MB for the lambda that creates ingest jobs from submitted file ingest
# requests.
sleeper.ingest.batcher.job.creation.memory.mb=1024
# The timeout in seconds for the lambda that creates ingest jobs from submitted file ingest requests.
sleeper.ingest.batcher.job.creation.timeout.seconds=900
# The rate at which the ingest batcher job creation lambda runs (in minutes, must be >=1).
sleeper.ingest.batcher.job.creation.period.minutes=1
## The following properties relate to bulk import, i.e. ingesting data using Spark jobs running on EMR
## or EKS.
# The class to use to perform the bulk import. The default value below uses Spark Dataframes. There is
# an alternative option that uses RDDs (sleeper.bulkimport.job.runner.rdd.BulkImportJobRDDDriver).
sleeper.bulk.import.class.name=sleeper.bulkimport.job.runner.dataframelocalsort.BulkImportDataframeLocalSortDriver
# The compression codec for map status results. Used to set spark.shuffle.mapStatus.compression.codec.
# Stops "Decompression error: Version not supported" errors - only a value of "lz4" has been tested.
sleeper.bulk.import.emr.spark.shuffle.mapStatus.compression.codec=lz4
# If true then speculative execution of tasks will be performed. Used to set spark.speculation.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.speculation=false
# Fraction of tasks which must be complete before speculation is enabled for a particular stage. Used
# to set spark.speculation.quantile.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.spark.speculation.quantile=0.75
# The amount of memory allocated to a Spark executor. Used to set spark.executor.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.memory=16g
# The amount of memory allocated to the Spark driver. Used to set spark.driver.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.memory=16g
# The number of executors. Used to set spark.executor.instances.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.instances=29
# The memory overhead for an executor. Used to set spark.executor.memoryOverhead.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.memory.overhead=2g
# The memory overhead for the driver. Used to set spark.driver.memoryOverhead.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.memory.overhead=2g
# The default parallelism for Spark job. Used to set spark.default.parallelism.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.default.parallelism=290
# The number of partitions used in a Spark SQL/dataframe shuffle operation. Used to set
# spark.sql.shuffle.partitions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.sql.shuffle.partitions=290
# (Non-persistent or persistent EMR mode only) An EC2 keypair to use for the EC2 instances. Specifying
# this will allow you to SSH to the nodes in the cluster while it's running.
sleeper.bulk.import.emr.keypair.name=my-key
# (Non-persistent or persistent EMR mode only) Specifying this security group causes the group to be
# added to the EMR master's list of security groups.
# sleeper.bulk.import.emr.master.additional.security.group=
# (Non-persistent or persistent EMR mode only) The number of cores used by an executor. Used to set
# spark.executor.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.cores=5
# (Non-persistent or persistent EMR mode only) The number of cores used by the driver. Used to set
# spark.driver.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.cores=5
# (Non-persistent or persistent EMR mode only) The default timeout for network interactions in Spark.
# Used to set spark.network.timeout.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.network.timeout=800s
# (Non-persistent or persistent EMR mode only) The interval between heartbeats from executors to the
# driver. Used to set spark.executor.heartbeatInterval.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.heartbeat.interval=60s
# (Non-persistent or persistent EMR mode only) Whether Spark should use dynamic allocation to scale
# resources up and down. Used to set spark.dynamicAllocation.enabled.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.dynamic.allocation.enabled=false
# (Non-persistent or persistent EMR mode only) The fraction of heap space used for execution and
# storage. Used to set spark.memory.fraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.memory.fraction=0.80
# (Non-persistent or persistent EMR mode only) The amount of storage memory immune to eviction,
# expressed as a fraction of the heap space used for execution and storage. Used to set
# spark.memory.storageFraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.memory.storage.fraction=0.30
# (Non-persistent or persistent EMR mode only) JVM options passed to the executors. Used to set
# spark.executor.extraJavaOptions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.extra.java.options=-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark -XX:InitiatingHeapOccupancyPercent=35 -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p'
# (Non-persistent or persistent EMR mode only) JVM options passed to the driver. Used to set
# spark.driver.extraJavaOptions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.extra.java.options=-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark -XX:InitiatingHeapOccupancyPercent=35 -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p'
# (Non-persistent or persistent EMR mode only) The maximum number of executor failures before YARN can
# fail the application. Used to set spark.yarn.scheduler.reporterThread.maxFailures.
# See
# https://aws.amazon.com/blogs/big-data/best-practices-for-successfully-managing-memory-for-apache-spark-applications-on-amazon-emr/.
sleeper.bulk.import.emr.spark.yarn.scheduler.reporter.thread.max.failures=5
# (Non-persistent or persistent EMR mode only) The storage to use for temporary caching. Used to set
# spark.storage.level.
# See
# https://aws.amazon.com/blogs/big-data/best-practices-for-successfully-managing-memory-for-apache-spark-applications-on-amazon-emr/.
sleeper.bulk.import.emr.spark.storage.level=MEMORY_AND_DISK_SER
# (Non-persistent or persistent EMR mode only) Whether to compress serialized RDD partitions. Used to
# set spark.rdd.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.rdd.compress=true
# (Non-persistent or persistent EMR mode only) Whether to compress map output files. Used to set
# spark.shuffle.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.shuffle.compress=true
# (Non-persistent or persistent EMR mode only) Whether to compress data spilled during shuffles. Used
# to set spark.shuffle.spill.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.shuffle.spill.compress=true
# (Non-persistent or persistent EMR mode only) The size of the EBS volume in gibibytes (GiB).
# This can be a number from 10 to 1024.
sleeper.bulk.import.emr.ebs.volume.size.gb=256
# (Non-persistent or persistent EMR mode only) The type of the EBS volume.
# Valid values are 'gp2', 'gp3', 'io1', 'io2'.
sleeper.bulk.import.emr.ebs.volume.type=gp2
# (Non-persistent or persistent EMR mode only) The number of EBS volumes per instance.
# This can be a number from 1 to 25.
sleeper.bulk.import.emr.ebs.volumes.per.instance=4
# The architecture for EMR Serverless to use. X86_64 or ARM64 (Coming soon)
sleeper.bulk.import.emr.serverless.architecture=X86_64
# The version of EMR Serverless to use.
sleeper.bulk.import.emr.serverless.release=emr-6.13.0
# The name of the repository for the EMR serverless container. The Docker image from the bulk-import
# module should have been uploaded to an ECR repository of this name in this account.
sleeper.bulk.import.emr.serverless.repo=<insert-unique-sleeper-id>/bulk-import-runner-emr-serverless
# Set to true to allow an EMR Serverless Application to start automatically when a job is submitted.
sleeper.bulk.import.emr.serverless.autostart.enabled=true
# Set to true to allow an EMR Serverless Application to stop automatically when there are no jobs to
# process.
# Turning this off with pre-initialised capacity turned off is not recommended.
sleeper.bulk.import.emr.serverless.autostop.enabled=true
# The number of minutes of inactivity before EMR Serverless stops the application.
sleeper.bulk.import.emr.serverless.autostop.timeout=15
# The number of cores used by a Serverless executor. Used to set spark.executor.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.cores=4
# The amount of memory allocated to a Serverless executor. Used to set spark.executor.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.memory=16G
# The amount of storage allocated to a Serverless executor.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.emr-serverless.executor.disk=200G
# The number of executors to be used with Serverless. Used to set spark.executor.instances.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.instances=36
# The number of cores used by the Serverless Spark driver. Used to set spark.driver.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.driver.cores=4
# The amount of memory allocated to the Serverless Spark driver. Used to set spark.driver.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.driver.memory=16G
# The path to JAVA_HOME to be used by the custom image for bulk import.
sleeper.bulk.import.emr.serverless.spark.executorEnv.JAVA_HOME=/usr/lib/jvm/jre-11
# Whether Spark should use dynamic allocation to scale resources up and down. Used to set
# spark.dynamicAllocation.enabled. See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.dynamic.allocation.enabled=false
# Whether to compress serialized RDD partitions. Used to set spark.rdd.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.rdd.compress=true
# Whether to compress map output files. Used to set spark.shuffle.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.shuffle.compress=true
# Whether to compress data spilled during shuffles. Used to set spark.shuffle.spill.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.shuffle.spill.compress=true
# The default parallelism for Spark job. Used to set spark.default.parallelism.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.default.parallelism=288
# The number of partitions used in a Spark SQL/dataframe shuffle operation. Used to set
# spark.sql.shuffle.partitions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.sql.shuffle.partitions=288
# The default timeout for network interactions in Spark. Used to set spark.network.timeout.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.network.timeout=800s
# (The interval between heartbeats from executors to the driver. Used to set
# spark.executor.heartbeatInterval.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.heartbeat.interval=60s
# The fraction of heap space used for execution and storage. Used to set spark.memory.fraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.memory.fraction=0.80
# The amount of storage memory immune to eviction, expressed as a fraction of the heap space used for
# execution and storage. Used to set spark.memory.storageFraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.memory.storage.fraction=0.30
# If true then speculative execution of tasks will be performed. Used to set spark.speculation.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.speculation=false
# Fraction of tasks which must be complete before speculation is enabled for a particular stage. Used
# to set spark.speculation.quantile.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.speculation.quantile=0.75
# The compression codec for map status results. Used to set spark.shuffle.mapStatus.compression.codec.
# Stops "Decompression error: Version not supported" errors - only a value of "lz4" has been tested.
sleeper.bulk.import.emr.serverless.spark.shuffle.mapStatus.compression.codec=lz4
# Set to enable the pre-initialise capacity option for EMR Serverless application.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.enabled=false
# The number of executors to pre-initialise.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.count=72
# The amount of CPUs per executor for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.cores=4vCPU
# The amount of memory per executor for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.memory=18GB
# The amount of storage per executor for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.disk=200GB
# The number of drivers to pre-initialise.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.count=5
# The amount of CPUs per driver for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.cores=4vCPU
# The amount of memory per driver for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.memory=18GB
# The amount of storage per driver for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.disk=20GB
# (Non-persistent EMR mode only) The default EMR release label to be used when creating an EMR cluster
# for bulk importing data using Spark running on EMR.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.release.label=emr-6.13.0
# (Non-persistent EMR mode only) Which architecture to be used for EC2 instance types in the EMR
# cluster. Must be either "x86_64" "arm64" or "x86_64,arm64". For more information, see the Bulk
# import using EMR - Instance types section in docs/05-ingest.md
sleeper.default.bulk.import.emr.instance.architecture=x86_64
# (Non-persistent EMR mode only) The default EC2 x86_64 instance types and weights to be used for the
# master node of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.default.bulk.import.emr.master.x86.instance.types=m6i.xlarge
# (Non-persistent EMR mode only) The default EC2 x86_64 instance types and weights to be used for the
# executor nodes of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.default.bulk.import.emr.executor.x86.instance.types=m6i.4xlarge
# (Non-persistent EMR mode only) The default EC2 ARM64 instance types and weights to be used for the
# master node of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.default.bulk.import.emr.master.arm.instance.types=m6g.xlarge
# (Non-persistent EMR mode only) The default EC2 ARM64 instance types and weights to be used for the
# executor nodes of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.default.bulk.import.emr.executor.arm.instance.types=m6g.4xlarge
# (Non-persistent EMR mode only) The default purchasing option to be used for the executor nodes of
# the EMR cluster.
# Valid values are ON_DEMAND or SPOT.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.executor.market.type=SPOT
# (Non-persistent EMR mode only) The default initial number of capacity units to provision as EC2
# instances for executors in the EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.executor.initial.instances=2
# (Non-persistent EMR mode only) The default maximum number of capacity units to provision as EC2
# instances for executors in the EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.executor.max.instances=10
# (Persistent EMR mode only) The EMR release used to create the persistent EMR cluster.
sleeper.bulk.import.persistent.emr.release.label=emr-6.13.0
# (Persistent EMR mode only) Which architecture to be used for EC2 instance types in the EMR cluster.
# Must be either "x86_64" "arm64" or "x86_64,arm64". For more information, see the Bulk import using
# EMR - Instance types section in docs/05-ingest.md
sleeper.bulk.import.persistent.emr.instance.architecture=x86_64
# (Persistent EMR mode only) The EC2 x86_64 instance types and weights used for the master node of the
# persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.bulk.import.persistent.emr.master.x86.instance.types=m6i.xlarge
# (Persistent EMR mode only) The EC2 x86_64 instance types and weights used for the executor nodes of
# the persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.bulk.import.persistent.emr.executor.x86.instance.types=m6i.4xlarge
# (Persistent EMR mode only) The EC2 ARM64 instance types and weights used for the master node of the
# persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.bulk.import.persistent.emr.master.arm.instance.types=m6g.xlarge
# (Persistent EMR mode only) The EC2 ARM64 instance types and weights used for the executor nodes of
# the persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/05-ingest.md
sleeper.bulk.import.persistent.emr.executor.arm.instance.types=m6g.4xlarge
# (Persistent EMR mode only) Whether the persistent EMR cluster should use managed scaling or not.
sleeper.bulk.import.persistent.emr.use.managed.scaling=true
# (Persistent EMR mode only) The minimum number of capacity units to provision as EC2 instances for
# executors in the persistent EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# If managed scaling is not used then the cluster will be of fixed size, with a number of instances
# equal to this value.
sleeper.bulk.import.persistent.emr.min.capacity=1
# (Persistent EMR mode only) The maximum number of capacity units to provision as EC2 instances for
# executors in the persistent EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# This value is only used if managed scaling is used.
sleeper.bulk.import.persistent.emr.max.capacity=10
# (Persistent EMR mode only) This controls the number of EMR steps that can run concurrently.
sleeper.bulk.import.persistent.emr.step.concurrency.level=2
# (EKS mode only) The name of the ECS repository where the Docker image for the bulk import container
# is stored.
sleeper.bulk.import.eks.repo=<insert-unique-sleeper-id>/bulk-import-runner
# (EKS mode only) Names of AWS IAM roles which should have access to administer the EKS cluster.
# sleeper.bulk.import.eks.cluster.admin.roles=
# (EKS mode only) Set to true if sleeper.bulk.import.eks.repo contains the image built with native
# Hadoop libraries. By default when deploying with the EKS stack enabled, an image will be built based
# on the official Spark Docker image, so this should be false.
sleeper.bulk.import.eks.is.native.libs.image=false
## The following properties relate to the splitting of partitions.
# The frequency in minutes with which the lambda that finds partitions that need splitting runs.
sleeper.partition.splitting.period.minutes=30
# When a partition needs splitting, a partition splitting job is created. This reads in the sketch
# files associated to the files in the partition in order to identify the median. This parameter
# controls the maximum number of files that are read in.
sleeper.partition.splitting.files.maximum=50
# The amount of memory in MB for the lambda function used to identify partitions that need to be
# split.
sleeper.partition.splitting.finder.memory=2048
# The timeout in seconds for the lambda function used to identify partitions that need to be split.
sleeper.partition.splitting.finder.timeout.seconds=900
# The memory for the lambda function used to split partitions.
sleeper.partition.splitting.memory=2048
# The timeout in seconds for the lambda function used to split partitions.
sleeper.partition.splitting.timeout.seconds=900
# This is the default value of the partition splitting threshold. Partitions with more than the
# following number of records in will be split. This value can be overridden on a per-table basis.
sleeper.default.partition.splitting.threshold=1000000000
## The following properties relate to garbage collection.
# The frequency in minutes with which the garbage collector lambda is run.
sleeper.gc.period.minutes=15
# The memory in MB for the lambda function used to perform garbage collection.
sleeper.gc.memory=1024
# The size of the batch of files ready for garbage collection requested from the State Store.
sleeper.gc.batch.size=2000
# A file will not be deleted until this number of minutes have passed after it has been marked as
# ready for garbage collection. The reason for not deleting files immediately after they have been
# marked as ready for garbage collection is that they may still be in use by queries. This property
# can be overridden on a per-table basis.
sleeper.default.gc.delay.minutes=15
## The following properties relate to compactions.
# The name of the repository for the compaction container. The Docker image from the
# compaction-job-execution module should have been uploaded to an ECR repository of this name in this
# account.
sleeper.compaction.repo=<insert-unique-sleeper-id>/compaction-job-execution
# The visibility timeout for the queue of compaction jobs.
sleeper.compaction.queue.visibility.timeout.seconds=900
# The frequency, in seconds, with which change message visibility requests are sent to extend the
# visibility of messages on the compaction job queue so that they are not processed by other
# processes.
# This should be less than the value of sleeper.compaction.queue.visibility.timeout.seconds.
sleeper.compaction.keepalive.period.seconds=300
# The rate at which the compaction job creation lambda runs (in minutes, must be >=1).
sleeper.compaction.job.creation.period.minutes=1
# The amount of memory for the lambda that creates compaction jobs.
sleeper.compaction.job.creation.memory=1024
# The timeout for the lambda that creates compaction jobs in seconds.
sleeper.compaction.job.creation.timeout.seconds=900
# The maximum number of concurrent compaction tasks to run.
sleeper.compaction.max.concurrent.tasks=300
# The rate at which a check to see if compaction ECS tasks need to be created is made (in minutes,
# must be >= 1).
sleeper.compaction.task.creation.period.minutes=1
# The CPU architecture to run compaction tasks on. Valid values are X86_64 and ARM64.
# See Task CPU architecture at
# https://docs.aws.amazon.com/AmazonECS/latest/developerguide/AWS_Fargate.html
sleeper.compaction.task.cpu.architecture=X86_64
# The CPU for a compaction task using an ARM64 architecture.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.compaction.task.arm.cpu=1024
# The memory for a compaction task using an ARM64 architecture.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.compaction.task.arm.memory=4096
# The CPU for a compaction task using an x86_64 architecture.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.compaction.task.x86.cpu=1024
# The memory for a compaction task using an x86_64 architecture.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.compaction.task.x86.memory=4096
# What launch type should compaction containers use? Valid options: FARGATE, EC2.
sleeper.compaction.ecs.launch.type=FARGATE
# The EC2 instance type to use for compaction tasks (when using EC2-based compactions).
sleeper.compaction.ec2.type=t3.xlarge
# The minimum number of instances for the EC2 cluster (when using EC2-based compactions).
sleeper.compaction.ec2.pool.minimum=0
# The initial desired number of instances for the EC2 cluster (when using EC2-based compactions).
# Can be set by dividing initial maximum containers by number that should fit on instance type.
sleeper.compaction.ec2.pool.desired=0
# The maximum number of instances for the EC2 cluster (when using EC2-based compactions).
sleeper.compaction.ec2.pool.maximum=75
# The size in GiB of the root EBS volume attached to the EC2 instances (when using EC2-based
# compactions).
sleeper.compaction.ec2.root.size=50
# Flag to enable/disable storage of tracking information for compaction jobs and tasks.
sleeper.compaction.status.store.enabled=true
# The time to live in seconds for compaction job updates in the status store. Default is 1 week.
# The expiry time is fixed when an update is saved to the store, so changing this will only affect new
# data.
sleeper.compaction.job.status.ttl=604800
# The time to live in seconds for compaction task updates in the status store. Default is 1 week.
# The expiry time is fixed when an update is saved to the store, so changing this will only affect new
# data.
sleeper.compaction.task.status.ttl=604800
# The name of the class that defines how compaction jobs should be created. This should implement
# sleeper.compaction.strategy.CompactionStrategy. The value of this property is the default value
# which can be overridden on a per-table basis.
sleeper.default.compaction.strategy.class=sleeper.compaction.strategy.impl.SizeRatioCompactionStrategy
# The minimum number of files to read in a compaction job. Note that the state store must support
# atomic updates for this many files. For the DynamoDBStateStore this is 11. It can be overridden on a
# per-table basis.
# (NB This does not apply to splitting jobs which will run even if there is only 1 file.)
# This is a default value and will be used if not specified in the table.properties file.
sleeper.default.compaction.files.batch.size=11
# Used by the SizeRatioCompactionStrategy to decide if a group of files should be compacted.
# If the file sizes are s_1, ..., s_n then the files are compacted if s_1 + ... + s_{n-1} >= ratio *
# s_n.
# It can be overridden on a per-table basis.
sleeper.default.table.compaction.strategy.sizeratio.ratio=3
# Used by the SizeRatioCompactionStrategy to control the maximum number of jobs that can be running
# concurrently per partition. It can be overridden on a per-table basis.
sleeper.default.table.compaction.strategy.sizeratio.max.concurrent.jobs.per.partition=100000
## The following properties relate to queries.
# The maximum number of simultaneous connections to S3 from a single query runner. This is separated
# from the main one as it's common for a query runner to need to open more files at once.
sleeper.query.s3.max-connections=1024
# The amount of memory in MB for the lambda that executes queries.
sleeper.query.processor.memory=2048
# The timeout for the lambda that executes queries in seconds.
sleeper.query.processor.timeout.seconds=900
# The frequency with which the query processing lambda refreshes its knowledge of the system state
# (i.e. the partitions and the mapping from partition to files), in seconds.
sleeper.query.processor.state.refresh.period.seconds=60
# The maximum number of records to include in a batch of query results send to the results queue from
# the query processing lambda.
sleeper.query.processor.results.batch.size=2000
# The size of the thread pool for retrieving records in a query processing lambda.
sleeper.query.processor.record.retrieval.threads=10
# This value is used to set the time-to-live on the tracking of the queries in the DynamoDB-based
# query tracker.
sleeper.query.tracker.ttl.days=1
# The length of time the results of queries remain in the query results bucket before being deleted.
sleeper.query.results.bucket.expiry.days=7
# The default value of the rowgroup size used when the results of queries are written to Parquet
# files. The value given below is 8MiB. This value can be overridden using the query config.
sleeper.default.query.results.rowgroup.size=8388608
# The default value of the page size used when the results of queries are written to Parquet files.
# The value given below is 128KiB. This value can be overridden using the query config.
sleeper.default.query.results.page.size=131072
## The following properties relate to the dashboard.
# The period in minutes used in the dashboard.
sleeper.dashboard.time.window.minutes=5
## The following properties relate to logging.
# The logging level for logging Sleeper classes. This does not apply to the MetricsLogger which is
# always set to INFO.
sleeper.logging.level=INFO
# The logging level for Apache logs that are not Parquet.
sleeper.logging.apache.level=INFO
# The logging level for Parquet logs.
sleeper.logging.parquet.level=WARN
# The logging level for AWS logs.
sleeper.logging.aws.level=INFO
# The logging level for everything else.
sleeper.logging.root.level=INFO
## The following properties relate to the integration with Athena.
# The number of days before objects in the spill bucket are deleted.
sleeper.athena.spill.bucket.ageoff.days=1
# The fully qualified composite classes to deploy. These are the classes that interact with Athena.
# You can choose to remove one if you don't need them. Both are deployed by default.
sleeper.athena.handler.classes=sleeper.athena.composite.SimpleCompositeHandler,sleeper.athena.composite.IteratorApplyingCompositeHandler
# The amount of memory (GB) the athena composite handler has.
sleeper.athena.handler.memory=4096
# The timeout in seconds for the athena composite handler.
sleeper.athena.handler.timeout.seconds=900
## The following properties relate to default values used by table properties.
# The readahead range set on the Hadoop configuration when reading Parquet files in a query
# (see https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html).
sleeper.default.fs.s3a.readahead.range=64K
# The size of the row group in the Parquet files (default is 8MiB).
sleeper.default.rowgroup.size=8388608
# The size of the pages in the Parquet files (default is 128KiB).
sleeper.default.page.size=131072
# The compression codec to use in the Parquet files.
# Valid values are: [uncompressed, snappy, gzip, lzo, brotli, lz4, zstd]
sleeper.default.compression.codec=zstd
# Whether dictionary encoding should be used for row key columns in the Parquet files.
sleeper.default.parquet.dictionary.encoding.rowkey.fields=false
# Whether dictionary encoding should be used for sort key columns in the Parquet files.
sleeper.default.parquet.dictionary.encoding.sortkey.fields=false
# Whether dictionary encoding should be used for value columns in the Parquet files.
sleeper.default.parquet.dictionary.encoding.value.fields=false
# Used to set parquet.columnindex.truncate.length, see documentation here:
# https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md
# The length in bytes to truncate binary values in a column index.
sleeper.default.parquet.columnindex.truncate.length=128
# Used to set parquet.statistics.truncate.length, see documentation here:
# https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md
# The length in bytes to truncate the min/max binary values in row groups.
sleeper.default.parquet.statistics.truncate.length=2147483647
# This specifies whether queries and scans against DynamoDB tables used in the DynamoDB state store
# are strongly consistent. This default can be overridden by a table property.
sleeper.default.table.dynamo.strongly.consistent.reads=false
# Specifies the minimum number of leaf partitions that are needed to run a bulk import job. If this
# minimum has not been reached, bulk import jobs will refuse to start.
sleeper.default.bulk.import.min.leaf.partitions=64
# Specifies the minimum total file size required for an ingest job to be batched and sent. An ingest
# job will be created if the batcher runs while this much data is waiting, and the minimum number of
# files is also met.
sleeper.default.ingest.batcher.job.min.size=1G
# Specifies the maximum total file size for a job in the ingest batcher. If more data is waiting than
# this, it will be split into multiple jobs. If a single file exceeds this, it will still be ingested
# in its own job. It's also possible some data may be left for a future run of the batcher if some
# recent files overflow the size of a job but aren't enough to create a job on their own.
sleeper.default.ingest.batcher.job.max.size=5G
# Specifies the minimum number of files for a job in the ingest batcher. An ingest job will be created
# if the batcher runs while this many files are waiting, and the minimum size of files is also met.
sleeper.default.ingest.batcher.job.min.files=1
# Specifies the maximum number of files for a job in the ingest batcher. If more files are waiting
# than this, they will be split into multiple jobs. It's possible some data may be left for a future
# run of the batcher if some recent files overflow the size of a job but aren't enough to create a job
# on their own.
sleeper.default.ingest.batcher.job.max.files=100
# Specifies the maximum time in seconds that a file can be held in the batcher before it will be
# included in an ingest job. When any file has been waiting for longer than this, jobs will be created
# for all the currently held files, even if other criteria for a batch are not met.
sleeper.default.ingest.batcher.file.max.age.seconds=300
# Specifies the target ingest queue where batched jobs are sent.
# Valid values are: [standard_ingest, bulk_import_emr, bulk_import_persistent_emr, bulk_import_eks,
# bulk_import_emr_serverless]
sleeper.default.ingest.batcher.ingest.mode=standard_ingest
# The time in minutes that the tracking information is retained for a file before the records of its
# ingest are deleted (eg. which ingest job it was assigned to, the time this occurred, the size of the
# file).
# The expiry time is fixed when a file is saved to the store, so changing this will only affect new
# data.
# Defaults to 1 week.
sleeper.default.ingest.batcher.file.tracking.ttl.minutes=10080