Skip to content

Commit

Permalink
pkg/util/metric: increase bucket counts for Prometheus histograms
Browse files Browse the repository at this point in the history
This patch increases the fidelity of the histogram buckets for
the new Prometheus model. This is primarily done by increasing the
bucket counts for all latency buckets, but may also be manually
tweaked according to feedback from various engineering teams for
their own use cases.

Release note (ops change): Prometheus histograms will now export
more buckets across the board to improve precision & fidelity of
information reported by histogram metrics, such as quantiles.
This will lead to an increase in storage requirements to process
these histogram metrics in downstream systems like Prometheus,
but should still be a marked improvement when compared to the
legacy HdrHistogram model. If users have issues with the precision
of these bucket boundaries, they can set the environment variable
`COCKROACH_ENABLE_HDR_HISTOGRAMS=true` to revert to using the
legacy HdrHistogram model instead, although this is not recommended
otherwise as the HdrHistogram strains systems like Prometheus with
excessive numbers of histogram buckets. Note that HdrHistograms are
slated for full deprecation in upcoming releases.
  • Loading branch information
abarganier committed Feb 2, 2023
1 parent a28aa6c commit 4b32a98
Show file tree
Hide file tree
Showing 2 changed files with 241 additions and 61 deletions.
294 changes: 237 additions & 57 deletions pkg/util/metric/histogram_buckets.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,65 @@ package metric
var IOLatencyBuckets = []float64{
// Generated via TestHistogramBuckets/IOLatencyBuckets.
10000.000000, // 10µs
26826.957953, // 26.826µs
71968.567300, // 71.968µs
193069.772888, // 193.069µs
517947.467923, // 517.947µs
1389495.494373, // 1.389495ms
3727593.720315, // 3.727593ms
10000000.000000, // 9.999999ms
26826957.952797, // 26.826957ms
71968567.300115, // 71.968567ms
193069772.888325, // 193.069772ms
517947467.923120, // 517.947467ms
1389495494.373135, // 1.389495494s
3727593720.314933, // 3.72759372s
9999999999.999981, // 9.999999999s
12638.482029, // 12.638µs
15973.122801, // 15.973µs
20187.602547, // 20.187µs
25514.065200, // 25.514µs
32245.905453, // 32.245µs
40753.929659, // 40.753µs
51506.780762, // 51.506µs
65096.752305, // 65.096µs
82272.413417, // 82.272µs
103979.841848, // 103.979µs
131414.736261, // 131.414µs
166088.278263, // 166.088µs
209910.372011, // 209.91µs
265294.846443, // 265.294µs
335292.414925, // 335.292µs
423758.716060, // 423.758µs
535566.691771, // 535.566µs
676875.000946, // 676.875µs
855467.253557, // 855.467µs
1081180.751077, // 1.08118ms
1366448.349295, // 1.366448ms
1726983.290659, // 1.726983ms
2182644.728397, // 2.182644ms
2758531.617629, // 2.758531ms
3486365.227678, // 3.486365ms
4406236.427774, // 4.406236ms
5568813.990945, // 5.568813ms
7038135.554932, // 7.038135ms
8895134.973108, // 8.895134ms
11242100.350621, // 11.2421ms
14208308.325339, // 14.208308ms
17957144.943716, // 17.957144ms
22695105.366947, // 22.695105ms
28683168.133420, // 28.683168ms
36251170.499885, // 36.25117ms
45815976.690545, // 45.815976ms
57904439.806025, // 57.904439ms
73182422.190762, // 73.182422ms
92491472.772173, // 92.491472ms
116895181.649858, // 116.895181ms
147737765.259851, // 147.737765ms
186718109.129192, // 186.718109ms
235983346.678219, // 235.983346ms
298247128.621688, // 298.247128ms
376939097.538835, // 376.939097ms
476393801.040133, // 476.393801ms
602089449.333611, // 602.089449ms
760949668.545986, // 760.949668ms
961724871.115294, // 961.724871ms
1215474250.076283, // 1.21547425s
1536174946.671824, // 1.536174946s
1941491945.743876, // 1.941491945s
2453751106.639811, // 2.453751106s
3101168926.574770, // 3.101168926s
3919406774.847209, // 3.919406774s
4953535208.959157, // 4.953535208s
6260516572.014802, // 6.260516572s
7912342618.981298, // 7.912342618s
9999999999.999969, // 9.999999999s
}

// NetworkLatencyBuckets are prometheus histogram buckets suitable for a histogram
Expand All @@ -39,21 +84,66 @@ var IOLatencyBuckets = []float64{
// range during normal operation.
var NetworkLatencyBuckets = []float64{
// Generated via TestHistogramBuckets/NetworkLatencyBuckets.
500000.000000, // 500µs
860513.842995, // 860.513µs
1480968.147973, // 1.480968ms
2548787.184731, // 2.548787ms
4386533.310619, // 4.386533ms
7549345.273094, // 7.549345ms
12992632.226094, // 12.992632ms
22360679.774998, // 22.360679ms
38483348.970335, // 38.483348ms
66230909.027573, // 66.230909ms
113985228.104760, // 113.985228ms
196171733.362212, // 196.171733ms
337616984.325077, // 337.616984ms
581048177.284016, // 581.048177ms
999999999.999999, // 999.999999ms
500000.000000, // 500µs
568747.715565, // 568.747µs
646947.927922, // 646.947µs
735900.312190, // 735.9µs
837083.242884, // 837.083µs
952178.364257, // 952.178µs
1083098.538963, // 1.083098ms
1232019.639535, // 1.232019ms
1401416.711034, // 1.401416ms
1594105.105912, // 1.594105ms
1813287.274717, // 1.813287ms
2062605.990318, // 2.062605ms
2346204.890209, // 2.346204ms
2668797.343109, // 2.668797ms
3035744.784401, // 3.035744ms
3453145.822334, // 3.453145ms
3927937.595933, // 3.927937ms
4468011.069141, // 4.468011ms
5082342.177389, // 5.082342ms
5781141.006222, // 5.781141ms
6576021.481300, // 6.576021ms
7480194.389996, // 7.480194ms
8508686.942589, // 8.508686ms
9678592.522117, // 9.678592ms
11009354.773683, // 11.009354ms
12523090.754761, // 12.52309ms
14244958.517175, // 14.244958ms
16203575.229933, // 16.203575ms
18431492.792031, // 18.431492ms
20965738.839853, // 20.965738ms
23848432.140611, // 23.848432ms
27127482.599575, // 27.127482ms
30857387.515093, // 30.857387ms
35100137.315047, // 35.100137ms
39926245.827925, // 39.926245ms
45415922.211464, // 45.415922ms
51660404.016126, // 51.660404ms
58763473.538708, // 58.763473ms
66843182.667648, // 66.843182ms
76033814.886682, // 76.033814ms
86488117.045035, // 86.488117ms
98379837.985822, // 98.379837ms
111906616.224248, // 111.906616ms
127293264.668375, // 127.293264ms
144795506.973983, // 144.795506ms
164704227.631154, // 164.704227ms
187350306.418342, // 187.350306ms
213110117.571795, // 213.110117ms
242411785.065635, // 242.411785ms
275742297.964389, // 275.742297ms
313655604.103963, // 313.655604ms
356781816.616787, // 356.781816ms
405837686.312094, // 405.837686ms
461638513.960647, // 461.638513ms
525111700.464186, // 525.1117ms
597312160.111267, // 597.31216ms
679439853.085354, // 679.439853ms
772859728.612681, // 772.859728ms
879124410.201811, // 879.12441ms
1000000000.000001, // 1s
}

// BatchProcessLatencyBuckets are prometheus histogram buckets suitable for a
Expand All @@ -62,20 +152,65 @@ var NetworkLatencyBuckets = []float64{
var BatchProcessLatencyBuckets = []float64{
// Generated via TestHistogramBuckets/BatchProcessLatencyBuckets.
500000000.000000, // 500ms
789604072.059876, // 789.604072ms
1246949181.227077, // 1.246949181s
1969192302.297256, // 1.969192302s
3109764521.125753, // 3.109764521s
4910965458.056452, // 4.910965458s
7755436646.853539, // 7.755436646s
12247448713.915894, // 12.247448713s
19341270753.704967, // 19.341270753s
30543892291.876068, // 30.543892291s
48235163460.447227, // 48.23516346s
76173362969.685760, // 1m16.173362969s
120293595166.717728, // 2m0.293595166s
189968625172.725128, // 3m9.968625172s
300000000000.000183, // 5m0s
557259285.358743, // 557.259285ms
621075822.237074, // 621.075822ms
692200537.706851, // 692.200537ms
771470353.934916, // 771.470353ms
859818036.218456, // 859.818036ms
958283168.803309, // 958.283168ms
1068024387.637287, // 1.068024387s
1190333014.000928, // 1.190333014s
1326648249.442152, // 1.326648249s
1478574110.813123, // 1.47857411s
1647898304.683320, // 1.647898304s
1836613263.223422, // 1.836613263s
2046939589.088547, // 2.046939589s
2281352185.176006, // 2.281352185s
2542609376.725576, // 2.542609376s
2833785368.441068, // 2.833785368s
3158306418.555065, // 3.158306418s
3519991155.495853, // 3.519991155s
3923095511.561431, // 3.923095511s
4372362802.333632, // 4.372362802s
4873079541.115184, // 4.873079541s
5431137645.156319, // 5.431137645s
6053103765.649553, // 6.053103765s
6746296557.296375, // 6.746296557s
7518872796.674253, // 7.518872796s
8379923362.755980, // 8.379923362s
9339580208.980864, // 9.339580208s
10409135585.614676, // 10.409135585s
11601174915.283792, // 11.601174915s
12929724885.225649, // 12.929724885s
14410418498.852003, // 14.410418498s
16060679028.781363, // 16.060679028s
17899925035.909710, // 17.899925035s
19949798866.972237, // 19.949798866s
22234421319.319225, // 22.234421319s
24780675469.538071, // 24.780675469s
27618523005.723442, // 27.618523005s
30781356785.666904, // 30.781356785s
34306393769.506477, // 34.306393769s
38235112950.461639, // 38.23511295s
42613743436.770157, // 42.613743436s
47493808428.070732, // 47.493808428s
52932731487.183495, // 52.932731487s
58994512241.268242, // 58.994512241s
65750479463.313522, // 1m5.750479463s
73280130395.441635, // 1m13.280130395s
81672066190.318619, // 1m21.67206619s
91025034477.977859, // 1m31.025034477s
101449091325.905777, // 1m41.449091325s
113066896265.136261, // 1m53.066896265s
126015155620.881943, // 2m6.01515562s
140446231131.326965, // 2m20.446231131s
156529932783.144257, // 2m36.529932783s
174455516959.974152, // 2m54.455516959s
194433913416.010529, // 3m14.433913416s
216700207279.419586, // 3m36.700207279s
241516405291.241699, // 4m1.516405291s
269174518830.019897, // 4m29.17451883s
300000000000.000854, // 5m0s
}

// LongRunning60mLatencyBuckets are prometheus histogram buckets suitable
Expand All @@ -84,20 +219,65 @@ var BatchProcessLatencyBuckets = []float64{
var LongRunning60mLatencyBuckets = []float64{
// Generated via TestHistogramBuckets/LongRunning60mLatencyBuckets.
500000000.000000, // 500ms
942961049.923126, // 942.961049ms
1778351083.344248, // 1.778351083s
3353831609.364442, // 3.353831609s
6325065151.263324, // 6.325065151s
11928580151.734879, // 11.928580151s
22496372927.944168, // 22.496372927s
42426406871.192848, // 42.426406871s
80012898335.451462, // 1m20.012898335s
150898093243.579315, // 2m30.898093243s
284582048872.726685, // 4m44.582048872s
536699575188.601318, // 8m56.699575188s
1012173589826.278687, // 16m52.173589826s
1908880541934.094238, // 31m48.880541934s
3599999999999.998535, // 59m59.999999999s
581230667.894489, // 581.230667ms
675658178.602148, // 675.658178ms
785426508.834601, // 785.426508ms
913027948.623944, // 913.027948ms
1061359688.770060, // 1.061359688s
1233789601.560218, // 1.233789601s
1434232708.312242, // 1.434232708s
1667240069.936893, // 1.667240069s
1938102118.779750, // 1.938102118s
2252968777.892157, // 2.252968777s
2618989095.039379, // 2.618989095s
3044473561.836243, // 3.044473561s
3539082803.466387, // 3.539082803s
4114046923.185338, // 4.114046923s
4782420481.824564, // 4.782420481s
5559378901.606352, // 5.559378901s
6462563024.118382, // 6.462563024s
7512479645.637113, // 7.512479645s
8732967123.954826, // 8.732967123s
10151736628.313759, // 10.151736628s
11801001321.527510, // 11.801001321s
13718207759.870365, // 13.718207759s
15946886117.169632, // 15.946886117s
18537638537.439724, // 18.537638537s
21549288056.605419, // 21.549288056s
25050214179.583008, // 25.050214179s
29119905436.998066, // 29.119905436s
33850764172.341507, // 33.850764172s
39350204537.257782, // 39.350204537s
45743091329.950188, // 45.743091329s
53174575050.531136, // 53.17457505s
61813387543.251701, // 1m1.813387543s
71855673053.170151, // 1m11.855673053s
83529441681.404266, // 1m23.529441681s
97099746354.672745, // 1m37.099746354s
112874700852.223846, // 1m52.874700852s
131212475529.457443, // 2m11.212475529s
152529429576.151703, // 2m32.529429576s
177309564452.224213, // 2m57.309564452s
206115513141.294464, // 3m26.115513141s
239601314733.059875, // 3m59.601314733s
278527264381.388123, // 4m38.527264381s
323777175806.438293, // 5m23.777175806s
376378448285.935181, // 6m16.378448285s
437525393756.650940, // 7m17.525393756s
508606353667.955078, // 8m28.606353667s
591235221275.612671, // 9m51.235221275s
687288085089.540771, // 11m27.288085089s
798945825465.036499, // 13m18.945825465s
928743631493.114136, // 15m28.743631493s
1079628562470.991943, // 17m59.62856247s
1255026460885.963623, // 20m55.026460885s
1458919736172.010742, // 24m18.919736172s
1695937785319.419434, // 28m15.937785319s
1971462103337.413574, // 32m51.462103337s
2291748470102.958496, // 38m11.748470102s
2664068987848.231934, // 44m24.068987848s
3096877194248.046875, // 51m36.877194248s
3600000000000.007812, // 1h0m0s
}

// Count1KBuckets are prometheus histogram buckets suitable for a histogram that
Expand Down
8 changes: 4 additions & 4 deletions pkg/util/metric/histogram_buckets_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,22 +48,22 @@ func TestHistogramBuckets(t *testing.T) {
require.InDeltaSlice(t, exp, act, 1 /* delta */, "Please update the bucket boundaries for %s", t.Name())
}
t.Run("IOLatencyBuckets", func(t *testing.T) {
exp := prometheus.ExponentialBucketsRange(10e3, 10e9, 15)
exp := prometheus.ExponentialBucketsRange(10e3, 10e9, 60)
verifyAndPrint(t, exp, IOLatencyBuckets, LATENCY)
})

t.Run("NetworkLatencyBuckets", func(t *testing.T) {
exp := prometheus.ExponentialBucketsRange(500e3, 1e9, 15)
exp := prometheus.ExponentialBucketsRange(500e3, 1e9, 60)
verifyAndPrint(t, exp, NetworkLatencyBuckets, LATENCY)
})

t.Run("BatchProcessLatencyBuckets", func(t *testing.T) {
exp := prometheus.ExponentialBucketsRange(500e6, 300e9, 15)
exp := prometheus.ExponentialBucketsRange(500e6, 300e9, 60)
verifyAndPrint(t, exp, BatchProcessLatencyBuckets, LATENCY)
})

t.Run("LongRunning60mLatencyBuckets", func(t *testing.T) {
exp := prometheus.ExponentialBucketsRange(500e6, 3600e9, 15)
exp := prometheus.ExponentialBucketsRange(500e6, 3600e9, 60)
verifyAndPrint(t, exp, LongRunning60mLatencyBuckets, LATENCY)
})

Expand Down

0 comments on commit 4b32a98

Please sign in to comment.