-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrmorgan_cluster_balance.xml
483 lines (478 loc) · 21.1 KB
/
rmorgan_cluster_balance.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
<form>
<label>Cluster balance and stability</label>
<init>
<set token="index_splunkd">index=_internal sourcetype=splunkd</set>
<set token="index_introspection">index=_introspection</set>
<set token="index_metrics">index=_internal sourcetype=splunkd</set>
<set token="index_remote">index=_internal source=*remote_searches.log*</set>
</init>
<fieldset submitButton="false">
<input type="time" token="time">
<label></label>
<default>
<earliest>-24h@h</earliest>
<latest>now</latest>
</default>
</input>
<input type="dropdown" token="selected_resolution">
<label>Chart resolution</label>
<choice value="100">low (100)</choice>
<choice value="250">medium (250)</choice>
<choice value="500">high (500)</choice>
<choice value="1000">break my browser (1000)</choice>
<default>250</default>
<initialValue>250</initialValue>
</input>
<input type="text" token="selected_limit">
<label>Select limit for charts</label>
<default>25</default>
<prefix>limit=</prefix>
</input>
</fieldset>
<row>
<panel depends="$debug$">
<title></title>
<input type="text" token="selected_span_1s" searchWhenChanged="true">
<label>selected_span_1s</label>
<default>1</default>
<prefix>span=</prefix>
<suffix>s</suffix>
</input>
<input type="text" token="selected_span_10s">
<label>selected_span_10s</label>
<default>10</default>
<prefix>span=</prefix>
<suffix>s</suffix>
</input>
<input type="text" token="selected_span_31s">
<label>selected_span_31s</label>
<default>31</default>
<prefix>span=</prefix>
<suffix>s</suffix>
</input>
<input type="text" token="selected_span_60s">
<label>selected_span_60s</label>
<default>60</default>
<prefix>span=</prefix>
<suffix>s</suffix>
</input>
<input type="text" token="selected_span_1hr">
<label>selected_span_1hr</label>
<default>3600</default>
<prefix>span=</prefix>
<suffix>s</suffix>
</input>
</panel>
<panel depends="$debug$">
<title>Compute timespan</title>
<table>
<search>
<done>
<set token="form.selected_span_1s">$result.span_1s$</set>
<set token="form.selected_span_10s">$result.span_10s$</set>
<set token="form.selected_span_31s">$result.span_31s$</set>
<set token="form.selected_span_60s">$result.span_60s$</set>
<set token="form.selected_span_1hr">$result.span_1hr$</set>
</done>
<query>| makeresults
| addinfo
| eval
duration_seconds=info_max_time-info_min_time,
resolution=$selected_resolution$,
span_1s=if(duration_seconds/resolution<1,1,round(duration_seconds/resolution,0)),
span_10s=if(duration_seconds/resolution<10,10,round(duration_seconds/resolution,0)),
span_31s=if(duration_seconds/resolution<31,31,round(duration_seconds/resolution,0)),
span_60s=if(duration_seconds/resolution<60,60,round(duration_seconds/resolution,0)),
span_1hr=if(duration_seconds/resolution<(60*60),(60*60),round(duration_seconds/resolution,0))
| fields span_*
| fields - _time</query>
<earliest>$time.earliest$</earliest>
<latest>$time.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<chart>
<title>Normalised stdev for each KPI</title>
<search base="base">
<query>| fields - age_days launch* virtual*
| fillnull value=0
| timechart $selected_span_31s$
avg(*) as *_avg
stdev(*) as *_stdev
| foreach *_avg
[| eval <<MATCHSTR>>_normalized=round(<<MATCHSTR>>_stdev/<<MATCHSTR>>_avg,2)
]
| table _time *_normalized
| rename *_normalized as *, churn_* as *</query>
</search>
<option name="charting.axisY.scale">linear</option>
<option name="charting.chart">line</option>
<option name="charting.chart.overlayFields">stdev(churn_downloaded_gb)</option>
<option name="charting.drilldown">all</option>
<option name="height">458</option>
<option name="refresh.display">progressbar</option>
<drilldown>
<set token="form.selected_kpi">$click.name2$</set>
</drilldown>
</chart>
</panel>
</row>
<row>
<panel>
<input type="text" token="selected_anomaly_threshold">
<label>Threshold for anomaly</label>
<default>1.5</default>
</input>
<chart>
<title>Indexers by their delta from average</title>
<search base="base">
<query>| fillnull age_days value="?"
| eventstats
stdev(*) as stack_*_stdev
avg(*) as stack_*_avg
by _time
| rename age_days as _age_days
| foreach stack_*_stdev
[| eval
"stack_<<MATCHSTR>>_stdev"=coalesce('stack_<<MATCHSTR>>_stdev',0),
"stack_<<MATCHSTR>>_avg"=coalesce('stack_<<MATCHSTR>>_avg',0),
"<<MATCHSTR>>_anomoly_score"=if(abs('stack_<<MATCHSTR>>_avg'-'<<MATCHSTR>>')/'stack_<<MATCHSTR>>_stdev'>$selected_anomaly_threshold$,1,0)]
| stats
min(_time) as min_time
max(_time) as max_time
sum(*_anomoly_score) as a_*
by host _age_days
| rename _age_days as age_days, churn_* as *
| eval duration_seconds=(max_time-min_time)/$form.selected_span_31s$
| foreach a_*
[| eval <<MATCHSTR>>=<<FIELD>>/duration_seconds
]
| fields - m*_time a_* duration_seconds
| sort - age_days</query>
</search>
<option name="charting.axisY2.enabled">1</option>
<option name="charting.axisY2.scale">linear</option>
<option name="charting.chart">column</option>
<option name="charting.chart.overlayFields">age_days</option>
<option name="charting.chart.stackMode">stacked</option>
<option name="charting.drilldown">all</option>
<option name="height">324</option>
<option name="refresh.display">progressbar</option>
<drilldown>
<eval token="host_index">mvfind($form.selected_hosts$,$click.value$)</eval>
<eval token="form.selected_hosts">if(isnull($host_index$), mvdedup(mvappend($form.selected_hosts$,$click.value$)),mvappend(mvindex($form.selected_hosts$,0,$host_index$-1),mvindex($form.selected_hosts$,$host_index$+1,mvcount($form.selected_hosts$))))</eval>
<eval token="form.selected_kpi">$click.name2$</eval>
</drilldown>
</chart>
</panel>
</row>
<row>
<panel>
<title>Count of hosts in the cluster</title>
<chart>
<search base="base">
<query>| eventstats
min(_time) as min_time
max(_time) as max_time
by host
| eval
duration_seconds=max_time-min_time
| sort + min_time
| bin _time $selected_span_31s$
| chart limit=50 count by _time host
| foreach *
[| eval <<FIELD>>=if(<<FIELD>>=0,null(),<<FIELD>>) ]</query>
</search>
<option name="charting.chart">area</option>
<option name="charting.chart.stackMode">stacked</option>
<option name="charting.drilldown">all</option>
<option name="charting.layout.splitSeries">0</option>
<option name="height">303</option>
<option name="refresh.display">progressbar</option>
<drilldown>
<eval token="host_index">mvfind($form.selected_hosts$,$click.name2$)</eval>
<eval token="form.selected_hosts">if(isnull($host_index$), mvdedup(mvappend($form.selected_hosts$,$click.name2$)),mvappend(mvindex($form.selected_hosts$,0,$host_index$-1),mvindex($form.selected_hosts$,$host_index$+1,mvcount($form.selected_hosts$))))</eval>
</drilldown>
</chart>
</panel>
</row>
<row>
<panel>
<title>$debug_value$</title>
<input type="multiselect" token="selected_hosts">
<label>Select hosts</label>
<fieldForLabel>host</fieldForLabel>
<fieldForValue>host</fieldForValue>
<search base="base">
<query>| stats count by host</query>
</search>
<prefix>host IN (</prefix>
<suffix>)</suffix>
<delimiter>, </delimiter>
<change>
<condition match="mvcount('form.selected_hosts')!=0">
<set token="debug_value">$value$</set>
</condition>
<condition>
<set token="selected_hosts">host=*</set>
</condition>
</change>
</input>
<input type="dropdown" token="selected_kpi">
<label>Select KPI</label>
<fieldForLabel>column</fieldForLabel>
<fieldForValue>column</fieldForValue>
<search base="base">
<query>| head 1
| fields - _time age_days instance_type host label launch_time_epoch report stack
| transpose
| fields + column</query>
</search>
<default>cores_p99_9</default>
</input>
<input type="multiselect" token="selected_overlay">
<label>Show overlay</label>
<choice value="_stack_average as stack_average">Average</choice>
<choice value="_stack_sum as stack_sum">Sum</choice>
<choice value="_stack_stdev as stack_stdev">Stdev</choice>
<choice value="_stack_p10 as stack_p10">p10</choice>
<prefix>| rename </prefix>
<delimiter>, </delimiter>
</input>
<chart>
<title>Selected indexers vs theaverage</title>
<search base="base">
<query>| eval at_{host}=if(searchmatch("$selected_hosts$"),'$selected_kpi$',null())
| timechart $selected_span_31s$
avg(at_*) as *
avg($selected_kpi$) as _stack_average
sum($selected_kpi$) as _stack_sum
stdev($selected_kpi$) as _stack_stdev
p10($selected_kpi$) as _stack_p10
| foreach *
[| eval "<<FIELD>>"=if('<<FIELD>>'=0,null(),'<<FIELD>>')
]
$selected_overlay$</query>
</search>
<option name="charting.chart">line</option>
<option name="charting.chart.overlayFields">stack_average</option>
<option name="charting.drilldown">all</option>
<option name="refresh.display">progressbar</option>
<drilldown>
<set token="form.selected_host">$click.name2$</set>
</drilldown>
</chart>
</panel>
</row>
<row>
<panel>
<input type="text" token="selected_host">
<label>Host</label>
</input>
<single>
<search base="base">
<query>| where host="$selected_host$"
| stats
min(_time) as min_time
max(_time) as max_time
by host
| eval label="Drill down for host performance data"
| table label *</query>
</search>
<option name="refresh.display">progressbar</option>
<drilldown>
<link target="_blank">/app/coresearch/io_information_for_host_iostats_iowait_iotop?form.selected_host=$row.host$&form.time.earliest=$row.min_time$&form.time.latest=$row.max_time$&form.selected_</link>
</drilldown>
</single>
</panel>
</row>
<row>
<panel depends="$debug$">
<title>base churn</title>
<table>
<title>base</title>
<search id="base">
<query>| tstats prestats=true
avg(data.cpu_idle_pct) as cpu_avg_idle
p25(data.cpu_idle_pct) as cpu_p75_idle
p10(data.cpu_idle_pct) as cpu_p90_idle
p05(data.cpu_idle_pct) as cpu_p95_idle
p01(data.cpu_idle_pct) as cpu_p99_idle
p00.5(data.cpu_idle_pct) as cpu_p99_1_idle
p00.1(data.cpu_idle_pct) as cpu_p99_9_idle
latest(data.splunk_version) as splunk_version
latest(data.cpu_count) as cpu_count
latest(data.virtual_cpu_count) as virtual_cpu_count
avg(data.normalized_load_avg_1min) as normalized_load_avg_avg
p75(data.normalized_load_avg_1min) as normalized_load_avg_p75
p90(data.normalized_load_avg_1min) as normalized_load_avg_p90
p95(data.normalized_load_avg_1min) as normalized_load_avg_p95
p99(data.normalized_load_avg_1min) as normalized_load_avg_p99
p99.9(data.normalized_load_avg_1min) as normalized_load_avg_p99_9
min(data.cpu_idle_pct) as cpu_max_idle
avg(data.mem_used) as mem_mb_used_avg
p99(data.mem_used) as mem_mb_used_p99
max(data.mem_used) as mem_mb_used_max
max(data.mem) as mem_mb_total
where $index_introspection$ component=hostwide host=idx*
by _time host $selected_span_10s$
| eval report=coalesce(report, "hostwide")
| tstats prestats=true append=true
``` bucketroller - this tells us how many buckets are being generated by the platform ```
count
sum(PREFIX(bytes_evicted=))
sum(PREFIX(elapsed_ms=))
where
$index_splunkd$ host=idx* CacheManager TERM(bytes_evicted=*) TERM(elapsed_ms=*)
by _time host $selected_span_10s$
| eval report=coalesce(report, "churn_deleted")
| tstats prestats=true append=true
count
sum(PREFIX(elapsed_ms=))
sum(PREFIX(kb=))
where $index_splunkd$ cachemanager cachemanagerDownloadExecutorWorker TERM(action=download) TERM(status=succeeded)
by _time host $selected_span_10s$
| eval report=coalesce(report, "churn_downloaded")
| tstats prestats=true append=true
count
sum(PREFIX(elapsed_ms=))
sum(PREFIX(kb=))
where $index_splunkd$ cachemanager cachemanagerUploadExecutorWorker TERM(action=upload) TERM(status=succeeded)
by _time host $selected_span_10s$
| eval report=coalesce(report, "churn_uploaded")
| tstats prestats=true append=true
``` bucketroller - this tells us how many buckets are being generated by the platform ```
count
sum(PREFIX(bucketsize=))
dc(PREFIX(idx=))
where
$index_splunkd$ sourcetype=splunkd host=idx* hotbucketroller finished moving hot to warm TERM(bucketSize=*) TERM(caller=*) TERM(_maxHotBucketSize=*)
by _time host $selected_span_10s$
| eval report=coalesce(report, "churn_created")
| tstats prestats=true append=true
max(PREFIX(active_searches=))
mode(PREFIX(active_searches=))
p99.9(PREFIX(active_searches=))
p99(PREFIX(active_searches=))
p75(PREFIX(active_searches=))
avg(PREFIX(active_searches=))
dc(host)
where
$index_remote$ TERM(active_searches=*) host=idx*
by _time host $selected_span_10s$
| eval report=coalesce(report,"concurrency")
| tstats prestats=true append=true
count
where
$index_remote$ NOT TERM(tstats) TERM(drop_count=*) TERM(scan_count=*) TERM(eliminated_buckets=*) TERM(considered_events=*) TERM(decompressed_slices=*) TERM(events_count=*) TERM(total_slices=*) TERM(considered_buckets=*) TERM(search_rawdata_bucketcache_error=*) TERM(search_rawdata_bucketcache_miss=*) TERM(search_index_bucketcache_error=*) TERM(search_index_bucketcache_hit=*) TERM(search_index_bucketcache_miss=*) TERM(search_rawdata_bucketcache_hit=*) TERM(search_rawdata_bucketcache_miss_wait=*) TERM(search_rawdata_bucketcache_miss_wait=0.000) TERM(search_index_bucketcache_miss_wait=0.000) TERM(search_index_bucketcache_miss_wait=*) host=idx*
by _time host $selected_span_10s$
| eval report=coalesce(report,"spl_inside_cache")
| tstats prestats=true append=true
count
sum(PREFIX(search_index_bucketcache_miss_wait=))
sum(PREFIX(search_rawdata_bucketcache_miss_wait=))
where
$index_remote$ NOT TERM(tstats) TERM(drop_count=*) TERM(scan_count=*) TERM(eliminated_buckets=*) TERM(considered_events=*) TERM(decompressed_slices=*) TERM(events_count=*) TERM(total_slices=*) TERM(considered_buckets=*) TERM(search_rawdata_bucketcache_error=*) TERM(search_rawdata_bucketcache_miss=*) TERM(search_index_bucketcache_error=*) TERM(search_index_bucketcache_hit=*) TERM(search_index_bucketcache_miss=*) TERM(search_rawdata_bucketcache_hit=*) TERM(search_rawdata_bucketcache_miss_wait=*) TERM(search_index_bucketcache_miss_wait=*) NOT(TERM(search_rawdata_bucketcache_miss_wait=0.000) OR TERM(search_rawdata_bucketcache_miss_wait=0.000)) host=idx*
by _time host $selected_span_10s$
| eval report=coalesce(report,"spl_outside_cache")
| tstats prestats=true append=true
count
where
$index_remote$ TERM(tstats) TERM(drop_count=*) TERM(scan_count=*) TERM(eliminated_buckets=*) TERM(considered_events=*) TERM(decompressed_slices=*) TERM(events_count=*) TERM(total_slices=*) TERM(considered_buckets=*) TERM(search_rawdata_bucketcache_error=*) TERM(search_rawdata_bucketcache_miss=*) TERM(search_index_bucketcache_error=*) TERM(search_index_bucketcache_hit=*) TERM(search_index_bucketcache_miss=*) TERM(search_rawdata_bucketcache_hit=*) TERM(search_rawdata_bucketcache_miss_wait=*) TERM(search_index_bucketcache_miss_wait=0.000) TERM(search_index_bucketcache_miss_wait=*) host=idx*
by _time host $selected_span_10s$
| eval report=coalesce(report,"tstats_inside_cache")
| tstats prestats=true append=true
count
sum(PREFIX(search_index_bucketcache_miss_wait=))
where
$index_remote$ TERM(tstats) TERM(drop_count=*) TERM(scan_count=*) TERM(eliminated_buckets=*) TERM(considered_events=*) TERM(decompressed_slices=*) TERM(events_count=*) TERM(total_slices=*) TERM(considered_buckets=*) TERM(search_rawdata_bucketcache_error=*) TERM(search_rawdata_bucketcache_miss=*) TERM(search_index_bucketcache_error=*) TERM(search_index_bucketcache_hit=*) TERM(search_index_bucketcache_miss=*) TERM(search_rawdata_bucketcache_hit=*) TERM(search_rawdata_bucketcache_miss_wait=*) TERM(search_index_bucketcache_miss_wait=*) NOT(TERM(search_rawdata_bucketcache_miss_wait=0.000)) host=idx*
by _time host $selected_span_10s$
| eval report=coalesce(report,"tstats_outside_cache")
| stats
``` count and elapsed_ms are featured in multiple reports and needs to be reassigned ```
count
``` search metrics ```
dc(host) as search_cluster_size
max(PREFIX(active_searches=)) as search_concurrency_max
mode(PREFIX(active_searches=)) as search_concurrency_mode
p99.9(PREFIX(active_searches=)) as search_concurrency_p99_9
p99(PREFIX(active_searches=)) as search_concurrency_p99
p75(PREFIX(active_searches=)) as search_concurrency_p75
avg(PREFIX(active_searches=)) as search_concurrency_avg
sum(PREFIX(search_index_bucketcache_miss_wait=)) as index_miss_wait
sum(PREFIX(search_rawdata_bucketcache_miss_wait=)) as rawdata_miss_wait
``` cache churn ```
dc(PREFIX(idx=)) as churn_created_dc_indexes
sum(PREFIX(elapsed_ms=)) as elapsed_ms
``` bucket roller ```
sum(PREFIX(bucketsize=)) as churn_created_bytes
``` cache manager download ```
sum(PREFIX(kb=)) as kb
``` cache manager eviction ```
sum(PREFIX(bytes_evicted=)) as churn_deleted_bytes
``` hostwide data ```
avg(data.cpu_idle_pct) as cpu_avg_idle
p25(data.cpu_idle_pct) as cpu_p75_idle
p10(data.cpu_idle_pct) as cpu_p90_idle
p05(data.cpu_idle_pct) as cpu_p95_idle
p01(data.cpu_idle_pct) as cpu_p99_idle
p00.5(data.cpu_idle_pct) as cpu_p99_1_idle
p00.1(data.cpu_idle_pct) as cpu_p99_9_idle
latest(data.splunk_version) as splunk_version
latest(data.cpu_count) as cpu_count
latest(data.virtual_cpu_count) as virtual_cpu_count
avg(data.normalized_load_avg_1min) as normalized_load_avg_avg
p75(data.normalized_load_avg_1min) as normalized_load_avg_p75
p90(data.normalized_load_avg_1min) as normalized_load_avg_p90
p95(data.normalized_load_avg_1min) as normalized_load_avg_p95
p99(data.normalized_load_avg_1min) as normalized_load_avg_p99
p99.9(data.normalized_load_avg_1min) as normalized_load_avg_p99_9
min(data.cpu_idle_pct) as cpu_max_idle
avg(data.mem_used) as mem_mb_used_avg
p99(data.mem_used) as mem_mb_used_p99
max(data.mem_used) as mem_mb_used_max
max(data.mem) as mem_mb_total
by _time host report
| eval
{report}_count=count,
{report}_elapsed_ms=elapsed_ms,
{report}_kb=kb
| fields - count elapsed_ms
| eval
churn_deleted_gb=churn_deleted_bytes/pow(1024,3),
churn_created_gb=churn_created_bytes/pow(1024,3),
churn_downloaded_gb=churn_downloaded_kb/pow(1024,2),
churn_uploaded_gb=churn_uploaded_kb/pow(1024,2),
churn_created_dc_indexes=if(churn_created_dc_indexes=0,null(),churn_created_dc_indexes)
``` search conncurrency clean up ```
| foreach *_miss_wait
[| eval search_{report}_<<MATCHSTR>>_wait_sum=<<FIELD>>]
| fields - count index_miss_wait rawdata_miss_wait report
``` host wide clean up```
| eval normalized_load_avg_avg=round(normalized_load_avg_avg,2)
| foreach *_idle
[| eval <<MATCHSEG1>>=100-round(<<FIELD>>,2)]
| foreach mem_*
[| eval <<FIELD>>=round(<<FIELD>>,3)]
| foreach cpu_*
[| eval cores_<<MATCHSTR>>=virtual_cpu_count*round(<<FIELD>>/100,3)]
| fields - *_idle
| fields - count churn_deleted_bytes kb churn_uploaded_kb churn_downloaded_kb churn_uploaded_kb churn_created_bytes churn_deleted_bytes
| stats list(*) as * by _time host
</query>
<earliest>$time.earliest$</earliest>
<latest>$time.latest$</latest>
</search>
<option name="count">1</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
</form>