-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Preliminary intra-site canary metrics from kernel upgrade #12
Comments
This is the final spreadsheet that shows the differences detected in NDT download speeds and speed/minRTT correlation on gru03 machines. The mlab1 machine was identified as a possible canary machine, out of worldwide canaries on single unknown machines per site. |
This is the query used in the spreadsheet. WITH
metro_stats AS (
SELECT * FROM `mlab-sandbox.gfr.metro_stats`
),
date_agg AS (
SELECT metro, site, machine, protocol, complete, slow, error,
SUM(tests) AS tests,
ROUND(AVG(log_mean_speed),3) AS log_mean_speed,
ROUND(AVG(logMeanMinRTT),3) AS logMeanMinRTT,
ROUND(AVG(pearson),3) AS pearson,
ROUND(AVG(under_1),3) AS under_1 ,
ROUND(AVG(_1_3),3) AS _1_3,
ROUND(AVG(_3_10 ),3) AS _3_10,
ROUND(AVG(_10_30 ),3) AS _10_30,
ROUND(AVG(_30_100 ),3) AS _30_100,
ROUND(AVG(_100_300 ),3) AS _100_300,
ROUND(AVG(over_300),3) AS over_300
FROM metro_stats
--WHERE metro = "syd"
WHERE protocol = "JSON" AND complete IS true
GROUP BY metro, site, machine, protocol, complete, slow, error
ORDER BY protocol, complete DESC, slow, error, site, machine
),
deviations AS (
SELECT metro, site,
ROUND(SAFE_DIVIDE(STDDEV(log_mean_speed), AVG(log_mean_speed)),4) AS logMeanSpeedStdev,
ROUND(SAFE_DIVIDE(STDDEV(logMeanMinRTT), AVG(logMeanMinRTT)),4) AS logMeanMinRTTStdev,
ROUND(SAFE_DIVIDE(STDDEV(pearson), AVG(pearson)),4) AS pearsonStdev,
FROM date_agg
WHERE complete = true AND error = false AND slow = false
GROUP BY metro, site, protocol, complete, slow, error
ORDER BY logMeanSpeedStdev DESC
)
#SELECT * FROM deviations
SELECT * FROM metro_stats
WHERE metro = "gru" AND site = "gru03"
AND protocol = "JSON" AND complete AND slow = false AND error = false
ORDER BY site, test_date, machine |
This is a saved query that was probably used to generate the table that the spreadsheet uses. https://console.cloud.google.com/bigquery?sq=581276032543:cb730dd999c54182a129c7d9ccbff01c # Metro stats
# Order:
# 2020-06-06 ndt5 per machine client stats, last 10 days
# 2020-06-07 Client stats
# 2020-06-16 Metro stats
CREATE OR REPLACE TABLE `mlab-sandbox.gfr.metro_stats`
PARTITION BY test_date
CLUSTER BY metro, site
AS
WITH
primary AS (
SELECT DATE(TIMESTAMP_SECONDS(log_time)) AS test_date,
TIMESTAMP_SECONDS(log_time) AS log_time,
* EXCEPT(log_time)
FROM `mlab-oti.base_tables.ndt5`
WHERE DATE(_PARTITIONTIME) >= DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY) # PARTITIONED on this.
),
# Ideally, this should be based on binomial distribution likelihood.
# However, for now I'm using a simpler criteria that is sub-optimal.
good_clients AS (
SELECT * FROM `mlab-sandbox.gfr.client_weekly_stats`
WHERE date >= DATE_SUB(CURRENT_DATE(), INTERVAL 8 DAY)
AND client NOT IN
("45.56.98.222", "35.192.37.249", "35.225.75.192", "23.228.128.99",
"2600:3c03::f03c:91ff:fe33:819", "2605:a601:f1ff:fffe::99")
# Exclude clients more than twice as many tests as machines in the metro
--AND total_tests < 2*machines
# Good clients will have similar counts across all machines
AND max <= min + SQRT(machines) -- up to 3 machines -> max = 1, 4 machines -> max = 2, ALL 15 machines -> spread < 4
--AND max <= 3
--AND test_machines = machines
--AND total_tests > 25
--AND zeros = 0 AND ones = 0 AND twos=0
--AND max <= min + 2
# If there are fewer tests than machines, we expect lots of singletons
--AND (total_tests > machines OR ones >= twos)
ORDER BY machines DESC, metro
),
---------------------------------------------------------------------------
downloads AS (
SELECT
test_date, log_time,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro,
ParseInfo,
result.S2C.ClientIP AS client,
result.S2C.UUID AS uuid,
result.Control.MessageProtocol AS protocol,
result.S2C.MeanThroughputMbps AS mbps,
result.S2C.TCPInfo.Retransmits, # empty/NULL
result.S2C.TCPInfo.MinRTT, # empty/NULL
result.S2C.MinRTT AS appMinRTT,
result.S2C.SumRTT, result.S2C.CountRTT, # empty/NULL
result.S2C.MaxRTT AS appMaxRTT, # empty/NULL
TIMESTAMP_DIFF(result.S2C.EndTime, result.S2C.StartTime, MICROSECOND)/1000000 AS test_duration,
result.S2C.Error != "" AS error,
result.S2C.MeanThroughputMbps <= 0.1 AS slow,
TIMESTAMP_DIFF(result.S2C.EndTime, result.S2C.StartTime, MICROSECOND)/1000000 BETWEEN 9 AND 13 AS complete,
FROM primary
--WHERE ParseInfo.TaskFilename LIKE "%lga%"
),
--------------------------------------------------------------
good_downloads AS (
SELECT D.*
FROM downloads D JOIN good_clients G ON D.client = G.client AND D.metro = G.metro AND D.test_date = G.date
),
stats AS (
SELECT test_date, metro, site, machine, protocol, complete, slow, error, count(uuid) AS tests,
ROUND(EXP(AVG(IF(mbps > 0, LN(mbps), NULL))),2) AS log_mean_speed,
ROUND(SAFE_DIVIDE(COUNTIF(appMinRTT < 10000000), COUNT(uuid)),3) AS rttUnder10,
ROUND(APPROX_QUANTILES(appMinRTT, 101)[OFFSET(50)]/1000000,2) AS medianMinRTT,
ROUND(AVG(appMinRTT)/1000000,2) AS meanMinRTT,
ROUND(EXP(AVG(IF(appMinRTT > 0, LN(appMinRTT/1000000), 0))),2) AS logMeanMinRTT,
AVG(Retransmits) AS avgRetransmits,
# Pearson correlation between ln(minRTT) and ln(bandwidth). Ln produces much higher correlation (.5 vs .3)
# suggesting that the long tail of high speed / low RTT undermines the correlation without the LOG.
ROUND(CORR(IF(appMinRTT > 0, LN(1/appMinRTT), NULL) , IF(mbps > 0, LN(mbps), NULL)), 3) AS pearson,
--ROUND(AVG(SAFE_DIVIDE(SumRTT,CountRTT))/1000000,2) AS meanAppAvgRTT,
ROUND(APPROX_QUANTILES(mbps, 101)[OFFSET(10)],2) AS q10,
ROUND(APPROX_QUANTILES(mbps, 101)[OFFSET(25)],2) AS q25,
ROUND(APPROX_QUANTILES(mbps, 101)[OFFSET(50)],2) AS q50,
ROUND(APPROX_QUANTILES(mbps, 101)[OFFSET(75)],2) AS q75,
ROUND(APPROX_QUANTILES(mbps, 101)[OFFSET(90)],2) AS q90,
ROUND(MAX(mbps),2) AS max,
ROUND(SAFE_DIVIDE(COUNTIF(mbps < 1), COUNT(uuid)),3) AS under_1,
ROUND(SAFE_DIVIDE(COUNTIF(mbps BETWEEN 1 AND 3), COUNT(uuid)),3) AS _1_3,
ROUND(SAFE_DIVIDE(COUNTIF(mbps BETWEEN 3 AND 10), COUNT(uuid)),3) AS _3_10,
ROUND(SAFE_DIVIDE(COUNTIF(mbps BETWEEN 10 AND 30), COUNT(uuid)),3) AS _10_30,
ROUND(SAFE_DIVIDE(COUNTIF(mbps BETWEEN 30 AND 100), COUNT(uuid)),3) AS _30_100,
ROUND(SAFE_DIVIDE(COUNTIF(mbps BETWEEN 100 AND 300), COUNT(uuid)),3) AS _100_300,
ROUND(SAFE_DIVIDE(COUNTIF(mbps > 300), COUNT(uuid)),3) AS over_300,
COUNTIF(appMinRTT > 50000000) AS far,
ROUND(EXP(AVG(IF(appMinRTT > 50000000 AND mbps > 0, LN(mbps), NULL))),3) AS logMeanFarMbps,
FROM good_downloads
GROUP BY metro, test_date, machine, site, complete, slow, error, protocol
)
SELECT * FROM stats
-- WHERE metro = "lga"
-- ORDER BY protocol, complete DESC, slow, error, test_date, site, machine
|
This is the saved query that likely produced the client_week_stats, used by the metro_stats query. https://console.cloud.google.com/bigquery?sq=581276032543:e0c015127c3b4fa09f7d0f45914d8212 # All metros, 7 dates takes about 1 slot hour, produces 2M rows of good clients.
# CROSS JOIN produces about 150M rows.
# Order:
# 2020-06-06 ndt5 per machine client stats, last 10 days
# 2020-06-07 Client stats
# 2020-06-10 Multi-date
CREATE OR REPLACE TABLE `mlab-sandbox.gfr.client_weekly_stats`
PARTITION BY date
CLUSTER BY metro, client
AS
# This is the raw metro summary data
WITH metros AS (
SELECT *
FROM `mlab-sandbox.gfr.oti_metro_client_summary`
),
# flatten the per metro data, so that we have a row for each machine/client/date
flatten AS (
SELECT test_date, metro, client, CONCAT(site, ".", machine) AS machine, tests
FROM metros JOIN UNNEST(metros.machines)
GROUP BY test_date, metro, client, site, machine, tests
),
# extract complete list of machines per metro/date
machines AS (
SELECT test_date, metro, machine
FROM flatten
GROUP BY test_date, metro, machine
),
# extract complete list of clients per metro/date
clients AS (
SELECT test_date, metro, client
FROM flatten
WHERE client != ""
GROUP BY test_date, metro, client
),
# create a complete list of machine/client pairs per metro/date
# This is quite large - about 100M pairs worldwide for a one week window.
product AS (
SELECT machines.test_date, machines.metro, machines.machine, clients.client
FROM machines CROSS JOIN clients
WHERE machines.metro = clients.metro AND machines.test_date = clients.test_date
),
# Now join the machine/client pairs with the original flattened data.
# This produces a full complement of rows for each client/metro/date.
joined AS (
SELECT product.test_date, product.metro, product.machine, product.client, IF(flatten.tests IS NULL, 0, flatten.tests) AS tests
FROM product LEFT JOIN flatten ON product.test_date = flatten.test_date AND product.metro = flatten.metro AND product.client = flatten.client AND product.machine = flatten.machine
),
---------------------------------------------------------
# Now aggregate over the past week, to produce a complete distribution of tests
# per client across all machines in each metro.
past_week AS (
SELECT
test_date AS date, metro, machine, client,
SUM(tests) OVER weekly_window AS weekly_tests,
MIN(test_date) OVER weekly_window AS min_date,
FROM joined
GROUP BY date, metro, client, machine, tests
WINDOW weekly_window AS (
PARTITION BY client, machine
ORDER BY UNIX_DATE(test_date) RANGE BETWEEN 6 PRECEDING AND CURRENT ROW
)
),
# Now summarize the data for each client/metro
weekly_summary AS (
SELECT
date, metro, client,
COUNTIF(weekly_tests > 0) AS test_machines,
COUNT(machine) AS machines,
SUM(weekly_tests) total_tests,
MIN(min_date) AS min_date,
# These count the number of machines with 0,1,2,3 or more tests
# These are useful to determining whether the client is statistically well behaved
MIN(weekly_tests) AS min,
MAX(weekly_tests) AS max,
COUNTIF(weekly_tests = 0) AS zeros,
COUNTIF(weekly_tests = 1) AS ones,
COUNTIF(weekly_tests = 2) AS twos,
COUNTIF(weekly_tests = 3) AS threes,
COUNTIF(weekly_tests > 3) AS more,
FROM past_week
GROUP BY date, metro, client
),
good_clients AS (
SELECT * FROM weekly_summary
WHERE date >= DATE_SUB(CURRENT_DATE(), INTERVAL 8 DAY)
# Exclude clients more than twice as many tests as machines in the metro
AND total_tests < 2*machines
# Good clients will have similar counts across all machines
AND max <= min + 2
# If there are fewer tests than machines, we expect lots of singletons
AND (total_tests > machines OR ones >= twos)
ORDER BY machines DESC, metro
)
SELECT *
FROM weekly_summary |
# Update mlab-sandbox.gfr.ndt5_stats
INSERT INTO `mlab-sandbox.gfr.ndt5_stats`
SELECT
DATE(TIMESTAMP_SECONDS(log_time)) AS test_date,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine,
_PARTITIONTIME AS pt,
ParseInfo.TaskFileName as archive,
COUNT(test_id) AS tests,
CURRENT_TIMESTAMP() AS query_time
FROM `mlab-oti.base_tables.ndt5`
WHERE DATE(TIMESTAMP_SECONDS(log_time)) > DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
AND DATE(_PARTITIONTIME) > DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
GROUP BY test_date, pt, metro, site, machine, archive |
This is a scheduled query that also updates metro_stats # Tests per client per machine per day, with metro summaries.
# Uses about 1 Slot Hour.
# Introduces dups if run more than once per day.
INSERT into `mlab-sandbox.gfr.metro_client_summary`
WITH hours_per_machine AS (
SELECT
DATE(log_time) AS test_date,
TIMESTAMP_TRUNC(log_time, HOUR) AS hour,
COUNT(log_time) AS tests,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine,
FROM `mlab-sandbox.raw_ndt.ndt5`
WHERE DATE(log_time) >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
GROUP BY test_date, hour, machine, site, metro
),
hours_per_day_per_machine AS (
SELECT * EXCEPT(hour, tests),
COUNT(hour) AS hours,
SUM(tests) AS tests,
FROM hours_per_machine
GROUP BY test_date, machine, site, metro
),
good_machines_per_metro AS (
SELECT * EXCEPT(machine, hours, site, tests),
COUNT(machine) AS metro_machines,
SUM(hours) AS metro_hours,
COUNTIF(hours = 24) AS good_machines,
SUM(IF(hours = 24, hours, 0)) AS good_hours,
ARRAY_AGG(
STRUCT(
site, machine, tests, hours,
hours = 24 AS good
) ORDER BY site, machine) AS machines,
FROM hours_per_day_per_machine
GROUP BY test_date, metro
),
---------------------------------------------------------------------------
tests_per_client AS (
SELECT
DATE(log_time) AS test_date,
log_time AS TestTime,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro,
ParseInfo,
IF(result.S2C IS NOT NULL, result.S2C.ClientIP, result.C2S.ClientIP) AS client,
IF(result.S2C IS NOT NULL, result.S2C.UUID, result.C2S.UUID) AS uuid,
IF(result.S2C IS NOT NULL, result.S2C.MeanThroughputMbps, result.C2S.MeanThroughputMbps) AS mbps,
IF(result.S2C IS NOT NULL,
TIMESTAMP_DIFF(result.S2C.EndTime, result.S2C.StartTime, MICROSECOND),
TIMESTAMP_DIFF(result.C2S.EndTime, result.C2S.StartTime, MICROSECOND)) AS test_duration
FROM `mlab-sandbox.raw_ndt.ndt5`
WHERE
(result.S2C.Error = "" OR result.C2S.Error = "")
),
---------------------------------------------------------------------
# Count tests per machine, and join with hours per machine
machine_summary AS (
SELECT
a.* EXCEPT(TestTime, ParseInfo, uuid, mbps, test_duration),
--test_date, metro, site, machine, client,
COUNT(DISTINCT uuid) AS tests,
b.* EXCEPT(metro, site, machine, test_date, tests)
FROM tests_per_client a LEFT JOIN hours_per_day_per_machine b ON (a.metro = b.metro AND a.site = b.site AND a.machine = b.machine AND a.test_date = b.test_date)
GROUP BY metro, site, machine, client, test_date, hours
),
# This will be very expensive.
# This should create a lot of empty rows, for clients that appear in metro, but not on a machine.
with_hours AS (
SELECT
a.*, b.* EXCEPT(test_date, metro, machines)
FROM good_machines_per_metro b LEFT JOIN machine_summary a ON (a.metro = b.metro AND a.test_date = b.test_date)
),
metro_summary AS (
SELECT CURRENT_DATE() AS update_time,
test_date, metro, client,
ARRAY_AGG(
STRUCT(
site, machine, tests
) ORDER BY site, machine) AS machines,
metro_machines, metro_hours, good_machines, good_hours,
FROM with_hours
GROUP BY metro, client,
test_date, good_hours, metro_hours, good_machines, metro_machines
)
--------------------------------------------------------------
SELECT *
FROM metro_summary
WHERE test_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 3 DAY)
|
Here is the query that generates oti_metro_client_summary, saved as gfr personal query https://console.cloud.google.com/bigquery?sq=581276032543:5e697d8ec4bb4e249927976828b25341 # Tests per client per machine per day, with metro summaries.
# Uses < 2 Slot Hour to process 16 days, producing about 12 million rows.
CREATE OR REPLACE TABLE `mlab-sandbox.gfr.oti_metro_client_summary`
PARTITION BY test_date
CLUSTER BY metro, client
AS
WITH
primary AS (
SELECT DATE(TIMESTAMP_SECONDS(log_time)) AS test_date,
TIMESTAMP_SECONDS(log_time) AS log_time,
* EXCEPT(log_time)
FROM `mlab-oti.base_tables.ndt5`
WHERE DATE(_PARTITIONTIME) >= DATE_SUB(CURRENT_DATE(), INTERVAL 16 DAY) # PARTITIONED on this.
),
hours_per_machine AS (
SELECT
test_date,
TIMESTAMP_TRUNC(log_time, HOUR) AS hour,
COUNT(test_id) AS tests,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine,
FROM primary
# Without this, the query costs goes up dramatically.
WHERE test_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 16 DAY)
GROUP BY test_date, hour, machine, site, metro ),
hours_per_day_per_machine AS (
SELECT
* EXCEPT(hour, tests),
COUNT(hour) AS hours, SUM(tests) AS tests,
FROM hours_per_machine
GROUP BY test_date, machine, site, metro
),
good_machines_per_metro AS (
SELECT
* EXCEPT(machine, hours, site, tests),
COUNT(machine) AS metro_machines, SUM(hours) AS metro_hours,
COUNTIF(hours = 24) AS good_machines, SUM(IF(hours = 24, hours, 0)) AS good_hours,
ARRAY_AGG( STRUCT( site, machine, tests, hours, hours = 24 AS good ) ORDER BY site, machine) AS machines,
FROM hours_per_day_per_machine
GROUP BY test_date, metro ),
---------------------------------------------------------------------------
tests_per_client AS (
SELECT
test_date, log_time,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-(mlab[1-4])-.*") AS machine,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3}[0-9]{2}).*") AS site,
REGEXP_EXTRACT(ParseInfo.TaskFilename, ".*-mlab[1-4]-([a-z]{3})[0-9]{2}.*") AS metro,
ParseInfo,
IF(result.S2C IS NOT NULL, result.S2C.ClientIP, result.C2S.ClientIP) AS client,
IF(result.S2C IS NOT NULL, result.S2C.UUID, result.C2S.UUID) AS uuid,
IF(result.S2C IS NOT NULL, result.S2C.MeanThroughputMbps, result.C2S.MeanThroughputMbps) AS mbps,
IF(result.S2C IS NOT NULL, TIMESTAMP_DIFF(result.S2C.EndTime, result.S2C.StartTime, MICROSECOND), TIMESTAMP_DIFF(result.C2S.EndTime, result.C2S.StartTime, MICROSECOND)) AS test_duration
FROM primary
WHERE (result.S2C.Error = "" OR result.C2S.Error = "")
),
---------------------------------------------------------------------
# Count tests per machine, and join with hours per machine
machine_summary AS (
SELECT
a.* EXCEPT(log_time, ParseInfo, uuid, mbps, test_duration),
--test_date, metro, site, machine, client,
COUNT(DISTINCT uuid) AS tests,
b.* EXCEPT(metro, site, machine, test_date, tests)
FROM tests_per_client a LEFT JOIN hours_per_day_per_machine b
ON (a.metro = b.metro AND a.site = b.site AND a.machine = b.machine AND a.test_date = b.test_date)
GROUP BY metro, site, machine, client, test_date, hours
),
# This will be very expensive.
# This should create a lot of empty rows, for clients that appear in metro, but not on a machine.
with_hours AS (
SELECT a.*, b.* EXCEPT(test_date, metro, machines)
FROM good_machines_per_metro b LEFT JOIN machine_summary a
ON (a.metro = b.metro AND a.test_date = b.test_date)
),
metro_summary AS (
SELECT
CURRENT_DATE() AS update_time, test_date, metro, client,
ARRAY_AGG( STRUCT( site, machine, tests ) ORDER BY site, machine) AS machines,
metro_machines, metro_hours, good_machines, good_hours,
FROM with_hours
GROUP BY metro, client, test_date, good_hours, metro_hours, good_machines, metro_machines
)
--------------------------------------------------------------
SELECT * FROM metro_summary
WHERE test_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 16 DAY) |
This issue is intended to document the work on bigquery queries for canary assessments.
The primary concern is compensating for or avoiding client bias, in order to increase sensitivity to changes in the platform, specifically between a machine with a new canary software stack, and other machines on the same site.
The text was updated successfully, but these errors were encountered: