Skip to content

Commit

Permalink
More SQL Linting fixes (HTTPArchive#2305)
Browse files Browse the repository at this point in the history
* More SQL Linting fixes

* Fix parsing issue
tunetheweb authored Aug 11, 2021

Verified

This commit was signed with the committer’s verified signature. The key has expired.
NejcZdovc Nejc Zdovc
1 parent 9914a50 commit 127c73e
Showing 227 changed files with 1,889 additions and 1,921 deletions.
9 changes: 7 additions & 2 deletions sql/.sqlfluff
Original file line number Diff line number Diff line change
@@ -12,9 +12,9 @@ exclude_rules = L003,L007,L011,L014,L016,L020,L026,L027,L028,L029,L030,L031,L032
# L016 - We allow longer lines as some of our queries are complex. Maybe should limit in future?
# L020 - Asks for unique table aliases meaning it complains if selecting from two 2021_07_01 tables as implicit alias is table name (not fully qualified) so same.
# L026 - BigQuery uses arrays and functions which looks like incorrect references
# L027 - Asks for qualified columns for ambiguous. Really should enable this but a lot to clean up. TODO
# L027 - Asks for qualified columns for ambiguous one. Really should enable this but a lot to clean up. TODO
# L028 - Insists on references in column names even if not ambiguous. Bit OTT.
# L029 - Avoids keywords as identifiers but has some common ones we use (e.g. count, element).
# L029 - Avoids keywords as identifiers but we use this a lot (e.g. AS count, AS max...etc.)
# L030 - Function names will be mixed case so don't enforce case
# L031 - Avoid aliases in from and join - why?
# L032 - Uses joins instead of USING - why?
@@ -37,6 +37,7 @@ sql_file_exts = .sql,.sql.j2,.dml,.ddl

[sqlfluff:indentation]
indented_joins = False
indented_using_on = False
template_blocks_indent = True

[sqlfluff:templater]
@@ -67,6 +68,9 @@ unquoted_identifiers_policy = all
[sqlfluff:rules:L003]
lint_templated_tokens = True

[sqlfluff:rules:L007]
operator_new_lines = before

[sqlfluff:rules:L010] # Keywords
capitalisation_policy = upper

@@ -94,3 +98,4 @@ forbid_subquery_in = join

[sqlfluff:rules:L047] # Consistent syntax to count all rows
prefer_count_1 = False
prefer_count_0 = True
3 changes: 1 addition & 2 deletions sql/2019/accessibility/09_03.sql
Original file line number Diff line number Diff line change
@@ -32,5 +32,4 @@ GROUP BY
ORDER BY
pages / total DESC,
client
LIMIT
10000
LIMIT 10000
3 changes: 1 addition & 2 deletions sql/2019/accessibility/09_05.sql
Original file line number Diff line number Diff line change
@@ -22,5 +22,4 @@ GROUP BY
role
ORDER BY
pages / total DESC
LIMIT
1000
LIMIT 1000
16 changes: 8 additions & 8 deletions sql/2019/cdn/17_01.sql
Original file line number Diff line number Diff line change
@@ -16,14 +16,14 @@ SELECT
SUM(COUNT(0)) OVER (PARTITION BY client) AS totalHits,
ROUND((COUNT(0) * 100 / (0.001 + SUM(COUNT(0)) OVER (PARTITION BY client))), 2) AS hitsPct
FROM
(
SELECT
client, page, url, firstHtml, respBodySize,
IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry
IF(NET.HOST(url) = NET.HOST(page), TRUE, FALSE) AS sameHost,
IF(NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page), TRUE, FALSE) AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain
FROM `httparchive.almanac.requests3`
)
(
SELECT
client, page, url, firstHtml, respBodySize,
IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry
IF(NET.HOST(url) = NET.HOST(page), TRUE, FALSE) AS sameHost,
IF(NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page), TRUE, FALSE) AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain
FROM `httparchive.almanac.requests3`
)
GROUP BY
client,
cdn
56 changes: 28 additions & 28 deletions sql/2019/cdn/17_02.sql
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
#standardSQL
# 17_02: Percentage of the sites which use a CDN for any resource
SELECT
client,
COUNTIF(firstHtml) AS htmlHits,
COUNTIF(NOT firstHtml AND sameHost) AS domainHits,
COUNTIF(NOT sameHost AND sameDomain) AS subdomainHits,
COUNTIF(NOT sameHost AND NOT sameDomain) AS thirdPartyHits,
COUNT(0) AS hits,
SUM(IF(firstHtml, respBodySize, 0)) AS htmlBytes,
SUM(IF(NOT firstHtml AND sameHost, respBodySize, 0)) AS domainBytes,
SUM(IF(NOT sameHost AND sameDomain, respBodySize, 0)) AS subdomainBytes,
SUM(IF(NOT sameHost AND NOT sameDomain, respBodySize, 0)) AS thirdPartyBytes,
SUM(respBodySize) AS bytes,
client,
COUNTIF(firstHtml) AS htmlHits,
COUNTIF(NOT firstHtml AND sameHost) AS domainHits,
COUNTIF(NOT sameHost AND sameDomain) AS subdomainHits,
COUNTIF(NOT sameHost AND NOT sameDomain) AS thirdPartyHits,
COUNT(0) AS hits,
SUM(IF(firstHtml, respBodySize, 0)) AS htmlBytes,
SUM(IF(NOT firstHtml AND sameHost, respBodySize, 0)) AS domainBytes,
SUM(IF(NOT sameHost AND sameDomain, respBodySize, 0)) AS subdomainBytes,
SUM(IF(NOT sameHost AND NOT sameDomain, respBodySize, 0)) AS thirdPartyBytes,
SUM(respBodySize) AS bytes,

COUNTIF(cdn != 'ORIGIN') AS cdnHits,
ROUND((COUNTIF(cdn != 'ORIGIN') * 100) / COUNT(0), 2) AS hitsPct,
SUM(CASE WHEN cdn != 'ORIGIN' THEN respBodySize ELSE 0 END) AS cdnBytes,
ROUND((SUM(CASE WHEN _cdn_provider != '' THEN respBodySize ELSE 0 END) * 100) / SUM(respBodySize), 2) AS bytesPct
FROM
(
SELECT
client, page, url, firstHtml, respBodySize,
IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn,
CASE WHEN NET.HOST(url) = NET.HOST(page) THEN TRUE ELSE FALSE END AS sameHost,
CASE WHEN NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page) THEN TRUE ELSE FALSE END AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain
FROM `httparchive.almanac.requests3`
--GROUP BY client, pageid, requestid, page, url, firstHtml, _cdn_provider, respBodySize
)
GROUP BY
client,
hits
COUNTIF(cdn != 'ORIGIN') AS cdnHits,
ROUND((COUNTIF(cdn != 'ORIGIN') * 100) / COUNT(0), 2) AS hitsPct,
SUM(CASE WHEN cdn != 'ORIGIN' THEN respBodySize ELSE 0 END) AS cdnBytes,
ROUND((SUM(CASE WHEN _cdn_provider != '' THEN respBodySize ELSE 0 END) * 100) / SUM(respBodySize), 2) AS bytesPct
FROM
(
SELECT
client, page, url, firstHtml, respBodySize,
IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn,
CASE WHEN NET.HOST(url) = NET.HOST(page) THEN TRUE ELSE FALSE END AS sameHost,
CASE WHEN NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page) THEN TRUE ELSE FALSE END AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain
FROM `httparchive.almanac.requests3`
--GROUP BY client, pageid, requestid, page, url, firstHtml, _cdn_provider, respBodySize
)
GROUP BY
client,
hits
42 changes: 21 additions & 21 deletions sql/2019/cdn/17_02f.sql
Original file line number Diff line number Diff line change
@@ -7,27 +7,27 @@ SELECT
COUNT(0) AS hits,
ROUND(100 * COUNTIF(jscdnHits > 0 ) / COUNT(0), 2) AS pct
FROM
(
SELECT
client,
page,
COUNTIF(
NET.HOST(url) IN ('unpkg.com',
'www.jsdelivr.net',
'cdnjs.cloudflare.com',
'ajax.aspnetcdn.com',
'ajax.googleapis.com',
'stackpath.bootstrapcdn.com',
'maxcdn.bootstrapcdn.com',
'use.fontawesome.com',
'code.jquery.com',
'fonts.googleapis.com')
) AS jscdnHits
FROM `httparchive.almanac.requests3`
GROUP BY
client,
page
)
(
SELECT
client,
page,
COUNTIF(
NET.HOST(url) IN ('unpkg.com',
'www.jsdelivr.net',
'cdnjs.cloudflare.com',
'ajax.aspnetcdn.com',
'ajax.googleapis.com',
'stackpath.bootstrapcdn.com',
'maxcdn.bootstrapcdn.com',
'use.fontawesome.com',
'code.jquery.com',
'fonts.googleapis.com')
) AS jscdnHits
FROM `httparchive.almanac.requests3`
GROUP BY
client,
page
)
GROUP BY
client
ORDER BY
20 changes: 10 additions & 10 deletions sql/2019/cdn/17_02g.sql
Original file line number Diff line number Diff line change
@@ -4,17 +4,17 @@ SELECT
*,
ROUND(100 * pageUseCount / totalPagesCount, 2) AS Pct # doing the Pct calc causes memory problems with bigquery
FROM
(
SELECT
client,
IF(respBodySize > 0 AND REGEXP_CONTAINS(resp_content_type, r'javascript|css|font'), NET.HOST(url), NULL) AS host,
COUNT(DISTINCT page) AS pageUseCount,
SUM(COUNTIF(firstHtml)) OVER (PARTITION BY client) AS totalPagesCount
(
SELECT
client,
IF(respBodySize > 0 AND REGEXP_CONTAINS(resp_content_type, r'javascript|css|font'), NET.HOST(url), NULL) AS host,
COUNT(DISTINCT page) AS pageUseCount,
SUM(COUNTIF(firstHtml)) OVER (PARTITION BY client) AS totalPagesCount
FROM `httparchive.almanac.requests3`
GROUP BY
client,
host
)
GROUP BY
client,
host
)
WHERE host IS NOT NULL AND pageUseCount > 1000
ORDER BY
client DESC,
38 changes: 19 additions & 19 deletions sql/2019/cdn/17_02h.sql
Original file line number Diff line number Diff line change
@@ -5,26 +5,26 @@ SELECT
*,
ROUND(100 * pageUseCount / totalPagesCount, 2) AS Pct
FROM
(
SELECT
client,
IF(NET.HOST(url) IN ('unpkg.com',
'cdn.jsdelivr.net',
'cdnjs.cloudflare.com',
'ajax.aspnetcdn.com',
'ajax.googleapis.com',
'stackpath.bootstrapcdn.com',
'maxcdn.bootstrapcdn.com',
'use.fontawesome.com',
'code.jquery.com',
'fonts.googleapis.com'), NET.HOST(url), 'OTHER') AS jsCDN,
COUNT(DISTINCT page) AS pageUseCount,
SUM(COUNTIF(firstHtml)) OVER (PARTITION BY client) AS totalPagesCount
(
SELECT
client,
IF(NET.HOST(url) IN ('unpkg.com',
'cdn.jsdelivr.net',
'cdnjs.cloudflare.com',
'ajax.aspnetcdn.com',
'ajax.googleapis.com',
'stackpath.bootstrapcdn.com',
'maxcdn.bootstrapcdn.com',
'use.fontawesome.com',
'code.jquery.com',
'fonts.googleapis.com'), NET.HOST(url), 'OTHER') AS jsCDN,
COUNT(DISTINCT page) AS pageUseCount,
SUM(COUNTIF(firstHtml)) OVER (PARTITION BY client) AS totalPagesCount
FROM `httparchive.almanac.requests3`
GROUP BY
client,
jsCDN
)
GROUP BY
client,
jsCDN
)
ORDER BY
client DESC,
pageUseCount DESC
6 changes: 3 additions & 3 deletions sql/2019/cdn/17_13.sql
Original file line number Diff line number Diff line change
@@ -16,7 +16,7 @@ FROM (
IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry
CAST(JSON_EXTRACT(payload, "$.timings.ssl") AS INT64) AS tlstime,
ARRAY_LENGTH(split(JSON_EXTRACT(payload, '$._securityDetails.sanList'), "")) AS sanLength,
-- length(FROM_BASE64(REPLACE(REGEXP_REPLACE(JSON_EXTRACT_SCALAR(payload, '$._certificates[0]'), ""-----(BEGIN|END) CERTIFICATE-----"", """"), ""\n"", """"))) AS tlscertsize,
-- length(FROM_BASE64(REPLACE(REGEXP_REPLACE(JSON_EXTRACT_SCALAR(payload, '$._certificates[0]'), ""-----(BEGIN|END) CERTIFICATE-----"", """"), ""\n"", """"))) AS tlscertsize,
IF(NET.HOST(url) = NET.HOST(page), TRUE, FALSE) AS sameHost,
IF(NET.HOST(url) = NET.HOST(page) OR NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page), TRUE, FALSE) AS sameDomain # if toplevel reg_domain will return NULL so we group this as sameDomain
FROM `httparchive.almanac.requests`
@@ -27,8 +27,8 @@ WHERE
tlstime != -1 AND
sanLength IS NOT NULL
GROUP BY
client,
cdn,
client,
cdn,
firstHtml
ORDER BY
requests DESC
65 changes: 33 additions & 32 deletions sql/2019/cdn/17_19.sql
Original file line number Diff line number Diff line change
@@ -16,40 +16,41 @@ SELECT
ROUND(100 * COUNTIF(isSecure OR IFNULL(a.protocol, b.protocol) = 'HTTP/2') / COUNT(0), 2) AS tls_pct,
COUNT(0) AS total
FROM
(
SELECT
client, page, url, firstHtml,
# WPT is inconsistent with protocol population.
upper(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat("HTTP/", JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/')))) AS protocol,
JSON_EXTRACT_SCALAR(payload, '$._tls_version') AS tlsVersion,
(
SELECT
client, page, url, firstHtml,
# WPT is inconsistent with protocol population.
upper(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat("HTTP/", JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/')))) AS protocol,
JSON_EXTRACT_SCALAR(payload, '$._tls_version') AS tlsVersion,

# WPT joins CDN detection but we bias to the DNS detection which is the first entry
IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn,
CAST(JSON_EXTRACT(payload, "$.timings.ssl") AS INT64) AS tlstime,
# WPT joins CDN detection but we bias to the DNS detection which is the first entry
IFNULL(NULLIF(REGEXP_EXTRACT(_cdn_provider, r'^([^,]*).*'), ''), 'ORIGIN') AS cdn,
CAST(JSON_EXTRACT(payload, "$.timings.ssl") AS INT64) AS tlstime,

# isSecure reports what the browser thought it was going to use, but it can get upgraded with STS OR UpgradeInsecure: 1
IF(STARTS_WITH(url, 'https') OR JSON_EXTRACT_SCALAR(payload, '$._tls_version') IS NOT NULL OR CAST(JSON_EXTRACT(payload, '$._is_secure') AS INT64) = 1, TRUE, FALSE) AS isSecure,
CAST(jSON_EXTRACT(payload, "$._socket") AS INT64) AS socket
FROM
`httparchive.almanac.requests3`
WHERE
# WPT changes the response fields based on a redirect (url becomes the Location path instead of the original) causing insonsistencies in the counts, so we ignore them
resp_location = '' OR resp_location IS NULL
) a
LEFT JOIN (
SELECT
client, page,
CAST(jSON_EXTRACT(payload, "$._socket") AS INT64) AS socket,
ANY_VALUE(upper(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat("HTTP/", JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))))) AS protocol,
ANY_VALUE(JSON_EXTRACT_SCALAR(payload, '$._tls_version')) AS tlsVersion
FROM
`httparchive.almanac.requests3`
WHERE
JSON_EXTRACT_SCALAR(payload, '$._tls_version') IS NOT NULL AND
IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat("HTTP/", JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))) IS NOT NULL AND
jSON_EXTRACT(payload, "$._socket") IS NOT NULL
GROUP BY client, page, socket
) b ON (a.client = b.client AND a.page = b.page AND a.socket = b.socket)
# isSecure reports what the browser thought it was going to use, but it can get upgraded with STS OR UpgradeInsecure: 1
IF(STARTS_WITH(url, 'https') OR JSON_EXTRACT_SCALAR(payload, '$._tls_version') IS NOT NULL OR CAST(JSON_EXTRACT(payload, '$._is_secure') AS INT64) = 1, TRUE, FALSE) AS isSecure,
CAST(jSON_EXTRACT(payload, "$._socket") AS INT64) AS socket
FROM
`httparchive.almanac.requests3`
WHERE
# WPT changes the response fields based on a redirect (url becomes the Location path instead of the original) causing insonsistencies in the counts, so we ignore them
resp_location = '' OR resp_location IS NULL
) a
LEFT JOIN
(
SELECT
client, page,
CAST(jSON_EXTRACT(payload, "$._socket") AS INT64) AS socket,
ANY_VALUE(upper(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat("HTTP/", JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))))) AS protocol,
ANY_VALUE(JSON_EXTRACT_SCALAR(payload, '$._tls_version')) AS tlsVersion
FROM
`httparchive.almanac.requests3`
WHERE
JSON_EXTRACT_SCALAR(payload, '$._tls_version') IS NOT NULL AND
IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat("HTTP/", JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))) IS NOT NULL AND
jSON_EXTRACT(payload, "$._socket") IS NOT NULL
GROUP BY client, page, socket
) b ON (a.client = b.client AND a.page = b.page AND a.socket = b.socket)

GROUP BY
client,
Loading

0 comments on commit 127c73e

Please sign in to comment.