-
Notifications
You must be signed in to change notification settings - Fork 170
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #222 from auxten/main
Add benchmark on Pandas DataFrame for Pandas, Polars, DuckDB, chDB
- Loading branch information
Showing
19 changed files
with
2,572 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
*.bak | ||
.idea | ||
.clickbench | ||
*.parquet | ||
hits.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/bin/bash | ||
|
||
# Install | ||
|
||
sudo apt-get update | ||
sudo apt-get install -y python3-pip | ||
pip install pandas chdb | ||
|
||
# Download the data | ||
wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet | ||
|
||
# Run the queries | ||
|
||
./run.sh 2>&1 | tee log.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
SELECT COUNT(*) FROM Python(hits); | ||
SELECT COUNT(*) FROM Python(hits) WHERE AdvEngineID <> 0; | ||
SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM Python(hits); | ||
SELECT AVG(UserID) FROM Python(hits); | ||
SELECT COUNT(DISTINCT UserID) FROM Python(hits); | ||
SELECT COUNT(DISTINCT SearchPhrase) FROM Python(hits); | ||
SELECT MIN(EventDate), MAX(EventDate) FROM Python(hits); | ||
SELECT AdvEngineID, COUNT(*) FROM Python(hits) WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; | ||
SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM Python(hits) GROUP BY RegionID ORDER BY u DESC LIMIT 10; | ||
SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM Python(hits) GROUP BY RegionID ORDER BY c DESC LIMIT 10; | ||
SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; | ||
SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; | ||
SELECT SearchPhrase, COUNT(*) AS c FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; | ||
SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; | ||
SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; | ||
SELECT UserID, COUNT(*) FROM Python(hits) GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; | ||
SELECT UserID, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; | ||
SELECT UserID, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, SearchPhrase LIMIT 10; | ||
SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; | ||
SELECT UserID FROM Python(hits) WHERE UserID = 435090932899640449; | ||
SELECT COUNT(*) FROM Python(hits) WHERE URL LIKE '%google%'; | ||
SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM Python(hits) WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; | ||
SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM Python(hits) WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; | ||
SELECT * FROM Python(hits) WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; | ||
SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; | ||
SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; | ||
SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; | ||
SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM Python(hits) WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; | ||
SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM Python(hits) WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; | ||
SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM Python(hits); | ||
SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; | ||
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; | ||
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; | ||
SELECT URL, COUNT(*) AS c FROM Python(hits) GROUP BY URL ORDER BY c DESC LIMIT 10; | ||
SELECT 1, URL, COUNT(*) AS c FROM Python(hits) GROUP BY 1, URL ORDER BY c DESC LIMIT 10; | ||
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM Python(hits) GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; | ||
SELECT URL, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; | ||
SELECT Title, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; | ||
SELECT URL, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; | ||
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; | ||
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; | ||
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; | ||
SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import pandas as pd | ||
import timeit | ||
import datetime | ||
import json | ||
import chdb | ||
|
||
start = timeit.default_timer() | ||
hits = pd.read_parquet("hits.parquet") | ||
end = timeit.default_timer() | ||
load_time = end - start | ||
|
||
dataframe_size = hits.memory_usage().sum() | ||
|
||
# print("Dataframe(numpy) size:", dataframe_size, "bytes") | ||
|
||
# fix some types | ||
hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") | ||
hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") | ||
|
||
# fix all object columns to string | ||
for col in hits.columns: | ||
if hits[col].dtype == "O": | ||
hits[col] = hits[col].astype(str) | ||
|
||
queries = [] | ||
with open("queries.sql") as f: | ||
queries = f.readlines() | ||
|
||
queries_times = [] | ||
for q in queries: | ||
times = [] | ||
for _ in range(3): | ||
start = timeit.default_timer() | ||
result = chdb.query(q, "Null") | ||
end = timeit.default_timer() | ||
times.append(end - start) | ||
queries_times.append(times) | ||
|
||
result_json = { | ||
"system": "chDB (DataFrame)", | ||
"date": datetime.date.today().strftime("%Y-%m-%d"), | ||
"machine": "c6a.metal, 500gb gp2", | ||
"cluster_size": 1, | ||
"comment": "", | ||
"tags": [ | ||
"C++", | ||
"column-oriented", | ||
"embedded", | ||
"stateless", | ||
"serverless", | ||
"dataframe", | ||
"ClickHouse derivative", | ||
], | ||
"load_time": 0, | ||
"data_size": int(dataframe_size), | ||
"result": queries_times, | ||
} | ||
|
||
# if cpuinfo contains "AMD EPYC 9654" update machine and write result into results/epyc-9654.json | ||
if "AMD EPYC 9654" in open("/proc/cpuinfo").read(): | ||
result_json["machine"] = "EPYC 9654, 384G" | ||
with open("results/epyc-9654.json", "w") as f: | ||
f.write(json.dumps(result_json, indent=4)) | ||
else: | ||
# write result into results/c6a.metal.json | ||
with open("results/c6a.metal.json", "w") as f: | ||
f.write(json.dumps(result_json, indent=4)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,235 @@ | ||
{ | ||
"system": "chDB (DataFrame)", | ||
"date": "2024-09-09", | ||
"machine": "c6a.metal, 500gb gp2", | ||
"cluster_size": 1, | ||
"comment": "", | ||
"tags": [ | ||
"C++", | ||
"column-oriented", | ||
"embedded", | ||
"stateless", | ||
"serverless", | ||
"dataframe", | ||
"ClickHouse derivative" | ||
], | ||
"load_time": 0, | ||
"data_size": 46998823722, | ||
"result": [ | ||
[ | ||
0.05626875200005088, | ||
0.03751970800021809, | ||
0.03710983100017984 | ||
], | ||
[ | ||
0.06512281899995287, | ||
0.05849275699983991, | ||
0.0585379379999722 | ||
], | ||
[ | ||
0.044165491000057955, | ||
0.041299934000107896, | ||
0.040226853000149276 | ||
], | ||
[ | ||
0.08134618700023566, | ||
0.040427908999845386, | ||
0.04047265000008338 | ||
], | ||
[ | ||
0.24362793399996008, | ||
0.2186527310004749, | ||
0.2025154820003081 | ||
], | ||
[ | ||
0.2937789709999379, | ||
0.29077527099980216, | ||
0.2973619129998042 | ||
], | ||
[ | ||
0.04860631699921214, | ||
0.044442590999096865, | ||
0.04185208899980353 | ||
], | ||
[ | ||
0.07148085600010745, | ||
0.087940534000154, | ||
0.07362535899983413 | ||
], | ||
[ | ||
0.32479551800133777, | ||
0.31021195799985435, | ||
0.3000845909991767 | ||
], | ||
[ | ||
0.3567556970001533, | ||
0.3498798099999476, | ||
0.36433636799984015 | ||
], | ||
[ | ||
0.1430718550000165, | ||
0.14249916199969448, | ||
0.13584852000030878 | ||
], | ||
[ | ||
0.13416774699999223, | ||
0.12045774299986078, | ||
0.126722227999835 | ||
], | ||
[ | ||
0.3129409140001371, | ||
0.39802245999999286, | ||
0.27576033200011807 | ||
], | ||
[ | ||
0.30749968300006003, | ||
0.30776443899958394, | ||
0.2933942620002199 | ||
], | ||
[ | ||
0.2840204679996532, | ||
0.29736796199995297, | ||
0.2656048210001245 | ||
], | ||
[ | ||
0.16236161799997717, | ||
0.14227574799997456, | ||
0.1577743770003508 | ||
], | ||
[ | ||
0.41774886599978345, | ||
0.45844158400041124, | ||
0.4236037320001742 | ||
], | ||
[ | ||
0.343798914999752, | ||
0.34234521600001244, | ||
0.3242654260002382 | ||
], | ||
[ | ||
0.8645628289996239, | ||
0.7159582540002702, | ||
0.701609888999883 | ||
], | ||
[ | ||
0.07303793199980646, | ||
0.0385745369999313, | ||
0.039382633000059286 | ||
], | ||
[ | ||
0.7685791720014095, | ||
0.7017175380005938, | ||
0.7004018799998448 | ||
], | ||
[ | ||
0.7086737089998678, | ||
0.7338668900001721, | ||
0.726541903999987 | ||
], | ||
[ | ||
1.0143638360000296, | ||
1.025565608999841, | ||
0.9419111859997429 | ||
], | ||
[ | ||
4.812288134000028, | ||
2.167691587000263, | ||
2.528547120999974 | ||
], | ||
[ | ||
0.155205888999717, | ||
0.14651630200023646, | ||
0.15077497899983427 | ||
], | ||
[ | ||
0.1408380660000148, | ||
0.14518392399986624, | ||
0.13928737500009447 | ||
], | ||
[ | ||
0.1470085519999884, | ||
0.15105727499985733, | ||
0.1438814180000918 | ||
], | ||
[ | ||
0.8403685159992165, | ||
0.7649107939996611, | ||
0.8209212080000725 | ||
], | ||
[ | ||
1.9075914280001598, | ||
1.8646036999998614, | ||
1.885373637000157 | ||
], | ||
[ | ||
0.07688073700001041, | ||
0.07453166799996325, | ||
0.07560217999980523 | ||
], | ||
[ | ||
0.20158837400003904, | ||
0.20706678800024747, | ||
0.19952737999983583 | ||
], | ||
[ | ||
0.2738630439998815, | ||
0.2761814330001471, | ||
0.2812519489998522 | ||
], | ||
[ | ||
0.7870267199996306, | ||
0.709162213000036, | ||
0.7544576690002032 | ||
], | ||
[ | ||
1.2057435320002696, | ||
1.1144656840000607, | ||
1.1528750570003467 | ||
], | ||
[ | ||
1.2744926979999036, | ||
1.227631830000064, | ||
1.2613582039998619 | ||
], | ||
[ | ||
0.1441288920000261, | ||
0.1581041039999036, | ||
0.1318917770004191 | ||
], | ||
[ | ||
0.7185039579999284, | ||
0.6852749829999993, | ||
0.6816314070001681 | ||
], | ||
[ | ||
0.49516889899996386, | ||
0.46560566200014364, | ||
0.4326922939999349 | ||
], | ||
[ | ||
0.7060461809996923, | ||
0.2074475480003457, | ||
0.2127349039997171 | ||
], | ||
[ | ||
0.8890890540001237, | ||
0.4336082350000652, | ||
0.7149529159999474 | ||
], | ||
[ | ||
0.09202534199994261, | ||
0.08829310200007967, | ||
0.08950286199979018 | ||
], | ||
[ | ||
0.08598820699990029, | ||
0.08675313599997025, | ||
0.08428598599994075 | ||
], | ||
[ | ||
0.08334934399999838, | ||
0.08340355500013175, | ||
0.0796592659999078 | ||
] | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/bin/bash | ||
|
||
./query.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/bin/bash | ||
|
||
# Install | ||
|
||
sudo apt-get update | ||
sudo apt-get install -y python3-pip | ||
pip install pandas duckdb | ||
|
||
# Download the data | ||
wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet | ||
|
||
# Run the queries | ||
|
||
./run.sh 2>&1 | tee log.txt |
Oops, something went wrong.