Skip to content

Commit

Permalink
Explicitly set writing of statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
milesgranger committed Nov 30, 2023
1 parent eafb5fc commit a95ab8b
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions tests/tpch/generate-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,12 @@ def _tpch_data_gen(
out_ = pathlib.Path(out)
out_.mkdir(exist_ok=True, parents=True)
out_ = str(out_ / file)
pq.write_table(df, out_, compression=compression.value.lower())
pq.write_table(
df,
out_,
compression=compression.value.lower(),
write_statistics=True,
)
print(f"Finished exporting table {table}!")
print("Finished exporting all data!")

Expand All @@ -213,7 +218,9 @@ def rows_approx_mb(con, table_name, partition_size: str, compression: Compressio
tmp = pathlib.Path(tmpdir) / "tmp.parquet"
stmt = f"select * from {table_name} limit {sample_size}"
df = con.sql(stmt).arrow()
pq.write_table(df, tmp, compression=compression.value.lower())
pq.write_table(
df, tmp, compression=compression.value.lower(), write_statistics=True
)
mb = tmp.stat().st_size
return int(
(len(table) * ((len(table) / sample_size) * partition_size)) / mb
Expand Down

0 comments on commit a95ab8b

Please sign in to comment.