Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: change default multiprocessing pool mode to spawn #128

Merged
merged 4 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Tqdm's `disable` parameter for non-TTY environments from `None` to `False`
- Refactored final GeoParquet file saving logic to greatly reduce memory usage
- Bumped minimal `pyarrow` version to 16.0
- Default `multiprocessing.Pool` initialization mode from `fork` to `spawn`

## [0.8.1] - 2024-05-11

Expand Down
27 changes: 15 additions & 12 deletions quackosm/pbf_file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
import hashlib
import itertools
import json
import multiprocessing
import secrets
import shutil
import tempfile
import time
import warnings
from collections.abc import Iterable
from math import floor
from multiprocessing import Pool
from pathlib import Path
from time import sleep
from typing import Any, Callable, Literal, NamedTuple, Optional, Union, cast
Expand Down Expand Up @@ -186,15 +186,8 @@ def __init__(
self.debug_memory = debug_memory
self.debug_times = debug_times
self.task_progress_tracker: TaskProgressTracker = None
self.rows_per_group: int = 0

self.rows_per_group = PbfFileReader.ROWS_PER_GROUP_MEMORY_CONFIG[0]
actual_memory = psutil.virtual_memory()
# If more than 8 / 16 / 24 GB total memory, increase the number of rows per group
for memory_gb, rows_per_group in PbfFileReader.ROWS_PER_GROUP_MEMORY_CONFIG.items():
if actual_memory.total >= (memory_gb * MEMORY_1GB):
self.rows_per_group = rows_per_group
else:
break

self.parquet_compression = parquet_compression

Expand Down Expand Up @@ -814,6 +807,16 @@ def _parse_pbf_file(
)
return result_file_path.with_suffix(".geoparquet")

self.encountered_query_exception = False
self.rows_per_group = PbfFileReader.ROWS_PER_GROUP_MEMORY_CONFIG[0]
actual_memory = psutil.virtual_memory()
# If more than 8 / 16 / 24 GB total memory, increase the number of rows per group
for memory_gb, rows_per_group in PbfFileReader.ROWS_PER_GROUP_MEMORY_CONFIG.items():
if actual_memory.total >= (memory_gb * MEMORY_1GB):
self.rows_per_group = rows_per_group
else:
break

elements = self.connection.sql(f"SELECT * FROM ST_READOSM('{Path(pbf_path)}');")

if self.tags_filter is None:
Expand Down Expand Up @@ -1598,7 +1601,7 @@ def _run_query(
sql_queries = [sql_queries]

if run_in_separate_process:
with Pool() as pool:
with multiprocessing.get_context("spawn").Pool() as pool:
r = pool.apply_async(
_run_query, args=(sql_queries, tmp_dir_path or self.tmp_dir_path)
)
Expand Down Expand Up @@ -2571,7 +2574,7 @@ def _save_final_parquet_file(
for col in features_table.columns
if col not in (FEATURES_INDEX, "geometry_wkb")
]
columns_to_test_result = duckdb.sql(
columns_to_test_result = self.connection.sql(
f"SELECT {', '.join(columns_to_test)} FROM '{input_file}/*.parquet'"
).to_df()

Expand Down Expand Up @@ -2726,7 +2729,7 @@ def _run_query(sql_queries: Union[str, list[str]], tmp_dir_path: Path) -> None:

def _run_in_multiprocessing_pool(function: Callable[..., None], args: Any) -> None:
try:
with Pool() as pool:
with multiprocessing.get_context("spawn").Pool() as pool:
r = pool.apply_async(
func=function,
args=args,
Expand Down
Loading