kraina-ai · RaczeQ · Jun 4, 2024 · Jun 4, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Tqdm's `disable` parameter for non-TTY environments from `None` to `False`
 - Refactored final GeoParquet file saving logic to greatly reduce memory usage
 - Bumped minimal `pyarrow` version to 16.0
+- Default `multiprocessing.Pool` initialization mode from `fork` to `spawn`
 
 ## [0.8.1] - 2024-05-11
 

diff --git a/quackosm/pbf_file_reader.py b/quackosm/pbf_file_reader.py
@@ -7,14 +7,14 @@
 import hashlib
 import itertools
 import json
+import multiprocessing
 import secrets
 import shutil
 import tempfile
 import time
 import warnings
 from collections.abc import Iterable
 from math import floor
-from multiprocessing import Pool
 from pathlib import Path
 from time import sleep
 from typing import Any, Callable, Literal, NamedTuple, Optional, Union, cast
@@ -186,15 +186,8 @@ def __init__(
         self.debug_memory = debug_memory
         self.debug_times = debug_times
         self.task_progress_tracker: TaskProgressTracker = None
+        self.rows_per_group: int = 0
 
-        self.rows_per_group = PbfFileReader.ROWS_PER_GROUP_MEMORY_CONFIG[0]
-        actual_memory = psutil.virtual_memory()
-        # If more than 8 / 16 / 24 GB total memory, increase the number of rows per group
-        for memory_gb, rows_per_group in PbfFileReader.ROWS_PER_GROUP_MEMORY_CONFIG.items():
-            if actual_memory.total >= (memory_gb * MEMORY_1GB):
-                self.rows_per_group = rows_per_group
-            else:
-                break
 
         self.parquet_compression = parquet_compression
 
@@ -814,6 +807,16 @@ def _parse_pbf_file(
             )
             return result_file_path.with_suffix(".geoparquet")
 
+        self.encountered_query_exception = False
+        self.rows_per_group = PbfFileReader.ROWS_PER_GROUP_MEMORY_CONFIG[0]
+        actual_memory = psutil.virtual_memory()
+        # If more than 8 / 16 / 24 GB total memory, increase the number of rows per group
+        for memory_gb, rows_per_group in PbfFileReader.ROWS_PER_GROUP_MEMORY_CONFIG.items():
+            if actual_memory.total >= (memory_gb * MEMORY_1GB):
+                self.rows_per_group = rows_per_group
+            else:
+                break
+
         elements = self.connection.sql(f"SELECT * FROM ST_READOSM('{Path(pbf_path)}');")
 
         if self.tags_filter is None:
@@ -1598,7 +1601,7 @@ def _run_query(
             sql_queries = [sql_queries]
 
         if run_in_separate_process:
-            with Pool() as pool:
+            with multiprocessing.get_context("spawn").Pool() as pool:
                 r = pool.apply_async(
                     _run_query, args=(sql_queries, tmp_dir_path or self.tmp_dir_path)
                 )
@@ -2571,7 +2574,7 @@ def _save_final_parquet_file(
                 for col in features_table.columns
                 if col not in (FEATURES_INDEX, "geometry_wkb")
             ]
-            columns_to_test_result = duckdb.sql(
+            columns_to_test_result = self.connection.sql(
                 f"SELECT {', '.join(columns_to_test)} FROM '{input_file}/*.parquet'"
             ).to_df()
 
@@ -2726,7 +2729,7 @@ def _run_query(sql_queries: Union[str, list[str]], tmp_dir_path: Path) -> None:
 
 def _run_in_multiprocessing_pool(function: Callable[..., None], args: Any) -> None:
     try:
-        with Pool() as pool:
+        with multiprocessing.get_context("spawn").Pool() as pool:
             r = pool.apply_async(
                 func=function,
                 args=args,