-
Notifications
You must be signed in to change notification settings - Fork 166
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PyArrow: Don't enforce the schema #902
Changes from all commits
361d066
a0c0c57
4ca513b
064ed0e
ee293a1
d2a0b36
3e86782
8d1ed75
0fcf93c
ab0db07
c4f044a
a760a52
4464bd7
97c5ce9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1047,8 +1047,10 @@ def _task_to_record_batches( | |
|
||
fragment_scanner = ds.Scanner.from_fragment( | ||
fragment=fragment, | ||
# We always use large types in memory as it uses larger offsets | ||
# That can chunk more row values into the buffers | ||
# With PyArrow 16.0.0 there is an issue with casting record-batches: | ||
# https://github.com/apache/arrow/issues/41884 | ||
# https://github.com/apache/arrow/issues/43183 | ||
# Would be good to remove this later on | ||
schema=_pyarrow_schema_ensure_large_types(physical_schema), | ||
# This will push down the query to Arrow. | ||
# But in case there are positional deletes, we have to apply them first | ||
|
@@ -1084,11 +1086,17 @@ def _task_to_table( | |
positional_deletes: Optional[List[ChunkedArray]], | ||
case_sensitive: bool, | ||
name_mapping: Optional[NameMapping] = None, | ||
) -> pa.Table: | ||
batches = _task_to_record_batches( | ||
fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, name_mapping | ||
) -> Optional[pa.Table]: | ||
batches = list( | ||
_task_to_record_batches( | ||
fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, name_mapping | ||
) | ||
) | ||
return pa.Table.from_batches(batches, schema=schema_to_pyarrow(projected_schema, include_field_ids=False)) | ||
|
||
if len(batches) > 0: | ||
return pa.Table.from_batches(batches) | ||
else: | ||
return None | ||
|
||
|
||
def _read_all_delete_files(fs: FileSystem, tasks: Iterable[FileScanTask]) -> Dict[str, List[ChunkedArray]]: | ||
|
@@ -1192,7 +1200,7 @@ def project_table( | |
if len(tables) < 1: | ||
return pa.Table.from_batches([], schema=schema_to_pyarrow(projected_schema, include_field_ids=False)) | ||
|
||
result = pa.concat_tables(tables) | ||
result = pa.concat_tables(tables, promote_options="permissive") | ||
|
||
if limit is not None: | ||
return result.slice(0, limit) | ||
|
@@ -1271,54 +1279,62 @@ def project_batches( | |
|
||
|
||
def to_requested_schema( | ||
requested_schema: Schema, file_schema: Schema, batch: pa.RecordBatch, downcast_ns_timestamp_to_us: bool = False | ||
requested_schema: Schema, | ||
file_schema: Schema, | ||
batch: pa.RecordBatch, | ||
downcast_ns_timestamp_to_us: bool = False, | ||
include_field_ids: bool = False, | ||
) -> pa.RecordBatch: | ||
# We could re-use some of these visitors | ||
struct_array = visit_with_partner( | ||
requested_schema, batch, ArrowProjectionVisitor(file_schema, downcast_ns_timestamp_to_us), ArrowAccessor(file_schema) | ||
requested_schema, | ||
batch, | ||
ArrowProjectionVisitor(file_schema, downcast_ns_timestamp_to_us, include_field_ids), | ||
ArrowAccessor(file_schema), | ||
) | ||
|
||
arrays = [] | ||
fields = [] | ||
for pos, field in enumerate(requested_schema.fields): | ||
array = struct_array.field(pos) | ||
arrays.append(array) | ||
fields.append(pa.field(field.name, array.type, field.optional)) | ||
return pa.RecordBatch.from_arrays(arrays, schema=pa.schema(fields)) | ||
return pa.RecordBatch.from_struct_array(struct_array) | ||
|
||
|
||
class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, Optional[pa.Array]]): | ||
file_schema: Schema | ||
_include_field_ids: bool | ||
|
||
def __init__(self, file_schema: Schema, downcast_ns_timestamp_to_us: bool = False): | ||
def __init__(self, file_schema: Schema, downcast_ns_timestamp_to_us: bool = False, include_field_ids: bool = False) -> None: | ||
self.file_schema = file_schema | ||
self._include_field_ids = include_field_ids | ||
self.downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us | ||
|
||
def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: | ||
file_field = self.file_schema.find_field(field.field_id) | ||
|
||
if field.field_type.is_primitive: | ||
if field.field_type != file_field.field_type: | ||
return values.cast(schema_to_pyarrow(promote(file_field.field_type, field.field_type), include_field_ids=False)) | ||
elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=False)) != values.type: | ||
Fokko marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# if file_field and field_type (e.g. String) are the same | ||
# but the pyarrow type of the array is different from the expected type | ||
# (e.g. string vs larger_string), we want to cast the array to the larger type | ||
safe = True | ||
return values.cast( | ||
schema_to_pyarrow(promote(file_field.field_type, field.field_type), include_field_ids=self._include_field_ids) | ||
) | ||
elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)) != values.type: | ||
# Downcasting of nanoseconds to microseconds | ||
if ( | ||
pa.types.is_timestamp(target_type) | ||
and target_type.unit == "us" | ||
and pa.types.is_timestamp(values.type) | ||
and values.type.unit == "ns" | ||
): | ||
safe = False | ||
return values.cast(target_type, safe=safe) | ||
return values.cast(target_type, safe=False) | ||
return values | ||
|
||
def _construct_field(self, field: NestedField, arrow_type: pa.DataType) -> pa.Field: | ||
metadata = {} | ||
if field.doc: | ||
metadata[PYARROW_FIELD_DOC_KEY] = field.doc | ||
if self._include_field_ids: | ||
metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah good catch on this one as well 👍 |
||
|
||
return pa.field( | ||
name=field.name, | ||
type=arrow_type, | ||
nullable=field.optional, | ||
metadata={DOC: field.doc} if field.doc is not None else None, | ||
metadata=metadata, | ||
) | ||
|
||
def schema(self, schema: Schema, schema_partner: Optional[pa.Array], struct_result: Optional[pa.Array]) -> Optional[pa.Array]: | ||
|
@@ -1960,14 +1976,15 @@ def write_parquet(task: WriteTask) -> DataFile: | |
file_schema=table_schema, | ||
batch=batch, | ||
downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us, | ||
include_field_ids=True, | ||
) | ||
for batch in task.record_batches | ||
] | ||
arrow_table = pa.Table.from_batches(batches) | ||
file_path = f'{table_metadata.location}/data/{task.generate_data_file_path("parquet")}' | ||
fo = io.new_output(file_path) | ||
with fo.create(overwrite=True) as fos: | ||
with pq.ParquetWriter(fos, schema=file_schema.as_arrow(), **parquet_writer_kwargs) as writer: | ||
with pq.ParquetWriter(fos, schema=arrow_table.schema, **parquet_writer_kwargs) as writer: | ||
writer.write(arrow_table, row_group_size=row_group_size) | ||
statistics = data_file_statistics_from_parquet_metadata( | ||
parquet_metadata=writer.writer.metadata, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2053,8 +2053,9 @@ def to_arrow_batch_reader(self) -> pa.RecordBatchReader: | |
|
||
from pyiceberg.io.pyarrow import project_batches, schema_to_pyarrow | ||
|
||
target_schema = schema_to_pyarrow(self.projection()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, we are making an opinionated decision on whether we are using large or small type as the pyarrow schema when reading the Iceberg table as a RecordBatchReader. Is there a reason why we don't want to do the same for the table API? I've noticed that we've changed the return type of the Table API to Similarly, other libraries like polars use the approach of choosing one type over the other (large types in the case of polars).
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My preference would be to let Arrow decide. For Polars it is different because they are also the query engine. Casting the types will recompute the buffers, consuming additional memory/CPU, which I would rather avoid. For the table, we first materialize all the batches in memory, so if one of them is large, it will automatically upcast, otherwise, it will keep the small types. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My knowledge on Parquet data to Arrow buffer conversion is less versed, so please do check me if I am not making much sense 🙂 But are we actually casting the types on read? We make a decision on whether we are choosing to read with large or small types when instantiating the fragment scanner, which loads the parquet data into the Arrow buffers. The I think the only time we are casting the types is on write, where we may want to downcast it for forward compatibility. It looks like we have to choose a schema to use on write anyways, because using a schema for the ParquetWriter that isn't consistent with the schema within the dataframe results in an exception. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
+1 Currently, we use "large_*" types during write. I think it could be better if we can write file based on the input pyarrow dataframe schema: if the dataframe is |
||
return pa.RecordBatchReader.from_batches( | ||
schema_to_pyarrow(self.projection()), | ||
target_schema, | ||
project_batches( | ||
self.plan_files(), | ||
self.table_metadata, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Docs for
promote_options="permissive"
https://arrow.apache.org/docs/python/generated/pyarrow.concat_tables.html