-
Notifications
You must be signed in to change notification settings - Fork 603
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: support read_parquet for backend with no native support #9744
Changes from 10 commits
ab2ad16
661f50d
e16f1bb
eaec7a2
9106ad8
27d7a08
ac6117f
3ce9674
24530ca
bb238af
12cfc7d
2cf597a
b4cf0ea
2ba5002
6f2c754
24bfe38
6a50c46
4579bff
d1ed444
b01bc6a
e70de2f
413ada7
c3fba44
8b6b3c6
0d55190
fda5493
71ebb8e
2473c02
3ab60a8
59c03e0
c0c1fd1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,10 +3,13 @@ | |
import abc | ||
import collections.abc | ||
import functools | ||
import glob | ||
import importlib.metadata | ||
import keyword | ||
import re | ||
import urllib.parse | ||
import urllib.request | ||
from io import BytesIO | ||
from pathlib import Path | ||
from typing import TYPE_CHECKING, Any, ClassVar | ||
|
||
|
@@ -1199,6 +1202,61 @@ def has_operation(cls, operation: type[ops.Value]) -> bool: | |
f"{cls.name} backend has not implemented `has_operation` API" | ||
) | ||
|
||
def read_parquet( | ||
self, path: str | Path, table_name: str | None = None, **kwargs: Any | ||
) -> ir.Table: | ||
"""Register a parquet file as a table in the current backend. | ||
|
||
Parameters | ||
---------- | ||
path | ||
The data source. May be a path to a file, an iterable of files, | ||
or directory of parquet files. | ||
table_name | ||
An optional name to use for the created table. This defaults to | ||
a sequentially generated name. | ||
**kwargs | ||
Additional keyword arguments passed to the pyarrow loading function. | ||
See https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html | ||
for more information. | ||
|
||
Returns | ||
------- | ||
ir.Table | ||
The just-registered table | ||
|
||
""" | ||
|
||
table = self._get_pyarrow_table_from_path(path, **kwargs) | ||
table_name = table_name or util.gen_name("read_parquet") | ||
self.create_table(table_name, table) | ||
return self.table(table_name) | ||
|
||
def _get_pyarrow_table_from_path(self, path: str | Path, **kwargs) -> pa.Table: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why can't the implementation of this just be: return pq.read_table(path, **kwargs) Did you try that already? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried that in my first commit, it cannot handle all the cases: such as glob pattern and Parquet files hosted on some uri: i.e HTTPS SFTP Pyarrow implements natively the following filesystem subclasses: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cpcloud does this make sense to you? |
||
pq = util.import_object("pyarrow.parquet") | ||
|
||
path = str(path) | ||
# handle url | ||
if util.is_url(path): | ||
headers = kwargs.pop("headers", {}) | ||
req_info = urllib.request.Request(path, headers=headers) # noqa: S310 | ||
cpcloud marked this conversation as resolved.
Show resolved
Hide resolved
jitingxu1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
with urllib.request.urlopen(req_info) as req: # noqa: S310 | ||
with BytesIO(req.read()) as reader: | ||
return pq.read_table(reader) | ||
|
||
# handle fsspec compatible url | ||
if util.is_fsspec_url(path): | ||
return pq.read_table(path, **kwargs) | ||
|
||
# Handle local file paths or patterns | ||
paths = glob.glob(path) | ||
if not paths: | ||
raise ValueError(f"No files found matching pattern: {path!r}") | ||
elif len(paths) == 1: | ||
paths = paths[0] | ||
|
||
return pq.read_table(paths, **kwargs) | ||
|
||
def _cached(self, expr: ir.Table): | ||
jitingxu1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Cache the provided expression. | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,13 @@ | |
|
||
import pytest | ||
|
||
from ibis.util import PseudoHashable, flatten_iterable, import_object | ||
from ibis.util import ( | ||
PseudoHashable, | ||
flatten_iterable, | ||
import_object, | ||
is_fsspec_url, | ||
is_url, | ||
) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
|
@@ -51,6 +57,65 @@ def test_import_object(): | |
import_object("collections.this_attribute_doesnt_exist") | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("url", "expected"), | ||
[ | ||
("http://example.com", True), # Valid http URL | ||
("https://example.com", True), # Valid https URL | ||
("ftp://example.com", True), # Valid ftp URL | ||
("sftp://example.com", True), # Valid sftp URL | ||
("ws://example.com", True), # Valid WebSocket URL | ||
("wss://example.com", True), # Valid WebSocket Secure URL | ||
("file:///home/user/file.txt", True), # Valid file URL | ||
("mailto:[email protected]", False), # Invalid URL with non-supported scheme | ||
("http://localhost:8000", True), # Valid URL with port | ||
("ftp://192.168.1.1", True), # Valid URL with IP address | ||
("https://example.com/path/to/resource", True), # Valid URL with path | ||
("http://user:[email protected]", True), # Valid URL with credentials | ||
("ftp://example.com/resource", True), # Valid FTP URL with resource | ||
("telnet://example.com", True), # Valid Telnet URL | ||
("git://example.com/repo.git", True), # Valid Git URL | ||
("sip://example.com", True), # Valid SIP URL | ||
("sips://example.com", True), # Valid SIPS URL | ||
("invalid://example.com", False), # Invalid URL with unknown scheme | ||
], | ||
) | ||
def test_is_url(url, expected): | ||
assert is_url(url) == expected | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("url", "expected"), | ||
[ | ||
("s3://bucket/path/to/file", True), # Valid fsspec URL | ||
("ftp://example.com/file.txt", True), # Valid fsspec URL | ||
("gs://bucket/path/to/file", True), # Valid fsspec URL | ||
("http://example.com/file.txt", False), # Invalid URL (HTTP) | ||
("https://example.com/file.txt", False), # Invalid URL (HTTPS) | ||
("file://localhost/path/to/file", True), # Valid fsspec URL | ||
("mailto:[email protected]", False), # Invalid URL | ||
( | ||
"ftp://user:[email protected]/path/to/file", | ||
True, | ||
), # Valid fsspec URL with credentials | ||
("ftp://example.com", True), # Valid fsspec URL without file | ||
("", False), # Empty string (invalid URL) | ||
("invalid://path/to/file", True), # Invalid scheme but valid format | ||
("http://localhost:8000", False), # Invalid URL (HTTP with port) | ||
( | ||
"https://192.168.1.1/path/to/file", | ||
False, | ||
), # Invalid URL (HTTPS with IP address) | ||
( | ||
"file:/path/to/file", | ||
False, | ||
), # Invalid URL (missing double slashes after file:) | ||
], | ||
) | ||
def test_is_fsspec_url(url, expected): | ||
assert is_fsspec_url(url) == expected | ||
|
||
|
||
# TODO(kszucs): add tests for promote_list and promote_tuple | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of BytesIO, I could pass the fsspec object, It could be HTTPFile if we pass an HTTP url. Not sure what is the best way to handle the type of
path
@gforsyth any suggestion?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think
fsspec
is a good option.