Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more docstrings #11

Open
wants to merge 1 commit into
base: ARROW-6341
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 117 additions & 2 deletions python/pyarrow/_dataset.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,31 @@ cdef class DefaultPartitionScheme(PartitionScheme):


cdef class SchemaPartitionScheme(PartitionScheme):
"""
A PartitionScheme based on a specified Schema.

The SchemaPartitionScheme expects one segment in the file path for each
field in the schema (all fields are required to be present).
For example given schema<year:int16, month:int8> the path "/2009/11" would
be parsed to ("year"_ == 2009 and "month"_ == 11).

Parameters
----------
schema : Schema
The schema that describes the partitions present in the file path.

Returns
-------
SchemaPartitionScheme

Examples
--------
>>> from pyarrow.dataset import SchemaPartitionScheme
>>> scheme = SchemaPartitionScheme(
... pa.schema([("year", pa.int16()), ("month", pa.int8())]))
>>> print(scheme.parse("/2009/11"))
((year == 2009:int16) and (month == 11:int8))
"""

cdef:
CSchemaPartitionScheme* schema_scheme # hmmm...
Expand All @@ -147,10 +172,42 @@ cdef class SchemaPartitionScheme(PartitionScheme):

@property
def schema(self):
"""The arrow Schema describing the partition scheme."""
return pyarrow_wrap_schema(self.schema_scheme.schema())


cdef class HivePartitionScheme(PartitionScheme):
"""
A PartitionScheme for "/$key=$value/" nested directories as found in
Apache Hive.

Multi-level, directory based partitioning scheme originating from
Apache Hive with all data files stored in the leaf directories. Data is
partitioned by static values of a particular column in the schema.
Partition keys are represented in the form $key=$value in directory names.
Field order is ignored, as are missing or unrecognized field names.

For example, given schema<year:int16, month:int8, day:int8>, a possible
path would be "/year=2009/month=11/day=15".

Parameters
----------
schema : Schema
The schema that describes the partitions present in the file path.

Returns
-------
SchemaPartitionScheme

Examples
--------
>>> from pyarrow.dataset import HivePartitionScheme
>>> scheme = HivePartitionScheme(
... pa.schema([("year", pa.int16()), ("month", pa.int8())]))
>>> print(scheme.parse("/year=2009/month=11"))
((year == 2009:int16) and (month == 11:int8))

"""

cdef:
CHivePartitionScheme* hive_scheme
Expand All @@ -168,6 +225,7 @@ cdef class HivePartitionScheme(PartitionScheme):

@property
def schema(self):
"""The arrow Schema describing the partition scheme."""
return pyarrow_wrap_schema(self.hive_scheme.schema())


Expand Down Expand Up @@ -240,6 +298,10 @@ cdef class DataSourceDiscovery:

@property
def partition_scheme(self):
"""
Get or set the PartitionScheme for the data source that is being
discovered.
""""
cdef shared_ptr[CPartitionScheme] scheme
scheme = self.discovery.partition_scheme()
if scheme.get() == nullptr:
Expand All @@ -264,19 +326,48 @@ cdef class DataSourceDiscovery:
check_status(self.discovery.SetRootPartition(expr.unwrap()))

def inspect(self):
"""
Insects all data fragments and returns a common Schema.

Returns
-------
Schema
"""
cdef CResult[shared_ptr[CSchema]] result
with nogil:
result = self.discovery.Inspect()
return pyarrow_wrap_schema(GetResultValue(result))

def finish(self):
"""
Returns a DataSource.

Returns
-------
DataSource
"""
cdef CResult[shared_ptr[CDataSource]] result
with nogil:
result = self.discovery.Finish()
return DataSource.wrap(GetResultValue(result))


cdef class FileSystemDataSourceDiscovery(DataSourceDiscovery):
"""
Creates a DataSource from a list of paths with schema inspection.

DataSourceDiscovery is used to create a DataSource, inspect the Schema
of the fragments contained in it, and declare a partition scheme.

Parameters
----------
filesystem : pyarrow.fs.FileSystem
paths_or_selector: pyarrow.fs.Selector or list of path-likes
Either a Selector object or a list of path-like objects.
format : FileFormat
options : FileSystemDiscoveryOptions, optional

"""

cdef:
CFileSystemDataSourceDiscovery* filesystem_discovery
Expand Down Expand Up @@ -346,6 +437,10 @@ cdef class DataSource:

@property
def partition_expression(self):
"""
An expression which evaluates to true for all data viewed by this
DataSource.
"""
cdef shared_ptr[CExpression] expression
expression = self.source.partition_expression()
if expression.get() == nullptr:
Expand Down Expand Up @@ -460,7 +555,13 @@ cdef class FileSystemDataSource(DataSource):


cdef class Dataset:
"""Collection of data fragments coming from possibly multiple sources."""
"""
Collection of data fragments coming from possibly multiple sources.

Arrow Datasets allow you to query against data that has been split across
multiple files. This sharding of data may indicate partitioning, which
can accelerate queries that only touch some partitions (files).
"""

cdef:
shared_ptr[CDataset] wrapped
Expand Down Expand Up @@ -500,7 +601,19 @@ cdef class Dataset:
return self.wrapped

def new_scan(self, MemoryPool memory_pool=None):
"""Begin to build a new Scan operation against this Dataset."""
"""
Begin to build a new Scan operation against this Dataset.

Parameters
----------
memory_pool : MemoryPool, default None
For memory allocations, if required. If not specified, uses the
default pool.

Returns
-------
ScannerBuilder
"""
cdef:
shared_ptr[CScanContext] context = make_shared[CScanContext]()
CResult[shared_ptr[CScannerBuilder]] result
Expand All @@ -510,11 +623,13 @@ cdef class Dataset:

@property
def sources(self):
"""List of the data sources"""
cdef vector[shared_ptr[CDataSource]] sources = self.dataset.sources()
return [DataSource.wrap(source) for source in sources]

@property
def schema(self):
"""The common schema of the full Dataset"""
return pyarrow_wrap_schema(self.dataset.schema())


Expand Down