-
Notifications
You must be signed in to change notification settings - Fork 175
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[PERF] Remove calls to remote_len_partition
#1660
Changes from 6 commits
4026d56
115d379
7a2a784
f0119a8
58e7e7c
6f890d8
30ea7ee
2843588
386f912
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -198,8 +198,9 @@ def iter_partitions(self) -> Iterator[Union[Table, "RayObjectRef"]]: | |
else: | ||
# Execute the dataframe in a streaming fashion. | ||
context = get_context() | ||
partitions_iter = context.runner().run_iter(self._builder) | ||
yield from partitions_iter | ||
results_iter = context.runner().run_iter(self._builder) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you also may have to handle the
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
for result in results_iter: | ||
yield result.partition() | ||
|
||
@DataframePublicAPI | ||
def __repr__(self) -> str: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
import pathlib | ||
import sys | ||
from dataclasses import dataclass, field | ||
from typing import Generic, TypeVar | ||
from typing import Generic | ||
|
||
if sys.version_info < (3, 8): | ||
from typing_extensions import Protocol | ||
|
@@ -26,14 +26,15 @@ | |
from daft.logical.map_partition_ops import MapPartitionOp | ||
from daft.logical.schema import Schema | ||
from daft.runners.partitioning import ( | ||
MaterializedResult, | ||
PartialPartitionMetadata, | ||
PartitionMetadata, | ||
PartitionT, | ||
TableParseCSVOptions, | ||
TableReadOptions, | ||
) | ||
from daft.table import Table, table_io | ||
|
||
PartitionT = TypeVar("PartitionT") | ||
ID_GEN = itertools.count() | ||
|
||
|
||
|
@@ -251,35 +252,6 @@ def __repr__(self) -> str: | |
return super().__str__() | ||
|
||
|
||
class MaterializedResult(Protocol[PartitionT]): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Promoted" and moved to |
||
"""A protocol for accessing the result partition of a PartitionTask. | ||
|
||
Different Runners can fill in their own implementation here. | ||
""" | ||
|
||
def partition(self) -> PartitionT: | ||
"""Get the partition of this result.""" | ||
... | ||
|
||
def vpartition(self) -> Table: | ||
"""Get the vPartition of this result.""" | ||
... | ||
|
||
def metadata(self) -> PartitionMetadata: | ||
"""Get the metadata of the partition in this result.""" | ||
... | ||
|
||
def cancel(self) -> None: | ||
"""If possible, cancel execution of this PartitionTask.""" | ||
... | ||
|
||
def _noop(self, _: PartitionT) -> None: | ||
"""Implement this as a no-op. | ||
https://peps.python.org/pep-0544/#overriding-inferred-variance-of-protocol-classes | ||
""" | ||
... | ||
|
||
|
||
class Instruction(Protocol): | ||
"""An instruction is a function to run over a list of partitions. | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,19 +38,22 @@ | |
) | ||
from daft.expressions import ExpressionsProjection | ||
from daft.logical.schema import Schema | ||
from daft.runners.partitioning import PartialPartitionMetadata | ||
from daft.runners.partitioning import ( | ||
MaterializedResult, | ||
PartialPartitionMetadata, | ||
PartitionT, | ||
) | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
PartitionT = TypeVar("PartitionT") | ||
T = TypeVar("T") | ||
|
||
|
||
# A PhysicalPlan that is still being built - may yield both PartitionTaskBuilders and PartitionTasks. | ||
InProgressPhysicalPlan = Iterator[Union[None, PartitionTask[PartitionT], PartitionTaskBuilder[PartitionT]]] | ||
|
||
# A PhysicalPlan that is complete and will only yield PartitionTasks or final PartitionTs. | ||
MaterializedPhysicalPlan = Iterator[Union[None, PartitionTask[PartitionT], PartitionT]] | ||
MaterializedPhysicalPlan = Iterator[Union[None, PartitionTask[PartitionT], MaterializedResult[PartitionT]]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NOTE this important change: instead of yielding |
||
|
||
|
||
def _stage_id_counter(): | ||
|
@@ -738,7 +741,7 @@ def materialize( | |
# Check if any inputs finished executing. | ||
while len(materializations) > 0 and materializations[0].done(): | ||
done_task = materializations.popleft() | ||
yield done_task.partition() | ||
yield done_task._result | ||
jaychia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Materialize a single dependency. | ||
try: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -92,6 +92,40 @@ | |
PartitionT = TypeVar("PartitionT") | ||
|
||
|
||
class MaterializedResult(Generic[PartitionT]): | ||
"""A protocol for accessing the result partition of a PartitionTask. | ||
|
||
Different Runners can fill in their own implementation here. | ||
""" | ||
|
||
@abstractmethod | ||
def partition(self) -> PartitionT: | ||
"""Get the partition of this result.""" | ||
... | ||
|
||
@abstractmethod | ||
def vpartition(self) -> Table: | ||
"""Get the vPartition of this result.""" | ||
... | ||
|
||
@abstractmethod | ||
def metadata(self) -> PartitionMetadata: | ||
"""Get the metadata of the partition in this result.""" | ||
... | ||
|
||
@abstractmethod | ||
def cancel(self) -> None: | ||
"""If possible, cancel execution of this PartitionTask.""" | ||
... | ||
|
||
@abstractmethod | ||
def _noop(self, _: PartitionT) -> None: | ||
"""Implement this as a no-op. | ||
https://peps.python.org/pep-0544/#overriding-inferred-variance-of-protocol-classes | ||
""" | ||
... | ||
|
||
|
||
class PartitionSet(Generic[PartitionT]): | ||
def _get_merged_vpartition(self) -> Table: | ||
raise NotImplementedError() | ||
|
@@ -126,7 +160,7 @@ | |
raise NotImplementedError() | ||
|
||
@abstractmethod | ||
def set_partition(self, idx: PartID, part: PartitionT) -> None: | ||
def set_partition(self, idx: PartID, part: MaterializedResult[PartitionT]) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
raise NotImplementedError() | ||
|
||
@abstractmethod | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this was just typed wrongly from the beginning, not sure why our typechecks didn't catch it.