-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft strawman data frame "__dataframe__" interchange / data export protocol for discussion #1
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,269 @@ | ||||||||||||||||||||||
from abc import ABC, abstractmethod | ||||||||||||||||||||||
from collections.abc import Mapping | ||||||||||||||||||||||
from typing import Any, Iterable, Sequence | ||||||||||||||||||||||
|
||||||||||||||||||||||
# ---------------------------------------------------------------------- | ||||||||||||||||||||||
# A simple data type class hierarchy for illustration | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class DataType(ABC): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
A metadata object representing the logical value type of a cell in a data | ||||||||||||||||||||||
frame column. This metadata does not guarantee an specific underlying data | ||||||||||||||||||||||
representation | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
def __eq__(self, other: 'DataType'): | ||||||||||||||||||||||
return self.equals(other) | ||||||||||||||||||||||
|
||||||||||||||||||||||
def __str__(self): | ||||||||||||||||||||||
return self.to_string() | ||||||||||||||||||||||
|
||||||||||||||||||||||
def __repr__(self): | ||||||||||||||||||||||
return str(self) | ||||||||||||||||||||||
|
||||||||||||||||||||||
@abstractmethod | ||||||||||||||||||||||
def to_string(self) -> str: | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Return human-readable representation of the data type | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
|
||||||||||||||||||||||
@abstractmethod | ||||||||||||||||||||||
def equals(self, other: 'DataType') -> bool: | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Return true if other DataType contains the same metadata as this | ||||||||||||||||||||||
DataType | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
pass | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class PrimitiveType(DataType): | ||||||||||||||||||||||
|
||||||||||||||||||||||
def equals(self, other: DataType) -> bool: | ||||||||||||||||||||||
return type(self) == type(other) | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class NullType(PrimitiveType): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
A data type whose values are always null | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
def to_string(self): | ||||||||||||||||||||||
return "null" | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class Boolean(PrimitiveType): | ||||||||||||||||||||||
|
||||||||||||||||||||||
def to_string(self): | ||||||||||||||||||||||
return "bool" | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class NumberType(PrimitiveType): | ||||||||||||||||||||||
pass | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class IntegerType(NumberType): | ||||||||||||||||||||||
pass | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class SignedIntegerType(IntegerType): | ||||||||||||||||||||||
pass | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class Int8(SignedIntegerType): | ||||||||||||||||||||||
|
||||||||||||||||||||||
def to_string(self): | ||||||||||||||||||||||
return "int8" | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class Int16(SignedIntegerType): | ||||||||||||||||||||||
|
||||||||||||||||||||||
def to_string(self): | ||||||||||||||||||||||
return "int16" | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class Int32(SignedIntegerType): | ||||||||||||||||||||||
|
||||||||||||||||||||||
def to_string(self): | ||||||||||||||||||||||
return "int32" | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class Int64(SignedIntegerType): | ||||||||||||||||||||||
|
||||||||||||||||||||||
def to_string(self): | ||||||||||||||||||||||
return "int64" | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class Binary(PrimitiveType): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
A variable-size binary (bytes) value | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
def to_string(self): | ||||||||||||||||||||||
return "binary" | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class String(PrimitiveType): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
A UTF8-encoded string value | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
def to_string(self): | ||||||||||||||||||||||
return "string" | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class Object(PrimitiveType): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Any PyObject value | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
def to_string(self): | ||||||||||||||||||||||
return "object" | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class Categorical(DataType): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
A categorical value is an ordinal (integer) value that references a | ||||||||||||||||||||||
sequence of category values of an arbitrary data type | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
|
||||||||||||||||||||||
def __init__(self, index_type: IntegerType, category_type: DataType, | ||||||||||||||||||||||
ordered: bool = False): | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like to understand better the reason to have the types of the data and the categories in the type. With an example, I think a category column could be represented with something equivalent to: class CategoryColumn:
dtype = 'category'
index = pyarrow.array([0, 1, 0, 2, 0, 1], type='int8')
categories = pyarrow.array(['red', 'green', 'blue'], type='string') So, when accessing the data, I can imagine that we may be interested in one of two options:
For the first option, an example of code copying the data to a numpy array could be: for source_column in dataframe:
if source_column.dtype == 'category':
copy_of_index = numpy.array(source_column.index, dtype=source_column.index.dtype) The example is very simplistic, but it shows that we can get the type of the index ( What would be the main advantage of this implementation, as opposed to have just a categorical type with only the Am I missing something? Does it make sense what I say? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's pretty common for users to compare dtypes across columns. With this proposed change it would have a category column with I'm a +1 to including the index / categories types in the dtype as it makes things more explicit for end users. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, that's useful to know. Just to understand better, do you have any example on why users compare dtypes across columns? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's lots of reasons to have access to the complete schema, from preallocating space for data (in-memory or on disk) to building query plans (including kernel selection) before data arrives. Not having access to the index type and category type would (IMHO) make a lot of things much more difficult. That said, if you have an application where the index or category type is simply not known, you could use an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Unfortunately our users don't share the "why" too often, but one use case is checking the output of dtype inference from something like a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Thanks @wesm, that makes sense. But I don't fully understand this part. Given a categorical column, there is access to the index and category types in my example (there was a typo in my previous code, may be that's where I didn't make myself clear). It's just the place where it is found that is different (in the array instead of in the dtype). In your proposal: my_column.dtype.index_type In my example would be something like: my_column.underlying_categorical_data.index.dtype I understand that in my example there wouldn't be access to the underlying types if a dataframe schema is used without the actual dataframe. Then, with your proposal we could still know the internal types of the categorical, while in my example those will be undefined, since they should be in the data, which doesn't exist. Is this what you have in mind? If it is, in what case are we considering that a schema can live without the data? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Accessing the actual data is potentially costly, I think? So that would make getting the type of the categories costly, while I assume accessing it from the dtype itself would not. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Good point. In the PR, the class Column:
@property
def type(self):
pass
my_column.type.index_type The data is not specified, may be that's the problem, but if we agree the categorical data as two subcolumns, one for the index, one for the categories, then accessing the index type with an unparametrized categorical dtype would be:
I may be missing something, but I would say in both cases the type is metadata of the I'm not sure what would imply if we consider implementations using Arrow's |
||||||||||||||||||||||
self.index_type = index_type | ||||||||||||||||||||||
self.category_type = category_type | ||||||||||||||||||||||
self.ordered = ordered | ||||||||||||||||||||||
|
||||||||||||||||||||||
def equals(self, other: DataType) -> bool: | ||||||||||||||||||||||
return (isinstance(other, Categorical) and | ||||||||||||||||||||||
self.index_type == other.index_type and | ||||||||||||||||||||||
self.category_type == other.category_type and | ||||||||||||||||||||||
self.ordered == other.ordered) | ||||||||||||||||||||||
|
||||||||||||||||||||||
def to_string(self): | ||||||||||||||||||||||
return ("categorical(indices={}, categories={}, ordered={})" | ||||||||||||||||||||||
.format(str(self.index_type), str(self.category_type), | ||||||||||||||||||||||
self.ordered)) | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
# ---------------------------------------------------------------------- | ||||||||||||||||||||||
# Classes representing a column in a DataFrame | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class Column(ABC): | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the benefit of having a column-specific interface vs just a DataFrame with one column? In Modin, underneath everything is a DataFrame and Column/Series objects are views into the dataframe implemented at a higher layer. I'm interested to hear the motivation for a column-specific interface. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The semantics that most existing applications / users when they do
In all the Apache Arrow implementations there's a logical / semantic distinction between a RecordBatch / Table and a field of a RecordBatch / Table (which yield an Array or ChunkedArray, respectively) Secondly, the semantics and API of conversion methods at the column level vs. at the data frame level may differ. For example, suppose we wanted to select some number of columns from a data frame and then convert them to a dict of NumPy arrays, it might make sense to write
or similar. I'm interested to see what scikit-learn and matplotlib developers think. |
||||||||||||||||||||||
|
||||||||||||||||||||||
@property | ||||||||||||||||||||||
@abstractmethod | ||||||||||||||||||||||
def name(self) -> Any: | ||||||||||||||||||||||
pass | ||||||||||||||||||||||
|
||||||||||||||||||||||
@property | ||||||||||||||||||||||
@abstractmethod | ||||||||||||||||||||||
def type(self) -> DataType: | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Return the logical type of each column cell value | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
pass | ||||||||||||||||||||||
|
||||||||||||||||||||||
@property | ||||||||||||||||||||||
def attrs(self) -> Mapping: | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be something more precise, e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At least in xarray attrs can be arbitrary metadata so it should probably be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've seen keys as bytes (e.g. Arrow's metadata is bytes -- though UTF-8 bytes would be preferred) so |
||||||||||||||||||||||
""" | ||||||||||||||||||||||
Metadata for this column. Default implementation returns empty dict | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
return {} | ||||||||||||||||||||||
|
||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Proposing an additional API here for a more generic array API. I think that the constraint of supporting
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will open another suggestion around There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alright. From my perspective a critical matter is making sure that this doesn't get mixed up with APIs whose purpose is to provide data in a particular memory representation. So I think the term "array" in this context is may confuse, so There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Agreed.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That's... incredibly cumbersome :( If There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason that I'm against
What is "array" and what is "array_like"? It's too ambiguous. IMHO this project is first and foremost about data export at library boundaries (without one library having knowledge of the internals of the other). If you want to also expose object protocols (like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this goes beyond the scope of what problem is trying to be solved in this first project iteration. There is exporting "data" and exporting "interfaces". If we can't first agree about exporting "data" then I think this project will be kaputt and we should all disengage. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
From my perspective, in addition to exporting to a concrete memory representation this takes a step too far and exports to a specific object container type. It would be great if we could generalize this to exporting any object that follows a concrete memory representation as opposed to tightly confining it to a specific object. I.E. (ignore the cumbersome names for now)
These protocols are specifically defined surrounding memory representation so it makes sense to use them from my perspective. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm reached the limit of how much time I can invest in this for now so I'll wait a few weeks or a month for others to express their opinions. For me the things I care most about are:
Per my comment on the Discourse thread, I do think taking a step back and writing a requirements doc might be helpful to avoid diversions into discussions of non-goals that take up a lot of time and energy. |
||||||||||||||||||||||
def to_numpy(self): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Access column's data as a NumPy array. Recommended to return a view if | ||||||||||||||||||||||
able but not required | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
raise NotImplementedError("Conversion to NumPy not available") | ||||||||||||||||||||||
|
||||||||||||||||||||||
def to_arrow(self, **kwargs): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Access column's data in the Apache Arrow format as pyarrow.Array or | ||||||||||||||||||||||
ChunkedArray. Recommended to return a view if able but not required | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
raise NotImplementedError("Conversion to Arrow not available") | ||||||||||||||||||||||
Comment on lines
+169
to
+181
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could these APIs be generalized more to something like For example, Pandas uses There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand your comment. The idea of this interface is that the data representation is not known. So we will add various methods to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess my point is if we have Maybe I was unclear as to whether There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. There is no "standard" here, and this effort is not trying to establish a standard representation. The idea is that some projects are implemented against a certain data representation (e.g. NumPy arrays) -- so indeed these methods are here so that a producer can return their data to the consumer in the representation that they are looking for. matplotlib is an example -- it deals almost exclusively in NumPy arrays so rather than have matplotlib take responsibility for the details of converting from some foreign data frame object to a NumPy array, this conversion is the responsibility of the producer of this interface. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Also for clarity, efforts to achieve duck-typing between array libraries is pretty orthogonal to this project -- if such a duck-typed array standard is developed and adopted / normalized, then this interface can add it. The use case here is strictly abstract data interchange that does not force producers to convert to a particular intermediate format like pandas. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Understood, that makes sense. My fear is if protocols can't be relied on entirely, the default path consumers will take is to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. I don't know that it makes sense to be opinionated in this way at this level. We can provide people with choices and they can decide what makes sense for them. I don't see a problem with having a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem is if the "NumPy" path is included in the interface whereas the "general array like" path isn't, most producers will likely only implement the "NumPy" path which will make almost all consumers consume the "NumPy" path. This then becomes a problem for us trying to integrate efficiently of trying to convince everyone to change to the "general array like" path as opposed to that being the typical path. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you like to propose an API to add? In the PR description I wrote "The APIs proposed here are not intended to be to the exclusion of others -- more APIs can be proposed and added later." What is written here is not exhaustive. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Apologies, I missed that. Will add a suggestion to kickoff discussions on adding the generic API I was talking about. |
||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
# ---------------------------------------------------------------------- | ||||||||||||||||||||||
# DataFrame: the main public API | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class DataFrame(ABC): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
An abstract data frame base class. | ||||||||||||||||||||||
|
||||||||||||||||||||||
A "data frame" represents an ordered collection of named columns. A | ||||||||||||||||||||||
column's "name" is permitted to be any Python value, but strings are | ||||||||||||||||||||||
common. Names are not required to be unique. Columns may be accessed by | ||||||||||||||||||||||
name (when the name is unique) or by position. | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
|
||||||||||||||||||||||
def __dataframe__(self): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Idempotence of data frame protocol | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
return self | ||||||||||||||||||||||
|
||||||||||||||||||||||
@property | ||||||||||||||||||||||
@abstractmethod | ||||||||||||||||||||||
def num_columns(self): | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Personally I would rather use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I disagree strongly here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW, I still have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kkraus14 do you mind expanding on the use cases you have in mind? I don't have a strong opinion on not having the number of columns as a property or method. It's more a preference on not making users remember on more method name in an API, if it's not needed. So if this is useful for some cases, happy to have it. What I've got in mind is that dataframes would have a known and somehow reduced number of columns (let's say in the largest examples I can think of we're talking about few thousands). If that was the case, I think But for your comment, I think I'm wrong in my assumptions. So, would be great to know better what you have in mind, and which use cases have you seen where having the columns name as a Python list is a bad idea. If you have some code examples on how the protocol dataframe would be used, that's even better. Thanks! |
||||||||||||||||||||||
""" | ||||||||||||||||||||||
Return the number of columns in the DataFrame | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
pass | ||||||||||||||||||||||
|
||||||||||||||||||||||
@property | ||||||||||||||||||||||
@abstractmethod | ||||||||||||||||||||||
def num_rows(self): | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same, I don't think we need an alias for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a certain ambiguity to In R > df <- data.frame(a=c(1,2,3,4), b=c(5,6,7,8))
> length(df)
[1] 2
> nrow(df)
[1] 4
> ncol(df)
[1] 2 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree on that. To me it makes more sense that But my point here was more to have just one way of doing things, and not repeat There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with @wesm here that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMHO ...or just punt on it, let There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Coming back to this, couple of reasons why CPython limits the dunder method Python 3.7.6 | packaged by conda-forge | (default, Jun 1 2020, 18:57:50)
[GCC 7.5.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> class LenTooBig:
... def __len__(self):
... return 2 ** 64 + 1
...
>>> len(LenTooBig())
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
OverflowError: cannot fit 'int' into an index-sized integer That is many trillions of data points, that not sure if anyone will ever reach (surely not in-memory representations), but still worth mentioning that the limitation doesn't exist in regular methods. Another thing, with distributed systems in mind, does it make sense that this is a property (or df.num_rows # also applies to `len(df)` I'm mostly assuming that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Making it a function makes sense to me for the reasons you listed. It's probably best in this utilitarian interface to not have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This returns the number of rows in the DataFrame (if known)
Suggested change
should the api use Optional types or raise (maybe custom Exception) if not known? |
||||||||||||||||||||||
""" | ||||||||||||||||||||||
Return the number of rows in the DataFrame (if known) | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
pass | ||||||||||||||||||||||
|
||||||||||||||||||||||
@property | ||||||||||||||||||||||
@abstractmethod | ||||||||||||||||||||||
def column_names(self) -> Iterable[Any]: | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Return the column names as a materialized sequence | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
pass | ||||||||||||||||||||||
|
||||||||||||||||||||||
@property | ||||||||||||||||||||||
def row_names(self) -> Sequence[Any]: | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm -1 on row names / indices. IMO that shouldn't be part of the standard (of course pandas and any other project can have this as a feature, but shouldn't be here). Of course we could have this optional, but I think it's more a pandas thing, and we shouldn't add complexity to the standard. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. R's DataFrame also has row labels, so it's not specific to pandas. Are there any dataframe implementations that don't have row labels? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The way I look at it if some implementations return As one can of worms, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think most dataframe implementations have row labels. But in the Python world I think that is because everybody is trying to replicate the pandas API for compatibility. Having row labels is surely a good idea for many dataframe implementations. My point is that not having them is also a good idea to me. So, I would leave them out of this proposal, at least initially. IIRC @maartenbreddels mentioned that he'd make Vaex behave more like SQL regarding indexing. If I had to implement a dataframe library from scratch I'd also have indices as regular columns that can be indexed. There would surely be some drawbacks compared to pandas, but I think the API for users would be much simpler, and I prefer that. In any case, not trying to say that projects shouldn't have row labels. Just that my preference is to not include them here. It narrows the discussion, and it avoids biasing towards the R/pandas approach. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want any kind of invariant around But if you're storing your row labels "in band" (as part of the columns in the dataframe) then your row labels will be mixed with the values. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
While it isn't required from the protocol perspective, people will end up writing code using these optional features unnecessarily, i.e. in Pandas using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If a consumer writes code that assumes there are always row names, even though the interface says that they are optional, isn't that the consumer's problem? The abstract API is a contract for producer and consumer, so it wouldn't be appropriate for consumers to de facto violate the contract by making an optional feature required. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's the consumer's problem until there's enough people with the same problem (not wanting to rewrite code that assumes row names) that it becomes a problem for producers. For what it's worth, I fought against having row indices in cuDF and ultimately lost the battle from the overwhelming amount of libraries we wanted to integrate with and user code assuming row indices purely because they were built against Pandas originally and it became too large of a battle to continue fighting. I'm still against row indices, but would rather them be required than optional. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I guess that one's mostly my fault. In R, row names are optional and it doesn't seem to cause problems. In pandas the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In our case an Index is just a Column that we provide the syntactic sugar expected from an Index around it. If someone wants to grab rows by index value, we currently build a hash table and then probe that hash table each time (we have plans to memoize it in the future). No one has pointed this out or complained about performance of this thus far (outside of CUDA developers writing the libcudf C++ library 😆), so I don't think it has to do with computational functionality and is purely the data transfer aspects. |
||||||||||||||||||||||
""" | ||||||||||||||||||||||
Return the row names (if any) as a materialized sequence. It is not | ||||||||||||||||||||||
necessary to implement this method | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
raise NotImplementedError("This DataFrame has no row names") | ||||||||||||||||||||||
|
||||||||||||||||||||||
@abstractmethod | ||||||||||||||||||||||
def get_column(self, i: int) -> Column: | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Return the column at the indicated position | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
pass | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From a Python dev perspective (who would like to use this functionality) the get/select methods seem a bit awkward. If we instead extend the In [79]: df = DataFrame('A', 'B', 'C')
In [80]: df
Out[80]: DataFrame(['A', 'B', 'C'])
In [81]: df.columns
Out[81]: ColumnCollection(['A', 'B', 'C'])
In [82]: df.columns.idx(0)
Out[82]: Column(name='A')
In [83]: df.columns.idx(0, 2)
Out[83]: DataFrame(['A', 'C'])
In [84]: df.columns['A']
Out[84]: Column(name='A')
In [85]: df.columns['A','C']
Out[85]: DataFrame(['A', 'C']) ...which, at least to me, seems a lot more natural. Code:from typing import List
from dataclasses import dataclass
@dataclass
class Column(object):
name: str
class ColumnCollection(object):
_columns: List[Column]
def __init__(self, *columns: Column):
self._columns = list(columns)
@property
def names(self) -> List[str]:
return [col.name for col in self._columns]
def idx(self, *indices: int) -> ColumnCollection:
columns = [self._columns[idx] for idx in indices]
if len(columns) == 1:
return columns[0]
return DataFrame(
*(col.name for col in columns)
)
def __getitem__(self, names: str) -> Union[Column, ColumnCollection]:
columns = dict(zip(self.names, self._columns))
if len(names) == 1:
return columns[names[0]]
return DataFrame(*names)
def __len__(self):
return len(self._columns)
def __iter__(self):
for col in self._columns:
yield col
def __repr__(self):
columns = "', '".join(self.names)
return f"ColumnCollection(['{columns}'])"
class DataFrame(object):
_columns: ColumnCollection
def __init__(self, *columns):
self._columns = ColumnCollection(
*(Column(col) for col in columns)
)
def __repr__(self):
columns = "', '".join(self.columns.names)
return f"DataFrame(['{columns}'])"
@property
def columns(self):
return self._columns There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I strongly feel that syntactic sugar / conveniences (which get non-objective very quickly) belong at a higher level than this API, which is about library developers implementing unambiguous data export APIs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm interested in this as a potential way to write analytics functions which are DataFrame implementation agnostic. i.e. instead of coding to the pandas API I would code to this API hence its usage would be front-and-centre in my code in the same way that pandas now is. Maybe this isn't supposed to cater to that end-user concern though? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not. This was discussed at length in https://discuss.ossdata.org/t/a-dataframe-protocol-for-the-pydata-ecosystem/267 |
||||||||||||||||||||||
|
||||||||||||||||||||||
@abstractmethod | ||||||||||||||||||||||
def get_column_by_name(self, name: Any) -> Column: | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Return the column whose name is the indicated name. If the column names | ||||||||||||||||||||||
are not unique, may raise an exception. | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
pass | ||||||||||||||||||||||
|
||||||||||||||||||||||
def select_columns(self, indices: Sequence[int]): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Create a new DataFrame by selecting a subset of columns by index | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
raise NotImplementedError("select_columns") | ||||||||||||||||||||||
|
||||||||||||||||||||||
def select_columns_by_name(self, names: Sequence[Any]): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Create a new DataFrame by selecting a subset of columns by name. If the | ||||||||||||||||||||||
column names are not unique, may raise an exception. | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
raise NotImplementedError("select_columns_by_name") | ||||||||||||||||||||||
|
||||||||||||||||||||||
def to_dict_of_numpy(self): | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, using type hints could avoid cumbersome method names: def to_dict(self) -> Dict[str, np.ndarray]:
... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. -1, I think the method name needs to unambiguously describe what it's doing |
||||||||||||||||||||||
""" | ||||||||||||||||||||||
Convert DataFrame to a dict with column names as keys and values the | ||||||||||||||||||||||
corresponding columns converted to NumPy arrays | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
raise NotImplementedError("TODO") | ||||||||||||||||||||||
Comment on lines
+264
to
+269
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this be generalized to something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See my comment above |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getting a mypy error here...
should we follow the mypy recommedation?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see no reason not to