Skip to content

Commit

Permalink
add shuffle func (#451)
Browse files Browse the repository at this point in the history
* add sample function

* `eval_only` supports summarization

* add shuffle
  • Loading branch information
Dobiichi-Origami authored Apr 15, 2024
1 parent b4379c4 commit 53dff8d
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 4 deletions.
25 changes: 25 additions & 0 deletions python/qianfan/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1053,11 +1053,36 @@ def sample(
this value to True
**kwargs (Any):
other arguments
Returns:
Dataset: a sliced dataset
"""
return super().sample(
sample_number, start, end, should_create_new_obj, **kwargs
)

@_online_except_decorator
def shuffle(
self,
should_create_new_obj: bool = False,
**kwargs: Any,
) -> Self:
"""
make a shuffled Dataset
Args:
should_create_new_obj (bool):
should a new object be created when mapping terminates.
Default to False. In some cases, you may want to set
this value to True
**kwargs (Any):
other arguments
Returns:
Dataset: a sliced dataset
"""
return super().shuffle(should_create_new_obj, **kwargs)

def __getitem__(self, key: Any) -> Any:
if (
isinstance(key, int)
Expand Down
15 changes: 15 additions & 0 deletions python/qianfan/dataset/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,12 @@ def sample(
numbers = random.sample(range(start, end + 1), sample_number)
return self.table.take(numbers)

def shuffle(self) -> PyarrowTable:
indices = list(range(0, self.table.num_rows))
random.shuffle(indices)

return self.table.take(indices)


class _PyarrowColumnManipulator(BaseModel, Addable, Listable, Processable):
"""handler for processing of pyarrow table column"""
Expand Down Expand Up @@ -1238,6 +1244,15 @@ def sample(
result_ds = manipulator.sample(sample_number, start, end)
return self._create_new_obj(result_ds, should_create_new_obj)

def shuffle(
self,
should_create_new_obj: bool = False,
**kwargs: Any,
) -> Self:
manipulator = self._row_op()
result_ds = manipulator.shuffle()
return self._create_new_obj(result_ds, should_create_new_obj)

def col_map(
self,
op: Callable[[Any], Any],
Expand Down
4 changes: 0 additions & 4 deletions python/qianfan/evaluation/evaluation_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from qianfan.dataset.data_source.utils import (
_download_file_from_url_streamly,
)
from qianfan.dataset.schema import EvaluationSchema
from qianfan.errors import QianfanError
from qianfan.evaluation.consts import QianfanRefereeEvaluatorPromptTemplate
from qianfan.evaluation.evaluation_result import EvaluationResult
Expand Down Expand Up @@ -415,9 +414,6 @@ def eval_only(
{OldReferenceColumnName: [None for _ in range(len(dataset))]}
)

if not EvaluationSchema().validate(dataset):
raise ValueError("validate failed before evaluation")

tmp_ds = Dataset.create_from_pyobj(
self._run_evaluator_locally(dataset, **kwargs)
)
Expand Down

0 comments on commit 53dff8d

Please sign in to comment.