Skip to content

Commit

Permalink
No more Sequence name collision
Browse files Browse the repository at this point in the history
  • Loading branch information
Rocketknight1 committed May 20, 2022
1 parent c518d47 commit dbaf21a
Showing 1 changed file with 3 additions and 4 deletions.
7 changes: 3 additions & 4 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
Iterator,
List,
Optional,
Sequence,
Tuple,
Union,
overload,
Expand All @@ -61,7 +60,7 @@
from . import config
from .arrow_reader import ArrowReader
from .arrow_writer import ArrowWriter, OptimizedTypedSequence
from .features import Audio, ClassLabel, Features, Image, Value
from .features import Audio, ClassLabel, Features, Image, Sequence, Value
from .features.features import FeatureType, decode_nested_example, pandas_types_mapper, require_decoding
from .filesystems import extract_path_from_uri, is_remote_filesystem
from .fingerprint import (
Expand Down Expand Up @@ -312,13 +311,13 @@ def _get_output_signature(

def to_tf_dataset(
self,
columns: Optional[Union[str, Sequence[str]]] = None,
columns: Optional[Union[str, List[str]]] = None,
batch_size: int = 8,
shuffle: bool = True,
collate_fn: Optional[Callable] = None,
drop_remainder: Optional[bool] = None,
collate_fn_args: Optional[Dict[str, Any]] = None,
label_cols: Optional[Union[str, Sequence[str]]] = None,
label_cols: Optional[Union[str, List[str]]] = None,
prefetch: bool = True,
):
"""Create a tf.data.Dataset from the underlying Dataset. This tf.data.Dataset will load and collate batches from
Expand Down

1 comment on commit dbaf21a

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008859 / 0.011353 (-0.002494) 0.004304 / 0.011008 (-0.006704) 0.029961 / 0.038508 (-0.008547) 0.034789 / 0.023109 (0.011679) 0.346662 / 0.275898 (0.070764) 0.360957 / 0.323480 (0.037477) 0.006714 / 0.007986 (-0.001272) 0.003860 / 0.004328 (-0.000468) 0.007583 / 0.004250 (0.003333) 0.041737 / 0.037052 (0.004685) 0.333490 / 0.258489 (0.075001) 0.368909 / 0.293841 (0.075068) 0.031772 / 0.128546 (-0.096774) 0.009971 / 0.075646 (-0.065676) 0.252186 / 0.419271 (-0.167085) 0.053086 / 0.043533 (0.009553) 0.335980 / 0.255139 (0.080841) 0.352429 / 0.283200 (0.069230) 0.103237 / 0.141683 (-0.038446) 1.878692 / 1.452155 (0.426537) 1.925481 / 1.492716 (0.432765)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.318018 / 0.018006 (0.300012) 0.683897 / 0.000490 (0.683408) 0.006812 / 0.000200 (0.006612) 0.000125 / 0.000054 (0.000071)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.026099 / 0.037411 (-0.011312) 0.105248 / 0.014526 (0.090722) 0.113767 / 0.176557 (-0.062790) 0.160253 / 0.737135 (-0.576882) 0.117146 / 0.296338 (-0.179192)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.424693 / 0.215209 (0.209484) 4.242622 / 2.077655 (2.164967) 1.823218 / 1.504120 (0.319099) 1.613396 / 1.541195 (0.072201) 1.736582 / 1.468490 (0.268092) 0.450815 / 4.584777 (-4.133962) 4.537313 / 3.745712 (0.791601) 2.369958 / 5.269862 (-2.899903) 0.933313 / 4.565676 (-3.632363) 0.053836 / 0.424275 (-0.370439) 0.011999 / 0.007607 (0.004392) 0.530707 / 0.226044 (0.304663) 5.302787 / 2.268929 (3.033859) 2.254077 / 55.444624 (-53.190547) 1.920628 / 6.876477 (-4.955849) 2.040374 / 2.142072 (-0.101698) 0.571118 / 4.805227 (-4.234109) 0.125941 / 6.500664 (-6.374723) 0.063445 / 0.075469 (-0.012024)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.651096 / 1.841788 (-0.190691) 14.708835 / 8.074308 (6.634526) 27.169067 / 10.191392 (16.977675) 0.925355 / 0.680424 (0.244931) 0.550377 / 0.534201 (0.016176) 0.493256 / 0.579283 (-0.086027) 0.496570 / 0.434364 (0.062206) 0.316770 / 0.540337 (-0.223567) 0.326791 / 1.386936 (-1.060145)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008421 / 0.011353 (-0.002932) 0.004035 / 0.011008 (-0.006973) 0.029849 / 0.038508 (-0.008659) 0.034537 / 0.023109 (0.011428) 0.317174 / 0.275898 (0.041276) 0.347780 / 0.323480 (0.024300) 0.006427 / 0.007986 (-0.001559) 0.003611 / 0.004328 (-0.000717) 0.007408 / 0.004250 (0.003158) 0.040102 / 0.037052 (0.003049) 0.300972 / 0.258489 (0.042483) 0.342618 / 0.293841 (0.048777) 0.031785 / 0.128546 (-0.096762) 0.009785 / 0.075646 (-0.065861) 0.251641 / 0.419271 (-0.167630) 0.051006 / 0.043533 (0.007473) 0.308701 / 0.255139 (0.053562) 0.334607 / 0.283200 (0.051407) 0.093992 / 0.141683 (-0.047690) 1.838036 / 1.452155 (0.385882) 1.888202 / 1.492716 (0.395485)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.387126 / 0.018006 (0.369120) 0.540804 / 0.000490 (0.540314) 0.041496 / 0.000200 (0.041296) 0.000484 / 0.000054 (0.000430)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.026837 / 0.037411 (-0.010574) 0.106187 / 0.014526 (0.091661) 0.116571 / 0.176557 (-0.059985) 0.162804 / 0.737135 (-0.574331) 0.117986 / 0.296338 (-0.178353)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.426243 / 0.215209 (0.211034) 4.256744 / 2.077655 (2.179089) 1.902900 / 1.504120 (0.398780) 1.692138 / 1.541195 (0.150943) 1.823067 / 1.468490 (0.354577) 0.445333 / 4.584777 (-4.139444) 4.614765 / 3.745712 (0.869053) 2.173461 / 5.269862 (-3.096400) 0.939729 / 4.565676 (-3.625947) 0.053619 / 0.424275 (-0.370656) 0.012236 / 0.007607 (0.004629) 0.531891 / 0.226044 (0.305846) 5.294429 / 2.268929 (3.025500) 2.346761 / 55.444624 (-53.097863) 2.037467 / 6.876477 (-4.839009) 2.160973 / 2.142072 (0.018901) 0.558648 / 4.805227 (-4.246580) 0.123142 / 6.500664 (-6.377522) 0.061480 / 0.075469 (-0.013990)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.606774 / 1.841788 (-0.235014) 14.741381 / 8.074308 (6.667073) 26.631792 / 10.191392 (16.440400) 0.894447 / 0.680424 (0.214023) 0.523815 / 0.534201 (-0.010386) 0.489031 / 0.579283 (-0.090252) 0.506984 / 0.434364 (0.072620) 0.321927 / 0.540337 (-0.218410) 0.342936 / 1.386936 (-1.044000)

CML watermark

Please sign in to comment.