Skip to content

Commit

Permalink
minor fix according to zhijian's suggestion
Browse files Browse the repository at this point in the history
  • Loading branch information
yxdyc committed Nov 14, 2023
1 parent df4911c commit f355d00
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 6 deletions.
4 changes: 4 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ process:
min_ratio: 0.333 # the min aspect ratio of filter range
max_ratio: 3.0 # the max aspect ratio of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_size_filter: # filter samples according to the size of images (in bytes) within them
min_size: "0" # the min size of filter range
max_ratio: "1TB" # the max size of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
lang: en # keep text in what language
min_score: 0.8 # the min language scores to filter text
Expand Down
10 changes: 5 additions & 5 deletions data_juicer/ops/filter/image_size_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class ImageSizeFilter(Filter):

def __init__(self,
min_size: str = '0',
max_size: str = '1Tb',
max_size: str = '1TB',
any_or_all: str = 'any',
*args,
**kwargs):
Expand All @@ -35,8 +35,8 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.min_size = min_size
self.max_size = max_size
self.min_size = size_to_bytes(min_size)
self.max_size = size_to_bytes(max_size)
if any_or_all not in ['any', 'all']:
raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
f'Can only be one of ["any", "all"].')
Expand All @@ -63,8 +63,8 @@ def compute_stats(self, sample, context=False):
def process(self, sample):
image_sizes = sample[Fields.stats][StatsKeys.image_sizes]
keep_bools = np.array([
size_to_bytes(self.min_size) <= image_size <= size_to_bytes(
self.max_size) for image_size in image_sizes
self.min_size <= image_size <= self.max_size
for image_size in image_sizes
])
if len(keep_bools) <= 0:
return True
Expand Down
3 changes: 2 additions & 1 deletion data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,5 @@ def size_to_bytes(size):
else:
raise ValueError(f'You specified unidentifiable unit: {suffix}, '
f'expected in [KB, MB, GB, TB, PB, EB, ZB, YB, '
f'KiB, MiB, GiB, TiB, PiB, EiB, ZiB, YiB]')
f'KiB, MiB, GiB, TiB, PiB, EiB, ZiB, YiB], '
f'(case insensitive, counted by *Bytes*).')

0 comments on commit f355d00

Please sign in to comment.