From 75e328f323b2778490b443d4d1c5f8d9fc7089ac Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 22 Feb 2024 14:51:03 +0800 Subject: [PATCH] Fix --- data_juicer/ops/common/helper_func.py | 4 ++-- data_juicer/ops/filter/image_size_filter.py | 4 ++-- data_juicer/ops/mapper/remove_non_chinese_character_mapper.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/data_juicer/ops/common/helper_func.py b/data_juicer/ops/common/helper_func.py index fdf38d153..3e23c3948 100644 --- a/data_juicer/ops/common/helper_func.py +++ b/data_juicer/ops/common/helper_func.py @@ -32,7 +32,7 @@ def strip(document, strip_characters): emojis). :param document: document to be processed - :param strip_characters: characters uesd for stripping document + :param strip_characters: characters used for stripping document :return: stripped document """ if not document: @@ -76,7 +76,7 @@ def split_on_newline_tab_whitespace(document): First split on "\\\\n", then on "\\\\t", then on " ". :param document: document to be splited - :return: setence list obtained after splitting document + :return: sentence list obtained after splitting document """ sentences = document.split('\n') sentences = [sentence.split('\t') for sentence in sentences] diff --git a/data_juicer/ops/filter/image_size_filter.py b/data_juicer/ops/filter/image_size_filter.py index 76d537cdc..da7488fc4 100644 --- a/data_juicer/ops/filter/image_size_filter.py +++ b/data_juicer/ops/filter/image_size_filter.py @@ -8,7 +8,7 @@ @OPERATORS.register_module('image_size_filter') class ImageSizeFilter(Filter): - """Keep data samples whose image size (in bytes/kb/MB/...) within a + """Keep data samples whose image size (in Bytes/KB/MB/...) within a specific range. """ @@ -24,7 +24,7 @@ def __init__(self, :param min_size: The min image size to keep samples. set to be "0" by default for no size constraint :param max_size: The max image size to keep samples. set to be - "1Tb" by default, an approximate for un-limited case + "1TB" by default, an approximate for un-limited case :param any_or_all: keep this sample with 'any' or 'all' strategy of all images. 'any': keep this sample if any images meet the condition. 'all': keep this sample only if all images meet the diff --git a/data_juicer/ops/mapper/remove_non_chinese_character_mapper.py b/data_juicer/ops/mapper/remove_non_chinese_character_mapper.py index 14b799452..3e6cd494d 100644 --- a/data_juicer/ops/mapper/remove_non_chinese_character_mapper.py +++ b/data_juicer/ops/mapper/remove_non_chinese_character_mapper.py @@ -16,7 +16,7 @@ def __init__(self, """ Initialization method. - :param keep_alphabet: whether to keep alpabet + :param keep_alphabet: whether to keep alphabet :param keep_number: whether to keep number :param keep_punc: whether to keep punctuation :param args: extra args