Merge pull request #152 from artefactory/feature/dataloader

Feature/dataloader
artefactory · Jun 21, 2021 · 8094379 · 8094379
2 parents 8c0234b + fb6acf0
commit 8094379
Show file tree

Hide file tree

Showing 9 changed files with 556 additions and 281 deletions.
diff --git a/README.md b/README.md
@@ -52,6 +52,11 @@ pip install https://github.com/explosion/spacy-models/releases/download/en_core_
 pip install https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz
 ```
 
+To use our TextLoader class, you'll need to install Dask as well:
+```bash
+pip install dask[complete]==2021.3.0
+```
+
 # Preprocessing pipeline
 
 ## Default pipeline <a name="default_pipeline"></a>
@@ -64,7 +69,7 @@ text = "I just got the best dinner in my life @latourdargent !!! I  recommend
 preprocessor = Preprocessor()
 text = preprocessor.run(text)
 print(text)
-# "I just got the best dinner in my life !!! I recommend"
+# "I just got the best dinner in my life!!! I recommend"
 ```
 
 ## Create your custom pipeline <a name="custom_pipeline"></a>
@@ -94,6 +99,64 @@ print(text)
 Take a look at all the functions that are available [here](https://github.com/artefactory/NLPretext/tree/master/nlpretext) in the ```preprocess.py``` scripts in the different folders: basic, social, token.
 
 
+# Load text data
+
+Pre-processing text data is useful only if you have loaded data to process! Importing text data as strings in your code can be really simple if you have short texts contained in a local .txt, but it can quickly become difficult if you want to load a lot of texts, stored in multiple formats and divided in multiple files. Hopefully, you can use NLPretext's TextLoader class to easily import text data.
+Our TextLoader class makes use of Dask, so be sure to install the library if you want to use it, as mentioned above.
+
+```python
+from nlpretext.textloader import TextLoader
+files_path = "local_folder/texts/text.txt"
+text_loader = TextLoader()
+text_dataframe = text_loader.read_text(files_path)
+print(text_dataframe.text.values.tolist())
+# ["I just got the best dinner in my life!!!",  "I recommend", "It was awesome"]
+```
+
+As TextLoader uses dask to load data, file path can be provided as string, list of strings, with or without wildcards. It also supports imports from cloud providers, if your machine is authentified on a project.
+
+```python
+text_loader = TextLoader(text_column="name_of_text_column_in_your_data")
+
+local_file_path = "local_folder/texts/text.csv" # File from local folder
+local_corpus_path = ["local_folder/texts/text_1.csv", "local_folder/texts/text_2.csv", "local_folder/texts/text_3.csv"] # Multiple files from local folder
+
+gcs_file_path = "gs://my-bucket/texts/text.json" # File from GCS
+s3_file_path = "s3://my-bucket/texts/text.json" # File from S3
+hdfs_file_path = "hdfs://folder/texts/text.txt" # File from HDFS
+azure_file_path = "az://my-bucket/texts/text.parquet" # File from Azure
+
+gcs_corpus_path = "gs://my-bucket/texts/text_*.json" # Multiple files from GCS with wildcard
+
+text_dataframe_1 = text_loader.read_text(local_file_path)
+text_dataframe_2 = text_loader.read_text(local_corpus_path)
+text_dataframe_3 = text_loader.read_text(gcs_file_path)
+text_dataframe_4 = text_loader.read_text(s3_file_path)
+text_dataframe_5 = text_loader.read_text(hdfs_file_path)
+text_dataframe_6 = text_loader.read_text(azure_file_path)
+text_dataframe_7 = text_loader.read_text(gcs_corpus_path)
+
+```
+
+You can also specify a Preprocessor if you want your data to be directly pre-processed when loaded.
+```python
+text_loader = TextLoader(text_column="text_col")
+preprocessor = Preprocessor()
+
+file_path = "local_folder/texts/text.csv" # File from local folder
+
+raw_text_dataframe = text_loader.read_text(local_file_path)
+preprocessed_text_dataframe = text_loader.read_text(local_file_path, preprocessor=preprocessor)
+
+print(raw_text_dataframe.text_col.values.tolist())
+# ["These   texts are not preprocessed",  "This is bad ## "]
+
+print(preprocessed_text_dataframe.text_col.values.tolist())
+# ["These texts are not preprocessed",  "This is bad"]
+```
+
+
+
 # Individual Functions
 
 ## Replacing emails <a name="replace_emails"></a>

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.0.4
+1.0.5
diff --git a/nlpretext/_config/constants.py b/nlpretext/_config/constants.py
@@ -227,3 +227,6 @@
 HASHTAG_PATTERN = re.compile(r'#\w*')
 AT_PATTERN = re.compile(r'@\w*')
 HTML_TAG_PATTERN = re.compile(r'<.*?>')
+
+# TEXT LOADER
+TEXT_FILE_FORMATS_PATTERN = re.compile(r"^.*\.(json|csv|txt|parquet)(\.gz|\.zip)*$")
diff --git a/nlpretext/_utils/file_loader.py b/nlpretext/_utils/file_loader.py
@@ -15,23 +15,9 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program; if not, write to the Free Software Foundation,
 # Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-import codecs
-import glob
-import io
-import json
-import os
-import re
-import shutil
-import warnings
-from typing import Optional, Union, List
-
 import chardet
 
-
-def open_textfile(filepath: str, encoding="utf-8"):
-    with io.open(filepath, "r", encoding=encoding) as f:
-        string = f.read()
-    return string
+from nlpretext._config import constants
 
 
 def detect_encoding(file_path_or_string: str, n_lines: int = 100) -> str:
@@ -50,187 +36,39 @@ def detect_encoding(file_path_or_string: str, n_lines: int = 100) -> str:
     string
         the code of the detected encoding
     """
-    if os.path.isfile(file_path_or_string):
+    if isinstance(file_path_or_string, bytes):
+        rawdata = file_path_or_string
+    else:
         with open(file_path_or_string, "rb") as f:
             rawdata = b"".join([f.readline() for _ in range(n_lines)])
-    elif isinstance(file_path_or_string, bytes):
-        rawdata = file_path_or_string
     return chardet.detect(rawdata)
 
 
-def text_loader(
-        filepath: str, encoding: Optional[str] = None, detectencoding: bool = True
-) -> str:
-    """
-    This util loads a file. If the encoding is specified, will use the specified
-    encoding to load the text file.
-    If not specified, this function tries to open the doc as UTF-U, and if
-    it fails it will try to detect the encoding using **detect_encoding**
-
-    Parameters
-    ----------
-    filepath : str
-    encoding : str, optional
-        If the encoding is specified, will use the specified encoding to load the text file.
-    detectencoding : bool
-        If file is not encoded into UTF-8, try to detect encoding using the chardet library.
-
-    Returns
-    -------
-    string
-
-    Raises
-    ------
-    UnicodeDecodeError
-        if document can't be loaded with utf-8 encoding.
-    """
-    if encoding is not None:
-        return open_textfile(filepath, encoding=encoding)
-    try:
-        return open_textfile(filepath, encoding="utf-8")
-    except UnicodeDecodeError:
-        warnings.warn(f"Encoding for {filepath} is not UTF-8.")
-        if detectencoding is True:
-            detected_encoding = detect_encoding(filepath)
-            warnings.warn(
-                f'{filepath}: detected encoding is {detected_encoding["encoding"]},\
-                            with a confidence rate of {detected_encoding["confidence"]}'
-            )
-            return open_textfile(filepath, encoding=detected_encoding["encoding"])
-        raise UnicodeDecodeError(
-            "Cannot load document using utf-8. "
-            "Try to detect encoding using detectencoding=True"
-        )
-
-
-def get_subfolders_path(folder: str) -> list:
+def check_text_file_format(filepath) -> str:
     """
-    Get a list of all the subfolder for a folder path
-    """
-    if not folder.endswith("/"):
-        folder = folder + "/"
-    return [
-        folder + f + "/"
-        for f in os.listdir(folder)
-        if os.path.isdir(os.path.join(folder, f)) and f != ".DS_Store"
-    ]
-
-
-def list_files_in_subdir(filepath: str) -> list:
-    """
-    Get a list of all the filepath of files in directory and subdirectory.
-    """
-    res = []
-    for path, _, files in os.walk(filepath):
-        for name in files:
-            res.append(os.path.join(path, name))
-    return res
-
-
-def list_files(filepath: str) -> List[str]:
-    """
-    List files within a given filepath.
+    Retrieve format of a file path or list of files path, among .csv, .json, .parquet and .txt
 
     Parameters
     ----------
-    filepath : str
-        Supports wildcard "*" character.
-
-    Returns
-    -------
-    list
-        list of filepaths
-    """
-
-    if os.path.isdir(filepath) and len(re.findall(r"[\w.]$", filepath)) > 0:
-        filepath = filepath + "/*"
-    if filepath.endswith("/"):
-        filepath = filepath + "*"
-    return [file for file in glob.glob(filepath) if os.path.isfile(file)]
-
-
-def documents_loader(
-        filepath: str,
-        encoding: Optional[str] = None,
-        detectencoding: bool = True,
-        output_as: str = "dict",
-) -> Union[str, list, dict]:
-    """
-    Input a filepath, a filepath with wildcard (eg. *.txt),
-    or a list of filepaths. Output a string, or a dict of strings.
-
-    Parameters
-    ----------
-    filepath : str
+    filepath : str | list(str)
         A filepath with wildcard (eg. *.txt), or a list of filepaths.
-    encoding : str, optional
-        if not specified, will try to detect encoding except if detectencoding is false.
-    detectencoding : bool
-        if True and if encoding is not specified, will try to detect encoding using chardet.
-    output_as: str {list, dict}
-        If dict, key will be the filename.
 
     Returns
     -------
-    Uniont[string, list, dict]
-        The document loaded.
+    str
+        Format of the specified file path, among .json, .csv, .parquet or .txt
     """
-    if isinstance(filepath, str):
-        documents = list_files(filepath)
-        nb_of_documents = len(documents)
-    elif isinstance(filepath, list):
-        nb_of_documents = len(filepath)
-        documents = filepath
-    else:
-        raise TypeError("Please enter a valid filepath or a valid list of filepath")
-
-    if nb_of_documents == 1:
-        return text_loader(
-            documents[0], encoding=encoding, detectencoding=detectencoding
-        )
-    if nb_of_documents > 1:
-        if output_as == "list":
-            return [
-                text_loader(document, encoding=encoding, detectencoding=detectencoding)
-                for document in documents
-            ]
-        if output_as == "dict":
-            return {
-                document: text_loader(
-                    document, encoding=encoding, detectencoding=detectencoding
-                )
-                for document in documents
-            }
-        raise TypeError("Enter a valid output format between list or dict")
-    raise IOError("No files detected in {}".format(filepath))
-
+    pattern = constants.TEXT_FILE_FORMATS_PATTERN
+    if not isinstance(filepath, list):
+        filepath = [filepath]
 
-## CSV Loader
+    format_re_list = [pattern.match(path) for path in filepath]
+    if None in format_re_list:
+        raise ValueError("Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted")
 
+    format_list = [format_re.group(1) for format_re in format_re_list]
+    if len(set(format_list)) > 1:
+        raise ValueError(f"Multiple file formats found in file path list: {format_list}")
 
-def encode_columns(df, columns_to_encode: str):
-    """
-    apply json.dumps on columns
-    """
-    for col in columns_to_encode:
-        df[col] = df[col].apply(json.dumps)
-
-
-def decode_columns(df, columns_to_encode: str):
-    """
-    apply json.loads on columns
-    """
-    for col in columns_to_encode:
-        df[col] = df[col].apply(json.loads)
-
-
-## Encoding functions
-
-
-def convert_encoding(filepath: str, input_encoding: str, output_encoding: str):
-    """
-    Encode a file according to a specified encoding.
-    """
-    with codecs.open(filepath, encoding=input_encoding) as input_file:
-        with codecs.open("encoded_" + filepath, "w", encoding=output_encoding) as output_file:
-            shutil.copyfileobj(input_file, output_file)
+    file_format = format_list[0]
+    return file_format