Skip to content

Commit

Permalink
Merge pull request #152 from artefactory/feature/dataloader
Browse files Browse the repository at this point in the history
Feature/dataloader
  • Loading branch information
rafaelleaygalenq authored Jun 21, 2021
2 parents 8c0234b + fb6acf0 commit 8094379
Show file tree
Hide file tree
Showing 9 changed files with 556 additions and 281 deletions.
65 changes: 64 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ pip install https://github.com/explosion/spacy-models/releases/download/en_core_
pip install https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz
```

To use our TextLoader class, you'll need to install Dask as well:
```bash
pip install dask[complete]==2021.3.0
```

# Preprocessing pipeline

## Default pipeline <a name="default_pipeline"></a>
Expand All @@ -64,7 +69,7 @@ text = "I just got the best dinner in my life @latourdargent !!! I recommend
preprocessor = Preprocessor()
text = preprocessor.run(text)
print(text)
# "I just got the best dinner in my life !!! I recommend"
# "I just got the best dinner in my life!!! I recommend"
```

## Create your custom pipeline <a name="custom_pipeline"></a>
Expand Down Expand Up @@ -94,6 +99,64 @@ print(text)
Take a look at all the functions that are available [here](https://github.com/artefactory/NLPretext/tree/master/nlpretext) in the ```preprocess.py``` scripts in the different folders: basic, social, token.


# Load text data

Pre-processing text data is useful only if you have loaded data to process! Importing text data as strings in your code can be really simple if you have short texts contained in a local .txt, but it can quickly become difficult if you want to load a lot of texts, stored in multiple formats and divided in multiple files. Hopefully, you can use NLPretext's TextLoader class to easily import text data.
Our TextLoader class makes use of Dask, so be sure to install the library if you want to use it, as mentioned above.

```python
from nlpretext.textloader import TextLoader
files_path = "local_folder/texts/text.txt"
text_loader = TextLoader()
text_dataframe = text_loader.read_text(files_path)
print(text_dataframe.text.values.tolist())
# ["I just got the best dinner in my life!!!", "I recommend", "It was awesome"]
```

As TextLoader uses dask to load data, file path can be provided as string, list of strings, with or without wildcards. It also supports imports from cloud providers, if your machine is authentified on a project.

```python
text_loader = TextLoader(text_column="name_of_text_column_in_your_data")

local_file_path = "local_folder/texts/text.csv" # File from local folder
local_corpus_path = ["local_folder/texts/text_1.csv", "local_folder/texts/text_2.csv", "local_folder/texts/text_3.csv"] # Multiple files from local folder

gcs_file_path = "gs://my-bucket/texts/text.json" # File from GCS
s3_file_path = "s3://my-bucket/texts/text.json" # File from S3
hdfs_file_path = "hdfs://folder/texts/text.txt" # File from HDFS
azure_file_path = "az://my-bucket/texts/text.parquet" # File from Azure

gcs_corpus_path = "gs://my-bucket/texts/text_*.json" # Multiple files from GCS with wildcard

text_dataframe_1 = text_loader.read_text(local_file_path)
text_dataframe_2 = text_loader.read_text(local_corpus_path)
text_dataframe_3 = text_loader.read_text(gcs_file_path)
text_dataframe_4 = text_loader.read_text(s3_file_path)
text_dataframe_5 = text_loader.read_text(hdfs_file_path)
text_dataframe_6 = text_loader.read_text(azure_file_path)
text_dataframe_7 = text_loader.read_text(gcs_corpus_path)

```

You can also specify a Preprocessor if you want your data to be directly pre-processed when loaded.
```python
text_loader = TextLoader(text_column="text_col")
preprocessor = Preprocessor()

file_path = "local_folder/texts/text.csv" # File from local folder

raw_text_dataframe = text_loader.read_text(local_file_path)
preprocessed_text_dataframe = text_loader.read_text(local_file_path, preprocessor=preprocessor)

print(raw_text_dataframe.text_col.values.tolist())
# ["These texts are not preprocessed", "This is bad ## "]

print(preprocessed_text_dataframe.text_col.values.tolist())
# ["These texts are not preprocessed", "This is bad"]
```



# Individual Functions

## Replacing emails <a name="replace_emails"></a>
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.0.4
1.0.5
3 changes: 3 additions & 0 deletions nlpretext/_config/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,6 @@
HASHTAG_PATTERN = re.compile(r'#\w*')
AT_PATTERN = re.compile(r'@\w*')
HTML_TAG_PATTERN = re.compile(r'<.*?>')

# TEXT LOADER
TEXT_FILE_FORMATS_PATTERN = re.compile(r"^.*\.(json|csv|txt|parquet)(\.gz|\.zip)*$")
202 changes: 20 additions & 182 deletions nlpretext/_utils/file_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,9 @@
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
import codecs
import glob
import io
import json
import os
import re
import shutil
import warnings
from typing import Optional, Union, List

import chardet


def open_textfile(filepath: str, encoding="utf-8"):
with io.open(filepath, "r", encoding=encoding) as f:
string = f.read()
return string
from nlpretext._config import constants


def detect_encoding(file_path_or_string: str, n_lines: int = 100) -> str:
Expand All @@ -50,187 +36,39 @@ def detect_encoding(file_path_or_string: str, n_lines: int = 100) -> str:
string
the code of the detected encoding
"""
if os.path.isfile(file_path_or_string):
if isinstance(file_path_or_string, bytes):
rawdata = file_path_or_string
else:
with open(file_path_or_string, "rb") as f:
rawdata = b"".join([f.readline() for _ in range(n_lines)])
elif isinstance(file_path_or_string, bytes):
rawdata = file_path_or_string
return chardet.detect(rawdata)


def text_loader(
filepath: str, encoding: Optional[str] = None, detectencoding: bool = True
) -> str:
"""
This util loads a file. If the encoding is specified, will use the specified
encoding to load the text file.
If not specified, this function tries to open the doc as UTF-U, and if
it fails it will try to detect the encoding using **detect_encoding**
Parameters
----------
filepath : str
encoding : str, optional
If the encoding is specified, will use the specified encoding to load the text file.
detectencoding : bool
If file is not encoded into UTF-8, try to detect encoding using the chardet library.
Returns
-------
string
Raises
------
UnicodeDecodeError
if document can't be loaded with utf-8 encoding.
"""
if encoding is not None:
return open_textfile(filepath, encoding=encoding)
try:
return open_textfile(filepath, encoding="utf-8")
except UnicodeDecodeError:
warnings.warn(f"Encoding for {filepath} is not UTF-8.")
if detectencoding is True:
detected_encoding = detect_encoding(filepath)
warnings.warn(
f'{filepath}: detected encoding is {detected_encoding["encoding"]},\
with a confidence rate of {detected_encoding["confidence"]}'
)
return open_textfile(filepath, encoding=detected_encoding["encoding"])
raise UnicodeDecodeError(
"Cannot load document using utf-8. "
"Try to detect encoding using detectencoding=True"
)


def get_subfolders_path(folder: str) -> list:
def check_text_file_format(filepath) -> str:
"""
Get a list of all the subfolder for a folder path
"""
if not folder.endswith("/"):
folder = folder + "/"
return [
folder + f + "/"
for f in os.listdir(folder)
if os.path.isdir(os.path.join(folder, f)) and f != ".DS_Store"
]


def list_files_in_subdir(filepath: str) -> list:
"""
Get a list of all the filepath of files in directory and subdirectory.
"""
res = []
for path, _, files in os.walk(filepath):
for name in files:
res.append(os.path.join(path, name))
return res


def list_files(filepath: str) -> List[str]:
"""
List files within a given filepath.
Retrieve format of a file path or list of files path, among .csv, .json, .parquet and .txt
Parameters
----------
filepath : str
Supports wildcard "*" character.
Returns
-------
list
list of filepaths
"""

if os.path.isdir(filepath) and len(re.findall(r"[\w.]$", filepath)) > 0:
filepath = filepath + "/*"
if filepath.endswith("/"):
filepath = filepath + "*"
return [file for file in glob.glob(filepath) if os.path.isfile(file)]


def documents_loader(
filepath: str,
encoding: Optional[str] = None,
detectencoding: bool = True,
output_as: str = "dict",
) -> Union[str, list, dict]:
"""
Input a filepath, a filepath with wildcard (eg. *.txt),
or a list of filepaths. Output a string, or a dict of strings.
Parameters
----------
filepath : str
filepath : str | list(str)
A filepath with wildcard (eg. *.txt), or a list of filepaths.
encoding : str, optional
if not specified, will try to detect encoding except if detectencoding is false.
detectencoding : bool
if True and if encoding is not specified, will try to detect encoding using chardet.
output_as: str {list, dict}
If dict, key will be the filename.
Returns
-------
Uniont[string, list, dict]
The document loaded.
str
Format of the specified file path, among .json, .csv, .parquet or .txt
"""
if isinstance(filepath, str):
documents = list_files(filepath)
nb_of_documents = len(documents)
elif isinstance(filepath, list):
nb_of_documents = len(filepath)
documents = filepath
else:
raise TypeError("Please enter a valid filepath or a valid list of filepath")

if nb_of_documents == 1:
return text_loader(
documents[0], encoding=encoding, detectencoding=detectencoding
)
if nb_of_documents > 1:
if output_as == "list":
return [
text_loader(document, encoding=encoding, detectencoding=detectencoding)
for document in documents
]
if output_as == "dict":
return {
document: text_loader(
document, encoding=encoding, detectencoding=detectencoding
)
for document in documents
}
raise TypeError("Enter a valid output format between list or dict")
raise IOError("No files detected in {}".format(filepath))

pattern = constants.TEXT_FILE_FORMATS_PATTERN
if not isinstance(filepath, list):
filepath = [filepath]

## CSV Loader
format_re_list = [pattern.match(path) for path in filepath]
if None in format_re_list:
raise ValueError("Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted")

format_list = [format_re.group(1) for format_re in format_re_list]
if len(set(format_list)) > 1:
raise ValueError(f"Multiple file formats found in file path list: {format_list}")

def encode_columns(df, columns_to_encode: str):
"""
apply json.dumps on columns
"""
for col in columns_to_encode:
df[col] = df[col].apply(json.dumps)


def decode_columns(df, columns_to_encode: str):
"""
apply json.loads on columns
"""
for col in columns_to_encode:
df[col] = df[col].apply(json.loads)


## Encoding functions


def convert_encoding(filepath: str, input_encoding: str, output_encoding: str):
"""
Encode a file according to a specified encoding.
"""
with codecs.open(filepath, encoding=input_encoding) as input_file:
with codecs.open("encoded_" + filepath, "w", encoding=output_encoding) as output_file:
shutil.copyfileobj(input_file, output_file)
file_format = format_list[0]
return file_format
Loading

0 comments on commit 8094379

Please sign in to comment.