Skip to content

Commit

Permalink
Handle csv parsing errors.
Browse files Browse the repository at this point in the history
  • Loading branch information
carschno committed Nov 5, 2024
1 parent acd6297 commit 968fdcf
Showing 1 changed file with 14 additions and 9 deletions.
23 changes: 14 additions & 9 deletions tempo_embeddings/io/corpus_reader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import csv
import json
import logging
from pathlib import Path
from typing import Any, Iterable, Optional

Expand Down Expand Up @@ -62,15 +64,18 @@ def build_corpora(

for file in tqdm(files[:max_files], desc=self.directory.name, unit="file"):
if self.loader_type == "csv":
yield Corpus.from_csv_file(
filepath=file,
text_columns=self.text_columns,
filter_terms=filter_terms,
encoding=self.encoding,
compression=self.compression,
delimiter=self.delimiter,
segmenter=segmenter,
)
try:
yield Corpus.from_csv_file(
filepath=file,
text_columns=self.text_columns,
filter_terms=filter_terms,
encoding=self.encoding,
compression=self.compression,
delimiter=self.delimiter,
segmenter=segmenter,
)
except csv.Error as e:
logging.error("Error reading file %s: %s", file, e)
else:
raise NotImplementedError(f"Unrecognized format '{self.file_type}'")

Expand Down

0 comments on commit 968fdcf

Please sign in to comment.