diff --git a/tempo_embeddings/io/corpus_reader.py b/tempo_embeddings/io/corpus_reader.py index efcd7ee..d537315 100644 --- a/tempo_embeddings/io/corpus_reader.py +++ b/tempo_embeddings/io/corpus_reader.py @@ -1,4 +1,6 @@ +import csv import json +import logging from pathlib import Path from typing import Any, Iterable, Optional @@ -62,15 +64,18 @@ def build_corpora( for file in tqdm(files[:max_files], desc=self.directory.name, unit="file"): if self.loader_type == "csv": - yield Corpus.from_csv_file( - filepath=file, - text_columns=self.text_columns, - filter_terms=filter_terms, - encoding=self.encoding, - compression=self.compression, - delimiter=self.delimiter, - segmenter=segmenter, - ) + try: + yield Corpus.from_csv_file( + filepath=file, + text_columns=self.text_columns, + filter_terms=filter_terms, + encoding=self.encoding, + compression=self.compression, + delimiter=self.delimiter, + segmenter=segmenter, + ) + except csv.Error as e: + logging.error("Error reading file %s: %s", file, e) else: raise NotImplementedError(f"Unrecognized format '{self.file_type}'")