diff --git a/config.json b/config.json index 0b41b6f..399d141 100644 --- a/config.json +++ b/config.json @@ -12,6 +12,6 @@ { "type": "threshold", "value": "0.02" - } - + }, + "output_unit": "paragraph" } diff --git a/interest/__init__.py b/interest/__init__.py index 5170041..e69de29 100644 --- a/interest/__init__.py +++ b/interest/__init__.py @@ -1,7 +0,0 @@ -# from interest.preprocessor.parser import XMLExtractor -from interest.delpher_kranten import KrantenFile - -INPUT_FILE_TYPES = { - "delpher_kranten": KrantenFile - -} diff --git a/interest/article_final_selection/__init__.py b/interest/article_final_selection/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/interest/article_final_selection/process_article.py b/interest/article_final_selection/process_article.py index b3770ee..198c185 100644 --- a/interest/article_final_selection/process_article.py +++ b/interest/article_final_selection/process_article.py @@ -44,11 +44,11 @@ def __init__(self, gzip_file_path: str, article_id: int): self._file_path = gzip_file_path self._article_id = article_id self._title: Union[str, None] = '' - self._body: Union[str, None] = '' + self._body: Union[str, list, None] = '' self.selected: bool = False - def _read_article_from_gzip(self) -> Tuple[Union[str, None], - Union[str, None]]: + def read_article_from_gzip(self, in_paragraph: bool = False) -> ( + Tuple)[Union[str, None], Union[str, list, None]]: """ Read article content from a gzip file. @@ -63,8 +63,7 @@ def _read_article_from_gzip(self) -> Tuple[Union[str, None], article = articles.get(str(self._article_id), {}) title = article.get('title', {}) body = article.get('body', {}) - body_string = " ".join(body) - return title, body_string + return title, body if in_paragraph else " ".join(body) except Exception as e: # pylint: disable=broad-except logging.error("Error reading article %s from %s: %s", str(self._article_id), self._file_path, e) @@ -80,7 +79,7 @@ def process_article(self, clean_keywords: List[str]) -> str: Returns: str: The processed article body. """ - self._title, self._body = self._read_article_from_gzip() + self._title, self._body = self.read_article_from_gzip() if (self._title is None) or (self._body is None): return "" clean_title = clean(self._title) @@ -89,4 +88,6 @@ def process_article(self, clean_keywords: List[str]) -> str: if title_with_keyword: self.selected = True return "" - return clean(self._body) + if isinstance(self._body, str): + return clean(self._body) + return "" diff --git a/interest/filter/__init__.py b/interest/filter/__init__.py new file mode 100644 index 0000000..5618aa7 --- /dev/null +++ b/interest/filter/__init__.py @@ -0,0 +1,7 @@ +"""define input-file type""" +from interest.filter.delpher_kranten import KrantenFile + +INPUT_FILE_TYPES = { + "delpher_kranten": KrantenFile + +} diff --git a/interest/delpher_kranten.py b/interest/filter/delpher_kranten.py similarity index 97% rename from interest/delpher_kranten.py rename to interest/filter/delpher_kranten.py index 2e77575..ec2dc85 100644 --- a/interest/delpher_kranten.py +++ b/interest/filter/delpher_kranten.py @@ -8,8 +8,8 @@ import logging import os from typing import Optional -from interest.document import Document, Article -from interest.input_file import InputFile +from interest.filter.document import Document, Article +from interest.filter.input_file import InputFile class KrantenFile(InputFile): diff --git a/interest/document.py b/interest/filter/document.py similarity index 100% rename from interest/document.py rename to interest/filter/document.py diff --git a/interest/document_filter.py b/interest/filter/document_filter.py similarity index 99% rename from interest/document_filter.py rename to interest/filter/document_filter.py index db02309..19f5412 100644 --- a/interest/document_filter.py +++ b/interest/filter/document_filter.py @@ -4,7 +4,7 @@ """ from abc import ABC, abstractmethod from typing import List -from interest.document import Document, Article +from interest.filter.document import Document, Article class DocumentFilter(ABC): diff --git a/interest/input_file.py b/interest/filter/input_file.py similarity index 96% rename from interest/input_file.py rename to interest/filter/input_file.py index 7992ec6..dcb7504 100644 --- a/interest/input_file.py +++ b/interest/filter/input_file.py @@ -8,8 +8,8 @@ import logging from pathlib import Path from typing import Iterable, TextIO, cast, Optional -from interest.document import Document, Article -from interest.document_filter import DocumentFilter +from interest.filter.document import Document, Article +from interest.filter.document_filter import DocumentFilter class InputFile(abc.ABC): diff --git a/interest/utils.py b/interest/utils.py index 2c51f85..95ac80d 100644 --- a/interest/utils.py +++ b/interest/utils.py @@ -8,9 +8,12 @@ import json import spacy import spacy.cli -from interest.document_filter import YearFilter, TitleFilter, DocumentFilter -from interest.document_filter import (CompoundFilter, DecadeFilter, - KeywordsFilter) +from interest.filter.document_filter import (YearFilter, + TitleFilter, + DocumentFilter) +from interest.filter.document_filter import (CompoundFilter, + DecadeFilter, + KeywordsFilter) from interest.settings import ENCODING @@ -131,6 +134,34 @@ def get_article_selector_from_config(config_file: Path) -> dict: from exc +def get_output_unit_from_config(config_file: Path) -> dict: + """ + Get the article selector configuration from a JSON file. + + Args: + config_file (Path): The path to the JSON config file. + + Returns: + Dict[str, str]: The article selector configuration. + + Raises: + ArticleSelectorNotFoundError: If the article selector + is not found in the config file. + FileNotFoundError: If the config file is not found. + """ + try: + with open(config_file, 'r', encoding=ENCODING) as f: + config: Dict[str, str] = json.load(f)["output_unit"] + if not config: + raise ValueError("Config is empty") + return config + except FileNotFoundError as exc: + raise FileNotFoundError("Config file not found") from exc + except KeyError as exc: + raise KeyError("Article selector not found in config file") \ + from exc + + def save_filtered_articles(input_file: Any, article_id: str, output_dir: str) -> None: """Save filtered articles data to a JSON file. diff --git a/scripts/step1_filter_articles.py b/scripts/step1_filter_articles.py index d1c46dd..6962405 100644 --- a/scripts/step1_filter_articles.py +++ b/scripts/step1_filter_articles.py @@ -9,8 +9,8 @@ from tqdm import tqdm -from interest import INPUT_FILE_TYPES -from interest.input_file import InputFile +from interest.filter import INPUT_FILE_TYPES +from interest.filter.input_file import InputFile from interest.utils import load_filters_from_config from interest.utils import save_filtered_articles diff --git a/scripts/step4_generate_output.py b/scripts/step4_generate_output.py new file mode 100644 index 0000000..529ed8f --- /dev/null +++ b/scripts/step4_generate_output.py @@ -0,0 +1,119 @@ +"""This script reads selected articles from CSV files, +and saves their text for manual labeling""" +import argparse +import logging +import os +from pathlib import Path +from typing import Union +import pandas as pd +from pandas import DataFrame +from interest.article_final_selection.process_article import ArticleProcessor +from interest.utils import get_output_unit_from_config + +FILE_PATH_FIELD = "file_path" +ARTICLE_ID_FIELD = "article_id" +TITLE_FIELD = "title" +BODY_FIELD = "body" +LABEL_FIELD = "label" +SELECTED_FIELD = "selected" + + +def read_article(row: pd.Series, in_paragraph: bool = False) -> DataFrame: + """ + Read article from row and return DataFrame of articles. + + Args: + row (pd.Series): A row from a DataFrame. + in_paragraph (bool, optional): Whether to read article in paragraphs. + Defaults to False. + + Returns: + DataFrame: DataFrame containing article information. + """ + file_path = row[FILE_PATH_FIELD] + article_id = row[ARTICLE_ID_FIELD] + article_processor = ArticleProcessor(file_path, article_id) + title, body = article_processor.read_article_from_gzip(in_paragraph) + + titles = [title] * len(body) if in_paragraph and body is not None else [title] + files_path = [file_path] * len(body) if in_paragraph and body is not None else [file_path] + articles_id = [article_id] * len(body) if in_paragraph and body is not None else [article_id] + label = [''] * len(body) if in_paragraph and body is not None else [''] + return pd.DataFrame({FILE_PATH_FIELD: files_path, + ARTICLE_ID_FIELD: articles_id, + TITLE_FIELD: titles, + BODY_FIELD: body, + LABEL_FIELD: label}) + + +def find_articles_in_file(filepath: str, in_paragraph: bool) -> ( + Union)[DataFrame, None]: + """ + Find selected articles in a CSV file and return DataFrame of articles. + + Args: + filepath (str): Path to the CSV file. + in_paragraph (bool): Whether to read articles in paragraphs. + + Returns: + DataFrame: DataFrame containing selected articles information. + """ + try: + df_articles = pd.read_csv(filepath) + df_selected = df_articles.loc[df_articles[SELECTED_FIELD] == 1] + + result = pd.concat([read_article(row, in_paragraph=in_paragraph) + for _, row in df_selected.iterrows()], + axis=0, ignore_index=True) + return result + except Exception as e: # pylint: disable=W0718 + logging.error("Error reading selected indices in file: %s", e) + return None + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Select final articles.") + + parser.add_argument( + "--input_dir", + type=Path, + required=True, + help="Base directory for reading input files.", + ) + parser.add_argument( + "--glob", + type=str, + default="*.csv", + help="Glob pattern for find input files; e.g. '*.csv'.", + ) + parser.add_argument( + "--config_path", + type=Path, + default="config.json", + help="File path of config file.", + ) + parser.add_argument( + "--output_dir", + type=Path, + required=True, + help="The directory for storing output files.", + ) + + args = parser.parse_args() + + if not args.input_dir.is_dir(): + parser.error(f"Not a directory: '{str(args.input_dir.absolute())}'") + + args.output_dir.mkdir(parents=True, exist_ok=True) + config_output_unit = get_output_unit_from_config(args.config_path) + + result_df = pd.DataFrame(columns=[FILE_PATH_FIELD, ARTICLE_ID_FIELD, + TITLE_FIELD, BODY_FIELD, LABEL_FIELD]) + IN_PARAGRAPH = config_output_unit == "paragraph" + + for articles_filepath in args.input_dir.rglob(args.glob): + df = find_articles_in_file(articles_filepath, + in_paragraph=IN_PARAGRAPH) + result_df = pd.concat([result_df, df], ignore_index=True) + + result_df.to_csv(os.path.join(args.output_dir, 'articles_to_label.csv'))