UtrechtUniversity · parisa-zahedi · Apr 4, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024
diff --git a/config.json b/config.json
@@ -12,6 +12,6 @@
     {
       "type": "threshold",
       "value": "0.02"
-    }
-
+    },
+  "output_unit": "paragraph"
 }
diff --git a/interest/__init__.py b/interest/__init__.py
@@ -1,7 +0,0 @@
-# from interest.preprocessor.parser import XMLExtractor
-from interest.delpher_kranten import KrantenFile
-
-INPUT_FILE_TYPES = {
-    "delpher_kranten": KrantenFile
-
-}

diff --git a/interest/article_final_selection/__init__.py b/interest/article_final_selection/__init__.py
diff --git a/interest/article_final_selection/process_article.py b/interest/article_final_selection/process_article.py
@@ -44,11 +44,11 @@ def __init__(self, gzip_file_path: str, article_id: int):
         self._file_path = gzip_file_path
         self._article_id = article_id
         self._title: Union[str, None] = ''
-        self._body: Union[str, None] = ''
+        self._body: Union[str, list, None] = ''
         self.selected: bool = False
 
-    def _read_article_from_gzip(self) -> Tuple[Union[str, None],
-                                               Union[str, None]]:
+    def read_article_from_gzip(self, in_paragraph: bool = False) -> (
+            Tuple)[Union[str, None], Union[str, list, None]]:
         """
         Read article content from a gzip file.
 
@@ -63,8 +63,7 @@ def _read_article_from_gzip(self) -> Tuple[Union[str, None],
                 article = articles.get(str(self._article_id), {})
                 title = article.get('title', {})
                 body = article.get('body', {})
-                body_string = " ".join(body)
-                return title, body_string
+                return title, body if in_paragraph else " ".join(body)
         except Exception as e:  # pylint: disable=broad-except
             logging.error("Error reading article %s from %s: %s",
                           str(self._article_id), self._file_path, e)
@@ -80,7 +79,7 @@ def process_article(self, clean_keywords: List[str]) -> str:
         Returns:
             str: The processed article body.
         """
-        self._title, self._body = self._read_article_from_gzip()
+        self._title, self._body = self.read_article_from_gzip()
         if (self._title is None) or (self._body is None):
             return ""
         clean_title = clean(self._title)
@@ -89,4 +88,6 @@ def process_article(self, clean_keywords: List[str]) -> str:
         if title_with_keyword:
             self.selected = True
             return ""
-        return clean(self._body)
+        if isinstance(self._body, str):
+            return clean(self._body)
+        return ""
diff --git a/interest/filter/__init__.py b/interest/filter/__init__.py
@@ -0,0 +1,7 @@
+"""define input-file type"""
+from interest.filter.delpher_kranten import KrantenFile
+
+INPUT_FILE_TYPES = {
+    "delpher_kranten": KrantenFile
+
+}
diff --git a/interest/delpher_kranten.py → interest/filter/delpher_kranten.py b/interest/delpher_kranten.py → interest/filter/delpher_kranten.py
@@ -8,8 +8,8 @@
 import logging
 import os
 from typing import Optional
-from interest.document import Document, Article
-from interest.input_file import InputFile
+from interest.filter.document import Document, Article
+from interest.filter.input_file import InputFile
 
 
 class KrantenFile(InputFile):

diff --git a/interest/document.py → interest/filter/document.py b/interest/document.py → interest/filter/document.py
diff --git a/interest/document_filter.py → interest/filter/document_filter.py b/interest/document_filter.py → interest/filter/document_filter.py
@@ -4,7 +4,7 @@
 """
 from abc import ABC, abstractmethod
 from typing import List
-from interest.document import Document, Article
+from interest.filter.document import Document, Article
 
 
 class DocumentFilter(ABC):

diff --git a/interest/input_file.py → interest/filter/input_file.py b/interest/input_file.py → interest/filter/input_file.py
@@ -8,8 +8,8 @@
 import logging
 from pathlib import Path
 from typing import Iterable, TextIO, cast, Optional
-from interest.document import Document, Article
-from interest.document_filter import DocumentFilter
+from interest.filter.document import Document, Article
+from interest.filter.document_filter import DocumentFilter
 
 
 class InputFile(abc.ABC):

diff --git a/interest/utils.py b/interest/utils.py
@@ -8,9 +8,12 @@
 import json
 import spacy
 import spacy.cli
-from interest.document_filter import YearFilter, TitleFilter, DocumentFilter
-from interest.document_filter import (CompoundFilter, DecadeFilter,
-                                      KeywordsFilter)
+from interest.filter.document_filter import (YearFilter,
+                                             TitleFilter,
+                                             DocumentFilter)
+from interest.filter.document_filter import (CompoundFilter,
+                                             DecadeFilter,
+                                             KeywordsFilter)
 from interest.settings import ENCODING
 
 
@@ -131,6 +134,34 @@ def get_article_selector_from_config(config_file: Path) -> dict:
             from exc
 
 
+def get_output_unit_from_config(config_file: Path) -> dict:
+    """
+        Get the article selector configuration from a JSON file.
+
+        Args:
+            config_file (Path): The path to the JSON config file.
+
+        Returns:
+            Dict[str, str]: The article selector configuration.
+
+        Raises:
+            ArticleSelectorNotFoundError: If the article selector
+            is not found in the config file.
+            FileNotFoundError: If the config file is not found.
+    """
+    try:
+        with open(config_file, 'r', encoding=ENCODING) as f:
+            config: Dict[str, str] = json.load(f)["output_unit"]
+        if not config:
+            raise ValueError("Config is empty")
+        return config
+    except FileNotFoundError as exc:
+        raise FileNotFoundError("Config file not found") from exc
+    except KeyError as exc:
+        raise KeyError("Article selector not found in config file") \
+            from exc
+
+
 def save_filtered_articles(input_file: Any, article_id: str,
                            output_dir: str) -> None:
     """Save filtered articles data to a JSON file.

diff --git a/scripts/step1_filter_articles.py b/scripts/step1_filter_articles.py
@@ -9,8 +9,8 @@
 
 from tqdm import tqdm
 
-from interest import INPUT_FILE_TYPES
-from interest.input_file import InputFile
+from interest.filter import INPUT_FILE_TYPES
+from interest.filter.input_file import InputFile
 from interest.utils import load_filters_from_config
 from interest.utils import save_filtered_articles
 

diff --git a/scripts/step4_generate_output.py b/scripts/step4_generate_output.py
@@ -0,0 +1,119 @@
+"""This script reads selected articles from CSV files,
+and saves their text for manual labeling"""
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import Union
+import pandas as pd
+from pandas import DataFrame
+from interest.article_final_selection.process_article import ArticleProcessor
+from interest.utils import get_output_unit_from_config
+
+FILE_PATH_FIELD = "file_path"
+ARTICLE_ID_FIELD = "article_id"
+TITLE_FIELD = "title"
+BODY_FIELD = "body"
+LABEL_FIELD = "label"
+SELECTED_FIELD = "selected"
+
+
+def read_article(row: pd.Series, in_paragraph: bool = False) -> DataFrame:
+    """
+    Read article from row and return DataFrame of articles.
+
+    Args:
+        row (pd.Series): A row from a DataFrame.
+        in_paragraph (bool, optional): Whether to read article in paragraphs.
+        Defaults to False.
+
+    Returns:
+        DataFrame: DataFrame containing article information.
+    """
+    file_path = row[FILE_PATH_FIELD]
+    article_id = row[ARTICLE_ID_FIELD]
+    article_processor = ArticleProcessor(file_path, article_id)
+    title, body = article_processor.read_article_from_gzip(in_paragraph)
+
+    titles = [title] * len(body) if in_paragraph and body is not None else [title]
+    files_path = [file_path] * len(body) if in_paragraph and body is not None else [file_path]
+    articles_id = [article_id] * len(body) if in_paragraph and body is not None else [article_id]
+    label = [''] * len(body) if in_paragraph and body is not None else ['']
+    return pd.DataFrame({FILE_PATH_FIELD: files_path,
+                         ARTICLE_ID_FIELD: articles_id,
+                         TITLE_FIELD: titles,
+                         BODY_FIELD: body,
+                         LABEL_FIELD: label})
+
+
+def find_articles_in_file(filepath: str, in_paragraph: bool) -> (
+        Union)[DataFrame, None]:
+    """
+    Find selected articles in a CSV file and return DataFrame of articles.
+
+    Args:
+        filepath (str): Path to the CSV file.
+        in_paragraph (bool): Whether to read articles in paragraphs.
+
+    Returns:
+        DataFrame: DataFrame containing selected articles information.
+    """
+    try:
+        df_articles = pd.read_csv(filepath)
+        df_selected = df_articles.loc[df_articles[SELECTED_FIELD] == 1]
+
+        result = pd.concat([read_article(row, in_paragraph=in_paragraph)
+                            for _, row in df_selected.iterrows()],
+                           axis=0, ignore_index=True)
+        return result
+    except Exception as e:  # pylint: disable=W0718
+        logging.error("Error reading selected indices in file: %s", e)
+        return None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Select final articles.")
+
+    parser.add_argument(
+        "--input_dir",
+        type=Path,
+        required=True,
+        help="Base directory for reading input files.",
+    )
+    parser.add_argument(
+        "--glob",
+        type=str,
+        default="*.csv",
+        help="Glob pattern for find input files; e.g. '*.csv'.",
+    )
+    parser.add_argument(
+        "--config_path",
+        type=Path,
+        default="config.json",
+        help="File path of config file.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=Path,
+        required=True,
+        help="The directory for storing output files.",
+    )
+
+    args = parser.parse_args()
+
+    if not args.input_dir.is_dir():
+        parser.error(f"Not a directory: '{str(args.input_dir.absolute())}'")
+
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    config_output_unit = get_output_unit_from_config(args.config_path)
+
+    result_df = pd.DataFrame(columns=[FILE_PATH_FIELD, ARTICLE_ID_FIELD,
+                                      TITLE_FIELD, BODY_FIELD, LABEL_FIELD])
+    IN_PARAGRAPH = config_output_unit == "paragraph"
+
+    for articles_filepath in args.input_dir.rglob(args.glob):
+        df = find_articles_in_file(articles_filepath,
+                                   in_paragraph=IN_PARAGRAPH)
+        result_df = pd.concat([result_df, df], ignore_index=True)
+
+    result_df.to_csv(os.path.join(args.output_dir, 'articles_to_label.csv'))