Skip to content

Commit

Permalink
Output (#11)
Browse files Browse the repository at this point in the history
* save output file

* fix pylint flake8 mypy errors

* add filter folder and move related files to that

* fix flake8 error

* fix flake8 error

---------

Co-authored-by: parisa-zahedi <[email protected]>
  • Loading branch information
parisa-zahedi and parisa-zahedi authored Apr 4, 2024
1 parent ae75f8a commit a44cbc5
Show file tree
Hide file tree
Showing 12 changed files with 177 additions and 26 deletions.
4 changes: 2 additions & 2 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@
{
"type": "threshold",
"value": "0.02"
}

},
"output_unit": "paragraph"
}
7 changes: 0 additions & 7 deletions interest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +0,0 @@
# from interest.preprocessor.parser import XMLExtractor
from interest.delpher_kranten import KrantenFile

INPUT_FILE_TYPES = {
"delpher_kranten": KrantenFile

}
Empty file.
15 changes: 8 additions & 7 deletions interest/article_final_selection/process_article.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ def __init__(self, gzip_file_path: str, article_id: int):
self._file_path = gzip_file_path
self._article_id = article_id
self._title: Union[str, None] = ''
self._body: Union[str, None] = ''
self._body: Union[str, list, None] = ''
self.selected: bool = False

def _read_article_from_gzip(self) -> Tuple[Union[str, None],
Union[str, None]]:
def read_article_from_gzip(self, in_paragraph: bool = False) -> (
Tuple)[Union[str, None], Union[str, list, None]]:
"""
Read article content from a gzip file.
Expand All @@ -63,8 +63,7 @@ def _read_article_from_gzip(self) -> Tuple[Union[str, None],
article = articles.get(str(self._article_id), {})
title = article.get('title', {})
body = article.get('body', {})
body_string = " ".join(body)
return title, body_string
return title, body if in_paragraph else " ".join(body)
except Exception as e: # pylint: disable=broad-except
logging.error("Error reading article %s from %s: %s",
str(self._article_id), self._file_path, e)
Expand All @@ -80,7 +79,7 @@ def process_article(self, clean_keywords: List[str]) -> str:
Returns:
str: The processed article body.
"""
self._title, self._body = self._read_article_from_gzip()
self._title, self._body = self.read_article_from_gzip()
if (self._title is None) or (self._body is None):
return ""
clean_title = clean(self._title)
Expand All @@ -89,4 +88,6 @@ def process_article(self, clean_keywords: List[str]) -> str:
if title_with_keyword:
self.selected = True
return ""
return clean(self._body)
if isinstance(self._body, str):
return clean(self._body)
return ""
7 changes: 7 additions & 0 deletions interest/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""define input-file type"""
from interest.filter.delpher_kranten import KrantenFile

INPUT_FILE_TYPES = {
"delpher_kranten": KrantenFile

}
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
import logging
import os
from typing import Optional
from interest.document import Document, Article
from interest.input_file import InputFile
from interest.filter.document import Document, Article
from interest.filter.input_file import InputFile


class KrantenFile(InputFile):
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""
from abc import ABC, abstractmethod
from typing import List
from interest.document import Document, Article
from interest.filter.document import Document, Article


class DocumentFilter(ABC):
Expand Down
4 changes: 2 additions & 2 deletions interest/input_file.py → interest/filter/input_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
import logging
from pathlib import Path
from typing import Iterable, TextIO, cast, Optional
from interest.document import Document, Article
from interest.document_filter import DocumentFilter
from interest.filter.document import Document, Article
from interest.filter.document_filter import DocumentFilter


class InputFile(abc.ABC):
Expand Down
37 changes: 34 additions & 3 deletions interest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@
import json
import spacy
import spacy.cli
from interest.document_filter import YearFilter, TitleFilter, DocumentFilter
from interest.document_filter import (CompoundFilter, DecadeFilter,
KeywordsFilter)
from interest.filter.document_filter import (YearFilter,
TitleFilter,
DocumentFilter)
from interest.filter.document_filter import (CompoundFilter,
DecadeFilter,
KeywordsFilter)
from interest.settings import ENCODING


Expand Down Expand Up @@ -131,6 +134,34 @@ def get_article_selector_from_config(config_file: Path) -> dict:
from exc


def get_output_unit_from_config(config_file: Path) -> dict:
"""
Get the article selector configuration from a JSON file.
Args:
config_file (Path): The path to the JSON config file.
Returns:
Dict[str, str]: The article selector configuration.
Raises:
ArticleSelectorNotFoundError: If the article selector
is not found in the config file.
FileNotFoundError: If the config file is not found.
"""
try:
with open(config_file, 'r', encoding=ENCODING) as f:
config: Dict[str, str] = json.load(f)["output_unit"]
if not config:
raise ValueError("Config is empty")
return config
except FileNotFoundError as exc:
raise FileNotFoundError("Config file not found") from exc
except KeyError as exc:
raise KeyError("Article selector not found in config file") \
from exc


def save_filtered_articles(input_file: Any, article_id: str,
output_dir: str) -> None:
"""Save filtered articles data to a JSON file.
Expand Down
4 changes: 2 additions & 2 deletions scripts/step1_filter_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

from tqdm import tqdm

from interest import INPUT_FILE_TYPES
from interest.input_file import InputFile
from interest.filter import INPUT_FILE_TYPES
from interest.filter.input_file import InputFile
from interest.utils import load_filters_from_config
from interest.utils import save_filtered_articles

Expand Down
119 changes: 119 additions & 0 deletions scripts/step4_generate_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""This script reads selected articles from CSV files,
and saves their text for manual labeling"""
import argparse
import logging
import os
from pathlib import Path
from typing import Union
import pandas as pd
from pandas import DataFrame
from interest.article_final_selection.process_article import ArticleProcessor
from interest.utils import get_output_unit_from_config

FILE_PATH_FIELD = "file_path"
ARTICLE_ID_FIELD = "article_id"
TITLE_FIELD = "title"
BODY_FIELD = "body"
LABEL_FIELD = "label"
SELECTED_FIELD = "selected"


def read_article(row: pd.Series, in_paragraph: bool = False) -> DataFrame:
"""
Read article from row and return DataFrame of articles.
Args:
row (pd.Series): A row from a DataFrame.
in_paragraph (bool, optional): Whether to read article in paragraphs.
Defaults to False.
Returns:
DataFrame: DataFrame containing article information.
"""
file_path = row[FILE_PATH_FIELD]
article_id = row[ARTICLE_ID_FIELD]
article_processor = ArticleProcessor(file_path, article_id)
title, body = article_processor.read_article_from_gzip(in_paragraph)

titles = [title] * len(body) if in_paragraph and body is not None else [title]
files_path = [file_path] * len(body) if in_paragraph and body is not None else [file_path]
articles_id = [article_id] * len(body) if in_paragraph and body is not None else [article_id]
label = [''] * len(body) if in_paragraph and body is not None else ['']
return pd.DataFrame({FILE_PATH_FIELD: files_path,
ARTICLE_ID_FIELD: articles_id,
TITLE_FIELD: titles,
BODY_FIELD: body,
LABEL_FIELD: label})


def find_articles_in_file(filepath: str, in_paragraph: bool) -> (
Union)[DataFrame, None]:
"""
Find selected articles in a CSV file and return DataFrame of articles.
Args:
filepath (str): Path to the CSV file.
in_paragraph (bool): Whether to read articles in paragraphs.
Returns:
DataFrame: DataFrame containing selected articles information.
"""
try:
df_articles = pd.read_csv(filepath)
df_selected = df_articles.loc[df_articles[SELECTED_FIELD] == 1]

result = pd.concat([read_article(row, in_paragraph=in_paragraph)
for _, row in df_selected.iterrows()],
axis=0, ignore_index=True)
return result
except Exception as e: # pylint: disable=W0718
logging.error("Error reading selected indices in file: %s", e)
return None


if __name__ == "__main__":
parser = argparse.ArgumentParser("Select final articles.")

parser.add_argument(
"--input_dir",
type=Path,
required=True,
help="Base directory for reading input files.",
)
parser.add_argument(
"--glob",
type=str,
default="*.csv",
help="Glob pattern for find input files; e.g. '*.csv'.",
)
parser.add_argument(
"--config_path",
type=Path,
default="config.json",
help="File path of config file.",
)
parser.add_argument(
"--output_dir",
type=Path,
required=True,
help="The directory for storing output files.",
)

args = parser.parse_args()

if not args.input_dir.is_dir():
parser.error(f"Not a directory: '{str(args.input_dir.absolute())}'")

args.output_dir.mkdir(parents=True, exist_ok=True)
config_output_unit = get_output_unit_from_config(args.config_path)

result_df = pd.DataFrame(columns=[FILE_PATH_FIELD, ARTICLE_ID_FIELD,
TITLE_FIELD, BODY_FIELD, LABEL_FIELD])
IN_PARAGRAPH = config_output_unit == "paragraph"

for articles_filepath in args.input_dir.rglob(args.glob):
df = find_articles_in_file(articles_filepath,
in_paragraph=IN_PARAGRAPH)
result_df = pd.concat([result_df, df], ignore_index=True)

result_df.to_csv(os.path.join(args.output_dir, 'articles_to_label.csv'))

0 comments on commit a44cbc5

Please sign in to comment.