Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Output #11

Merged
merged 5 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@
{
"type": "threshold",
"value": "0.02"
}

},
"output_unit": "paragraph"
}
7 changes: 0 additions & 7 deletions interest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +0,0 @@
# from interest.preprocessor.parser import XMLExtractor
from interest.delpher_kranten import KrantenFile

INPUT_FILE_TYPES = {
"delpher_kranten": KrantenFile

}
Empty file.
15 changes: 8 additions & 7 deletions interest/article_final_selection/process_article.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ def __init__(self, gzip_file_path: str, article_id: int):
self._file_path = gzip_file_path
self._article_id = article_id
self._title: Union[str, None] = ''
self._body: Union[str, None] = ''
self._body: Union[str, list, None] = ''
self.selected: bool = False

def _read_article_from_gzip(self) -> Tuple[Union[str, None],
Union[str, None]]:
def read_article_from_gzip(self, in_paragraph: bool = False) -> (
Tuple)[Union[str, None], Union[str, list, None]]:
"""
Read article content from a gzip file.

Expand All @@ -63,8 +63,7 @@ def _read_article_from_gzip(self) -> Tuple[Union[str, None],
article = articles.get(str(self._article_id), {})
title = article.get('title', {})
body = article.get('body', {})
body_string = " ".join(body)
return title, body_string
return title, body if in_paragraph else " ".join(body)
except Exception as e: # pylint: disable=broad-except
logging.error("Error reading article %s from %s: %s",
str(self._article_id), self._file_path, e)
Expand All @@ -80,7 +79,7 @@ def process_article(self, clean_keywords: List[str]) -> str:
Returns:
str: The processed article body.
"""
self._title, self._body = self._read_article_from_gzip()
self._title, self._body = self.read_article_from_gzip()
if (self._title is None) or (self._body is None):
return ""
clean_title = clean(self._title)
Expand All @@ -89,4 +88,6 @@ def process_article(self, clean_keywords: List[str]) -> str:
if title_with_keyword:
self.selected = True
return ""
return clean(self._body)
if isinstance(self._body, str):
return clean(self._body)
return ""
7 changes: 7 additions & 0 deletions interest/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""define input-file type"""
from interest.filter.delpher_kranten import KrantenFile

INPUT_FILE_TYPES = {
"delpher_kranten": KrantenFile

}
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
import logging
import os
from typing import Optional
from interest.document import Document, Article
from interest.input_file import InputFile
from interest.filter.document import Document, Article
from interest.filter.input_file import InputFile


class KrantenFile(InputFile):
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""
from abc import ABC, abstractmethod
from typing import List
from interest.document import Document, Article
from interest.filter.document import Document, Article


class DocumentFilter(ABC):
Expand Down
4 changes: 2 additions & 2 deletions interest/input_file.py → interest/filter/input_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
import logging
from pathlib import Path
from typing import Iterable, TextIO, cast, Optional
from interest.document import Document, Article
from interest.document_filter import DocumentFilter
from interest.filter.document import Document, Article
from interest.filter.document_filter import DocumentFilter


class InputFile(abc.ABC):
Expand Down
37 changes: 34 additions & 3 deletions interest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@
import json
import spacy
import spacy.cli
from interest.document_filter import YearFilter, TitleFilter, DocumentFilter
from interest.document_filter import (CompoundFilter, DecadeFilter,
KeywordsFilter)
from interest.filter.document_filter import (YearFilter,
TitleFilter,
DocumentFilter)
from interest.filter.document_filter import (CompoundFilter,
DecadeFilter,
KeywordsFilter)
from interest.settings import ENCODING


Expand Down Expand Up @@ -131,6 +134,34 @@ def get_article_selector_from_config(config_file: Path) -> dict:
from exc


def get_output_unit_from_config(config_file: Path) -> dict:
"""
Get the article selector configuration from a JSON file.

Args:
config_file (Path): The path to the JSON config file.

Returns:
Dict[str, str]: The article selector configuration.

Raises:
ArticleSelectorNotFoundError: If the article selector
is not found in the config file.
FileNotFoundError: If the config file is not found.
"""
try:
with open(config_file, 'r', encoding=ENCODING) as f:
config: Dict[str, str] = json.load(f)["output_unit"]
if not config:
raise ValueError("Config is empty")
return config
except FileNotFoundError as exc:
raise FileNotFoundError("Config file not found") from exc
except KeyError as exc:
raise KeyError("Article selector not found in config file") \
from exc


def save_filtered_articles(input_file: Any, article_id: str,
output_dir: str) -> None:
"""Save filtered articles data to a JSON file.
Expand Down
4 changes: 2 additions & 2 deletions scripts/step1_filter_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

from tqdm import tqdm

from interest import INPUT_FILE_TYPES
from interest.input_file import InputFile
from interest.filter import INPUT_FILE_TYPES
from interest.filter.input_file import InputFile
from interest.utils import load_filters_from_config
from interest.utils import save_filtered_articles

Expand Down
119 changes: 119 additions & 0 deletions scripts/step4_generate_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""This script reads selected articles from CSV files,
and saves their text for manual labeling"""
import argparse
import logging
import os
from pathlib import Path
from typing import Union
import pandas as pd
from pandas import DataFrame
from interest.article_final_selection.process_article import ArticleProcessor
from interest.utils import get_output_unit_from_config

FILE_PATH_FIELD = "file_path"
ARTICLE_ID_FIELD = "article_id"
TITLE_FIELD = "title"
BODY_FIELD = "body"
LABEL_FIELD = "label"
SELECTED_FIELD = "selected"


def read_article(row: pd.Series, in_paragraph: bool = False) -> DataFrame:
"""
Read article from row and return DataFrame of articles.

Args:
row (pd.Series): A row from a DataFrame.
in_paragraph (bool, optional): Whether to read article in paragraphs.
Defaults to False.

Returns:
DataFrame: DataFrame containing article information.
"""
file_path = row[FILE_PATH_FIELD]
article_id = row[ARTICLE_ID_FIELD]
article_processor = ArticleProcessor(file_path, article_id)
title, body = article_processor.read_article_from_gzip(in_paragraph)

titles = [title] * len(body) if in_paragraph and body is not None else [title]
files_path = [file_path] * len(body) if in_paragraph and body is not None else [file_path]
articles_id = [article_id] * len(body) if in_paragraph and body is not None else [article_id]
label = [''] * len(body) if in_paragraph and body is not None else ['']
return pd.DataFrame({FILE_PATH_FIELD: files_path,
ARTICLE_ID_FIELD: articles_id,
TITLE_FIELD: titles,
BODY_FIELD: body,
LABEL_FIELD: label})


def find_articles_in_file(filepath: str, in_paragraph: bool) -> (
Union)[DataFrame, None]:
"""
Find selected articles in a CSV file and return DataFrame of articles.

Args:
filepath (str): Path to the CSV file.
in_paragraph (bool): Whether to read articles in paragraphs.

Returns:
DataFrame: DataFrame containing selected articles information.
"""
try:
df_articles = pd.read_csv(filepath)
df_selected = df_articles.loc[df_articles[SELECTED_FIELD] == 1]

result = pd.concat([read_article(row, in_paragraph=in_paragraph)
for _, row in df_selected.iterrows()],
axis=0, ignore_index=True)
return result
except Exception as e: # pylint: disable=W0718
logging.error("Error reading selected indices in file: %s", e)
return None


if __name__ == "__main__":
parser = argparse.ArgumentParser("Select final articles.")

parser.add_argument(
"--input_dir",
type=Path,
required=True,
help="Base directory for reading input files.",
)
parser.add_argument(
"--glob",
type=str,
default="*.csv",
help="Glob pattern for find input files; e.g. '*.csv'.",
)
parser.add_argument(
"--config_path",
type=Path,
default="config.json",
help="File path of config file.",
)
parser.add_argument(
"--output_dir",
type=Path,
required=True,
help="The directory for storing output files.",
)

args = parser.parse_args()

if not args.input_dir.is_dir():
parser.error(f"Not a directory: '{str(args.input_dir.absolute())}'")

args.output_dir.mkdir(parents=True, exist_ok=True)
config_output_unit = get_output_unit_from_config(args.config_path)

result_df = pd.DataFrame(columns=[FILE_PATH_FIELD, ARTICLE_ID_FIELD,
TITLE_FIELD, BODY_FIELD, LABEL_FIELD])
IN_PARAGRAPH = config_output_unit == "paragraph"

for articles_filepath in args.input_dir.rglob(args.glob):
df = find_articles_in_file(articles_filepath,
in_paragraph=IN_PARAGRAPH)
result_df = pd.concat([result_df, df], ignore_index=True)

result_df.to_csv(os.path.join(args.output_dir, 'articles_to_label.csv'))
Loading