generated from UtrechtUniversity/re-python-package-setuptools
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add article_final_selection * add select_final_articles.py * add models * add text_cleaner * add article_selector to config * check if there is a list with None value * add functions to read config file and load spacy * fix linting errors * fix linting errors --------- Co-authored-by: parisa-zahedi <[email protected]>
- Loading branch information
1 parent
7528042
commit ae75f8a
Showing
13 changed files
with
677 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
"""Module containing the ArticleSelector class for selecting articles based on | ||
similarity scores.""" | ||
|
||
from typing import List, Dict, Union | ||
|
||
|
||
class ArticleSelector: | ||
"""Class for selecting articles based on similarity scores and | ||
configuration parameters.""" | ||
# pylint: disable=too-few-public-methods | ||
|
||
def __init__(self, similarity_scores: List[float], | ||
config: Dict[str, Union[str, float, int]]): | ||
"""Initializes the ArticleSelector object. | ||
Args: | ||
similarity_scores (List[float]): A list of similarity scores | ||
between keywords and articles. | ||
config (Dict[str, Union[str, float, int]]): A dictionary containing | ||
configuration parameters for selecting articles. | ||
""" | ||
self.similarity_scores = similarity_scores | ||
self.config = config | ||
|
||
def select_articles(self) -> List[int]: | ||
"""Selects articles based on the configured selection method and value. | ||
Returns: | ||
List[int]: A list of indices of selected articles. | ||
""" | ||
sorted_indices = sorted( | ||
range(len(self.similarity_scores)), | ||
key=lambda i: self.similarity_scores[i], | ||
reverse=True | ||
) | ||
|
||
selected_indices: List[int] = [] | ||
if self.config["type"] == "threshold": | ||
threshold = float(self.config["value"]) | ||
selected_indices.extend( | ||
i for i, score in enumerate(self.similarity_scores) | ||
if score >= threshold | ||
) | ||
elif self.config["type"] == "num_articles": | ||
num_articles = int(self.config["value"]) | ||
selected_indices.extend(sorted_indices[:num_articles]) | ||
|
||
return selected_indices |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
""" Module for processing articles from gzip files.""" | ||
import gzip | ||
import json | ||
import logging | ||
from typing import List, Union, Tuple | ||
from interest.preprocessor.text_cleaner import TextCleaner | ||
|
||
text_cleaner = TextCleaner() | ||
|
||
|
||
def clean(text: str) -> str: | ||
""" | ||
Clean the input text using TextCleaner. | ||
Args: | ||
text (str): The input text to clean. | ||
Returns: | ||
str: The cleaned text. | ||
""" | ||
return text_cleaner.preprocess(text) | ||
|
||
# pylint: disable=too-few-public-methods | ||
|
||
|
||
class ArticleProcessor: | ||
""" | ||
Process individual articles from gzip files. | ||
This class handles the processing of individual articles from | ||
gzip files. | ||
It reads the content of the article, cleans it using TextCleaner, and | ||
determines whether the article contains any keywords of interests in | ||
the title. | ||
""" | ||
def __init__(self, gzip_file_path: str, article_id: int): | ||
""" | ||
Initialize ArticleProcessor with the gzip file path and article ID. | ||
Args: | ||
gzip_file_path (str): The path to the gzip file. | ||
article_id (int): The ID of the article. | ||
""" | ||
self._file_path = gzip_file_path | ||
self._article_id = article_id | ||
self._title: Union[str, None] = '' | ||
self._body: Union[str, None] = '' | ||
self.selected: bool = False | ||
|
||
def _read_article_from_gzip(self) -> Tuple[Union[str, None], | ||
Union[str, None]]: | ||
""" | ||
Read article content from a gzip file. | ||
Returns: | ||
Tuple[Union[str, None], Union[str, None]]: A tuple containing | ||
the title and body of the article. | ||
""" | ||
try: | ||
with gzip.open(self._file_path, 'rt') as f: | ||
data = json.load(f) | ||
articles = data.get('articles', {}) | ||
article = articles.get(str(self._article_id), {}) | ||
title = article.get('title', {}) | ||
body = article.get('body', {}) | ||
body_string = " ".join(body) | ||
return title, body_string | ||
except Exception as e: # pylint: disable=broad-except | ||
logging.error("Error reading article %s from %s: %s", | ||
str(self._article_id), self._file_path, e) | ||
return None, None | ||
|
||
def process_article(self, clean_keywords: List[str]) -> str: | ||
""" | ||
Process the article content. | ||
Args: | ||
clean_keywords (List[str]): A list of clean keywords. | ||
Returns: | ||
str: The processed article body. | ||
""" | ||
self._title, self._body = self._read_article_from_gzip() | ||
if (self._title is None) or (self._body is None): | ||
return "" | ||
clean_title = clean(self._title) | ||
title_with_keyword = any(keyword in clean_title | ||
for keyword in clean_keywords) | ||
if title_with_keyword: | ||
self.selected = True | ||
return "" | ||
return clean(self._body) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
""" | ||
This module contains functions for selecting articles based on keywords | ||
and similarity scores. | ||
""" | ||
from typing import List, Tuple, Dict, Union | ||
import pandas as pd | ||
from sklearn.metrics.pairwise import cosine_similarity | ||
from interest.models.tfidf import TfidfEmbedder | ||
from interest.article_final_selection.process_article import ArticleProcessor | ||
from interest.article_final_selection.process_article import clean | ||
from interest.article_final_selection.article_selector import ArticleSelector | ||
|
||
|
||
def process_articles(articles_filepath: str, clean_keywords: List[str]) -> ( | ||
Tuple)[List[str], List[int]]: | ||
""" | ||
Process articles from a CSV file. | ||
Args: | ||
articles_filepath (str): The path to the CSV file containing articles. | ||
clean_keywords (List[str]): A list of clean keywords. | ||
Returns: | ||
Tuple[List[str], List[int]]: A tuple containing the processed article | ||
bodies and selected indices. | ||
""" | ||
articles_df = pd.read_csv(articles_filepath) | ||
article_bodies: List[str] = [] | ||
selected_indices: List[int] = [] | ||
for index, row in articles_df.iterrows(): | ||
article_processor = ArticleProcessor(row['file_path'], | ||
row['article_id']) | ||
processed_article_body = article_processor.process_article( | ||
clean_keywords) | ||
if article_processor.selected: | ||
selected_indices.append(int(str(index))) | ||
elif processed_article_body != "": | ||
article_bodies.append(processed_article_body) | ||
return article_bodies, selected_indices | ||
|
||
|
||
def apply_tfidf_similarity(documents: List[str], keywords: List[str]) -> ( | ||
List)[float]: | ||
""" | ||
Apply TF-IDF similarity between documents and keywords. | ||
Args: | ||
documents (List[str]): A list of document bodies. | ||
keywords (List[str]): A list of keywords. | ||
Returns: | ||
List[float]: A list of similarity scores. | ||
""" | ||
model = TfidfEmbedder(ngram_max=1, norm="l1", sublinear_tf=False, min_df=1, | ||
max_df=1.0) | ||
keywords_list = [" ".join(keywords)] | ||
model.fit(documents) | ||
embeddings_documents = model.transform(documents).tocsr() | ||
embeddings_keywords = model.transform(keywords_list).tocsr() | ||
similarity_scores = cosine_similarity(embeddings_keywords, | ||
embeddings_documents) | ||
return similarity_scores[0] | ||
|
||
|
||
def select_top_articles(similarity_scores: List[float], | ||
config: Dict[str, Union[str, float, int]]) \ | ||
-> List[int]: | ||
""" | ||
Select top articles based on similarity scores and configuration. | ||
Args: | ||
similarity_scores (List[float]): A list of similarity scores. | ||
config (Dict[str, str]): Configuration for selecting articles. | ||
Returns: | ||
List[int]: A list of selected article indices. | ||
""" | ||
selector = ArticleSelector(similarity_scores, config) | ||
selected_indices = selector.select_articles() | ||
return selected_indices | ||
|
||
|
||
def select_articles(articles_filepath: str, keywords: List[str], | ||
config: Dict[str, Union[str, float, int]]) -> List[int]: | ||
""" | ||
Select articles based on keywords, similarity scores, and configuration. | ||
Args: | ||
articles_filepath (str): The path to the CSV file containing articles. | ||
keywords (List[str]): A list of keywords. | ||
config (Dict[str, str]): Configuration for selecting articles. | ||
Returns: | ||
List[int]: A list of selected article indices. | ||
""" | ||
clean_keywords = [clean(keyword) for keyword in keywords] | ||
article_bodies, selected_indices = process_articles(articles_filepath, | ||
clean_keywords) | ||
similarity_scores = apply_tfidf_similarity(article_bodies, clean_keywords) | ||
indices = select_top_articles(similarity_scores, config) | ||
selected_indices.extend(indices) | ||
return selected_indices |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
"""Base class for document embeddings.""" | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import Union, Sequence | ||
import scipy | ||
from numpy import typing as npt | ||
import numpy as np | ||
|
||
|
||
class BaseEmbedder(ABC): | ||
"""Base class for creating document embeddings.""" | ||
|
||
@abstractmethod | ||
def fit(self, documents: Sequence[str]) -> None: | ||
"""Train the model on documents.""" | ||
|
||
@abstractmethod | ||
def transform(self, documents: Union[str, Sequence[str]]) -> ( | ||
Union)[scipy.sparse.spmatrix, npt.NDArray[np.float_]]: | ||
"""Get the embedding for a document.""" |
Oops, something went wrong.