Skip to content

Commit

Permalink
Tfidf (#10)
Browse files Browse the repository at this point in the history
* add article_final_selection

* add select_final_articles.py

* add models

* add text_cleaner

* add article_selector to config

* check if there is a list with None value

* add functions to read config file and load spacy

* fix linting errors

* fix linting errors

---------

Co-authored-by: parisa-zahedi <[email protected]>
  • Loading branch information
parisa-zahedi and parisa-zahedi authored Apr 3, 2024
1 parent 7528042 commit ae75f8a
Show file tree
Hide file tree
Showing 13 changed files with 677 additions and 5 deletions.
13 changes: 11 additions & 2 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,16 @@
"filters": [
{
"type": "KeywordsFilter",
"keywords": ["Article 1","Zweepen","spoorwegpersoneel"]
"keywords": ["windkracht", "windenergie", "windenergiebranche", "windturbine", "windstroom",
"zonne-energie", "zonnewarmte", "zonnestraling", "geothermische energie", "aardwarmte",
"waterkracht", "waterkrachtcentrale", "waterkrachtwerken", "waterstof", "waterstofenergie",
"hydroturbine", "getijden-energie", "ethanol-produktie", "ethanol"]
}
]
],
"article_selector":
{
"type": "threshold",
"value": "0.02"
}

}
48 changes: 48 additions & 0 deletions interest/article_final_selection/article_selector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Module containing the ArticleSelector class for selecting articles based on
similarity scores."""

from typing import List, Dict, Union


class ArticleSelector:
"""Class for selecting articles based on similarity scores and
configuration parameters."""
# pylint: disable=too-few-public-methods

def __init__(self, similarity_scores: List[float],
config: Dict[str, Union[str, float, int]]):
"""Initializes the ArticleSelector object.
Args:
similarity_scores (List[float]): A list of similarity scores
between keywords and articles.
config (Dict[str, Union[str, float, int]]): A dictionary containing
configuration parameters for selecting articles.
"""
self.similarity_scores = similarity_scores
self.config = config

def select_articles(self) -> List[int]:
"""Selects articles based on the configured selection method and value.
Returns:
List[int]: A list of indices of selected articles.
"""
sorted_indices = sorted(
range(len(self.similarity_scores)),
key=lambda i: self.similarity_scores[i],
reverse=True
)

selected_indices: List[int] = []
if self.config["type"] == "threshold":
threshold = float(self.config["value"])
selected_indices.extend(
i for i, score in enumerate(self.similarity_scores)
if score >= threshold
)
elif self.config["type"] == "num_articles":
num_articles = int(self.config["value"])
selected_indices.extend(sorted_indices[:num_articles])

return selected_indices
92 changes: 92 additions & 0 deletions interest/article_final_selection/process_article.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
""" Module for processing articles from gzip files."""
import gzip
import json
import logging
from typing import List, Union, Tuple
from interest.preprocessor.text_cleaner import TextCleaner

text_cleaner = TextCleaner()


def clean(text: str) -> str:
"""
Clean the input text using TextCleaner.
Args:
text (str): The input text to clean.
Returns:
str: The cleaned text.
"""
return text_cleaner.preprocess(text)

# pylint: disable=too-few-public-methods


class ArticleProcessor:
"""
Process individual articles from gzip files.
This class handles the processing of individual articles from
gzip files.
It reads the content of the article, cleans it using TextCleaner, and
determines whether the article contains any keywords of interests in
the title.
"""
def __init__(self, gzip_file_path: str, article_id: int):
"""
Initialize ArticleProcessor with the gzip file path and article ID.
Args:
gzip_file_path (str): The path to the gzip file.
article_id (int): The ID of the article.
"""
self._file_path = gzip_file_path
self._article_id = article_id
self._title: Union[str, None] = ''
self._body: Union[str, None] = ''
self.selected: bool = False

def _read_article_from_gzip(self) -> Tuple[Union[str, None],
Union[str, None]]:
"""
Read article content from a gzip file.
Returns:
Tuple[Union[str, None], Union[str, None]]: A tuple containing
the title and body of the article.
"""
try:
with gzip.open(self._file_path, 'rt') as f:
data = json.load(f)
articles = data.get('articles', {})
article = articles.get(str(self._article_id), {})
title = article.get('title', {})
body = article.get('body', {})
body_string = " ".join(body)
return title, body_string
except Exception as e: # pylint: disable=broad-except
logging.error("Error reading article %s from %s: %s",
str(self._article_id), self._file_path, e)
return None, None

def process_article(self, clean_keywords: List[str]) -> str:
"""
Process the article content.
Args:
clean_keywords (List[str]): A list of clean keywords.
Returns:
str: The processed article body.
"""
self._title, self._body = self._read_article_from_gzip()
if (self._title is None) or (self._body is None):
return ""
clean_title = clean(self._title)
title_with_keyword = any(keyword in clean_title
for keyword in clean_keywords)
if title_with_keyword:
self.selected = True
return ""
return clean(self._body)
102 changes: 102 additions & 0 deletions interest/article_final_selection/process_articles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""
This module contains functions for selecting articles based on keywords
and similarity scores.
"""
from typing import List, Tuple, Dict, Union
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from interest.models.tfidf import TfidfEmbedder
from interest.article_final_selection.process_article import ArticleProcessor
from interest.article_final_selection.process_article import clean
from interest.article_final_selection.article_selector import ArticleSelector


def process_articles(articles_filepath: str, clean_keywords: List[str]) -> (
Tuple)[List[str], List[int]]:
"""
Process articles from a CSV file.
Args:
articles_filepath (str): The path to the CSV file containing articles.
clean_keywords (List[str]): A list of clean keywords.
Returns:
Tuple[List[str], List[int]]: A tuple containing the processed article
bodies and selected indices.
"""
articles_df = pd.read_csv(articles_filepath)
article_bodies: List[str] = []
selected_indices: List[int] = []
for index, row in articles_df.iterrows():
article_processor = ArticleProcessor(row['file_path'],
row['article_id'])
processed_article_body = article_processor.process_article(
clean_keywords)
if article_processor.selected:
selected_indices.append(int(str(index)))
elif processed_article_body != "":
article_bodies.append(processed_article_body)
return article_bodies, selected_indices


def apply_tfidf_similarity(documents: List[str], keywords: List[str]) -> (
List)[float]:
"""
Apply TF-IDF similarity between documents and keywords.
Args:
documents (List[str]): A list of document bodies.
keywords (List[str]): A list of keywords.
Returns:
List[float]: A list of similarity scores.
"""
model = TfidfEmbedder(ngram_max=1, norm="l1", sublinear_tf=False, min_df=1,
max_df=1.0)
keywords_list = [" ".join(keywords)]
model.fit(documents)
embeddings_documents = model.transform(documents).tocsr()
embeddings_keywords = model.transform(keywords_list).tocsr()
similarity_scores = cosine_similarity(embeddings_keywords,
embeddings_documents)
return similarity_scores[0]


def select_top_articles(similarity_scores: List[float],
config: Dict[str, Union[str, float, int]]) \
-> List[int]:
"""
Select top articles based on similarity scores and configuration.
Args:
similarity_scores (List[float]): A list of similarity scores.
config (Dict[str, str]): Configuration for selecting articles.
Returns:
List[int]: A list of selected article indices.
"""
selector = ArticleSelector(similarity_scores, config)
selected_indices = selector.select_articles()
return selected_indices


def select_articles(articles_filepath: str, keywords: List[str],
config: Dict[str, Union[str, float, int]]) -> List[int]:
"""
Select articles based on keywords, similarity scores, and configuration.
Args:
articles_filepath (str): The path to the CSV file containing articles.
keywords (List[str]): A list of keywords.
config (Dict[str, str]): Configuration for selecting articles.
Returns:
List[int]: A list of selected article indices.
"""
clean_keywords = [clean(keyword) for keyword in keywords]
article_bodies, selected_indices = process_articles(articles_filepath,
clean_keywords)
similarity_scores = apply_tfidf_similarity(article_bodies, clean_keywords)
indices = select_top_articles(similarity_scores, config)
selected_indices.extend(indices)
return selected_indices
9 changes: 7 additions & 2 deletions interest/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
This module defines the Document class, which represents a document
containing articles.
"""
import logging
from typing import Optional, List, Union
from datetime import datetime

Expand Down Expand Up @@ -33,8 +34,12 @@ def __init__(self, article_id: str, title: str,
self.id = article_id
self.title = title
if isinstance(body, list):
article_body = '\n'.join(body)
self.text = article_body
if any(item is None for item in body):
logging.warning("There is a None value in body")
self.text = ""
else:
article_body = '\n'.join(body)
self.text = article_body
else:
self.text = body

Expand Down
20 changes: 20 additions & 0 deletions interest/models/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Base class for document embeddings."""

from abc import ABC, abstractmethod
from typing import Union, Sequence
import scipy
from numpy import typing as npt
import numpy as np


class BaseEmbedder(ABC):
"""Base class for creating document embeddings."""

@abstractmethod
def fit(self, documents: Sequence[str]) -> None:
"""Train the model on documents."""

@abstractmethod
def transform(self, documents: Union[str, Sequence[str]]) -> (
Union)[scipy.sparse.spmatrix, npt.NDArray[np.float_]]:
"""Get the embedding for a document."""
Loading

0 comments on commit ae75f8a

Please sign in to comment.