Tfidf (#10)

* add article_final_selection * add select_final_articles.py * add models * add text_cleaner * add article_selector to config * check if there is a list with None value * add functions to read config file and load spacy * fix linting errors * fix linting errors --------- Co-authored-by: parisa-zahedi <[email protected]>
UtrechtUniversity · Apr 3, 2024 · ae75f8a · ae75f8a
1 parent 7528042
commit ae75f8a
Show file tree

Hide file tree

Showing 13 changed files with 677 additions and 5 deletions.
diff --git a/config.json b/config.json
@@ -2,7 +2,16 @@
   "filters": [
     {
       "type": "KeywordsFilter",
-      "keywords": ["Article 1","Zweepen","spoorwegpersoneel"]
+      "keywords":  ["windkracht", "windenergie", "windenergiebranche", "windturbine", "windstroom",
+  "zonne-energie", "zonnewarmte", "zonnestraling", "geothermische energie", "aardwarmte",
+  "waterkracht", "waterkrachtcentrale", "waterkrachtwerken", "waterstof", "waterstofenergie",
+  "hydroturbine", "getijden-energie", "ethanol-produktie", "ethanol"]
     }
-  ]
+  ],
+  "article_selector":
+    {
+      "type": "threshold",
+      "value": "0.02"
+    }
+
 }
diff --git a/interest/article_final_selection/article_selector.py b/interest/article_final_selection/article_selector.py
@@ -0,0 +1,48 @@
+"""Module containing the ArticleSelector class for selecting articles based on
+similarity scores."""
+
+from typing import List, Dict, Union
+
+
+class ArticleSelector:
+    """Class for selecting articles based on similarity scores and
+    configuration parameters."""
+    # pylint: disable=too-few-public-methods
+
+    def __init__(self, similarity_scores: List[float],
+                 config: Dict[str, Union[str, float, int]]):
+        """Initializes the ArticleSelector object.
+
+        Args:
+            similarity_scores (List[float]): A list of similarity scores
+             between keywords and articles.
+            config (Dict[str, Union[str, float, int]]): A dictionary containing
+            configuration parameters for selecting articles.
+        """
+        self.similarity_scores = similarity_scores
+        self.config = config
+
+    def select_articles(self) -> List[int]:
+        """Selects articles based on the configured selection method and value.
+
+        Returns:
+            List[int]: A list of indices of selected articles.
+        """
+        sorted_indices = sorted(
+            range(len(self.similarity_scores)),
+            key=lambda i: self.similarity_scores[i],
+            reverse=True
+        )
+
+        selected_indices: List[int] = []
+        if self.config["type"] == "threshold":
+            threshold = float(self.config["value"])
+            selected_indices.extend(
+                i for i, score in enumerate(self.similarity_scores)
+                if score >= threshold
+            )
+        elif self.config["type"] == "num_articles":
+            num_articles = int(self.config["value"])
+            selected_indices.extend(sorted_indices[:num_articles])
+
+        return selected_indices
diff --git a/interest/article_final_selection/process_article.py b/interest/article_final_selection/process_article.py
@@ -0,0 +1,92 @@
+""" Module for processing articles from gzip files."""
+import gzip
+import json
+import logging
+from typing import List, Union, Tuple
+from interest.preprocessor.text_cleaner import TextCleaner
+
+text_cleaner = TextCleaner()
+
+
+def clean(text: str) -> str:
+    """
+    Clean the input text using TextCleaner.
+
+    Args:
+        text (str): The input text to clean.
+
+    Returns:
+        str: The cleaned text.
+    """
+    return text_cleaner.preprocess(text)
+
+# pylint: disable=too-few-public-methods
+
+
+class ArticleProcessor:
+    """
+        Process individual articles from gzip files.
+
+        This class handles the processing of individual articles from
+        gzip files.
+        It reads the content of the article, cleans it using TextCleaner, and
+        determines whether the article contains any keywords of interests in
+        the title.
+    """
+    def __init__(self, gzip_file_path: str, article_id: int):
+        """
+        Initialize ArticleProcessor with the gzip file path and article ID.
+
+        Args:
+            gzip_file_path (str): The path to the gzip file.
+            article_id (int): The ID of the article.
+        """
+        self._file_path = gzip_file_path
+        self._article_id = article_id
+        self._title: Union[str, None] = ''
+        self._body: Union[str, None] = ''
+        self.selected: bool = False
+
+    def _read_article_from_gzip(self) -> Tuple[Union[str, None],
+                                               Union[str, None]]:
+        """
+        Read article content from a gzip file.
+
+        Returns:
+            Tuple[Union[str, None], Union[str, None]]: A tuple containing
+            the title and body of the article.
+        """
+        try:
+            with gzip.open(self._file_path, 'rt') as f:
+                data = json.load(f)
+                articles = data.get('articles', {})
+                article = articles.get(str(self._article_id), {})
+                title = article.get('title', {})
+                body = article.get('body', {})
+                body_string = " ".join(body)
+                return title, body_string
+        except Exception as e:  # pylint: disable=broad-except
+            logging.error("Error reading article %s from %s: %s",
+                          str(self._article_id), self._file_path, e)
+            return None, None
+
+    def process_article(self, clean_keywords: List[str]) -> str:
+        """
+        Process the article content.
+
+        Args:
+            clean_keywords (List[str]): A list of clean keywords.
+
+        Returns:
+            str: The processed article body.
+        """
+        self._title, self._body = self._read_article_from_gzip()
+        if (self._title is None) or (self._body is None):
+            return ""
+        clean_title = clean(self._title)
+        title_with_keyword = any(keyword in clean_title
+                                 for keyword in clean_keywords)
+        if title_with_keyword:
+            self.selected = True
+            return ""
+        return clean(self._body)
diff --git a/interest/article_final_selection/process_articles.py b/interest/article_final_selection/process_articles.py
@@ -0,0 +1,102 @@
+"""
+This module contains functions for selecting articles based on keywords
+and similarity scores.
+"""
+from typing import List, Tuple, Dict, Union
+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+from interest.models.tfidf import TfidfEmbedder
+from interest.article_final_selection.process_article import ArticleProcessor
+from interest.article_final_selection.process_article import clean
+from interest.article_final_selection.article_selector import ArticleSelector
+
+
+def process_articles(articles_filepath: str, clean_keywords: List[str]) -> (
+        Tuple)[List[str], List[int]]:
+    """
+    Process articles from a CSV file.
+
+    Args:
+        articles_filepath (str): The path to the CSV file containing articles.
+        clean_keywords (List[str]): A list of clean keywords.
+
+    Returns:
+        Tuple[List[str], List[int]]: A tuple containing the processed article
+         bodies and selected indices.
+    """
+    articles_df = pd.read_csv(articles_filepath)
+    article_bodies: List[str] = []
+    selected_indices: List[int] = []
+    for index, row in articles_df.iterrows():
+        article_processor = ArticleProcessor(row['file_path'],
+                                             row['article_id'])
+        processed_article_body = article_processor.process_article(
+                                                   clean_keywords)
+        if article_processor.selected:
+            selected_indices.append(int(str(index)))
+        elif processed_article_body != "":
+            article_bodies.append(processed_article_body)
+    return article_bodies, selected_indices
+
+
+def apply_tfidf_similarity(documents: List[str], keywords: List[str]) -> (
+        List)[float]:
+    """
+    Apply TF-IDF similarity between documents and keywords.
+
+    Args:
+        documents (List[str]): A list of document bodies.
+        keywords (List[str]): A list of keywords.
+
+    Returns:
+        List[float]: A list of similarity scores.
+    """
+    model = TfidfEmbedder(ngram_max=1, norm="l1", sublinear_tf=False, min_df=1,
+                          max_df=1.0)
+    keywords_list = [" ".join(keywords)]
+    model.fit(documents)
+    embeddings_documents = model.transform(documents).tocsr()
+    embeddings_keywords = model.transform(keywords_list).tocsr()
+    similarity_scores = cosine_similarity(embeddings_keywords,
+                                          embeddings_documents)
+    return similarity_scores[0]
+
+
+def select_top_articles(similarity_scores: List[float],
+                        config: Dict[str, Union[str, float, int]]) \
+                        -> List[int]:
+    """
+    Select top articles based on similarity scores and configuration.
+
+    Args:
+        similarity_scores (List[float]): A list of similarity scores.
+        config (Dict[str, str]): Configuration for selecting articles.
+
+    Returns:
+        List[int]: A list of selected article indices.
+    """
+    selector = ArticleSelector(similarity_scores, config)
+    selected_indices = selector.select_articles()
+    return selected_indices
+
+
+def select_articles(articles_filepath: str, keywords: List[str],
+                    config: Dict[str, Union[str, float, int]]) -> List[int]:
+    """
+    Select articles based on keywords, similarity scores, and configuration.
+
+    Args:
+        articles_filepath (str): The path to the CSV file containing articles.
+        keywords (List[str]): A list of keywords.
+        config (Dict[str, str]): Configuration for selecting articles.
+
+    Returns:
+        List[int]: A list of selected article indices.
+    """
+    clean_keywords = [clean(keyword) for keyword in keywords]
+    article_bodies, selected_indices = process_articles(articles_filepath,
+                                                        clean_keywords)
+    similarity_scores = apply_tfidf_similarity(article_bodies, clean_keywords)
+    indices = select_top_articles(similarity_scores, config)
+    selected_indices.extend(indices)
+    return selected_indices
diff --git a/interest/document.py b/interest/document.py
@@ -3,6 +3,7 @@
 This module defines the Document class, which represents a document
 containing articles.
 """
+import logging
 from typing import Optional, List, Union
 from datetime import datetime
 
@@ -33,8 +34,12 @@ def __init__(self, article_id: str, title: str,
         self.id = article_id
         self.title = title
         if isinstance(body, list):
-            article_body = '\n'.join(body)
-            self.text = article_body
+            if any(item is None for item in body):
+                logging.warning("There is a None value in body")
+                self.text = ""
+            else:
+                article_body = '\n'.join(body)
+                self.text = article_body
         else:
             self.text = body
 

diff --git a/interest/models/base.py b/interest/models/base.py
@@ -0,0 +1,20 @@
+"""Base class for document embeddings."""
+
+from abc import ABC, abstractmethod
+from typing import Union, Sequence
+import scipy
+from numpy import typing as npt
+import numpy as np
+
+
+class BaseEmbedder(ABC):
+    """Base class for creating document embeddings."""
+
+    @abstractmethod
+    def fit(self, documents: Sequence[str]) -> None:
+        """Train the model on documents."""
+
+    @abstractmethod
+    def transform(self, documents: Union[str, Sequence[str]]) -> (
+            Union)[scipy.sparse.spmatrix, npt.NDArray[np.float_]]:
+        """Get the embedding for a document."""