Skip to content

Commit

Permalink
Categorize timestamped (#9)
Browse files Browse the repository at this point in the history
* categorize filtered article by timestamp

* fix pylint flake8 and mypy errors

* fix flake8 error

* aggregate article-info of each timestamp in a csv

* fix flake8 error

* remove extra indentation

---------

Co-authored-by: parisa-zahedi <[email protected]>
  • Loading branch information
parisa-zahedi and parisa-zahedi authored Mar 25, 2024
1 parent 2de5d28 commit 7528042
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 16 deletions.
9 changes: 9 additions & 0 deletions interest/temporal_categorization/timestamped_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,15 @@ def _load_data(self):
with open(self._filename, 'r', encoding='utf-8') as file:
return json.load(file)

def data(self):
"""
Returns the json data
Returns:
dict: The loaded JSON data.
"""
return self._data

def _get_timestamp(self):
"""
Extracts the timestamp from the data.
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ classifiers = [
"License :: OSI Approved :: MIT License",
]
dynamic = ["version"]
dependencies = ["tqdm","spacy"

dependencies = ["tqdm","pandas","pandas-stubs", "types-tqdm","spacy"
]

[project.optional-dependencies]
Expand Down
49 changes: 34 additions & 15 deletions scripts/categorize_by_timestamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
on their timestamps.
"""
import os
from shutil import move
import argparse
import logging
from typing import Iterable
from pathlib import Path
import pandas as pd
from tqdm import tqdm # type: ignore
from interest.temporal_categorization import PERIOD_TYPES
from interest.temporal_categorization.timestamped_data import TimestampedData

OUTPUT_FILE_NAME = 'articles'
FILENAME_COLUMN = 'file_path'
ARTICLE_ID_COLUMN = 'article_id'

if __name__ == "__main__":
parser = argparse.ArgumentParser("Categorize articles by timestamp.")

Expand Down Expand Up @@ -54,18 +58,33 @@

args.output_dir.mkdir(parents=True, exist_ok=True)

for timestamped_object in tqdm(timestamped_objects,
desc="Categorize by timestamp",
unit="file"):
timestamp = timestamped_object.categorize()
timestamp_folder = os.path.join(args.output_dir, str(timestamp))
if not os.path.exists(timestamp_folder):
os.makedirs(timestamp_folder)
try:
for timestamped_object in tqdm(timestamped_objects,
desc="Categorize by timestamp",
unit="file"):
try:
timestamp = timestamped_object.categorize()

timestamp_file_name = os.path.join(args.output_dir,
OUTPUT_FILE_NAME+'_' +
str(timestamp)+'.csv')
if os.path.isfile(timestamp_file_name):
df = pd.read_csv(timestamp_file_name)
else:
df = pd.DataFrame(columns=[FILENAME_COLUMN,
ARTICLE_ID_COLUMN])

new_row = {FILENAME_COLUMN: str(
timestamped_object.data()[FILENAME_COLUMN]),
ARTICLE_ID_COLUMN: str(
timestamped_object.data()[ARTICLE_ID_COLUMN])}
df = pd.concat([df, pd.DataFrame([new_row])],
ignore_index=True)

df.to_csv(timestamp_file_name, index=False)
except Exception as e: # pylint: disable=broad-except
logging.error("Error processing timestamped object: %s",
str(e))
except Exception as e: # pylint: disable=broad-except
logging.error("Error occurred in main loop: %s", str(e))

try:
move(timestamped_object.filename, timestamp_folder)
logging.warning("Moved %s to %s", timestamped_object.filename,
timestamp_folder)
except Exception as e: # pylint: disable=broad-except
logging.error("Error moving %s to %s : %s",
timestamped_object.filename, timestamp_folder, e)

0 comments on commit 7528042

Please sign in to comment.