Skip to content

Commit

Permalink
Merge pull request #5 from ilias-ant/feature/ia/v2
Browse files Browse the repository at this point in the history
v2 dataset
  • Loading branch information
ilias-ant authored Nov 14, 2022
2 parents e32c120 + dcb71d3 commit 33396ab
Show file tree
Hide file tree
Showing 13 changed files with 57,342 additions and 15,304 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ The project consists of the following components:
- **collectors**: a [Scrapy](https://scrapy.org/) project, responsible for scraping articles from [https://publications.americanalpineclub.org](https://publications.americanalpineclub.org/).
- **opensearch-cluster**: an [OpenSearch](https://opensearch.org/) cluster, where the scraped articles are indexed.
- **publishers**: functionality responsible for the publication of the articles index (e.g. as Kaggle dataset).
- **notebooks**: a collection of Jupyter notebooks, for various dataset-based explorations and applications.
- **dataset**: the raw dataset, in CSV format.

## Citation
Expand Down
14 changes: 9 additions & 5 deletions aac_articles/collectors/collectors/item_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from scrapy.loader import ItemLoader

from . import items
from .processors import TakeLast

TRAILING_DOT_REGEX = r"\.$"


class ArticleLoader(ItemLoader):
Expand All @@ -13,24 +16,25 @@ class ArticleLoader(ItemLoader):
default_item_class = items.Article

body_out = Compose(Join())
author_out = Compose(TakeFirst(), str.strip, lambda x: re.sub(r".$", "", x))
author_out = Compose(TakeLast(), str.strip, lambda x: re.sub(TRAILING_DOT_REGEX, "", x))
climb_year_out = Compose(
lambda years: [y for y in years if "N/A" not in y],
TakeFirst(),
TakeLast(),
str.strip,
lambda x: re.sub(r".$", "", x),
lambda x: re.sub(TRAILING_DOT_REGEX, "", x),
)
publication_year_out = Compose(
lambda years: [y for y in years if "N/A" not in y],
TakeFirst(),
TakeLast(),
lambda x: x.replace("\n", ""),
lambda x: x.replace("|", ""),
str.strip,
lambda x: re.sub(r".$", "", x),
lambda x: re.sub(TRAILING_DOT_REGEX, "", x),
)
link_to_pdf_out = Compose(
TakeFirst(),
lambda url: parse.urljoin("https://publications.americanalpineclub.org/", url)
if not url.startswith("http")
else url,
)
referer_out = Compose(TakeFirst(), lambda x: x.decode("utf-8"))
1 change: 1 addition & 0 deletions aac_articles/collectors/collectors/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ class Article(TimestampedItem):
climb_year = scrapy.Field()
publication_year = scrapy.Field()
link_to_pdf = scrapy.Field()
referer = scrapy.Field()
2 changes: 1 addition & 1 deletion aac_articles/collectors/collectors/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def process_item(self, item, spider):

expectations = (
item["publication"] in ("ANAM", "AAJ"),
"publication_year" in item,
("publication_year" in item) and (len(item["publication_year"]) == 4),
)

if not all(expectations):
Expand Down
17 changes: 17 additions & 0 deletions aac_articles/collectors/collectors/processors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
class TakeLast:
"""
Returns the last non-null/non-empty value from the values received,
so it's typically used as an output processor to single-valued fields.
It doesn't receive any ``__init__`` method arguments, nor does it accept Loader contexts.
Example:
>>> proc = TakeLast()
>>> proc(['one', 'two', 'three', ''])
'three'
"""

def __call__(self, values):
for value in values[::-1]:
if value is not None and value != "":
return value
10 changes: 9 additions & 1 deletion aac_articles/collectors/collectors/spiders/articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def parse_article(response, metadata):
article = item_loaders.ArticleLoader(response=response)

article.add_value("url", response.url)
article.add_value("referer", response.request.headers.get("Referer"))
article.add_value("type", metadata["type"])
article.add_value("publication", metadata["publication"])
article.add_xpath("title", '//div[contains(@class, "article-body")]//h2[@class="title"]/text()')
Expand Down Expand Up @@ -87,7 +88,14 @@ def parse_article(response, metadata):
article.add_xpath(
"publication_year",
'//div[contains(@class, "article-body")]/div/'
'span[contains(., "Publication Year") and contains(., "Author") and contains(., "Climb Year")]/i[3]/text()',
'span[contains(., "Publication Year") and contains(., "Author") and contains(., "Climb Year")]'
"/i[3]/text()",
)
article.add_xpath(
"publication_year",
'//div[contains(@class, "article-body")]/div/'
'span[contains(., "Publication Year") and not(contains(., "Author")) and contains(., "Climb Year")]'
"/i[2]/text()",
)

yield article.load_item()
71,021 changes: 55,740 additions & 15,281 deletions aac_articles/dataset/kaggle/articles.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion aac_articles/dataset/kaggle/dataset-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -95,5 +95,5 @@
],
"name": "articles",
"homepage": "https://www.kaggle.com/datasets/iantonopoulos/american-alpine-club-articles",
"version": "1.0.0"
"version": "2.0.0"
}
Empty file.
251 changes: 251 additions & 0 deletions aac_articles/notebooks/eda.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "721fb2a0",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "a8080c04",
"metadata": {},
"outputs": [],
"source": [
"articles = pd.read_csv(\"../dataset/kaggle/articles.csv\", dtype={\"publication_year\": str})"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "39526889",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>type</th>\n",
" <th>publication</th>\n",
" <th>title</th>\n",
" <th>location</th>\n",
" <th>body</th>\n",
" <th>climb_year</th>\n",
" <th>link_to_pdf</th>\n",
" <th>author</th>\n",
" <th>publication_year</th>\n",
" <th>scraped_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>https://publications.americanalpineclub.org/ar...</td>\n",
" <td>Accident Reports</td>\n",
" <td>ANAM</td>\n",
" <td>Large Group Caught in Slide</td>\n",
" <td>Washington, Silver Basic (Crystal Mountain)</td>\n",
" <td>On the morning of Saturday, December 11, the f...</td>\n",
" <td>2021</td>\n",
" <td>https://publications.americanalpineclub.org/ar...</td>\n",
" <td>Northwest Avalanche Cente</td>\n",
" <td>202</td>\n",
" <td>2022-11-05T20:12:06.963097</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>https://publications.americanalpineclub.org/ar...</td>\n",
" <td>Accident Reports</td>\n",
" <td>ANAM</td>\n",
" <td>Deeply Buried Weak Layer — Familiarity Heuristic</td>\n",
" <td>Colorado, Park Range, North Fork of Fish Creek</td>\n",
" <td>On March 19, 2022, two backcountry skiers plan...</td>\n",
" <td>2022</td>\n",
" <td>https://publications.americanalpineclub.org/ar...</td>\n",
" <td>Colorado Avalanche Information Cente</td>\n",
" <td>202</td>\n",
" <td>2022-11-05T20:12:23.675110</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>https://publications.americanalpineclub.org/ar...</td>\n",
" <td>Accident Reports</td>\n",
" <td>ANAM</td>\n",
" <td>Long Fall on Rock — Ledge Collapsed</td>\n",
" <td>Montana, Gallatin Canyon, The Watchtower</td>\n",
" <td>On July 16, Greg Sievers (63) and Rob Meshew (...</td>\n",
" <td>2021</td>\n",
" <td>https://publications.americanalpineclub.org/ar...</td>\n",
" <td>Greg Sievers and Mountain Projec</td>\n",
" <td>202</td>\n",
" <td>2022-11-05T20:14:12.658879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>https://publications.americanalpineclub.org/ar...</td>\n",
" <td>Accident Reports</td>\n",
" <td>ANAM</td>\n",
" <td>Rappel Anchor Failure</td>\n",
" <td>Minnesota, Taylors Falls</td>\n",
" <td>On March 25, Climber 1 (31) and Climber 2 (23)...</td>\n",
" <td>2021</td>\n",
" <td>https://publications.americanalpineclub.org/ar...</td>\n",
" <td>Climber 1 and The Editor</td>\n",
" <td>202</td>\n",
" <td>2022-11-05T20:14:01.360572</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>https://publications.americanalpineclub.org/ar...</td>\n",
" <td>Accident Reports</td>\n",
" <td>ANAM</td>\n",
" <td>Triggered Avalanche Catches Two Parties</td>\n",
" <td>New Hampshire, Mt. Washington, Tuckerman Ravine</td>\n",
" <td>On December 5, 2021, winds were light in Tucke...</td>\n",
" <td>2021</td>\n",
" <td>https://publications.americanalpineclub.org/ar...</td>\n",
" <td>Mt. Washington Avalanche Cente</td>\n",
" <td>202</td>\n",
" <td>2022-11-05T20:12:26.421852</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url type \\\n",
"0 https://publications.americanalpineclub.org/ar... Accident Reports \n",
"1 https://publications.americanalpineclub.org/ar... Accident Reports \n",
"2 https://publications.americanalpineclub.org/ar... Accident Reports \n",
"3 https://publications.americanalpineclub.org/ar... Accident Reports \n",
"4 https://publications.americanalpineclub.org/ar... Accident Reports \n",
"\n",
" publication title \\\n",
"0 ANAM Large Group Caught in Slide \n",
"1 ANAM Deeply Buried Weak Layer — Familiarity Heuristic \n",
"2 ANAM Long Fall on Rock — Ledge Collapsed \n",
"3 ANAM Rappel Anchor Failure \n",
"4 ANAM Triggered Avalanche Catches Two Parties \n",
"\n",
" location \\\n",
"0 Washington, Silver Basic (Crystal Mountain) \n",
"1 Colorado, Park Range, North Fork of Fish Creek \n",
"2 Montana, Gallatin Canyon, The Watchtower \n",
"3 Minnesota, Taylors Falls \n",
"4 New Hampshire, Mt. Washington, Tuckerman Ravine \n",
"\n",
" body climb_year \\\n",
"0 On the morning of Saturday, December 11, the f... 2021 \n",
"1 On March 19, 2022, two backcountry skiers plan... 2022 \n",
"2 On July 16, Greg Sievers (63) and Rob Meshew (... 2021 \n",
"3 On March 25, Climber 1 (31) and Climber 2 (23)... 2021 \n",
"4 On December 5, 2021, winds were light in Tucke... 2021 \n",
"\n",
" link_to_pdf \\\n",
"0 https://publications.americanalpineclub.org/ar... \n",
"1 https://publications.americanalpineclub.org/ar... \n",
"2 https://publications.americanalpineclub.org/ar... \n",
"3 https://publications.americanalpineclub.org/ar... \n",
"4 https://publications.americanalpineclub.org/ar... \n",
"\n",
" author publication_year \\\n",
"0 Northwest Avalanche Cente 202 \n",
"1 Colorado Avalanche Information Cente 202 \n",
"2 Greg Sievers and Mountain Projec 202 \n",
"3 Climber 1 and The Editor 202 \n",
"4 Mt. Washington Avalanche Cente 202 \n",
"\n",
" scraped_at \n",
"0 2022-11-05T20:12:06.963097 \n",
"1 2022-11-05T20:12:23.675110 \n",
"2 2022-11-05T20:14:12.658879 \n",
"3 2022-11-05T20:14:01.360572 \n",
"4 2022-11-05T20:12:26.421852 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"articles.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "eed296f5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['url',\n",
" 'type',\n",
" 'publication',\n",
" 'title',\n",
" 'location',\n",
" 'body',\n",
" 'climb_year',\n",
" 'link_to_pdf',\n",
" 'author',\n",
" 'publication_year',\n",
" 'scraped_at']"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(articles.columns)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
4 changes: 2 additions & 2 deletions aac_articles/publishers/kaggle/publisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def __init__(self, host: str, port: str, auth: tuple):

self.data_dir = "aac_articles/dataset/kaggle"
self.data_file = f"articles.csv"
self.no_publish_cols = ["scraped_at", "referer"]
self.client = OpenSearch(
hosts=[{"host": host, "port": port}],
http_compress=True, # enables gzip compression for request bodies
Expand Down Expand Up @@ -56,8 +57,7 @@ def publish(self, new_version: bool = False):

dataset = pd.DataFrame.from_records(dataset)

if dataset["url"].unique().size < len(dataset.index):
logging.warning("duplicate article URLs found in dataset - aborting publishing.")
dataset.drop(self.no_publish_cols, axis=1, inplace=True)

dataset.to_csv(f"{self.data_dir}/{self.data_file}", index=False)

Expand Down
Loading

0 comments on commit 33396ab

Please sign in to comment.