Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pull in more fields from mods, drop milstein.csv #171

Merged
merged 16 commits into from
Nov 14, 2024
7,324 changes: 3,662 additions & 3,662 deletions data/images.ndjson

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data/lat-lon-to-ids.json

Large diffs are not rendered by default.

247,826 changes: 247,826 additions & 0 deletions data/mods-details.json

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions data/originals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

Files in this folder are "originals"—not derived from some other data source.

- `data/originals/milstein.csv`: the vintage 2013 CSV file from the NYPL that started it all
- `data/originals/Milstein_data_for_DV.csv`: the 2024 update to the CSV
- `data/originals/Milstein_data_for_DV.csv`: CSV from the NYPL (2024)
- Street listings
- `manhattan-streets.txt`: https://geographic.org/streetview/usa/ny/new_york/new_york.html
- `brooklyn-streets.txt`: https://geographic.org/streetview/usa/ny/kings/brooklyn.html
Expand Down
63,092 changes: 0 additions & 63,092 deletions data/originals/milstein.csv

This file was deleted.

3 changes: 1 addition & 2 deletions oldnyc/geocode/coders/title_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from oldnyc.geocode.boroughs import boroughs_pat, guess_borough, point_to_borough
from oldnyc.geocode.coders.coder_utils import get_lat_lng_from_geocode
from oldnyc.geocode.geocode_types import Coder, Locatable
from oldnyc.geocode.record import clean_title
from oldnyc.item import Item

# Borough: str1 - str2
Expand Down Expand Up @@ -53,7 +52,7 @@ def strip_trivia(txt: str) -> str:


def clean_and_strip_title(title: str) -> str:
title = clean_title(title)
title = title.replace("[", "").replace("]", "")
title = re.sub(r" +:", ":", title)
# east side
# west corner
Expand Down
17 changes: 13 additions & 4 deletions oldnyc/geocode/generate_js.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import json
import sys
from collections import defaultdict
from datetime import date
from json import encoder
from typing import Sequence

from oldnyc.geocode import record
from oldnyc.geocode.geocode_types import Locatable
from oldnyc.ingest.dates import extract_years
from oldnyc.item import Item

encoder.FLOAT_REPR = lambda o: format(o, ".6f") # type: ignore
Expand All @@ -18,6 +19,15 @@
LocatedRecord = tuple[Item, str | None, Locatable | None]


def get_date_range(date_str: str) -> tuple[date, date]:
# TODO: this is a bit wonky; could use clean_date more directly.
years = extract_years(date_str)
if not years or years == [""]:
return date(1850, 1, 1), date(1999, 12, 31)
dates = [date(int(y), 1, 1) for y in years]
return min(dates), max(dates)


def _generateJson(located_recs: Sequence[LocatedRecord], lat_lon_map: dict[str, str]):
out: dict[str, list[str]] = {}
# "lat,lon" -> list of items
Expand Down Expand Up @@ -45,10 +55,9 @@ def _generateJson(located_recs: Sequence[LocatedRecord], lat_lon_map: dict[str,
points = 0
photos = 0
for lat_lon, recs in ll_to_id.items():
rec_dates = [(r, record.get_date_range(r.date or "")) for r in recs]
# XXX the "if" filter here probably doesn't do anything
rec_dates = [(r, get_date_range(r.date or "")) for r in recs]
sorted_recs = sorted(
[rdr for rdr in rec_dates if rdr[1] and rdr[1][1]],
rec_dates,
key=lambda rdr: rdr[1][1],
)
no_date += len(recs) - len(sorted_recs)
Expand Down
6 changes: 0 additions & 6 deletions oldnyc/geocode/geocode_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,6 @@
from oldnyc.item import Item


class Location(TypedDict):
address: str
lat: float
lon: float


class Locatable(TypedDict):
address: str
"""Can be either a geolocatable address or @lat,lng"""
Expand Down
43 changes: 0 additions & 43 deletions oldnyc/geocode/record.py

This file was deleted.

33 changes: 31 additions & 2 deletions oldnyc/ingest/collect_mods.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,20 @@ def as_list(dict_or_list: dict | list) -> list:
return [dict_or_list] if isinstance(dict_or_list, dict) else dict_or_list


def is_photographer(name: dict):
# Photographer, Artist, Lithographer, Creator
return any(rt["$"] in ("pht", "art", "ltg", "cre") for rt in name["role"]["roleTerm"])


def extract_date(origin_info: dict | list) -> str | None:
origins = as_list(origin_info)
for origin in origins:
for field in ("dateIssued", "dateCreated"):
date = origin.get(field)
if date:
return ", ".join([d["$"] for d in as_list(date)])


if __name__ == "__main__":
mods_dir, item_details_dir = sys.argv[1:]

Expand All @@ -30,9 +44,24 @@ def as_list(dict_or_list: dict | list) -> list:
mods = resp["mods"]
titles = as_list(mods["titleInfo"])
assert titles[0]["usage"] == "primary"

title_strs = [t["title"]["$"] for t in titles]
mapping[uuid] = {"titles": title_strs}

# TODO: output as a list
names = as_list(mods.get("name", []))
creators = [name["namePart"]["$"] for name in names if is_photographer(name)]
creator = ";".join(creators) if creators else None

origin = mods.get("originInfo")
date = extract_date(origin) if origin else None

sources = []
relatedItem = mods["relatedItem"]
while relatedItem:
assert relatedItem["type"] == "host"
sources = [relatedItem["titleInfo"]["title"]["$"]] + sources
relatedItem = relatedItem.get("relatedItem")

mapping[uuid] = {"titles": title_strs, "creator": creator, "sources": sources, "date": date}

# 104425.json: no back image
# 1552839.json: has back image
Expand Down
163 changes: 73 additions & 90 deletions oldnyc/ingest/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,47 @@ def sort_uniq(xs: list[str]) -> list[str]:
}


CREATOR_PATCHES = {
"Welles, Burton F. (Burton Frederick), 1872-": "Welles & Co.--Publisher",
"Sperr, Percy Loomis, 1890-1964": "Sperr, Percy Loomis",
"Wurts Bros. (New York, N.Y.)": "Wurts Brothers",
"Ewing Galloway (Agency)": "Galloway, Ewing",
"Underhill, Irving, -1960": "Underhill, Irving,d. 1960",
"Tiemann, Hermann Newell (1863-1957)": "Tiemann, Hermann Newell",
"Fass, John S. (John Stroble), 1890-1973": "Fass, John S. (John Stroble),b. 1890",
"Van der Weyde, William M. (William Manley), 1870-1928": "Van der Weyde, William M. (William Manley)",
"Abbott, Berenice, 1898-1991": "Abbott, Berenice",
"Fairchild Aerial Surveys, inc.": "Fairchild Aerial Surveys, Inc.",
"Armbruster, Eugene L., 1865-1943": "Armbruster, Eugene L.",
}

SOURCE_PATCHES = {
"Fifth Avenue, New York, from start to finish": "Fifth Avenue, New York, from start to finish.",
"Itineraire pittoresque du fleuve Hudson et des parties laterales de l'Amerique du Nord, d'apres les dessins originaux pris sur les lieux. Atlas": "Itineraire pittoresque du fleuve Hudson et des parties laterales de l'Amerique du Nord, d'apres les dessins originaux pris sur les lieux. Atlas.",
"Apartment houses of the metropolis": "Apartment houses of the metropolis.",
"Amerique septentrionale : vues des chutes du Niagara": "Amerique septentrionale : vues des chutes du Niagara.",
"Photographic views of New York City, 1870's-1970's. Supplement. / Manhattan": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement. / Manhattan",
"Photographic views of New York City, 1870's-1970's. Supplement. / Brooklyn": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement. / Brooklyn",
"Photographic views of New York City, 1870's-1970's. Supplement. / Queens": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement. / Queens",
"Photographic views of New York City, 1870's-1970's. Supplement. / Bronx": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement. / Bronx",
"Photographic views of New York City, 1870's-1970's. Supplement. / Topics": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement. / Topics",
"Collection of photographs of New York City / Manhattan": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Manhattan",
"Collection of photographs of New York City / Brooklyn": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Brooklyn",
"Collection of photographs of New York City / Bronx": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Bronx",
"Collection of photographs of New York City / Queens": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Queens",
"Collection of photographs of New York City / Subjects": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Subjects",
"Collection of photographs of New York City, 1931-1942": "[Collection of photographs of New York City, 1931-1942.]",
"Photographic negatives of the New York City Tenement House Department": "Photographic negatives of the New York City Tenement House Department, 1902-1914",
"A Pictorial description of Broadway": "A Pictorial description of Broadway / by the Mail & Express.",
"The World's loose leaf album of apartment houses: containing views and ground plans of the principal high class apartment houses in New York City, together with a map showing the situation of these houses, transportation facilities, etc.": "The World's loose leaf album of apartment houses, containing views and ground plans of the principal high class apartment houses in New York City, together with a map showing the situation of these houses, transportation facilities, etc.",
"[Collection of photographs of New York City, 1931-1942]": "[Collection of photographs of New York City, 1931-1942.]",
"Photographs of Madison Square Garden": "[Photographs of Madison Square Garden. New York, 1925]",
"Forty etchings, from sketches made with the camera lucida, in North America, in 1827 and 1828": "Forty etchings, from sketches made with the camera lucida, in North America, in 1827 and 1828.",
"Photographic views of the construction of the New York City subway system, 1901-1905": "Photographic views of the construction of the New York City subway system, 1901-1905.",
"Supplement to Apartment houses of the metropolis": "Supplement to Apartment houses of the metropolis.",
}


def outside_nyc(geographics: list[str]) -> bool:
for g in geographics:
if (g in STATES and g not in TRISTATE) or g in OTHER_OUTSIDE:
Expand All @@ -55,11 +96,21 @@ def strip_punctuation(s: str) -> str:
return re.sub(r"[^\w]", "", s)


def patch_source(source: str) -> str:
if source == "":
return ""
source = source.replace(", from the collections of the New York Public Library", "")
if source.startswith("Collection of photographs taken by Daniel B. Austin"):
source = "[" + source
source = source.replace("1914", "1914]")
return SOURCE_PATCHES.get(source, source)


# These are sometimes used as placeholders for unknown dates.
GENERIC_DATES = {"1887, 1986", "1870, 1970", "1887, 1964", "1900, 1999", "1960, 1990"}


def run():
csv2013 = {
row["DIGITAL_ID"]: row
for row in csv.DictReader(open("data/originals/milstein.csv", encoding="latin-1"))
}
csv2024 = {
row["image_id"].lower(): row
for row in csv.DictReader(open("data/originals/Milstein_data_for_DV_2.csv"))
Expand All @@ -76,108 +127,34 @@ def run():

counters = Counter[str]()
out = open("data/images.ndjson", "w")
ids = [*sorted(csv2013.keys())]
ids = [*sorted(csv2024.keys())]
for id in tqdm(ids):
counters["num records"] += 1
row = csv2013[id]

date_str = row["CREATED_DATE"]

title = row["IMAGE_TITLE"].strip()
assert title

alt_title = row["ALTERNATE_TITLE"].strip()
if not alt_title:
alt_title = None
source = row["SOURCE"].strip()

creator = row["CREATOR"].strip()

row2 = csv2024[id]

uuid = row2["item_uuid"]
url = row2["digital_collections_url"]
title2 = row2["title"].strip()
date2 = row2["date"]
if date2 == "1887, 1986" or date2 == "1870, 1970":
date2 = "" # 1887-1986 is used as "unknown"
counters["date2: generic"] += 1

topics = sort_uniq(json.loads(row2["subject/topic"]))
geographics = sort_uniq(json.loads(row2["subject/geographic"]))
names = sort_uniq(json.loads(row2["subject/name"]))
temporals = sort_uniq(json.loads(row2["subject/temporal"]))
mods_detail = mods_details.get(uuid)

dates = [date_str, date2]
dates = [clean_date(normalize_whitespace(d.strip())) for d in dates]
date_str, date2 = dates
if date_str != date2:
counters["mismatch: date"] += 1
if date2 and not date_str:
counters["date: added"] += 1
elif date_str and not date2:
counters["date: dropped"] += 1
else:
counters["date: changed"] += 1

# print("---")
# print(id)
# print(date_str)
# print(date2)

titles = [title, title2]
titles = [clean_title(normalize_whitespace(t)) for t in titles]
title, title2 = titles

if title != title2:
counters["mismatch: title mismatch"] += 1
if title == title2 + ".":
counters["title mismatch: drop dot"] += 1
elif title2 == title + "]":
counters["title mismatch: add bracket"] += 1
elif strip_punctuation(title).lower() == strip_punctuation(title2).lower():
counters["title mismatch: other punctuation"] += 1
elif not title2.isascii():
counters["title mismatch: non-ascii"] += 1
elif title == "No Title":
counters["title mismatch: add title"] += 1
# print("---", id, "---")
# print(title)
# print(title2)
elif "Directories" in topics or "directory" in title2.lower():
counters["title mismatch: directory"] += 1
elif outside_nyc(geographics):
counters["title mismatch: outside nyc"] += 1
elif title2.replace(" and ", "") == title:
counters["title mismatch: add and"] += 1
else:
counters["title mismatch: other"] += 1
# print("---", id, "---")
# print(title)
# print(title2)
date2 = row2["date"] or (mods_detail["date"] if mods_detail else None) or ""
if date2 in GENERIC_DATES:
date2 = ""
counters["date2: generic"] += 1
date2 = clean_date(normalize_whitespace(date2.strip()))

title2 = clean_title(normalize_whitespace(title2))

mods_detail = mods_details.get(uuid)
# TODO: store as array
alt_title2 = (
"\n".join(mods_detail.get("titles"))
if mods_detail
else (row2["alternative_title"].strip() if row2["alternative_title"] else None)
)
alt_titles = [alt_title, alt_title2]
alt_titles = [clean_title(normalize_whitespace(t)) if t else None for t in alt_titles]
alt_title, alt_title2 = alt_titles

if alt_title != alt_title2:
counters["mismatch: alt_title mismatch"] += 1
# print("---")
# print(alt_title)
# print(alt_title2)
alt_title2 = mods_detail.get("titles")[1:] if mods_detail else None
if not alt_title2:
alt_title2 = [row2["alternative_title"].strip()] if row2["alternative_title"] else []
alt_title2 = [clean_title(normalize_whitespace(t)) for t in alt_title2]

if alt_title:
counters["alt_title"] += 1
if alt_title2:
counters["alt_title2"] += 1

Expand Down Expand Up @@ -228,16 +205,22 @@ def run():
counters["filtered: directory"] += 1
continue

creator = (clean_creator(mods_detail["creator"] or "") or None) if mods_detail else None
creator = CREATOR_PATCHES.get(creator, creator) if creator else None

source = " / ".join(mods_detail["sources"]) if mods_detail else ""
source = patch_source(source)

r = Item(
id=id,
uuid=uuid,
url=url,
photo_url=f"https://images.nypl.org/?id={id}&t=w",
date=date2 or date_str or None,
date=date2 or None,
title=title2,
alt_title=alt_title2 or [],
back_id=back_id,
creator=clean_creator(creator) or None,
creator=creator,
source=source,
back_text=back_text,
back_text_source=back_text_source,
Expand Down