Skip to content

Commit

Permalink
make maps filterable by dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
kdutia committed Nov 18, 2024
1 parent 159e296 commit 7ad44c1
Showing 1 changed file with 48 additions and 9 deletions.
57 changes: 48 additions & 9 deletions src/streamlit_apps/searchable_world_map.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import Optional
from pathlib import Path
import duckdb
import geopandas as gpd
Expand Down Expand Up @@ -35,27 +36,41 @@ def load_data():
return db


def get_geography_count_for_texts(texts: list[str]) -> pd.DataFrame:
def get_geography_count_for_texts(
texts: list[str], corpus_type_names: Optional[list[str]]
) -> pd.DataFrame:
"""
Get the number of paragraphs containing any of the given texts, grouped by geography.
Returns dataframe with columns 'geography ISO', and 'count'.
"""
texts = [f"\\b{text.lower()}\\b" for text in texts]
regex = f".*({'|'.join(texts)}).*"
results_df = db.sql(
f"""

if corpus_type_names is None:
corpus_type_names_clause = ""
else:
corpus_type_names_string = (
"(" + ",".join([f"'{name}'" for name in corpus_type_names]) + ")"
)
corpus_type_names_clause = f"""AND "document_metadata.corpus_type_name" IN {corpus_type_names_string} """

sql_query = f"""
SELECT "document_metadata.geographies", COUNT(*)
FROM open_data
WHERE "text_block.language" = 'en'
AND (lower("text_block.text") SIMILAR TO '{regex}')
AND "document_metadata.geographies" IS NOT NULL
AND "document_metadata.geographies" <> ['XAA']
AND ("text_block.type" = 'title' OR "text_block.type" = 'Text' OR "text_block.type" = 'sectionHeading')
{corpus_type_names_clause}
GROUP BY "document_metadata.geographies"
ORDER BY COUNT(*) DESC
"""
).to_df()

print(sql_query)

results_df = db.sql(sql_query).to_df()

results_df["document_metadata.geographies"] = results_df[
"document_metadata.geographies"
Expand All @@ -76,6 +91,19 @@ def get_num_paragraphs_in_db() -> int:
return db.sql("SELECT COUNT(*) FROM open_data").to_df().iloc[0, 0]


@st.cache_data
def get_unique_corpus_type_names() -> list[str]:
names = (
db.sql(
"""SELECT DISTINCT "document_metadata.corpus_type_name" FROM open_data"""
)
.to_df()["document_metadata.corpus_type_name"]
.tolist()
)

return [n for n in names if n is not None]


@st.cache_data
def load_world_geometries():
"""
Expand All @@ -97,6 +125,7 @@ def load_world_geometries():

def plot_country_map(
keywords: list[str],
corpus_type_names: Optional[list[str]] = None,
normalize_counts: bool = False,
axis=None,
):
Expand All @@ -105,7 +134,7 @@ def plot_country_map(
Returns the raw results.
"""
results_df = get_geography_count_for_texts(keywords)
results_df = get_geography_count_for_texts(keywords, corpus_type_names)

# normalise by paragraph_count_by_geography
if normalize_counts:
Expand Down Expand Up @@ -172,18 +201,20 @@ def plot_country_map(


def plot_normalised_unnormalised_subplots(
kwds,
kwds, corpus_type_names: Optional[list[str]] = None
) -> tuple[plt.Figure, pd.DataFrame, pd.DataFrame]: # type: ignore
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 9), dpi=300)

df_unnorm = plot_country_map(
kwds,
corpus_type_names=corpus_type_names,
normalize_counts=False,
axis=ax1,
)

df_norm = plot_country_map(
kwds,
corpus_type_names=corpus_type_names,
normalize_counts=True,
axis=ax2,
)
Expand All @@ -197,7 +228,6 @@ def plot_normalised_unnormalised_subplots(
db = load_data()
world = load_world_geometries()
num_paragraphs_in_db = get_num_paragraphs_in_db()
paragraph_count_by_geography = get_geography_count_for_texts([".*"])

st.title("Searchable World Map")
st.markdown(
Expand All @@ -210,6 +240,13 @@ def plot_normalised_unnormalised_subplots(
- `elephants?`: match "elephant" and "elephants"
""")

corpus_type_names = get_unique_corpus_type_names()
st.markdown("## Select corpus type")
corpus_types = st.multiselect(
"Select corpus types", corpus_type_names, default=corpus_type_names
)
paragraph_count_by_geography = get_geography_count_for_texts([".*"], corpus_types)

kwds = st.text_input(
"Enter keywords separated by commas (spaces next to commas will be ignored)"
)
Expand All @@ -218,7 +255,7 @@ def plot_normalised_unnormalised_subplots(
kwds = [word.strip() for word in kwds.split(",")]

st.markdown("## all keywords")
fig, data1, data2 = plot_normalised_unnormalised_subplots(kwds)
fig, data1, data2 = plot_normalised_unnormalised_subplots(kwds, corpus_types)
n_paragraphs = data1["count"].sum()
percentage = round(n_paragraphs / num_paragraphs_in_db * 100, 2)
st.markdown(f"Num paragraphs: {n_paragraphs:,} ({percentage}%)")
Expand All @@ -227,7 +264,9 @@ def plot_normalised_unnormalised_subplots(
if len(kwds) > 1:
for keyword in kwds:
st.markdown(f"## {keyword}")
fig, data1, data2 = plot_normalised_unnormalised_subplots([keyword])
fig, data1, data2 = plot_normalised_unnormalised_subplots(
[keyword], corpus_types
)
n_paragraphs = data1["count"].sum()
percentage = round(n_paragraphs / num_paragraphs_in_db * 100, 2)
st.markdown(f"Num paragraphs: {n_paragraphs:,} ({percentage}%)")
Expand Down

0 comments on commit 7ad44c1

Please sign in to comment.