Cluster multiple thresholds v2 #2437

RobinL · 2024-09-30T19:32:18Z

Supercedes #2414

RobinL · 2024-10-01T07:34:18Z

Testing approach

import random
import time

import duckdb
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd

from splink import DuckDBAPI
from splink.internals.clustering import (
    cluster_pairwise_predictions_at_multiple_thresholds,
    cluster_pairwise_predictions_at_threshold,
)

db_api = DuckDBAPI(":default:")


def generate_random_graph(graph_size, seed=None):
    if not seed:
        seed = random.randint(5, 1000000)

    density = 1 / graph_size
    print(f"Graph size: {graph_size}, Density: {density}")

    graph = nx.fast_gnp_random_graph(graph_size, density, seed=seed, directed=False)
    return graph


def nodes_and_edges_from_graph(G):
    edges = nx.to_pandas_edgelist(G)
    edges.columns = ["unique_id_l", "unique_id_r"]

    nodes = pd.DataFrame({"unique_id": list(G.nodes)})

    return nodes, edges


G = generate_random_graph(1_000)
combined_nodes, combined_edges = nodes_and_edges_from_graph(G)
combined_edges["match_probability"] = np.random.uniform(0, 1, len(combined_edges))


thresholds = [0.0, 0.5, 0.501, 0.502, 0.53, 0.540, 1.0]


all_clusters = cluster_pairwise_predictions_at_multiple_thresholds(
    combined_nodes,
    combined_edges,
    node_id_column_name="unique_id",
    db_api=db_api,
    match_probability_thresholds=thresholds,
)


cluster_cols = [c.name for c in all_clusters.columns if "cluster" in c.name]

zipped_clusters = list(zip(thresholds, cluster_cols))
zipped_clusters

all_clusters_ddb = all_clusters.as_duckdbpyrelation()

for threshold, cluster_col in zipped_clusters:
    print("--")
    single_res = cluster_pairwise_predictions_at_threshold(
        combined_nodes,
        combined_edges,
        node_id_column_name="unique_id",
        db_api=db_api,
        threshold_match_probability=threshold,
    ).as_duckdbpyrelation()

    sql = """
    select count(*) from (
    select distinct cluster_id
    from single_res)
    """
    single_res_count = duckdb.sql(sql).fetchone()[0]

    sql = f"""
    select count(*)
    from (
    select distinct {cluster_col}
    from all_clusters_ddb
    )


    """
    all_clusters_count = duckdb.sql(sql).fetchone()[0]

    print(f"Threshold: {threshold}")
    print(f"Single res count: {single_res_count}")
    print(f"All clusters count: {all_clusters_count}")
    assert single_res_count == all_clusters_count

RobinL · 2024-10-01T09:30:02Z

Chart distinct cluster

import duckdb
import numpy as np
import pandas as pd

from splink import DuckDBAPI, Linker, splink_datasets
from splink.internals.clustering import (
    cluster_pairwise_predictions_at_multiple_thresholds,
    cluster_pairwise_predictions_at_threshold,
)
from splink.internals.misc import bayes_factor_to_prob, match_weight_to_bayes_factor

# Load the dataset and set up the DuckDB API
df = splink_datasets.historical_50k
df = df.reset_index()

df = df.drop(columns=["unique_id"])
df["unique_id"] = df.index
df = df.drop(columns=["index"])
df["index"] = df.index.astype("int32")

db_api = DuckDBAPI(":default:")

# Create the first_name_surname_concat column
df["first_name_surname_concat"] = df["first_name"] + " " + df["surname"]

# Initialize the Linker
linker = Linker(df, "50k_model.json", db_api=db_api)

# Generate predictions
df_edges = linker.inference.predict(threshold_match_probability=0.01)


# Define thresholds
thresholds_mw = [i / 2 for i in range(-20, 20, 1)]

threshold_probs = [
    bayes_factor_to_prob(match_weight_to_bayes_factor(mw)) for mw in thresholds_mw
]


# logging.getLogger("splink").setLevel(1)

# Cluster at multiple thresholds
all_clusters = cluster_pairwise_predictions_at_multiple_thresholds(
    df,
    df_edges,
    node_id_column_name="unique_id",
    db_api=db_api,
    match_probability_thresholds=threshold_probs,
    output_number_of_distinct_clusters_only=True,
)
all_clusters_ddb = all_clusters.as_duckdbpyrelation()


res = all_clusters_ddb.df()
res
import altair as alt

# Create the Altair chart
chart = (
    alt.Chart(res)
    .mark_line(point=True)
    .encode(
        x=alt.X("threshold", title="Threshold"),
        y=alt.Y("distinct_clusters", title="Number of Distinct Clusters"),
        tooltip=["threshold", "distinct_clusters"],
    )
    .properties(title="Number of Distinct Clusters by Threshold", width=600, height=400)
)

# Display the chart
chart

RobinL · 2024-10-03T12:51:09Z

Tractable example

from duckdb import DuckDBPyRelation

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
from splink.internals.clustering import (
    cluster_pairwise_predictions_at_multiple_thresholds,
    cluster_pairwise_predictions_at_threshold,
)

db_api = DuckDBAPI()


nodes = [
    {"abc": 1},
    {"abc": 2},
    {"abc": 3},
    {"abc": 4},
    {"abc": 5},
    {"abc": 6},
    {"abc": 7},
]

edges = [
    {"abc_l": 1, "abc_r": 2, "match_probability": 0.45},
    {"abc_l": 2, "abc_r": 3, "match_probability": 0.55},
    {"abc_l": 3, "abc_r": 4, "match_probability": 0.65},
    {"abc_l": 4, "abc_r": 5, "match_probability": 0.75},
    {"abc_l": 6, "abc_r": 7, "match_probability": 0.9},
]


cc = cluster_pairwise_predictions_at_multiple_thresholds(
    nodes,
    edges,
    node_id_column_name="abc",
    db_api=db_api,
    match_probability_thresholds=[0.4, 0.5, 0.6, 0.99],
)

cc.as_duckdbpyrelation()
for t in db_api._intermediate_table_cache:
    print(t)

dc = cluster_pairwise_predictions_at_multiple_thresholds(
    nodes,
    edges,
    node_id_column_name="abc",
    db_api=db_api,
    match_probability_thresholds=[0.4, 0.5, 0.6, 0.99],
    output_number_of_distinct_clusters_only=True,
)
dc.as_duckdbpyrelation()
for t in db_api._intermediate_table_cache:
    print(t)

RobinL · 2024-10-14T06:54:05Z

Results from runs on a big dataset comparing cluster_pairwise_predictions_at_threshold to cluster_pairwise_predictions_at_multiple_thresholds

splink=4.0.4 released version:

┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                 7957 │              29164679 │
└──────────────────────┴───────────────────────┘


┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                  585 │              30049917 │
└──────────────────────┴───────────────────────┘

┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                 1017 │              29345750 │
└──────────────────────┴───────────────────────┘



┌───────────┬──────────────┬──────────────────┬────────────────────┐
│ threshold │ num_clusters │ max_cluster_size │  avg_cluster_size  │
│   float   │    int64     │      int64       │       double       │
├───────────┼──────────────┼──────────────────┼────────────────────┤
│       0.8 │     29164679 │             7957 │  1.854098857045538 │
│      0.95 │     29345750 │             1017 │ 1.8426585791809718 │
│     0.999 │     30049917 │              585 │ 1.7994791133699304 │
└───────────┴──────────────┴──────────────────┴────────────────────┘
```

RobinL · 2024-10-14T07:18:17Z

Using the old algorithm (splink=4.0.3) we get the same result:

┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                 7957 │              29164679 │
└──────────────────────┴───────────────────────┘

┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                 1017 │              29345750 │
└──────────────────────┴───────────────────────┘

┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                  585 │              30049917 │
└──────────────────────┴───────────────────────┘

wip

3a8ca22

RobinL mentioned this pull request Sep 30, 2024

(Superceded) cluster multiple thresholds #2414

Closed

3 tasks

wip

e07bd53

works with distinct option

aa7dcbd

RobinL added 3 commits October 3, 2024 12:53

fix mypy issues

6bb504b

test cluster at multiple thresholds

72191c8

make more memory efficient

dcb337f

RobinL added 10 commits October 3, 2024 14:17

make it work in spark

3c6f679

spark fixes

6ac40c7

fix sql

859237c

better partitioning

2818128

better persist

8c63da0

document function

00e4ba9

skip the long running tests for spark

8cc2e52

restore skips for faster test runs

4975fa4

back up to 100

7c6fe68

faster tests

96024dd

RobinL merged commit 97481cf into master Oct 3, 2024
25 checks passed

RobinL deleted the cluster_multiple_thresholds_v2 branch October 3, 2024 14:15

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Cluster multiple thresholds v2 #2437

Cluster multiple thresholds v2 #2437

RobinL commented Sep 30, 2024

RobinL commented Oct 1, 2024

RobinL commented Oct 1, 2024

RobinL commented Oct 3, 2024

RobinL commented Oct 14, 2024 •

edited

Loading

RobinL commented Oct 14, 2024

Cluster multiple thresholds v2 #2437

Cluster multiple thresholds v2 #2437

Conversation

RobinL commented Sep 30, 2024

RobinL commented Oct 1, 2024

RobinL commented Oct 1, 2024

RobinL commented Oct 3, 2024

RobinL commented Oct 14, 2024 • edited Loading

RobinL commented Oct 14, 2024

RobinL commented Oct 14, 2024 •

edited

Loading