Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cluster multiple thresholds v2 #2437

Merged
merged 16 commits into from
Oct 3, 2024
Merged

Conversation

RobinL
Copy link
Member

@RobinL RobinL commented Sep 30, 2024

Supercedes #2414

@RobinL
Copy link
Member Author

RobinL commented Oct 1, 2024

Testing approach
import random
import time

import duckdb
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd

from splink import DuckDBAPI
from splink.internals.clustering import (
    cluster_pairwise_predictions_at_multiple_thresholds,
    cluster_pairwise_predictions_at_threshold,
)

db_api = DuckDBAPI(":default:")


def generate_random_graph(graph_size, seed=None):
    if not seed:
        seed = random.randint(5, 1000000)

    density = 1 / graph_size
    print(f"Graph size: {graph_size}, Density: {density}")

    graph = nx.fast_gnp_random_graph(graph_size, density, seed=seed, directed=False)
    return graph


def nodes_and_edges_from_graph(G):
    edges = nx.to_pandas_edgelist(G)
    edges.columns = ["unique_id_l", "unique_id_r"]

    nodes = pd.DataFrame({"unique_id": list(G.nodes)})

    return nodes, edges


G = generate_random_graph(1_000)
combined_nodes, combined_edges = nodes_and_edges_from_graph(G)
combined_edges["match_probability"] = np.random.uniform(0, 1, len(combined_edges))


thresholds = [0.0, 0.5, 0.501, 0.502, 0.53, 0.540, 1.0]


all_clusters = cluster_pairwise_predictions_at_multiple_thresholds(
    combined_nodes,
    combined_edges,
    node_id_column_name="unique_id",
    db_api=db_api,
    match_probability_thresholds=thresholds,
)


cluster_cols = [c.name for c in all_clusters.columns if "cluster" in c.name]

zipped_clusters = list(zip(thresholds, cluster_cols))
zipped_clusters

all_clusters_ddb = all_clusters.as_duckdbpyrelation()

for threshold, cluster_col in zipped_clusters:
    print("--")
    single_res = cluster_pairwise_predictions_at_threshold(
        combined_nodes,
        combined_edges,
        node_id_column_name="unique_id",
        db_api=db_api,
        threshold_match_probability=threshold,
    ).as_duckdbpyrelation()

    sql = """
    select count(*) from (
    select distinct cluster_id
    from single_res)
    """
    single_res_count = duckdb.sql(sql).fetchone()[0]

    sql = f"""
    select count(*)
    from (
    select distinct {cluster_col}
    from all_clusters_ddb
    )


    """
    all_clusters_count = duckdb.sql(sql).fetchone()[0]

    print(f"Threshold: {threshold}")
    print(f"Single res count: {single_res_count}")
    print(f"All clusters count: {all_clusters_count}")
    assert single_res_count == all_clusters_count

@RobinL
Copy link
Member Author

RobinL commented Oct 1, 2024

Chart distinct cluster
import duckdb
import numpy as np
import pandas as pd

from splink import DuckDBAPI, Linker, splink_datasets
from splink.internals.clustering import (
    cluster_pairwise_predictions_at_multiple_thresholds,
    cluster_pairwise_predictions_at_threshold,
)
from splink.internals.misc import bayes_factor_to_prob, match_weight_to_bayes_factor

# Load the dataset and set up the DuckDB API
df = splink_datasets.historical_50k
df = df.reset_index()

df = df.drop(columns=["unique_id"])
df["unique_id"] = df.index
df = df.drop(columns=["index"])
df["index"] = df.index.astype("int32")

db_api = DuckDBAPI(":default:")

# Create the first_name_surname_concat column
df["first_name_surname_concat"] = df["first_name"] + " " + df["surname"]

# Initialize the Linker
linker = Linker(df, "50k_model.json", db_api=db_api)

# Generate predictions
df_edges = linker.inference.predict(threshold_match_probability=0.01)


# Define thresholds
thresholds_mw = [i / 2 for i in range(-20, 20, 1)]

threshold_probs = [
    bayes_factor_to_prob(match_weight_to_bayes_factor(mw)) for mw in thresholds_mw
]


# logging.getLogger("splink").setLevel(1)

# Cluster at multiple thresholds
all_clusters = cluster_pairwise_predictions_at_multiple_thresholds(
    df,
    df_edges,
    node_id_column_name="unique_id",
    db_api=db_api,
    match_probability_thresholds=threshold_probs,
    output_number_of_distinct_clusters_only=True,
)
all_clusters_ddb = all_clusters.as_duckdbpyrelation()


res = all_clusters_ddb.df()
res
import altair as alt

# Create the Altair chart
chart = (
    alt.Chart(res)
    .mark_line(point=True)
    .encode(
        x=alt.X("threshold", title="Threshold"),
        y=alt.Y("distinct_clusters", title="Number of Distinct Clusters"),
        tooltip=["threshold", "distinct_clusters"],
    )
    .properties(title="Number of Distinct Clusters by Threshold", width=600, height=400)
)

# Display the chart
chart

@RobinL
Copy link
Member Author

RobinL commented Oct 3, 2024

Tractable example
from duckdb import DuckDBPyRelation

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
from splink.internals.clustering import (
    cluster_pairwise_predictions_at_multiple_thresholds,
    cluster_pairwise_predictions_at_threshold,
)

db_api = DuckDBAPI()


nodes = [
    {"abc": 1},
    {"abc": 2},
    {"abc": 3},
    {"abc": 4},
    {"abc": 5},
    {"abc": 6},
    {"abc": 7},
]

edges = [
    {"abc_l": 1, "abc_r": 2, "match_probability": 0.45},
    {"abc_l": 2, "abc_r": 3, "match_probability": 0.55},
    {"abc_l": 3, "abc_r": 4, "match_probability": 0.65},
    {"abc_l": 4, "abc_r": 5, "match_probability": 0.75},
    {"abc_l": 6, "abc_r": 7, "match_probability": 0.9},
]


cc = cluster_pairwise_predictions_at_multiple_thresholds(
    nodes,
    edges,
    node_id_column_name="abc",
    db_api=db_api,
    match_probability_thresholds=[0.4, 0.5, 0.6, 0.99],
)

cc.as_duckdbpyrelation()
for t in db_api._intermediate_table_cache:
    print(t)

dc = cluster_pairwise_predictions_at_multiple_thresholds(
    nodes,
    edges,
    node_id_column_name="abc",
    db_api=db_api,
    match_probability_thresholds=[0.4, 0.5, 0.6, 0.99],
    output_number_of_distinct_clusters_only=True,
)
dc.as_duckdbpyrelation()
for t in db_api._intermediate_table_cache:
    print(t)

@RobinL RobinL merged commit 97481cf into master Oct 3, 2024
25 checks passed
@RobinL RobinL deleted the cluster_multiple_thresholds_v2 branch October 3, 2024 14:15
@RobinL
Copy link
Member Author

RobinL commented Oct 14, 2024

Results from runs on a big dataset comparing cluster_pairwise_predictions_at_threshold to cluster_pairwise_predictions_at_multiple_thresholds

splink=4.0.4 released version:

┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                 7957 │              29164679 │
└──────────────────────┴───────────────────────┘


┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                  585 │              30049917 │
└──────────────────────┴───────────────────────┘

┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                 1017 │              29345750 │
└──────────────────────┴───────────────────────┘



┌───────────┬──────────────┬──────────────────┬────────────────────┐
│ threshold │ num_clusters │ max_cluster_size │  avg_cluster_size  │
│   float   │    int64     │      int64       │       double       │
├───────────┼──────────────┼──────────────────┼────────────────────┤
│       0.8 │     29164679 │             7957 │  1.854098857045538 │
│      0.95 │     29345750 │             1017 │ 1.8426585791809718 │
│     0.999 │     30049917 │              585 │ 1.7994791133699304 │
└───────────┴──────────────┴──────────────────┴────────────────────┘
```

@RobinL
Copy link
Member Author

RobinL commented Oct 14, 2024

Using the old algorithm (splink=4.0.3) we get the same result:

┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                 7957 │              29164679 │
└──────────────────────┴───────────────────────┘
┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                 1017 │              29345750 │
└──────────────────────┴───────────────────────┘
┌──────────────────────┬───────────────────────┐
│ largest_cluster_size │ num_distinct_clusters │
│        int64         │         int64         │
├──────────────────────┼───────────────────────┤
│                  585 │              30049917 │
└──────────────────────┴───────────────────────┘

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant