5 rows × 1 columns
\n", - "[5 rows x 1 columns in total]" + "" ], "text/plain": [ - " consumer_complaint_narrative\n", - "2 COLLECTION BUREAU OF AMERICA ACCOUNT NO. XXXX...\n", - "3 Despite multiple written requests, the unverif...\n", - "6 Once again you guys have not provided me with ...\n", - "9 XX/XX/XXXX {$350.00} I received a outstating d...\n", - "10 Im am unable to withdraw money from my account...\n", - "\n", - "[5 rows x 1 columns]" + " consumer_complaint_narrative\n", + "24 I sent disputed to Transunion, XXXX and XXXX f...\n", + "942 on XX/XX/2017 I sent XXXX, transunion, XXXX pr...\n", + "1193 On Wednesday, XXXX XXXX , I initiated a wir...\n", + "1292 Dear Sir or Madam, I am a victim of identity t...\n", + "1377 For the purpose of this complaint, I will refe..." ] }, "execution_count": 7, @@ -390,7 +375,7 @@ ], "source": [ "issues_df = input_df[[\"consumer_complaint_narrative\"]].dropna()\n", - "issues_df.head(n=5) # View the first five complaints" + "issues_df.peek(n=5) # View an arbitrary five complaints" ] }, { @@ -433,7 +418,7 @@ { "data": { "text/html": [ - "Query job bd6b88fc-6e05-4d71-acb1-d5befaced079 is DONE. 0 Bytes processed. Open Job" + "Query job 15b352c2-783c-42b1-bc03-e5772f00381a is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "\n", + " | text_embedding | \n", + "statistics | \n", + "ml_embed_text_status | \n", + "content | \n", + "
---|---|---|---|---|
545 | \n", + "[ 1.82510037e-02 -1.27867460e-02 -1.57095697e-... | \n", + "{\"token_count\":178,\"truncated\":false} | \n", + "\n", + " | My payments have been approximately {$89.00} w... | \n", + "
614 | \n", + "[ 5.40032536e-02 -5.28502129e-02 -5.33268750e-... | \n", + "{\"token_count\":399,\"truncated\":false} | \n", + "\n", + " | Hi, I have contacted Trans Union XXXX XXXX abo... | \n", + "
1236 | \n", + "[-5.32836001e-03 -5.84292673e-02 -5.86670786e-... | \n", + "{\"token_count\":129,\"truncated\":false} | \n", + "\n", + " | I have a XXXX XXXX XXXX credit card on my Exp... | \n", + "
1477 | \n", + "[ 3.02605387e-02 -4.37121317e-02 -2.70802993e-... | \n", + "{\"token_count\":16,\"truncated\":false} | \n", + "\n", + " | Wrongs information, selling my information to ... | \n", + "
2261 | \n", + "[ 2.35723313e-02 -3.73509154e-02 -6.44604117e-... | \n", + "{\"token_count\":33,\"truncated\":false} | \n", + "\n", + " | Please investigate and delete disputed item th... | \n", + "
2361 | \n", + "[ 1.04440488e-02 -9.37070698e-03 -7.36323372e-... | \n", + "{\"token_count\":45,\"truncated\":false} | \n", + "\n", + " | By the provisions of the Fair Credit Reporting... | \n", + "
2378 | \n", + "[ 3.04989032e-02 -4.08191867e-02 -6.18648790e-... | \n", + "{\"token_count\":892,\"truncated\":false} | \n", + "\n", + " | Since XX/XX/XXXX I have been trying to dispute... | \n", + "
3133 | \n", + "[ 0.00152804 -0.04189068 -0.04220504 -0.053740... | \n", + "{\"token_count\":90,\"truncated\":false} | \n", + "\n", + " | Out of the blue I received a debt collection n... | \n", + "
3140 | \n", + "[ 3.11435573e-02 -4.44000624e-02 -2.10917685e-... | \n", + "{\"token_count\":372,\"truncated\":false} | \n", + "\n", + " | My wife and I have been sending money to XXXX ... | \n", + "
3322 | \n", + "[ 2.75927987e-02 -6.23729872e-03 -3.83295454e-... | \n", + "{\"token_count\":36,\"truncated\":false} | \n", + "\n", + " | Phone calls from Convergent Outsourcing XXXX. ... | \n", + "
3583 | \n", + "[ 9.20385588e-03 -3.83387171e-02 -6.46291822e-... | \n", + "{\"token_count\":52,\"truncated\":false} | \n", + "\n", + " | I recently received a copy of my credit report... | \n", + "
4134 | \n", + "[-7.04960374e-04 -3.52595337e-02 -1.65264793e-... | \n", + "{\"token_count\":412,\"truncated\":false} | \n", + "\n", + " | I have been sending the creditor what they hav... | \n", + "
4496 | \n", + "[ 3.67735326e-02 1.21120387e-03 -5.20942472e-... | \n", + "{\"token_count\":182,\"truncated\":false} | \n", + "\n", + " | This is my second complaint. Their response to... | \n", + "
5260 | \n", + "[ 2.07133405e-02 -1.69602726e-02 -5.07124476e-... | \n", + "{\"token_count\":103,\"truncated\":false} | \n", + "\n", + " | XX/XX/XXXX and XX/XX/XXXX, {$3200.00} contacte... | \n", + "
5400 | \n", + "[ 1.44114876e-02 -2.34710164e-02 -6.58538565e-... | \n", + "{\"token_count\":60,\"truncated\":false} | \n", + "\n", + " | Upon checking my XXXX credit report I noticed ... | \n", + "
5425 | \n", + "[ 3.10326386e-02 -2.19427086e-02 -6.56386837e-... | \n", + "{\"token_count\":87,\"truncated\":false} | \n", + "\n", + " | Follow up to previous complaint XXXX XXXX XXXX... | \n", + "
6014 | \n", + "[ 1.90773793e-02 -2.27493346e-02 -3.27166244e-... | \n", + "{\"token_count\":175,\"truncated\":false} | \n", + "\n", + " | My new XXXX lease was over always paid on time... | \n", + "
8192 | \n", + "[ 0.01937891 -0.05466933 -0.06070872 -0.059028... | \n", + "{\"token_count\":131,\"truncated\":false} | \n", + "\n", + " | I have no idea where this account cane from. B... | \n", + "
8240 | \n", + "[ 4.34123818e-03 -3.40953320e-02 -4.06381376e-... | \n", + "{\"token_count\":87,\"truncated\":false} | \n", + "\n", + " | I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL F... | \n", + "
8720 | \n", + "[ 0.03133732 -0.03972461 -0.00178199 -0.035876... | \n", + "{\"token_count\":645,\"truncated\":false} | \n", + "\n", + " | XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consum... | \n", + "
8914 | \n", + "[ 1.75969116e-02 -2.25022305e-02 -5.70390299e-... | \n", + "{\"token_count\":180,\"truncated\":false} | \n", + "\n", + " | On XX/XX/21 I sent a letter regarding inaccura... | \n", + "
10021 | \n", + "[ 5.02460636e-02 -5.25112189e-02 -4.12914790e-... | \n", + "{\"token_count\":30,\"truncated\":false} | \n", + "\n", + " | XX/XX/XXXX and XX/XX/XXXX inaccurate informati... | \n", + "
10327 | \n", + "[-0.00979626 -0.04912931 -0.08654705 -0.021063... | \n", + "{\"token_count\":194,\"truncated\":false} | \n", + "\n", + " | When I reviewed my credit report, I discovered... | \n", + "
10345 | \n", + "[-0.04292191 -0.02636929 -0.06177032 -0.076520... | \n", + "{\"token_count\":262,\"truncated\":false} | \n", + "\n", + " | U.S. Bank sent two letters containing Visa Deb... | \n", + "
10369 | \n", + "[ 2.16020197e-02 -5.62509745e-02 -5.93873672e-... | \n", + "{\"token_count\":77,\"truncated\":false} | \n", + "\n", + " | I requested from XXXX that they reverse the la... | \n", + "
25 rows × 4 columns
\n", + "5 rows × 4 columns
\n", - "[5 rows x 4 columns in total]" + "25 rows × 4 columns
\n", + "[10000 rows x 4 columns in total]" ], "text/plain": [ - " text_embedding \\\n", - "251 [ 2.20562406e-02 -3.51827666e-02 7.63384486e-... \n", - "300 [ 0.01977486 -0.04289974 -0.05289588 -0.027267... \n", - "414 [ 1.37719307e-02 -4.15441953e-02 -7.81692266e-... \n", - "493 [ 4.48844060e-02 -1.40293539e-02 -3.46709713e-... \n", - "545 [ 1.82510037e-02 -1.27867460e-02 -1.57095697e-... \n", + " text_embedding \\\n", + "545 [ 1.82510037e-02 -1.27867460e-02 -1.57095697e-... \n", + "614 [ 5.40032536e-02 -5.28502129e-02 -5.33268750e-... \n", + "1236 [-5.32836001e-03 -5.84292673e-02 -5.86670786e-... \n", + "1477 [ 3.02605387e-02 -4.37121317e-02 -2.70802993e-... \n", + "2261 [ 2.35723313e-02 -3.73509154e-02 -6.44604117e-... \n", + "2361 [ 1.04440488e-02 -9.37070698e-03 -7.36323372e-... \n", + "2378 [ 3.04989032e-02 -4.08191867e-02 -6.18648790e-... \n", + "3133 [ 0.00152804 -0.04189068 -0.04220504 -0.053740... \n", + "3140 [ 3.11435573e-02 -4.44000624e-02 -2.10917685e-... \n", + "3322 [ 2.75927987e-02 -6.23729872e-03 -3.83295454e-... \n", + "3583 [ 9.20385588e-03 -3.83387171e-02 -6.46291822e-... \n", + "4134 [-7.04960374e-04 -3.52595337e-02 -1.65264793e-... \n", + "4496 [ 3.67735326e-02 1.21120387e-03 -5.20942472e-... \n", + "5260 [ 2.07133405e-02 -1.69602726e-02 -5.07124476e-... \n", + "5400 [ 1.44114876e-02 -2.34710164e-02 -6.58538565e-... \n", + "5425 [ 3.10326386e-02 -2.19427086e-02 -6.56386837e-... \n", + "6014 [ 1.90773793e-02 -2.27493346e-02 -3.27166244e-... \n", + "8192 [ 0.01937891 -0.05466933 -0.06070872 -0.059028... \n", + "8240 [ 4.34123818e-03 -3.40953320e-02 -4.06381376e-... \n", + "8720 [ 0.03133732 -0.03972461 -0.00178199 -0.035876... \n", + "8914 [ 1.75969116e-02 -2.25022305e-02 -5.70390299e-... \n", + "10021 [ 5.02460636e-02 -5.25112189e-02 -4.12914790e-... \n", + "10327 [-0.00979626 -0.04912931 -0.08654705 -0.021063... \n", + "10345 [-0.04292191 -0.02636929 -0.06177032 -0.076520... \n", + "10369 [ 2.16020197e-02 -5.62509745e-02 -5.93873672e-... \n", "\n", - " statistics ml_embed_text_status \\\n", - "251 {\"token_count\":145,\"truncated\":false} \n", - "300 {\"token_count\":498,\"truncated\":false} \n", - "414 {\"token_count\":263,\"truncated\":false} \n", - "493 {\"token_count\":395,\"truncated\":false} \n", - "545 {\"token_count\":178,\"truncated\":false} \n", + " statistics ml_embed_text_status \\\n", + "545 {\"token_count\":178,\"truncated\":false} \n", + "614 {\"token_count\":399,\"truncated\":false} \n", + "1236 {\"token_count\":129,\"truncated\":false} \n", + "1477 {\"token_count\":16,\"truncated\":false} \n", + "2261 {\"token_count\":33,\"truncated\":false} \n", + "2361 {\"token_count\":45,\"truncated\":false} \n", + "2378 {\"token_count\":892,\"truncated\":false} \n", + "3133 {\"token_count\":90,\"truncated\":false} \n", + "3140 {\"token_count\":372,\"truncated\":false} \n", + "3322 {\"token_count\":36,\"truncated\":false} \n", + "3583 {\"token_count\":52,\"truncated\":false} \n", + "4134 {\"token_count\":412,\"truncated\":false} \n", + "4496 {\"token_count\":182,\"truncated\":false} \n", + "5260 {\"token_count\":103,\"truncated\":false} \n", + "5400 {\"token_count\":60,\"truncated\":false} \n", + "5425 {\"token_count\":87,\"truncated\":false} \n", + "6014 {\"token_count\":175,\"truncated\":false} \n", + "8192 {\"token_count\":131,\"truncated\":false} \n", + "8240 {\"token_count\":87,\"truncated\":false} \n", + "8720 {\"token_count\":645,\"truncated\":false} \n", + "8914 {\"token_count\":180,\"truncated\":false} \n", + "10021 {\"token_count\":30,\"truncated\":false} \n", + "10327 {\"token_count\":194,\"truncated\":false} \n", + "10345 {\"token_count\":262,\"truncated\":false} \n", + "10369 {\"token_count\":77,\"truncated\":false} \n", "\n", - " content \n", - "251 A purse was purchased from XXXX XXXX on XX/XX/... \n", - "300 XXXX XXXX XXXXXXXX has reported on my credit r... \n", - "414 I have tried to dispute US BKPT CT TX XXXXXXXX... \n", - "493 Discover Student Loan has been holding onto {$... \n", - "545 My payments have been approximately {$89.00} w... \n", + " content \n", + "545 My payments have been approximately {$89.00} w... \n", + "614 Hi, I have contacted Trans Union XXXX XXXX abo... \n", + "1236 I have a XXXX XXXX XXXX credit card on my Exp... \n", + "1477 Wrongs information, selling my information to ... \n", + "2261 Please investigate and delete disputed item th... \n", + "2361 By the provisions of the Fair Credit Reporting... \n", + "2378 Since XX/XX/XXXX I have been trying to dispute... \n", + "3133 Out of the blue I received a debt collection n... \n", + "3140 My wife and I have been sending money to XXXX ... \n", + "3322 Phone calls from Convergent Outsourcing XXXX. ... \n", + "3583 I recently received a copy of my credit report... \n", + "4134 I have been sending the creditor what they hav... \n", + "4496 This is my second complaint. Their response to... \n", + "5260 XX/XX/XXXX and XX/XX/XXXX, {$3200.00} contacte... \n", + "5400 Upon checking my XXXX credit report I noticed ... \n", + "5425 Follow up to previous complaint XXXX XXXX XXXX... \n", + "6014 My new XXXX lease was over always paid on time... \n", + "8192 I have no idea where this account cane from. B... \n", + "8240 I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL F... \n", + "8720 XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consum... \n", + "8914 On XX/XX/21 I sent a letter regarding inaccura... \n", + "10021 XX/XX/XXXX and XX/XX/XXXX inaccurate informati... \n", + "10327 When I reviewed my credit report, I discovered... \n", + "10345 U.S. Bank sent two letters containing Visa Deb... \n", + "10369 I requested from XXXX that they reverse the la... \n", + "...\n", "\n", - "[5 rows x 4 columns]" + "[10000 rows x 4 columns]" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Will take ~3 minutes to compute the embeddings\n", - "predicted_embeddings = model.predict(downsampled_issues_df)\n", - "# Notice the lists of numbers that are our text embeddings for each complaint\n", - "predicted_embeddings.head() " + "successful_rows = (\n", + " (predicted_embeddings[\"ml_embed_text_status\"] == \"\")\n", + " # Series.str.len() gives the length of an array.\n", + " # See: https://stackoverflow.com/a/41340543/101923\n", + " & (predicted_embeddings[\"text_embedding\"].str.len() != 0)\n", + ")\n", + "predicted_embeddings = predicted_embeddings[successful_rows]\n", + "predicted_embeddings\n" ] }, { @@ -653,7 +1185,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "id": "AhNTnEC5FRz2" }, @@ -674,7 +1206,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "id": "6poSxh-fGJF7" }, @@ -682,7 +1214,7 @@ { "data": { "text/html": [ - "Query job 37f432dd-9ed7-4bbd-adc1-f33b8cbab33a is DONE. 61.5 MB processed. Open Job" + "Query job fa4bbc13-3831-4c80-9b59-9939e605ed58 is DONE. 61.7 MB processed. Open Job" ], "text/plain": [ "5 rows × 6 columns
\n", - "[5 rows x 6 columns in total]" + "" ], "text/plain": [ - " CENTROID_ID NEAREST_CENTROIDS_DISTANCE \\\n", - "251 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.534540549592... \n", - "300 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.437379245910... \n", - "414 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.482813493921... \n", - "493 9 [{'CENTROID_ID': 9, 'DISTANCE': 0.561752335987... \n", - "545 9 [{'CENTROID_ID': 9, 'DISTANCE': 0.540487926907... \n", + " CENTROID_ID NEAREST_CENTROIDS_DISTANCE \\\n", + "182250 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.570560301900... \n", + "3023485 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.523572693768... \n", + "407254 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.515173566816... \n", + "1509454 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.645342721754... \n", + "2357848 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.519872186251... \n", "\n", - " text_embedding \\\n", - "251 [ 2.20562406e-02 -3.51827666e-02 7.63384486e-... \n", - "300 [ 0.01977486 -0.04289974 -0.05289588 -0.027267... \n", - "414 [ 1.37719307e-02 -4.15441953e-02 -7.81692266e-... \n", - "493 [ 4.48844060e-02 -1.40293539e-02 -3.46709713e-... \n", - "545 [ 1.82510037e-02 -1.27867460e-02 -1.57095697e-... \n", + " text_embedding \\\n", + "182250 [ 4.70298417e-02 -4.08669300e-02 -2.99868709e-... \n", + "3023485 [ 1.55437263e-02 -1.93240177e-02 -2.48466972e-... \n", + "407254 [-0.01293471 -0.01959546 -0.02238463 -0.066214... \n", + "1509454 [ 3.21860723e-02 -2.67103072e-02 -4.78175096e-... \n", + "2357848 [-1.88122243e-02 -2.68064123e-02 -4.69480827e-... \n", "\n", - " statistics ml_embed_text_status \\\n", - "251 {\"token_count\":145,\"truncated\":false} \n", - "300 {\"token_count\":498,\"truncated\":false} \n", - "414 {\"token_count\":263,\"truncated\":false} \n", - "493 {\"token_count\":395,\"truncated\":false} \n", - "545 {\"token_count\":178,\"truncated\":false} \n", + " statistics ml_embed_text_status \\\n", + "182250 {\"token_count\":10,\"truncated\":false} \n", + "3023485 {\"token_count\":10,\"truncated\":false} \n", + "407254 {\"token_count\":10,\"truncated\":false} \n", + "1509454 {\"token_count\":10,\"truncated\":false} \n", + "2357848 {\"token_count\":10,\"truncated\":false} \n", "\n", - " content \n", - "251 A purse was purchased from XXXX XXXX on XX/XX/... \n", - "300 XXXX XXXX XXXXXXXX has reported on my credit r... \n", - "414 I have tried to dispute US BKPT CT TX XXXXXXXX... \n", - "493 Discover Student Loan has been holding onto {$... \n", - "545 My payments have been approximately {$89.00} w... \n", - "\n", - "[5 rows x 6 columns]" + " content \n", + "182250 These are not my accounts. Please remove them. \n", + "3023485 This debt is not mine due to identity theft. \n", + "407254 I do not owe this company money!!!!! \n", + "1509454 VIOLATES HIPPA AND CRA \n", + "2357848 Receive numerous phone calls. I have no debt. " ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -886,7 +1391,7 @@ "clustered_result = cluster_model.predict(predicted_embeddings)\n", "# Notice the CENTROID_ID column, which is the ID number of the group that\n", "# each complaint belongs to.\n", - "clustered_result.head(n=5)" + "clustered_result.peek(n=5)" ] }, { @@ -904,7 +1409,7 @@ "id": "21rNsFMHo8hO" }, "source": [ - "## Step 3: Use PaLM2 LLM model to summarize complaint clusters" + "## Step 3: Use Gemini to summarize complaint clusters" ] }, { @@ -917,7 +1422,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "id": "2E7wXM_jGqo6" }, @@ -925,7 +1430,7 @@ { "data": { "text/html": [ - "Query job 84f95981-01c7-49ca-a10c-5842f07d867f is DONE. 10.6 MB processed. Open Job" + "Query job 85ead687-4ba9-44bf-88da-23a066f45960 is DONE. 10.7 MB processed. Open Job" ], "text/plain": [ "