-
Notifications
You must be signed in to change notification settings - Fork 12
/
name_match.pl
418 lines (350 loc) · 15.4 KB
/
name_match.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
#!/usr/bin/perl
# Copyright 2009 CorpWatch.org
# San Francisco, CA | 94110, USA | Tel: +1-415-641-1633
# Developed by Greg Michalec and Skye Bender-deMoll
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#-------------------------------
# This code is used to generate match scores for sets of company names in the database. It is used to match company names associated with stock ticker codes, fortune 500 companies, and croc company names
#---------------------------------
require 'common.pl';
#use Text::JaroWinkler qw( strcmp95 );
use Data::Dumper;
use Parallel::ForkManager;
select(STDOUT); $| = 1; #unbuffer STDOUT
$match_table = '_croc_company_matches'; #where the output goes
$match_source = 'cw_companies'; #what we match against: 'cw_companies' 'cik_names' 'relation_names' 'filer_names'
$match_keep_level = 75; # only put matches in db if bigger than this
$efficient_matching = 0; #only match in one direction based on id
$match_locations = 0; #include location informtion in the match scores
print "Matching against ".$match_source." saving results with score above ".$match_keep_level." into table ".$match_table."\n";
#get the sets of names we are gonna be comparing
#$namequeries[1] = "select ucase(name),id from fortune1000";
#$namequeries[1] = "select lcase(clean_company),relationship_id as id,null,null from relationships";
$namequeries[1] = "select lcase(croc_company_name), row_id as id from croc_companies";
#$namequeries[2] = "select name,cw_id as id from company_names where source = 'filer_match_name' or source = 'relationships_clean_company' ";
our $db;
%stopterms = &get_stopterms();
$db->do("DROP TABLE IF EXISTS `$match_table`");
$db->do("CREATE TABLE `$match_table` ( `id` int(11) NOT NULL auto_increment, `name1` varchar(255) default NULL, `name2` varchar(255) default NULL, `score` decimal(5,2) default NULL, id_a varchar(25), id_b varchar(25),`match_type` varchar(10), `match` int(1) default 0, PRIMARY KEY (`id`), KEY `id1` (`name1`), KEY `id2` (`name2`), KEY `score` (`score`))");
#load in the single word frequency rates
my $weights = $db->selectall_hashref("select lcase(word) as word, weight from word_freq", 'word');
#load in the bigram frequency rates
my $bi_weights = $db->selectall_hashref("select bigram, weight from bigram_freq", 'bigram');
if ($match_locations) {
print "loading country frequencies...\n";
#load in the country codes and scores
$country_weights = $db->selectall_hashref("select country_code, 1/(log(count(*))+1) score from (select country_code, subdiv_code from company_locations group by cw_id,country_code) locs group by country_code ","country_code");
#subdive counts (and scores) are conditional on country scores: "US:DE"
$subdiv_weights = $db->selectall_hashref("select concat(country_code,':', subdiv_code) code, 1/(log(count(*))+1) score from (select country_code, subdiv_code from company_locations group by cw_id,country_code) locs where subdiv_code is not null group by country_code,subdiv_code","code");
}
my $clean;
#foreach ('1', '2') {
foreach ('1') {
my $set = $_;
print "Fetching names $set...\n";
#$query = "select ucase(name) from person_names where (sourcetable= 'facultylist' or sourcetable='courtesyapps') group by ucase(name) order by name";
#$query = "select ucase(name) from person_names group by ucase(name) order by name";
my $sth = $db->prepare($namequeries[$set]) || print "$DBI->errstr\n";
$sth->execute() || print "$DBI->errstr\n".$db->errstr;
while (my $row = $sth->fetchrow_arrayref) {
#clean out puncuation for matching
$row->[0] = &clean_for_match($row->[0]);
$names = {name=>$row->[0], id=>$row->[1], country_code=>$row->[2], subdiv_code=>$row->[3]};
push(@{$clean->[$set]}, $names);
}
}
#print Data::Dumper::Dumper($clean);
print "Finding matches...\n";
#prepare a statment for inserting matches into db.
my $count = 0;
my $y = 0;
my $listsize = scalar(@{$clean->[1]});
my $time = time();
print "$count/$listsize\n";
my $matches;
my $percent = int($#{$clean->[1]}/2000);
$db->disconnect;
$manager = new Parallel::ForkManager( 5 );
$manager->run_on_finish(
sub { $count++; }
);
#---------------------NAME MATCHING LOOP------------------
#loop over the names, computing match score for each.
foreach my $names (@{$clean->[1]}) {
#tracker to print out percent done
if ($y == $percent) {
print "\r".$count/$listsize*100 ."% (";
my $ntime = time() - $time;
print "$ntime)";
$y = 0;
$time = time();
}
$y++;
#get the name off the list
my $pid = $manager->start and next;
print ".";
my $name1 = ${$names}{name};
$db = &dbconnect();
#set up a query to get the a corresponding subset of names to match against
#print "getting matchlist for ".$name1." (id: ".${$names}{id}.")\n";
my @match_subset;
my $query = &name_subset_query($name1,$match_source);
#print ($query."\n");
#my $sth3 = $db->prepare($query) || print "$DBI->errstr\n";
#$sth3->execute() || print "$DBI->errstr\n".$db->errstr;
#load the query results into array (assuming they already cleaned)
# while (my $row = $db->selectrow_arrayref($query)) {
# $record = {name=>$row->[0], id=>$row->[1],country_code=>$row->[2],subdiv_code=>$row->[3]};
# push(@match_subset, $record);
# }
# if ($db->errstr){print $db->errstr."\n";}
#print " comparing to ".scalar(@match_subset)." names.\n";
my $sth = $db->prepare($query);
$sth->execute;
#print "comparing ".$name1." (id: ".${$names}{id}.") against ".$sth->rows." names:\n";
while (my $row = $sth->fetchrow_arrayref) {
$names2 = {name=>$row->[0], id=>$row->[1],country_code=>$row->[2],subdiv_code=>$row->[3]};
#if (${$names}{primary_id} == ${$names2}{primary_id}) { next; }
my $name2 = ${$names2}{name};
#$matches->{$name1}->{$name2}->{'count'}++;
#print "$name1 (".${$names}{id}." ".${$names}{country_code}.":".${$names}{subdiv_code}.")\t$name2 (".${$names2}{id}." ".${$names2}{country_code}.":".${$names2}{subdiv_code}.")\n";
#------ location matching (if we are doing it
my $loc_score = 0;
if ($match_locations){
$loc_score = &location_match_score(${$names}{country_code}, ${$names}{subdiv_code}, ${$names2}{country_code},${$names2}{subdiv_code});
}
#----- bigram matching
#if matching efficiently, only match pair in one direction
unless ($efficient_matching && ${$names}{id} > ${$names2}{id}) {
my $match = 0;
if ($name1 eq $name2) {
$match = 100;
} else {
$match = &get_bigram_score(${$names}{name}, ${$names2}{name});
$match *=100;
unless ($match) { $match = 0; }
}
$match += $loc_score; #add in the location score
#print "\t".$name2." (bigram): ".$match."\n";
#if the match is above a threshold, insert in db
if ($match > $match_keep_level) {
my $sth = $db->prepare_cached("insert into $match_table (name1, name2, score, id_a, id_b,match_type) values (?,?,?,?,?,?)");
$sth->execute($name1, $name2, $match, ${$names}{id}, ${$names2}{id},"bigram_".$match_locations);
}
#print "\tbigram:$match";
} #else { print "dupe\n"; }
# ----------- term frequency matching
#if matching efficiently, only match pair in one direction
unless ($efficient_matching && ${$names}{id} > ${$names2}{id}) {
my $match = 0;
if ($name1 eq $name2) {
$match = 100;
} else {
$match = &get_term_score(${$names}{name}, ${$names2}{name});
$match *=100;
unless ($match) { $match = 0; }
}
$match += $loc_score; #add in the location score
#print "\t".$name2." (term_freq): ".$match."\n";
#if the match is above a threshold, insert in db
if ($match > $match_keep_level) {
my $sth = $db->prepare_cached("insert into $match_table (name1, name2, score, id_a, id_b,match_type) values (?,?,?,?,?,?)");
$sth->execute($name1, $name2, $match, ${$names}{id}, ${$names2}{id},"term_".$match_locations);
}
#print " term_freq:$match";
} #else { print "dupe\n"; }
}
$manager->finish;
#print total_size($clean)."\t".total_size($matches)."\t".total_size($sth)."\n";
#print "=============================\n";
}
#$db->do("insert into $match_table select null, name2, name1, score from $match_table");
print "\nDone.\n";
exit;
#compute a match score based on the intersecting set of terms, weighted by their observed frequency in our set of names
sub get_term_score() {
my ($comp1, $comp2) = @_;
my $score = 0;
my $no_match_weight = 0.2;
my @tokens1 = split(/ /,$comp1);
my @tokens2 = split(/ /,$comp2);
#//score= 2*(sum score tokens in comp) / (sum score comp1)+(sum score comp2);
my $sum1 =0;
my $sum2 =0;
my $sumBoth = 0;
foreach my $token (@tokens1){
#if it has a weight, it is at least somewhat common
if (defined $weights->{$token}){
$sum1 += $weights->{$token}->{weight};
} else {
#//it didn't show up the the db, so weight it as $no_match_weight
$sum1 += $no_match_weight ;
}
#//check if it is in the other company set
if (grep {$_ eq $token } @tokens2){
if (defined $weights->{$token}){
$sumBoth += $weights->{$token}->{weight} * 2;
} else {
$sumBoth += $no_match_weight*2;
}
}
}
#// now compute weight for 2nd company
foreach my $token (@tokens2){
if (defined $weights->{$token}){
$sum2 += $weights->{$token}->{weight};
} else {
#since it is not in our list of common tokens, assume it is rare
$sum2 += $no_match_weight ;
}
}
$score = $sumBoth/($sum1+$sum2);
#debug
#print($comp1." $comp2 : $sumBoth $sum1 $sum2 = $score \n");
return $score;
}
#compute a match score based on the frequency of bigram occurances observed in our set of names
sub get_bigram_score() {
my ($comp1, $comp2) = @_;
my $score = 0;
my $no_match_weight = 0.2;
#how to handle names that have just a single term?
my @tokens1 = &list_bigrams($comp1);
my @tokens2 = &list_bigrams($comp2);
#Score is compute by comparing the weights of the matching tokens to the weights of each name's tokens:
#score= 2*(sum score tokens in comp) / (sum score comp1)+(sum score comp2);
my $sum1 =0;
my $sum2 =0;
my $sumBoth = 0;
foreach my $token (@tokens1){
#if it has a weight, it is at least somewhat common
if (defined $bi_weights->{$token}){
$sum1 += $bi_weights->{$token}->{weight};
} else {
#//it didn't show up the the db, so weight it as $no_match_weight
$sum1 += $no_match_weight ;
}
#//check if it is in the other company set
if (grep {$_ eq $token } @tokens2){
if (defined $bi_weights->{$token}){
$sumBoth += $bi_weights->{$token}->{weight} * 2;
} else {
$sumBoth += $no_match_weight*2;
}
}
}
#// now compute weight for 2nd company
foreach my $token (@tokens2){
if (defined $bi_weights->{$token}){
$sum2 += $bi_weights->{$token}->{weight};
} else {
#since it is not in our list of common tokens, assume it is rare
$sum2 += $no_match_weight ;
}
}
#deal with case to avoid divide by zero
if ($sum1+$sum2 > 0){
$score = $sumBoth/($sum1+$sum2);
}
return $score;
}
#create a query that will return a subset of names that match at least one term each. This costs some mysql query time, but makes it so we are only matching aginst 100 or a few thousand names instead of tens of thousands.
sub name_subset_query() {
my $comp1 = $_[0]; #the company name that will be matched.
my $match_set = $_[1]; #this determines what nameset we should match against
#need to escape quotes in company name for db
$comp1 = &clean_for_match($comp1);
# $comp1 =~ s/'/\'/;
#$comp1 =~ s/"/\"/;
my $query = "";
my @tokens1;
#split the name on into tokens on space, kick out tokens that match the stopword list
foreach my $token (split(/ /,$comp1)) {
if (!exists $stopterms{$token}) {
push(@tokens1,$token);
}
}
#if we are matching against all EDGAR names in cik lookup table
if ($match_set eq "cik_names") {
$first = pop(@tokens1);
$query = "select lcase(match_name),cik as id,null,null from cik_name_lookup where match_name like '%".$first."%'";
foreach my $token (@tokens1){
$query = $query." union distinct select lcase(match_name),cik as id ,null,null from cik_name_lookup where match_name like '%".$token."%'";
}
}
#use these queries if we are matching against names of companies in relationships table
elsif ($match_set eq "relation_names") {
$first = pop(@tokens1);
$query = "select lcase(clean_company) as name, relationship_id as id, country_code,subdiv_code from relationships where clean_company like '%".$first."%' ";
foreach my $token (@tokens1){
$query = $query."union distinct select lcase(clean_company), relationship_id as id,country_code,subdiv_code from relationships where clean_company like '%".$token."%' ";
}
}
#use these quries if we are only matching aginst company_names table
elsif ($match_set eq "cw_companies") {
$first = pop(@tokens1);
$query = "select lcase(name) as name, cw_id as id,null as country_code,null as subdiv_code from company_names where name like '%".$first."%' ";
foreach my $token (@tokens1){
$query = $query."union distinct select lcase(name), cw_id as id,null,null from company_names where name like '%".$token."%' ";
}
}
#use these queries if we are matching against filer names
#TODO: add location info
elsif ($match_set eq "filer_names") {
$first = pop(@tokens1);
$query = "select lcase(match_name) as name, cik as id,null as country_code,null as subdiv_code from filers where match_name like '%".$first."%' ";
foreach my $token (@tokens1){
$query = $query."union distinct select lcase(match_name), cik as id, null,null from filers where match_name like '%".$token."%' ";
}
}else {
#uh oh, what should default be?
}
return $query;
}
#returns a hash with the list of some of the most common terms to avoid using in queires
sub get_stopterms() {
my %stopterms = ();
my $stopquery = "select lcase(word) from word_freq order by count desc limit 75";
my $sth4 = $db->prepare($stopquery) || print "$DBI->errstr\n";
$sth4->execute() || print "$DBI->errstr\n".$db->errstr;
#load the query results into the hash
while (my $row = $sth4->fetchrow_arrayref) {
$stopterms{$row->[0]} = $row->[0];
}
return %stopterms;
}
#scores match country_code and subdiv_code of passed arguments
sub location_match_score() {
my ($country1, $subdiv1,$country2,$subdiv2) = @_;
$subdiv1 = $country1.":".$subdiv1;
$subdiv2 = $country2.":".$subdiv2;
my $score = 0.25;
#if either side is null, don't match
if ((defined $country1) & (defined $country2 )){
if ($country1 eq $country2){
$score = $country_weights->{$country1}->{score};
#now check if the subdivision also matches
if ((defined $subdiv1) & (defined $subdiv2)){
if ($subdiv1 eq $subdiv2) {
$score = $score+ $subdiv_weights->{$subdiv1}->{score};
} else {
#mismatch, so punish
$score=0;
}
}
} else {
#countries do *not* match, so punish
$score=0;
}
}
return $score * 100;
}