-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.php
98 lines (77 loc) · 3.12 KB
/
index.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
<?php
set_time_limit(0);
// replaces non-alphanumeric characters with optional non-alphanumeric match, and inserts optional non-alphanumeric match between numbers and letters
function prepareRegex($str) {
return preg_replace('/([a-z])(\d)/i', '$1[^a-z\d]*$2',
preg_replace('/(\d)([a-z])/i', '$1[^a-z\d]*$2',
preg_replace('/[^a-z\d]+/i', '[^a-z\d]*',
$str)));
}
// make regex string with word boundaries
function makeRegex($str) {
return '/\b'.prepareRegex($str).'\b/i';
}
// make regex string without word boundaries
function makeRegexNoWord($str) {
return '/'.prepareRegex($str).'/i';
}
// initialize output
$output = array();
define('OUTPUT_FILE', 'results.txt');
file_put_contents(OUTPUT_FILE, '');
// read in listings
$listings = file_get_contents('listings.txt');
$listings = explode("\n", $listings);
foreach($listings as &$l) $l = json_decode($l, true);// decode json string for each lisitng
// read in products
$products = file_get_contents('products.txt');
$products = explode("\n", $products);
foreach($products as $j => &$p) {
$p = json_decode($p, true);// decode json string
if (is_null($p)) {
unset($products[$j]);// remove product if json decode failed
continue;
}
// initialize product in output
$output[$p['product_name']] = array(
'product_name' => $p['product_name'],
'listings' => array()
);
// search for ideal match first - manufacturer + family + full model
if (empty($p['manufacturer']) || empty($p['model']) || empty($p['family'])) continue;
$manufacturer = makeRegex($p['manufacturer']);
$model = makeRegex($p['model']);
$family = makeRegex($p['family']);
foreach($listings as $i => $l) {
if (!preg_match($manufacturer, $l['manufacturer']) || !preg_match($model, $l['title']) || !preg_match($family, $l['title'])) continue;
$output[$p['product_name']]['listings'][] = $l;// add match to output
unset($listings[$i]);// remove matched listing from future searches
}
}
// 2nd pass - look for manufacturer + full model from listings that haven't been matched yet
foreach($products as $p) {
if (empty($p['manufacturer']) || empty($p['model'])) continue;
$manufacturer = makeRegex($p['manufacturer']);
$model = makeRegex($p['model']);
foreach($listings as $i => $l) {
if (!preg_match($manufacturer, $l['manufacturer']) || !preg_match($model, $l['title'])) continue;
$output[$p['product_name']]['listings'][] = $l;
unset($listings[$i]);
}
}
// 3rd pass - look for partial models - e.g. model is DSC123 but listing has DSC123S
// skipped if model is only numbers or only letters (too general - many false positives)
foreach($products as $p) {
if (empty($p['manufacturer']) || empty($p['model']) || is_numeric($p['model']) || !preg_match('/\d/', $p['model'])) continue;
$manufacturer = makeRegex($p['manufacturer']);
$model = makeRegexNoWord($p['model']);
foreach($listings as $i => $l) {
if (!preg_match($manufacturer, $l['manufacturer']) || !preg_match($model, $l['title'])) continue;
$output[$p['product_name']]['listings'][] = $l;
unset($listings[$i]);
}
}
// write output
foreach($output as $line) file_put_contents(OUTPUT_FILE, json_encode($line)."\n", FILE_APPEND);
exit(0);
?>