-
Notifications
You must be signed in to change notification settings - Fork 2
/
tfidf.php
91 lines (86 loc) · 2.65 KB
/
tfidf.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
<?php
class TfIdf
{
/**
* Returns TF value to determine the number of times $term occurs in $words
* @param string $term
* @param string $words
* @return float
*/
public function getTermFrequency($term, $words)
{
$wordCount = count($words);
$frequency = count(array_keys($words, $term));
if (!$frequency) return 0;
$tf = floatval($frequency) / floatval($wordCount);
return round($tf, 6);
}
/**
* Returns IDF value to determine how common is the term across all $documents
* @param string $term
* @param array $documents
* @return float
*/
public function getInverseDocumentFrequency($term, $documents)
{
$docCount = count($documents);
$docWithTerms = 0;
foreach ($documents as $document) {
$words = array_filter(explode(',', $document));
if (in_array($term, $words)) $docWithTerms++;
}
if (!($docWithTerms)) return 0;
$idf = 1 + log($docCount / $docWithTerms);
return $idf;
}
/**
* Returns TF-IDF value to determine how important a $query is to the entire document
* @param string $query
* @param string $words
* @param float $idf
* @return float
*/
public function getTFIDF($query, $words, $idf)
{
$tf = $this->termFrequency($query, $words);
return $tf * $idf;
}
/**
* Returns Cosine Similarity value using TF-IDF and IDF values
* @param float $idf
* @param float $tfidf
* @return float
*/
public function getCosineSimilarity($idf, $tfidf)
{
$tf = 1 / count($idf);
$dotProduct = 0;
$query = 0;
$document = 0;
foreach ($tfidf as $key => $value) {
$dotProduct += (($tf * $idf[$key]) * $value);
$query += pow(($tf * $idf[$key]), 2);
$document += pow($value, 2);
}
$query = sqrt($query);
$document = sqrt($document);
if (!($query * $document)) return 0;
return $dotProduct / ($query * $document);
}
/**
* Returns the normalized value of the $data
* @param array $data
* @return array
*/
public function getNormalizeData($data)
{
$minX = floatval(min($data));
$maxX = floatval(max($data));
foreach ($data as $key => $value) {
$x = floatval($value);
$data[$key] = ($x - $minX) / ($maxX - $minX);
if (!($maxX - $minX)) $data[$key] = 0;
}
return $data;
}
}