-
Notifications
You must be signed in to change notification settings - Fork 30
/
classifer.js
198 lines (161 loc) · 6 KB
/
classifer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
const bayes = require('bayes');
const { normalisePropertyNames } = require('../util');
const acronyms = ['DSL', 'GUI', 'SDK'];
const titleCase = (category) => category.split(' ')
.map((text) => {
const upper = text.toUpperCase();
// THIS NEEDS TO BE VASTLY IMPROVED AS IT DOES NOT SCALE
if (acronyms.indexOf(upper) !== -1) {
return upper;
}
return text.charAt(0).toUpperCase() + text.substr(1).toLowerCase();
})
.join(' ');
const normaliseCategoryValue = (category) => {
let output = category.trim();
const dash = output.match(/^(?:\w+(?:-|$))+$/);
const comma = output.match(/^(?:\w+(?:(?:,|$) *))+$/);
// Words concatenated with dash or comma
if (dash || comma) {
output = output.split(/-|,/).join(' ');
}
return titleCase(output);
};
class Classifier {
constructor(tools) {
if (!tools) {
throw new Error('Missing parameter: tools');
}
this.ignoredCategories = [
'miscellaneous', 'random',
];
this.tools = tools;
this.classifier = bayes();
}
/**
* Use the captured README data and categories to provide an inference of the correct catagories
*/
learn() {
this.tools = this.tools
.map((tool) => {
const { base64Readme } = (tool.repositoryMetadata || {});
const { category } = tool;
const readme = base64Readme
? Buffer.from(base64Readme, 'base64').toString()
: null;
// Write decoded readme onto tooling array to be used for classification
tool.readme = readme; // eslint-disable-line no-param-reassign
if (category) {
if (Array.isArray(category)) {
tool.category = category // eslint-disable-line no-param-reassign
.filter((value) => this.ignoredCategories.indexOf(value.toLowerCase()) === -1)
.map((value) => {
const normalisedValue = normaliseCategoryValue(value);
if (readme) {
this.classifier.learn(readme, normalisedValue);
}
return normalisedValue;
});
} else if (this.ignoredCategories.indexOf(category.toLowerCase()) === -1) {
const normalisedValue = normaliseCategoryValue(category);
if (readme) {
this.classifier.learn(readme, normalisedValue);
}
tool.category = normalisedValue; // eslint-disable-line no-param-reassign
} else {
// Removed an ignored category
delete tool.category; // eslint-disable-line no-param-reassign
}
}
return tool;
});
}
/**
* Provides a list of category names and the category name they should be mapped to
*
* @returns {Object[]]} The list of categories and what they should be set to
*/
getNormalisedCategories() {
// Collect up all categories and count their appearances in the data
const weightedCategories = this.tools
.reduce((output, tool) => {
const update = output;
// If category not set then ignore
if (!tool.category) {
return update;
}
// If category is an array then normalise all values
if (Array.isArray(tool.category)) {
tool.category
.forEach((c) => {
const normalisedValue = normaliseCategoryValue(c);
update[normalisedValue] = (
update[normalisedValue] || 0) + 1;
});
return update;
}
// Category is a string so normalise and count
const normalisedValue = normaliseCategoryValue(tool.category);
update[normalisedValue] = (update[normalisedValue] || 0) + 1;
return update;
}, {});
// Normalise all category names, coalescing on the most common
// Dial-up the hamming distance to allow categories to become more concentrated
return normalisePropertyNames(weightedCategories, 4);
}
/**
* Returns the tools list categorised based on the model
*
* @returns {Object[]} The full tools list
*/
async categorize() {
const tools = await Promise.all(this.tools
.map(async (tool) => {
// If categoryByRequestIndicator is true then override analysis and persist
// existing category, as tooling owner has asked specifically for that one
if (!tool.categoryByRequestIndicator && tool.readme) {
const newCategory = await this.classifier.categorize(tool.readme);
delete tool.readme; // eslint-disable-line no-param-reassign
if (newCategory && newCategory !== 'null') {
// No category exists
if (!tool.category) {
return Object.assign(tool, { category: newCategory });
}
// Array of categories already exists
if (Array.isArray(tool.category)) {
if (tool.category.indexOf(newCategory) === -1) {
return Object.assign(tool, { category: tool.category.concat([newCategory]) });
}
// A category already exists, so convert to array
} else if (tool.category !== newCategory) {
return Object.assign(tool, { category: [tool.category, newCategory] });
}
}
}
// Trash the decoded readme
delete tool.readme; // eslint-disable-line no-param-reassign
return tool;
}));
// Get the weighted categories
const categories = this.getNormalisedCategories();
// Rationalise the categories, concentrating them into relatively few
// This also tidies up any duplicates through the use of Set
this.tools = tools
.map((tool) => {
if (!tool.category) {
return tool;
}
return Object.assign(
tool,
{
category: Array.isArray(tool.category)
? [...new Set(tool.category
.map((category) => (categories[category] ? categories[category] : category)))]
: categories[tool.category],
},
);
});
return this.tools;
}
}
module.exports = Classifier;