forked from spellcheck-ko/korean-dict-nikl-krdict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.js
77 lines (71 loc) · 2.07 KB
/
extract.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import { readdir, readFile, writeFile } from 'node:fs/promises';
import { parseStringPromise as parseXml } from 'xml2js';
const load = async () => {
const xml = await readFile('5000.xml', { encoding: 'utf-8' });
const json = await parseXml(xml);
await writeFile('5000.json', JSON.stringify(json, null, 2), { flag: 'w+' });
};
const isGood = (feature) => {
if (feature.lexicalUnit !== '단어') {
return false;
}
if (['접사', '품사 없음'].includes(feature.partOfSpeech)) {
return false;
}
return true;
};
const process = async (file, words) => {
const utf8 = await readFile(file, { encoding: 'utf-8' });
const data = await parseXml(utf8);
for (const lexicon of data.LexicalResource.Lexicon) {
for (const entry of lexicon.LexicalEntry) {
const feature = {};
if (!entry.feat) {
continue;
}
for (const { $ } of entry.feat) {
const { att, val } = $;
feature[att] = val;
}
for (const sense of entry.Sense) {
if (!sense.Equivalent) {
continue;
}
for (const equivalent of sense.Equivalent) {
const field = {};
for (const { $ } of equivalent.feat) {
const { att, val } = $;
field[att] = val;
}
if (field.language === '영어') {
feature.english = field.definition;
}
}
}
for (const lemma of entry.Lemma) {
for (const feat of lemma.feat) {
const { $ } = feat;
const { att, val } = $;
if (att === 'writtenForm') {
const { english, vocabularyLevel } = feature;
if (!isGood(feature)) {
continue;
}
words.push({ level: vocabularyLevel, korean: val, english });
}
}
}
}
}
};
const run = async () => {
const words = []
await process
for (const file of await readdir('.')) {
if (file.endsWith('.xml')) {
await process(file, words);
}
}
await writeFile('out.json', JSON.stringify(words, null, 2), { encoding: 'utf-8', flag: 'w+' });
}
run();