forked from liwenzhu/corpusZh
-
Notifications
You must be signed in to change notification settings - Fork 0
/
combineAll.js
72 lines (60 loc) · 1.33 KB
/
combineAll.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
var fs = require('fs');
var readline = require('readline');
// uncomment for debug
// var showMem = require('./showMemory');
// showMem(); // for debug
var CORPUS_HOME = "./corpus";
var EOF = "#!EOF";
var readOptions = {
flags: 'r',
encoding: null,
fd: null,
mode: 0666,
autoClose: true
};
var writeOptions = {
flags: 'a+',
encoding: null,
mode: 0666
};
fs.readdir(CORPUS_HOME, function(err, files) {
if(err) throw new Error(err);
combineFiles(files);
});
function combineFiles (files) {
var len = files.length;
var buf = [];
var writeStream = fs.createWriteStream('./all.txt', writeOptions);
for (var i = 0; i < len; i++) {
readFile(CORPUS_HOME + "/" + files[i], function(err, text){
if (text != EOF) {
buf.push(text);
} else {
writeStream.write(Buffer.concat(buf));
buf = [];
}
});
}
};
function readFile (filePath, callback) {
var readStream = fs.createReadStream(filePath, readOptions);
var rl = readline.createInterface({
input: readStream,
output: process.stdout,
terminal: false
});
var count = 0;
rl.on('line', function (line) {
callback(null, formatLine(line));
count++;
});
rl.on('close', function(){
console.log("end:", count);
callback(null, EOF);
rl.close();
});
};
function formatLine (line) {
line = line.split("\t\t");
return new Buffer(line[1] + '\n');
};