-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
107 lines (100 loc) · 5.07 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/* Script Parser */
var request = require("request");
var requestNative = require("request-promise-native");
var fs = require('fs');
// Script parser
var page1url = 'http://www.kacl780.net/frasier/transcripts/season_1/episode_1/the_good_son.html';
var pageURLS = [
// First episode has poorly formated html
// 'http://www.kacl780.net/frasier/transcripts/season_1/episode_1/the_good_son.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_2/space_quest.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_3/dinner_at_eight.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_4/i_hate_frasier_crane.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_5/heres_looking_at_you.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_6/the_crucible.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_7/call_me_irresponsible.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_8/beloved_infidel.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_9/selling_out.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_10/oops.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_11/death_becomes_him.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_12/miracle_on_3rd_or_4th_street.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_13/guess_whos_coming_to_breakfast.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_14/cant_buy_me_love.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_15/you_cant_tell_a_crook_by_his_cover.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_16/the_show_where_lilith_comes_back.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_17/a_mid_winter_nights_dream.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_18/and_the_whimper_is.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_19/give_him_the_chair.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_20/fortysomething.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_21/travels_with_martin.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_22/author_author.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_23/frasier_cranes_day_off.html',
'http://www.kacl780.net/frasier/transcripts/season_1/episode_24/my_coffee_with_niles.html'
];
var corpus = [];
// Start Pull
pullFromSite();
function pullFromSite() {
let promiseList = [];
pageURLS.forEach( (url) => {
// Fill
console.log('Parsing url', url);
promiseList.push(requestNative(url).then(parseWebsite, onError));
});
Promise.all(promiseList).then( (data) => {
console.log('All the promises', corpus.length);
reformatToCorpus();
});
}
function onError(err) {
console.log('There was an error');
}
function parseWebsite(body) {
// Find Fraisers Line
let frasierSplit = body.split('<b>Frasier: </b>');
console.log('Frasier Split', frasierSplit[0]);
console.log('Line Numbers: ' , frasierSplit.length);
frasierSplit.forEach( (lineSet) => {
// Find the last line spoken, set it as the response. If no line, dont include and continue
let previousLine = lineSet.split('<b>');
// console.log('Frasier Response:', previousLine[0]);
// Remove Name from Question
let questionArr = previousLine[previousLine.length - 1];
// console.log('Question Arr: ' , questionArr);
let question = questionArr.split('</b>')[1];
if (question && (question.indexOf('Scene') > 0 || question.indexOf('<pre>') > 0)) {
question = undefined;
}
if (previousLine[0] && (previousLine[0].indexOf('Scene') > 0 || previousLine[0].indexOf('<pre>') > 0)) {
previousLine[0] = undefined;
}
if (corpus.length === 0) {
corpus.push({ question: question});
} else {
corpus[corpus.length - 1]['response'] = previousLine[0];
corpus.push({question: question});
}
});
console.log('Corpus: ' , corpus.length);
}
function reformatToCorpus() {
console.log('corpus', corpus);
// Start with question - response format, reformat to chatterbot corpus format
// Create Fraiser File!
let startingLine = 'categories:\n- Fraiser\nconversations:\n';
fs.writeFile('fraisercrane.yml' , startingLine, function (err) {
console.log('File Written');
console.log('Attempting to write corpus' , corpus.length);
corpus.forEach( ( pairs ) => {
// Combine pairs
if (!pairs.question || !pairs.response) {
// Do Nothing
console.log('Undefined response');
} else {
let lineToAppend = '- - ' + pairs.question.replace(/[\r\n\[\]\:\/]+/g,"").replace(/ +/g, ' ') + '\n - '
+ pairs.response.replace(/[\r\n\[\]\:]+/g,"").replace(/ +/g, ' ') + '\n';
fs.appendFile('fraisercrane.yml', lineToAppend);
}
});
});
}