forked from philip-nomad/stock-prediction-model
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kosac_preprocessor.py
77 lines (62 loc) · 2.41 KB
/
kosac_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import csv
import datetime
import os
import pandas as pd
from konlpy.tag import Hannanum
import news_contents_crawler
hannanum = Hannanum()
NEWS_WORDS_DIR = 'news_words'
def mkdir(company_code):
if not os.path.exists(f"./{NEWS_WORDS_DIR}/{company_code}"):
os.makedirs(f"./{NEWS_WORDS_DIR}/{company_code}")
def start(company_code, start_date, end_date):
print(f"company_code: {company_code} 뉴스기사 전처리 시작")
mkdir(company_code)
while start_date <= end_date:
date_results = []
title_results = []
contents_results = []
input_titles = []
input_contexts = []
try:
with open(f"./{news_contents_crawler.NEWS_DIR}/{company_code}/{company_code}_{str(start_date)[:10]}.csv",
'r',
-1,
'utf-8') as news:
next(news)
for line in csv.reader(news):
date_results.append(line[1])
title_results.append(line[2])
contents_results.append(line[3])
except FileNotFoundError:
pass
# print('FileNotFoundError: 해당 날짜에 뉴스기사가 존재하지 않습니다.')
else:
for title in title_results:
text = hannanum.nouns(title)
input_title = ""
for t in text:
input_title += t + " "
input_title = input_title.strip()
input_titles.append(input_title)
for content in contents_results:
text = hannanum.nouns(content)
input_content = ""
for t in text:
input_content += t + " "
input_content = input_content.strip()
input_contexts.append(input_content)
f = open(f"./{NEWS_WORDS_DIR}/{company_code}/{company_code}_{str(start_date)[:10]}.csv",
"w+")
f.close()
columns = ['time', 'title', 'context']
df = pd.DataFrame(columns=columns)
df["time"] = date_results
df["title"] = input_titles
df["context"] = input_contexts
df.to_csv(
f"./{NEWS_WORDS_DIR}/{company_code}/{company_code}_{str(start_date)[:10]}.csv",
index=False
)
finally:
start_date += datetime.timedelta(days=1)