-
Notifications
You must be signed in to change notification settings - Fork 0
/
sendToDB.py
108 lines (85 loc) · 3.98 KB
/
sendToDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
import firebase_admin
from firebase_admin import credentials, storage, firestore
import pandas as pd
from summarizer import Summarizer
from dotenv import load_dotenv
import os
import asyncio
import requests
from datetime import datetime
load_dotenv()
# Use the application default credentials
cred = credentials.Certificate("./serviceAccountKey.json")
firebase_admin.initialize_app(cred, {"storageBucket": os.getenv("FIRSTORE_PROJECT_ID") + ".appspot.com"})
db = firestore.client()
# Storage에 이미지 업로드할 버킷 생성
bucket = storage.bucket()
def sendDataToDB(data) -> None:
try:
# autoInformation(collection) - 카테고리명(doc) - posts(collection) - 개별 post(doc)
doc_ref = db.collection('autoInformation').document(data['category']).collection('autoInformation_posts').document()
# 유의미한 ID를 두지 않고 Cloud Firestore에서 자동으로 ID를 생성 : add() , document().set()
doc_ref.set(data)
except Exception as e:
print(e, ": data Firestore 전송 실패")
def sendImagesToStorage(fileName, imageUrlsGroup):
storageUrlsGroup = []
for index, imageUrls in enumerate(imageUrlsGroup):
storageUrls = []
if imageUrls == [""]:
storageUrlsGroup.append(storageUrls)
continue
for imageUrl in imageUrls:
try:
response = requests.get(imageUrl)
response.raise_for_status()
imageData = response.content
url = f"autoInformation_images/{fileName}_{index}" + datetime.now().strftime("_%Y-%m-%d_%H-%M") + ".jpg"
blob = bucket.blob(url)
blob.upload_from_string(imageData, content_type="image/jpeg")
storageUrl = 'gs://' + os.getenv("FIRSTORE_PROJECT_ID") + '.appspot.com/' + url
storageUrls.append(storageUrl)
except requests.exceptions.HTTPError as e:
print(f"HTTP 에러 발생: {e}")
except requests.exceptions.RequestException as e:
print(f"다운로드 에러 발생: {e}")
except Exception as e:
print(f"알 수 없는 에러 발생: {e}")
storageUrlsGroup.append(storageUrls)
return storageUrlsGroup
# 파일들 이름 가져오기
def getCrawledFileNames():
return os.listdir("result/crawl")
async def main():
# Summarize 인스턴스 생성
summarizer = Summarizer("chatGPT")
# 반복문으로 result/summary에 있는 csv 파일 하나씩 가져오면서 summarize 진행
# summarize 진행 후 df 가져와서 DB로 보내기
files = getCrawledFileNames()
print(files)
for file in files:
fileName = file[:-4]
try:
summarizer.setFileName(fileName)
summarizer.loadDataframe()
await summarizer.summarizeContents()
summarizer.df.drop(['Unnamed: 0'], axis = 1, inplace = True)
summarizer.df = summarizer.df.astype({"category" : "str", "region" : "str", "image" : "str"})
summarizer.df["region"] = [summarizer.df.loc[i]['region'].replace('\'', '')[1:-1].split(', ') for i in summarizer.df.index]
summarizer.df["image"] = [summarizer.df.loc[i]['image'].replace('\'', '')[1:-1].split(', ') for i in summarizer.df.index]
summarizer.df["image"] = sendImagesToStorage(fileName, summarizer.df["image"].tolist())
summarizer.df["post_date"] = datetime.utcnow()
print(summarizer.df["image"].tolist())
for i in summarizer.df.index:
data = summarizer.df.loc[i].to_dict()
data['scraps'] = 0
data['scrapsUser'] = []
data['keyword'] = data['title'].split(' ')
print(data)
sendDataToDB(data)
summarizer.saveDataframeToCSV()
except Exception as e:
print(e, f"{fileName} 요약 실패")
if __name__ == "__main__":
asyncio.run(main())