Skip to content

Commit

Permalink
Merge pull request #129 from kookmin-sw/develop-ai
Browse files Browse the repository at this point in the history
Develop ai
  • Loading branch information
guahama authored May 3, 2024
2 parents 95ba0d6 + 98e5ae8 commit 3aa931b
Show file tree
Hide file tree
Showing 24 changed files with 691 additions and 70 deletions.
1 change: 1 addition & 0 deletions ai/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
*.pyc
__pycache__
data/
cap30.pem
Binary file not shown.
Binary file added ai/FAISS/NOTICE/index.pkl
Binary file not shown.
Binary file added ai/FAISS/SCHOOL_INFO/index.faiss
Binary file not shown.
Binary file added ai/FAISS/SCHOOL_INFO/index.pkl
Binary file not shown.
Binary file removed ai/FAISS/index.pkl
Binary file not shown.
Binary file modified ai/crawler/__pycache__/clova_ocr.cpython-39.pyc
Binary file not shown.
Binary file modified ai/crawler/__pycache__/notice_crawler.cpython-39.pyc
Binary file not shown.
137 changes: 137 additions & 0 deletions ai/crawler/ciss_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import bs4
from langchain.document_loaders import WebBaseLoader
import requests
import pickle
import os
from tqdm import tqdm

class CissClawer:
def __init__(self, urls_txt='./crawler/ciss_url.txt'):
self.urls = self.load_urls_from_file(urls_txt)
self.static_urls = self.urls[:29] # 그 외
self.notice_urls = self.urls[29:] # 공지사항
self.notice_categories = ['academic', 'visa', 'scholarship', 'event', 'program', 'gks']

def load_urls_from_file(self, file_path='./ciss_url.txt'):
urls = []
with open(file_path, 'r') as file:
for line in file:
url = line.strip() # 줄 바꿈 문자 제거
if url: # 비어있지 않은 줄만 추가
urls.append(url)
return urls

def crawling_content_url(self, urls_lst):
loader = WebBaseLoader(
web_paths=(urls_lst),
bs_kwargs=dict(
parse_only=bs4.SoupStrainer(
class_=("content-wrap")
)
),
)

docs = loader.load()
metadata_lst = self.extract_page_metadata(urls_lst)

for i, doc in enumerate(tqdm(docs)):
doc.page_content = doc.page_content.replace(u"\xa0", u" ")
try:
doc.metadata['title'] = metadata_lst[i][0]
if len(metadata_lst[i]) >= 2:
doc.metadata['datetime'] = metadata_lst[i][1]
except:
print(f'ERROR ! : {urls_lst[i], metadata_lst[i]}')

return docs


def crawling(self, path = './CISS/'):
print(f' crawling CISS notice ... ')
notice_path = path + 'NOTICE/'
for i, url in enumerate(self.notice_urls):
print(f' crawling {self.notice_categories[i]} ... ')
child_url_lst = self.get_notice_child_urls(url)

docs = self.crawling_content_url(child_url_lst)
if not os.path.exists(notice_path):
os.makedirs(notice_path)
with open(notice_path+self.notice_categories[i]+'.pkl', 'wb') as f:
pickle.dump(docs, f)

non_notice_path = path + 'SCHOOL_INFO/'
print(f' crawling CISS non_notice ... ')
docs = self.crawling_content_url(self.static_urls)

if not os.path.exists(non_notice_path):
os.makedirs(non_notice_path)
with open(non_notice_path+'non_notice'+'.pkl', 'wb') as f:
pickle.dump(docs, f)

def get_notice_child_urls(self, notice_categori_url):
href_values = []
base_url = notice_categori_url.split('?')[0]

try:
# 페이지 가져오기
response = requests.get(notice_categori_url)
response.raise_for_status()

# HTML 파싱
soup = bs4.BeautifulSoup(response.text, 'html.parser')

# 지정된 클래스를 가진 요소 추출
boxes = soup.find_all(class_='b-title-box')

# 각 박스에서 하위 <a> 태그의 href 속성 값 추출
for box in boxes:
links = box.find_all('a', href=True)
for link in links:
href_values.append(base_url + link['href'])

except Exception as e:
print(f"Error processing URL '{notice_categori_url}': {e}")

return href_values

def extract_page_metadata(self, urls):
results = [] # 각 url들의 title이 담길 list
for url in urls:
try:
# 페이지 가져오기
response = requests.get(url)
response.raise_for_status() # 오류가 발생하면 예외를 일으킴
# HTML 파싱
soup = bs4.BeautifulSoup(response.text, 'html.parser')
# 클래스가 'page-title'인 요소 추출
page_title_elements = soup.find_all(class_= ['page-title', 'b-date-box'])
# 결과 리스트에 추가
page_titles = [element.text.strip() for element in page_title_elements]
results.append(page_titles)

except Exception as e:
print(f"Error processing URL '{url}': {e}")

return results


if __name__ == '__main__':
def change_working_directory_to_script_location():
# 현재 스크립트의 디렉토리 경로를 얻기
script_directory = os.path.dirname(__file__)

# 현재 스크립트의 디렉토리로 작업 디렉토리 변경
os.chdir(script_directory)

# 변경된 작업 디렉토리 반환
return os.getcwd()

# 함수 호출하여 작업 디렉토리 변경
new_working_directory = change_working_directory_to_script_location()

# 현재 작업 디렉토리 출력
current_working_directory = os.getcwd()
print("Current working directory:", current_working_directory)
urls_txt = 'url_path'
cc = CissClawer(urls_txt)
cc.crawling()
35 changes: 35 additions & 0 deletions ai/crawler/ciss_url.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
https://cms.kookmin.ac.kr/kmuciss/ciss/intro.do
https://cms.kookmin.ac.kr/kmuciss/ciss/organize.do
https://cms.kookmin.ac.kr/kmuciss/ciss/location.do
https://cms.kookmin.ac.kr/kmuciss/academic/undergrad01.do
https://cms.kookmin.ac.kr/kmuciss/academic/undergrad02.do
https://cms.kookmin.ac.kr/kmuciss/academic/undergrad04.do
https://cms.kookmin.ac.kr/kmuciss/academic/undergrad05.do
https://cms.kookmin.ac.kr/kmuciss/academic/undergrad06.do
https://cms.kookmin.ac.kr/kmuciss/academic/grad01.do
https://cms.kookmin.ac.kr/kmuciss/academic/grad02.do
https://cms.kookmin.ac.kr/kmuciss/academic/grad04.do
https://cms.kookmin.ac.kr/kmuciss/academic/grad05.do
https://cms.kookmin.ac.kr/kmuciss/academic/grad06.do
https://cms.kookmin.ac.kr/kmuciss/scholarship/undergrad01.do
https://cms.kookmin.ac.kr/kmuciss/scholarship/undergrad02.do
https://cms.kookmin.ac.kr/kmuciss/scholarship/undergrad03.do
https://cms.kookmin.ac.kr/kmuciss/scholarship/undergrad03.do
https://cms.kookmin.ac.kr/kmuciss/visa/visa04.do
https://cms.kookmin.ac.kr/kmuciss/visa/visa01.do
https://cms.kookmin.ac.kr/kmuciss/visa/visa02.do
https://cms.kookmin.ac.kr/kmuciss/visa/visa03.do
https://cms.kookmin.ac.kr/kmuciss/program/program01.do
https://cms.kookmin.ac.kr/kmuciss/program/program02.do
https://cms.kookmin.ac.kr/kmuciss/program/program03.do
https://cms.kookmin.ac.kr/kmuciss/program/program04.do
https://cms.kookmin.ac.kr/kmuciss/program/program05.do
https://cms.kookmin.ac.kr/kmuciss/program/program06.do
https://cms.kookmin.ac.kr/kmuciss/program/program07.do
https://cms.kookmin.ac.kr/kmuciss/program/program08.do
https://cms.kookmin.ac.kr/kmuciss/notice/academic.do?&articleLimit=1000
https://cms.kookmin.ac.kr/kmuciss/notice/visa.do?&articleLimit=1000
https://cms.kookmin.ac.kr/kmuciss/notice/scholarship.do?&articleLimit=1000
https://cms.kookmin.ac.kr/kmuciss/notice/event.do?&articleLimit=1000
https://cms.kookmin.ac.kr/kmuciss/notice/program.do?&articleLimit=1000
https://cms.kookmin.ac.kr/kmuciss/notice/gks.do?&articleLimit=1000
25 changes: 19 additions & 6 deletions ai/crawler/pdf_reader.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,30 @@
from langchain_community.document_loaders import PyPDFLoader
import pickle

import os
from tqdm import tqdm

class PdfReader:
def __init__(self) -> None:
pass

def read_pdf(filename, path):
def read_pdf(self, filepath, path='./data/', name='default'):

filename, _ = os.path.splitext(os.path.basename(filepath))
path += filename + '/' + 'SCHOOL_INFO/'

loader = PyPDFLoader(filename)
if not os.path.exists(path):
os.makedirs(path)
print(f'-- Load pdf file {filename} --')
loader = PyPDFLoader(filepath)
pages = loader.load()
for page_no in range(len(pages)):
print('-- start --')
total_pdf = []
for page_no in tqdm(range(10)):
doc = pages[page_no]
doc.page_content = doc.page_content.replace(u"\xa0", u" ")
with open(path+str(page_no)+'.pkl', 'wb') as f:
pickle.dump(doc, f)
doc.page_content = doc.page_content.replace("·", "")
if doc.page_content:
total_pdf.append(doc)
with open(path+name+'.pkl', 'wb') as f:
pickle.dump(total_pdf, f)
print(total_pdf)
159 changes: 159 additions & 0 deletions ai/crawler/sw_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import bs4
from langchain.document_loaders import WebBaseLoader
import requests
import pickle
import os
from tqdm import tqdm

class SoftwareCrawler:
def __init__(self, urls_txt = './crawler/sw_url.txt'):
self.urls = self.load_urls_from_file(urls_txt)
self.static_urls = self.urls[:-4]
self.notice_urls = self.urls[-4:]

self.notice_categories = {'notice':self.notice_urls[0],
'jobs':self.notice_urls[1],
'scholarship':self.notice_urls[2],
'event':self.notice_urls[3]}

self.crawling_range = {'notice':(1971, 2413),
'jobs':(1530, 1642),
'scholarship':(85, 86),
'event':(1113, 1207)}

def load_urls_from_file(self, file_path='./ciss_url.txt'):
urls = []
with open(file_path, 'r') as file:
for line in file:
url = line.strip() # 줄 바꿈 문자 제거
if url: # 비어있지 않은 줄만 추가
urls.append(url)
return urls

def crawling_content_url(self, urls_lst, page_type=None):
loader = WebBaseLoader(
web_paths=(urls_lst),
bs_kwargs=dict(
parse_only=bs4.SoupStrainer(
id='content'
)
),
)

docs = loader.load()
if page_type == 'notice':
metadata_lst = self.extract_notice_metadata(urls_lst)
else:
metadata_lst = self.extract_static_metadata(urls_lst)

for i, doc in enumerate((tqdm(docs))):
doc.page_content = doc.page_content.replace(u"\xa0", u" ")
doc.metadata['title'] = metadata_lst[i][0]
if page_type == 'notice':
doc.metadata['datetime'] = metadata_lst[i][1]

return docs

def notice_url_lst(self, categori):
start = self.crawling_range[categori][0]
end = self.crawling_range[categori][1]+1
url_lst = []
base_url = self.notice_categories[categori]
for i in range(start, end):
tmp_url = base_url + '/' + str(i)
response = requests.get(tmp_url)
try:
response.raise_for_status() # 오류가 발생하면 예외를 일으킴
except:
continue

# HTML 파싱
soup = bs4.BeautifulSoup(response.text, 'html.parser')
td_elements = soup.find_all('td')
try:
test = td_elements[1]
url_lst.append(tmp_url)
except:
print(f'not valid url : {tmp_url}')

return url_lst

def crawling(self, path='./SW/'):

non_notice_path = path + 'SCHOOL_INFO/'

if not os.path.exists(non_notice_path):
os.makedirs(non_notice_path)

docs = self.crawling_content_url(self.static_urls)
with open(non_notice_path+'non_notice'+'.pkl', 'wb') as f:
pickle.dump(docs, f)

notice_path = path + 'NOTICE/'
notice_categories = self.notice_categories.keys()

if not os.path.exists(notice_path):
os.makedirs(notice_path)

for c in notice_categories:
docs = self.crawling_content_url(self.notice_url_lst(c), page_type='notice')
with open(notice_path+'sw_'+c+'.pkl', 'wb') as f:
pickle.dump(docs, f)

def extract_static_metadata(self, urls):
results = [] # 결과를 저장할 리스트
for i, url in enumerate(urls):
try:
# 페이지 가져오기
response = requests.get(url)
response.raise_for_status() # 오류가 발생하면 예외를 일으킴
# HTML 파싱
soup = bs4.BeautifulSoup(response.text, 'html.parser')
# 클래스가 'page-title'인 요소 추출
page_title_elements = soup.find_all(class_= 'page-title')
# 결과 리스트에 추가
page_titles = ['소프트웨어학부 ' + page_title_elements[0].text.strip()]
results.append(page_titles)
except Exception as e:
print(f"Error processing URL 'idx{i} {url}': {e}")
if i == 5:
results.append('멘토링 시스템')
print('successfully handle exception! : 멘토링 시스템')
elif i == 20:
results.append('일반대학원 입학')
print('successfully handle exception! : 일반대학원 입학')

return results

def extract_notice_metadata(self, urls):
result = []
for url in urls:
try:
# 페이지 가져오기
response = requests.get(url)
response.raise_for_status() # 오류가 발생하면 예외를 일으킴

# HTML 파싱
soup = bs4.BeautifulSoup(response.text, 'html.parser')

# 'view-title' 클래스를 가진 요소 찾기
view_title_elements = soup.find_all(class_='view-title')
view_titles = [element.text.strip() for element in view_title_elements]

# 'article-subject' 클래스를 가진 요소 찾기
article_subject_element = soup.find(class_='aricle-subject')

# 'article-subject' 클래스 아래에 있는 <td> 태그의 값 추출
td_values = []
if article_subject_element:
td_elements = article_subject_element.find_all_next('td')
for td_element in td_elements:
td_values.append(td_element.text.strip())

result.append(view_titles + td_values)

except Exception as e:
print(f"Error processing URL '{url}': {e}")
result.append('None', 'None')

return result
Loading

0 comments on commit 3aa931b

Please sign in to comment.