forked from kookmin-sw/cap-template
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #129 from kookmin-sw/develop-ai
Develop ai
- Loading branch information
Showing
24 changed files
with
691 additions
and
70 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,3 +5,4 @@ | |
*.pyc | ||
__pycache__ | ||
data/ | ||
cap30.pem |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
import bs4 | ||
from langchain.document_loaders import WebBaseLoader | ||
import requests | ||
import pickle | ||
import os | ||
from tqdm import tqdm | ||
|
||
class CissClawer: | ||
def __init__(self, urls_txt='./crawler/ciss_url.txt'): | ||
self.urls = self.load_urls_from_file(urls_txt) | ||
self.static_urls = self.urls[:29] # 그 외 | ||
self.notice_urls = self.urls[29:] # 공지사항 | ||
self.notice_categories = ['academic', 'visa', 'scholarship', 'event', 'program', 'gks'] | ||
|
||
def load_urls_from_file(self, file_path='./ciss_url.txt'): | ||
urls = [] | ||
with open(file_path, 'r') as file: | ||
for line in file: | ||
url = line.strip() # 줄 바꿈 문자 제거 | ||
if url: # 비어있지 않은 줄만 추가 | ||
urls.append(url) | ||
return urls | ||
|
||
def crawling_content_url(self, urls_lst): | ||
loader = WebBaseLoader( | ||
web_paths=(urls_lst), | ||
bs_kwargs=dict( | ||
parse_only=bs4.SoupStrainer( | ||
class_=("content-wrap") | ||
) | ||
), | ||
) | ||
|
||
docs = loader.load() | ||
metadata_lst = self.extract_page_metadata(urls_lst) | ||
|
||
for i, doc in enumerate(tqdm(docs)): | ||
doc.page_content = doc.page_content.replace(u"\xa0", u" ") | ||
try: | ||
doc.metadata['title'] = metadata_lst[i][0] | ||
if len(metadata_lst[i]) >= 2: | ||
doc.metadata['datetime'] = metadata_lst[i][1] | ||
except: | ||
print(f'ERROR ! : {urls_lst[i], metadata_lst[i]}') | ||
|
||
return docs | ||
|
||
|
||
def crawling(self, path = './CISS/'): | ||
print(f' crawling CISS notice ... ') | ||
notice_path = path + 'NOTICE/' | ||
for i, url in enumerate(self.notice_urls): | ||
print(f' crawling {self.notice_categories[i]} ... ') | ||
child_url_lst = self.get_notice_child_urls(url) | ||
|
||
docs = self.crawling_content_url(child_url_lst) | ||
if not os.path.exists(notice_path): | ||
os.makedirs(notice_path) | ||
with open(notice_path+self.notice_categories[i]+'.pkl', 'wb') as f: | ||
pickle.dump(docs, f) | ||
|
||
non_notice_path = path + 'SCHOOL_INFO/' | ||
print(f' crawling CISS non_notice ... ') | ||
docs = self.crawling_content_url(self.static_urls) | ||
|
||
if not os.path.exists(non_notice_path): | ||
os.makedirs(non_notice_path) | ||
with open(non_notice_path+'non_notice'+'.pkl', 'wb') as f: | ||
pickle.dump(docs, f) | ||
|
||
def get_notice_child_urls(self, notice_categori_url): | ||
href_values = [] | ||
base_url = notice_categori_url.split('?')[0] | ||
|
||
try: | ||
# 페이지 가져오기 | ||
response = requests.get(notice_categori_url) | ||
response.raise_for_status() | ||
|
||
# HTML 파싱 | ||
soup = bs4.BeautifulSoup(response.text, 'html.parser') | ||
|
||
# 지정된 클래스를 가진 요소 추출 | ||
boxes = soup.find_all(class_='b-title-box') | ||
|
||
# 각 박스에서 하위 <a> 태그의 href 속성 값 추출 | ||
for box in boxes: | ||
links = box.find_all('a', href=True) | ||
for link in links: | ||
href_values.append(base_url + link['href']) | ||
|
||
except Exception as e: | ||
print(f"Error processing URL '{notice_categori_url}': {e}") | ||
|
||
return href_values | ||
|
||
def extract_page_metadata(self, urls): | ||
results = [] # 각 url들의 title이 담길 list | ||
for url in urls: | ||
try: | ||
# 페이지 가져오기 | ||
response = requests.get(url) | ||
response.raise_for_status() # 오류가 발생하면 예외를 일으킴 | ||
# HTML 파싱 | ||
soup = bs4.BeautifulSoup(response.text, 'html.parser') | ||
# 클래스가 'page-title'인 요소 추출 | ||
page_title_elements = soup.find_all(class_= ['page-title', 'b-date-box']) | ||
# 결과 리스트에 추가 | ||
page_titles = [element.text.strip() for element in page_title_elements] | ||
results.append(page_titles) | ||
|
||
except Exception as e: | ||
print(f"Error processing URL '{url}': {e}") | ||
|
||
return results | ||
|
||
|
||
if __name__ == '__main__': | ||
def change_working_directory_to_script_location(): | ||
# 현재 스크립트의 디렉토리 경로를 얻기 | ||
script_directory = os.path.dirname(__file__) | ||
|
||
# 현재 스크립트의 디렉토리로 작업 디렉토리 변경 | ||
os.chdir(script_directory) | ||
|
||
# 변경된 작업 디렉토리 반환 | ||
return os.getcwd() | ||
|
||
# 함수 호출하여 작업 디렉토리 변경 | ||
new_working_directory = change_working_directory_to_script_location() | ||
|
||
# 현재 작업 디렉토리 출력 | ||
current_working_directory = os.getcwd() | ||
print("Current working directory:", current_working_directory) | ||
urls_txt = 'url_path' | ||
cc = CissClawer(urls_txt) | ||
cc.crawling() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
https://cms.kookmin.ac.kr/kmuciss/ciss/intro.do | ||
https://cms.kookmin.ac.kr/kmuciss/ciss/organize.do | ||
https://cms.kookmin.ac.kr/kmuciss/ciss/location.do | ||
https://cms.kookmin.ac.kr/kmuciss/academic/undergrad01.do | ||
https://cms.kookmin.ac.kr/kmuciss/academic/undergrad02.do | ||
https://cms.kookmin.ac.kr/kmuciss/academic/undergrad04.do | ||
https://cms.kookmin.ac.kr/kmuciss/academic/undergrad05.do | ||
https://cms.kookmin.ac.kr/kmuciss/academic/undergrad06.do | ||
https://cms.kookmin.ac.kr/kmuciss/academic/grad01.do | ||
https://cms.kookmin.ac.kr/kmuciss/academic/grad02.do | ||
https://cms.kookmin.ac.kr/kmuciss/academic/grad04.do | ||
https://cms.kookmin.ac.kr/kmuciss/academic/grad05.do | ||
https://cms.kookmin.ac.kr/kmuciss/academic/grad06.do | ||
https://cms.kookmin.ac.kr/kmuciss/scholarship/undergrad01.do | ||
https://cms.kookmin.ac.kr/kmuciss/scholarship/undergrad02.do | ||
https://cms.kookmin.ac.kr/kmuciss/scholarship/undergrad03.do | ||
https://cms.kookmin.ac.kr/kmuciss/scholarship/undergrad03.do | ||
https://cms.kookmin.ac.kr/kmuciss/visa/visa04.do | ||
https://cms.kookmin.ac.kr/kmuciss/visa/visa01.do | ||
https://cms.kookmin.ac.kr/kmuciss/visa/visa02.do | ||
https://cms.kookmin.ac.kr/kmuciss/visa/visa03.do | ||
https://cms.kookmin.ac.kr/kmuciss/program/program01.do | ||
https://cms.kookmin.ac.kr/kmuciss/program/program02.do | ||
https://cms.kookmin.ac.kr/kmuciss/program/program03.do | ||
https://cms.kookmin.ac.kr/kmuciss/program/program04.do | ||
https://cms.kookmin.ac.kr/kmuciss/program/program05.do | ||
https://cms.kookmin.ac.kr/kmuciss/program/program06.do | ||
https://cms.kookmin.ac.kr/kmuciss/program/program07.do | ||
https://cms.kookmin.ac.kr/kmuciss/program/program08.do | ||
https://cms.kookmin.ac.kr/kmuciss/notice/academic.do?&articleLimit=1000 | ||
https://cms.kookmin.ac.kr/kmuciss/notice/visa.do?&articleLimit=1000 | ||
https://cms.kookmin.ac.kr/kmuciss/notice/scholarship.do?&articleLimit=1000 | ||
https://cms.kookmin.ac.kr/kmuciss/notice/event.do?&articleLimit=1000 | ||
https://cms.kookmin.ac.kr/kmuciss/notice/program.do?&articleLimit=1000 | ||
https://cms.kookmin.ac.kr/kmuciss/notice/gks.do?&articleLimit=1000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,30 @@ | ||
from langchain_community.document_loaders import PyPDFLoader | ||
import pickle | ||
|
||
import os | ||
from tqdm import tqdm | ||
|
||
class PdfReader: | ||
def __init__(self) -> None: | ||
pass | ||
|
||
def read_pdf(filename, path): | ||
def read_pdf(self, filepath, path='./data/', name='default'): | ||
|
||
filename, _ = os.path.splitext(os.path.basename(filepath)) | ||
path += filename + '/' + 'SCHOOL_INFO/' | ||
|
||
loader = PyPDFLoader(filename) | ||
if not os.path.exists(path): | ||
os.makedirs(path) | ||
print(f'-- Load pdf file {filename} --') | ||
loader = PyPDFLoader(filepath) | ||
pages = loader.load() | ||
for page_no in range(len(pages)): | ||
print('-- start --') | ||
total_pdf = [] | ||
for page_no in tqdm(range(10)): | ||
doc = pages[page_no] | ||
doc.page_content = doc.page_content.replace(u"\xa0", u" ") | ||
with open(path+str(page_no)+'.pkl', 'wb') as f: | ||
pickle.dump(doc, f) | ||
doc.page_content = doc.page_content.replace("·", "") | ||
if doc.page_content: | ||
total_pdf.append(doc) | ||
with open(path+name+'.pkl', 'wb') as f: | ||
pickle.dump(total_pdf, f) | ||
print(total_pdf) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
import bs4 | ||
from langchain.document_loaders import WebBaseLoader | ||
import requests | ||
import pickle | ||
import os | ||
from tqdm import tqdm | ||
|
||
class SoftwareCrawler: | ||
def __init__(self, urls_txt = './crawler/sw_url.txt'): | ||
self.urls = self.load_urls_from_file(urls_txt) | ||
self.static_urls = self.urls[:-4] | ||
self.notice_urls = self.urls[-4:] | ||
|
||
self.notice_categories = {'notice':self.notice_urls[0], | ||
'jobs':self.notice_urls[1], | ||
'scholarship':self.notice_urls[2], | ||
'event':self.notice_urls[3]} | ||
|
||
self.crawling_range = {'notice':(1971, 2413), | ||
'jobs':(1530, 1642), | ||
'scholarship':(85, 86), | ||
'event':(1113, 1207)} | ||
|
||
def load_urls_from_file(self, file_path='./ciss_url.txt'): | ||
urls = [] | ||
with open(file_path, 'r') as file: | ||
for line in file: | ||
url = line.strip() # 줄 바꿈 문자 제거 | ||
if url: # 비어있지 않은 줄만 추가 | ||
urls.append(url) | ||
return urls | ||
|
||
def crawling_content_url(self, urls_lst, page_type=None): | ||
loader = WebBaseLoader( | ||
web_paths=(urls_lst), | ||
bs_kwargs=dict( | ||
parse_only=bs4.SoupStrainer( | ||
id='content' | ||
) | ||
), | ||
) | ||
|
||
docs = loader.load() | ||
if page_type == 'notice': | ||
metadata_lst = self.extract_notice_metadata(urls_lst) | ||
else: | ||
metadata_lst = self.extract_static_metadata(urls_lst) | ||
|
||
for i, doc in enumerate((tqdm(docs))): | ||
doc.page_content = doc.page_content.replace(u"\xa0", u" ") | ||
doc.metadata['title'] = metadata_lst[i][0] | ||
if page_type == 'notice': | ||
doc.metadata['datetime'] = metadata_lst[i][1] | ||
|
||
return docs | ||
|
||
def notice_url_lst(self, categori): | ||
start = self.crawling_range[categori][0] | ||
end = self.crawling_range[categori][1]+1 | ||
url_lst = [] | ||
base_url = self.notice_categories[categori] | ||
for i in range(start, end): | ||
tmp_url = base_url + '/' + str(i) | ||
response = requests.get(tmp_url) | ||
try: | ||
response.raise_for_status() # 오류가 발생하면 예외를 일으킴 | ||
except: | ||
continue | ||
|
||
# HTML 파싱 | ||
soup = bs4.BeautifulSoup(response.text, 'html.parser') | ||
td_elements = soup.find_all('td') | ||
try: | ||
test = td_elements[1] | ||
url_lst.append(tmp_url) | ||
except: | ||
print(f'not valid url : {tmp_url}') | ||
|
||
return url_lst | ||
|
||
def crawling(self, path='./SW/'): | ||
|
||
non_notice_path = path + 'SCHOOL_INFO/' | ||
|
||
if not os.path.exists(non_notice_path): | ||
os.makedirs(non_notice_path) | ||
|
||
docs = self.crawling_content_url(self.static_urls) | ||
with open(non_notice_path+'non_notice'+'.pkl', 'wb') as f: | ||
pickle.dump(docs, f) | ||
|
||
notice_path = path + 'NOTICE/' | ||
notice_categories = self.notice_categories.keys() | ||
|
||
if not os.path.exists(notice_path): | ||
os.makedirs(notice_path) | ||
|
||
for c in notice_categories: | ||
docs = self.crawling_content_url(self.notice_url_lst(c), page_type='notice') | ||
with open(notice_path+'sw_'+c+'.pkl', 'wb') as f: | ||
pickle.dump(docs, f) | ||
|
||
def extract_static_metadata(self, urls): | ||
results = [] # 결과를 저장할 리스트 | ||
for i, url in enumerate(urls): | ||
try: | ||
# 페이지 가져오기 | ||
response = requests.get(url) | ||
response.raise_for_status() # 오류가 발생하면 예외를 일으킴 | ||
# HTML 파싱 | ||
soup = bs4.BeautifulSoup(response.text, 'html.parser') | ||
# 클래스가 'page-title'인 요소 추출 | ||
page_title_elements = soup.find_all(class_= 'page-title') | ||
# 결과 리스트에 추가 | ||
page_titles = ['소프트웨어학부 ' + page_title_elements[0].text.strip()] | ||
results.append(page_titles) | ||
except Exception as e: | ||
print(f"Error processing URL 'idx{i} {url}': {e}") | ||
if i == 5: | ||
results.append('멘토링 시스템') | ||
print('successfully handle exception! : 멘토링 시스템') | ||
elif i == 20: | ||
results.append('일반대학원 입학') | ||
print('successfully handle exception! : 일반대학원 입학') | ||
|
||
return results | ||
|
||
def extract_notice_metadata(self, urls): | ||
result = [] | ||
for url in urls: | ||
try: | ||
# 페이지 가져오기 | ||
response = requests.get(url) | ||
response.raise_for_status() # 오류가 발생하면 예외를 일으킴 | ||
|
||
# HTML 파싱 | ||
soup = bs4.BeautifulSoup(response.text, 'html.parser') | ||
|
||
# 'view-title' 클래스를 가진 요소 찾기 | ||
view_title_elements = soup.find_all(class_='view-title') | ||
view_titles = [element.text.strip() for element in view_title_elements] | ||
|
||
# 'article-subject' 클래스를 가진 요소 찾기 | ||
article_subject_element = soup.find(class_='aricle-subject') | ||
|
||
# 'article-subject' 클래스 아래에 있는 <td> 태그의 값 추출 | ||
td_values = [] | ||
if article_subject_element: | ||
td_elements = article_subject_element.find_all_next('td') | ||
for td_element in td_elements: | ||
td_values.append(td_element.text.strip()) | ||
|
||
result.append(view_titles + td_values) | ||
|
||
except Exception as e: | ||
print(f"Error processing URL '{url}': {e}") | ||
result.append('None', 'None') | ||
|
||
return result |
Oops, something went wrong.