Merge pull request #3 from boostcampaitech6/gunwoo

add AI_Poet
boostcampaitech6 · Mar 9, 2024 · b405e7b · b405e7b
2 parents 569c4c2 + abcee34
commit b405e7b
Show file tree

Hide file tree

Showing 9 changed files with 1,939,753 additions and 3 deletions.
diff --git a/ML/AI_Poet/EDA.ipynb b/ML/AI_Poet/EDA.ipynb
@@ -0,0 +1,103 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 삭제해야 할 패턴들\n",
+    "1. \"\\xa0\"\n",
+    "2. \"\\n\\n \"\n",
+    "3. \"\\n\" * m (m > 2)\n",
+    "4. \"\\s\" * m (m > 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import PreTrainedTokenizerFast\n",
+    "import pandas as pd\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def data_preprocessing(df):\n",
+    "\n",
+    "    # NaN 제거\n",
+    "    dataset = df.dropna(subset=[\"poem\"]) # 현재 poem쪽에 NaN이 존재해서 처리해줌. TODO : 크롤링할 때, poem==NaN이면 skip하도록 구현\n",
+    "\n",
+    "    # 중복 제거\n",
+    "    dataset = dataset.drop_duplicates([\"poem\"], keep=\"first\")\n",
+    "\n",
+    "    # 1000자 이상의 데이터 제거\n",
+    "    dataset = dataset[dataset[\"poem\"].str.len() <= 1000]\n",
+    "\n",
+    "    # index 재설정\n",
+    "    dataset = dataset.reset_index(drop=True) \n",
+    "\n",
+    "    # 아래의 패턴들을 전처리\n",
+    "    # 1. \"\\xa0\"\n",
+    "    # 2. \"\\s\" * m (m > 1)\n",
+    "    # 3. \"\\n\\n \"\n",
+    "    # 4. \"\\n\" * m (m > 2)\n",
+    "    for i in range(len(dataset)):\n",
+    "        dataset.iloc[i, 2] = dataset.iloc[i, 2].replace(u\"\\xa0\", u\"\")\n",
+    "        dataset.iloc[i, 2] = re.sub(r\"( {2,})\", \"\", dataset[\"poem\"][i])\n",
+    "        dataset.iloc[i, 2] = re.sub(r\"\\n\\n \", \"\", dataset[\"poem\"][i])\n",
+    "        dataset.iloc[i, 2] = re.sub(r\"(\\n{3,})\", \"\", dataset[\"poem\"][i])\n",
+    "\n",
+    "    return dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv(\"dataset/crawling_poem.csv\")\n",
+    "train_dataset = data_preprocessing(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset[\"first_line\"] = \"\"\n",
+    "for i in range(len(train_dataset)):\n",
+    "    text_until_newline = re.split('\\n', train_dataset[\"poem\"][i], 1)[0]\n",
+    "    train_dataset.iloc[i, 4] = text_until_newline"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/ML/AI_Poet/__init__.py b/ML/AI_Poet/__init__.py
diff --git a/ML/AI_Poet/crawling.ipynb b/ML/AI_Poet/crawling.ipynb
@@ -0,0 +1,211 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 207,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bs4 import BeautifulSoup\n",
+    "import requests\n",
+    "import re\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 222,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<Response [200]>\n",
+      "<div class=\"panel panel-default view-head\">\n",
+      "<div class=\"panel-heading\">\n",
+      "<font color=\"#7A8FDB\">저자</font> : <strong>류인순</strong>\n",
+      "\t\t\t\t\n",
+      "\t\t\t\t\t\t\t\t    <font color=\"#7A8FDB\">시집명</font> : <strong></strong>\n",
+      "<font color=\"#7A8FDB\">출판(발표)연도</font> : <strong>2024.03.05.</strong>\n",
+      "\t\t\t\t\n",
+      "\t\t\t\t\t\t\t\t    <font color=\"#7A8FDB\">출판사</font> : <strong></strong>\n",
+      "</div>\n",
+      "</div>\n",
+      "류인순\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = requests.get(\"https://poemlove.co.kr/bbs/board.php?bo_table=tb01&wr_id=271686\")\n",
+    "print(response)\n",
+    "# 웹페이지의 내용을 성공적으로 가져온 경우\n",
+    "if response.status_code == 200:\n",
+    "    soup = BeautifulSoup(response.content, \"lxml\")\n",
+    "    fail_message = soup.find(\"meta\", {\"name\": 'title'})\n",
+    "    if \"오류안내 페이지\" in fail_message[\"content\"]:\n",
+    "        print(\"fail\")\n",
+    "    headline = soup.find(\"h1\", {\"itemprop\": \"headline\"}).get_text().strip() # 시 제목\n",
+    "    author_box = soup.find_all(\"div\", {\"class\": \"panel panel-default view-head\"})\n",
+    "    print(author_box[-1])\n",
+    "    all_text = author_box[-1].find_all(\"strong\")[0].get_text()\n",
+    "    print(all_text)               \n",
+    "    # author = author_box.find(\"span\", {\"class\": \"member\"}).get_text()        # 시 저자\n",
+    "    # poem = soup.find(\"div\", {'itemprop': 'description', 'class': 'view-content'}).get_text(separator=\"\\n\")\n",
+    "    # print(headline)\n",
+    "    # print(author)\n",
+    "    # print(poem)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# '\\n\\t\\t\\t\\t햇살의 기쁨\\r\\n \\n\n",
+    "# '\\n\\t\\t\\t\\t매화\\r\\n \\n\\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 글 : 박동수\\r\\n \\n\n",
+    "# '\\n\\t\\t\\t\\t내가 먼저/ 홍수희\\r\\n \\n \\n\n",
+    "# '\\n\\t\\t\\t\\t풍경 ( 風 磬 )\\r\\n\\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 - 다서 신형식\\r\\n \\n\n",
+    "# '\\n\\t\\t\\t\\t홍매화 향기처럼 / 정심 김덕성\\r\\n \\n \\n\n",
+    "# '\\n\\t\\t\\t\\t때로는 꽃들이 마음에 쉼표가 -> 처음으로 \\r을 만날 때\n",
+    "# '\\n\\t\\t\\t\\t아지랑이\\r\\n소산/문 재학\\r\\n \\n\n",
+    "# '\\n\\t\\t\\t\\t그래서 순수만이 만져볼 수 있는\n",
+    "# '\\n\\t\\t\\t\\t너에게 쓰는 편지\\r\\n \\n\n",
+    "# '\\n\\t\\t\\t\\t문신 / 이재봉\\r\\n \\n\n",
+    "# \"\\n\\t\\t\\t\\t삼일절 \\r\\n \\n靑山 손병흥 \\r\\n \\n\n",
+    "# '\\n\\t\\t\\t\\t5월 들길 / 성백군\\r\\n \\n \\n늙은 봄과 젊은 여름이\\r\\n공생하는 5월 들길을 걷는다\\r\\n \\n\n",
+    "\n",
+    "# 1. \\r\\n \\n 가 1번 나오는 경우 / 2번 나오는 경우\n",
+    "# 2. \\r\\n \\n \\n\n",
+    "# 3. \\r\\n \\n \\n이 나오고 내용이 나오지만 그 뒤에 \\r\\n \\n이 나오는 경우 -> 2,3의 경우에는 이 패턴이 나오면 끝내야 함\n",
+    "# 4. 아예 없는 경우\n",
+    "\n",
+    "# 제목과 작가를 strip(제목 or 작가)로 지우고, 나머지 특수기호들을 strip으로 싹 지워준다면 가장 깔끔할 것 같다\n",
+    "# strip()을 사용하니 양끝에서 일반문자를 만날 때까지 특수기호를 제거한다. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 173,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\n\\t\\t\\t\\t풍경 ( 風 磬 )\\r\\n\\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 - 다서 신형식\\r\\n \\n꽃향기 냄새가\\r\\n풍경 ( 風磬 )앞에서 맑다\\r\\n은은하다 못해 찬란하다\\r\\n난 아직도 나를 몰라\\r\\n바람에 흔들리고\\r\\n나를 깨우는 그대 때문에\\r\\n혼자 일어나\\r\\n조용한 새벽을 준비한다\\t\\t\\t'"
+      ]
+     },
+     "execution_count": 173,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "poem"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 178,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 시의 내용만 가져오기 위한 전처리\n",
+    "def preprocessing(headline: str, author: str, poem: str) -> str:\n",
+    "    data = poem.replace(\"\\r\", \"\")                           # 특수문자 제거\n",
+    "    data = data.replace(\"\\n \\n\", \"\\n\\n\")                    # 연 구분\n",
+    "    if data.find(author) > 0:\n",
+    "        data = data[data.find(author) + len(author):]       # 작가 제거\n",
+    "    data = data.replace(headline, \"\", 1).replace(\"/\", \"\")   # 제목 및 / 제거\n",
+    "    data = re.sub(r'[^가-힣0-9\\n\\s,.]', '', data)           # 외국어 및 특수문자 제거\n",
+    "    data = data.strip()                                     # 양옆 \\기호 제거\n",
+    "\n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 179,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'다서 신형식\\n\\n꽃향기 냄새가\\n풍경   앞에서 맑다\\n은은하다 못해 찬란하다\\n난 아직도 나를 몰라\\n바람에 흔들리고\\n나를 깨우는 그대 때문에\\n혼자 일어나\\n조용한 새벽을 준비한다'"
+      ]
+     },
+     "execution_count": 179,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preprocessed_poem = preprocessing(headline, author, poem)\n",
+    "preprocessed_poem"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# url request -> html속에서 시 추출 -> 시 전처리 -> csv 파일에 저장"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 182,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_file = {\"title\":[],\n",
+    "            \"author\": [],\n",
+    "            \"poem\": []}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 184,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = [\"\\n\\t\\t\\t\\t햇살의 기쁨\\r\\n \\n\",\n",
+    "\"\\n\\t\\t\\t\\t매화\\r\\n \\n\\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 글 : 박동수\\r\\n \\n\",\n",
+    "\"\\n\\t\\t\\t\\t내가 먼저/ 홍수희\\r\\n \\n \\n\",\n",
+    "\"\\n\\t\\t\\t\\t풍경 ( 風 磬 )\\r\\n\\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 - 다서 신형식\\r\\n \\n\",\n",
+    "\"\\n\\t\\t\\t\\t홍매화 향기처럼 / 정심 김덕성\\r\\n \\n \\n\"]\n",
+    "\n",
+    "for i in text:\n",
+    "    data_file[\"title\"].append(\"A\")\n",
+    "    data_file[\"author\"].append(\"B\")\n",
+    "    data_file[\"poem\"].append(i)\n",
+    "\n",
+    "df = pd.DataFrame(data_file)\n",
+    "df.to_csv(\"df.csv\", index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}