GDGoC-SCHU · giyomdabin · Dec 21, 2024 · Nov 23, 2024 · Dec 14, 2024 · Dec 19, 2024
diff --git a/README.md b/README.md
diff --git a/week2-huiju/task1.py b/week2-huiju/task1.py
@@ -0,0 +1,68 @@
+
+from sklearn.datasets import load_iris
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+import numpy as np
+import matplotlib.pyplot as plt
+
+# 1. 데이터 준비
+iris = load_iris()
+X = iris["data"][:, 3:]
+y = (iris["target"] == 2).astype(int)   # 타깃 이진화: Virginica 품종을 1로, 나머지는 0으로 변환
+
+# 데이터 정보 출력
+print("특성 이름:", iris.feature_names)
+print("타깃 이름:", iris.target_names)
+print("데이터 크기:", y.reshape(-1, 1).shape)
+
+# 2. 데이터 분할
+# 데이터 분할: 훈련 세트(80%)와 테스트 세트(20%)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# 훈련 세트와 테스트 세트 크기 출력
+print("훈련 세트 크기:", y_train.reshape(-1, 1).shape)
+print("테스트 세트 크기:", y_test.reshape(-1, 1).shape)
+
+# 3. 모델 학습 / 예측 및 평가
+# 로지스틱 회귀 모델 정의 및 학습
+log_reg = LogisticRegression()
+log_reg.fit(X_train, y_train)
+
+res=log_reg.predict(X_test) # 예측 수행
+accuracy=accuracy_score(y_test, res) # 모델의 정확도 계산
+
+print("모델 정확도(Accuracy) ", accuracy) # 결과 출력
+
+# 4. 결정 경계 시각화
+X_new = np.linspace(0, 3, 1000).reshape(-1, 1)
+y_proba = log_reg.predict_proba(X_new)
+decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]
+
+plt.plot(X_new, y_proba[:, 1], "g-", label="Iris virginica")
+plt.axvline(x=decision_boundary, color="r", linestyle="--", label="Decision Boundary")
+plt.xlabel("Petal width (cm)")
+plt.ylabel("Probability")
+plt.title("Logistic Regression - Decision Boundary")
+plt.legend() # 범례를 추가
+plt.grid()
+plt.show()
+print(f"결정 경계: {decision_boundary}")
+
+# 4. 결정 경계 시각화
+x_min, x_max = 0, 3
+xx = np.linspace(x_min, x_max, 1000).reshape(-1, 1)
+probabilities = model.predict_proba(np.hstack((np.zeros((1000, 1)), xx)))[:, 1]
+
+plt.figure(figsize=(10, 6))
+plt.plot(xx, probabilities, label='Iris Virginica 확률', color='blue')
+plt.axhline(0.5, color='red', linestyle='--', label='결정 경계')
+plt.title('Iris Virginica 확률 곡선 및 결정 경계')
+plt.xlabel('꽃잎 너비 (cm)')
+plt.ylabel('확률')
+plt.legend()
+plt.grid()
+plt.show()
+
+# 분석 :
+# 모델의 성능은 정확도 1.0으로 매우 우수하며, 결정 경계 시각화를 통해 꽃잎 너비에 따라 Iris Virginica 품종과 나머지 품종을 잘 구분하고 있는 것을 확인할 수 있습니다. 
diff --git a/week2-huiju/task2.py b/week2-huiju/task2.py
@@ -0,0 +1,57 @@
+from sklearn.datasets import load_iris
+from sklearn.svm import SVC
+import numpy as np
+import matplotlib.pyplot as plt
+
+# 1. 데이터 준비
+iris = load_iris()
+X = iris.data[:, 2:4]  # 꽃잎 길이와 꽃잎 너비
+y = iris.target
+
+# Iris Setosa와 Iris Versicolor 클래스만 선택
+setosa_or_versicolor = (y == 0) | (y == 1)
+X = X[setosa_or_versicolor]
+y = y[setosa_or_versicolor]
+
+# 데이터 정보 출력
+print("특성 이름:", iris.feature_names[2:4])
+print("타깃 이름:", iris.target_names[:2])
+print("데이터 크기:", X.shape)
+
+# 2. 모델 학습
+svm_clf = SVC(kernel="linear", C=1)
+svm_clf.fit(X, y)
+
+# 지원 벡터 출력
+print("지원 벡터(Support Vectors):")
+print(svm_clf.support_vectors_)
+
+# 3. 결정 경계 시각화
+def plot_decision_boundary(svm_clf, X, y):
+    # 결정 경계 및 마진 계산
+    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
+    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
+    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
+
+    Z = svm_clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
+    Z = Z.reshape(xx.shape)
+
+    # 결정 경계 및 마진 시각화
+    plt.contour(xx, yy, Z, colors='k', levels=[0], alpha=1, linewidths=2)  # 결정 경계
+    plt.contour(xx, yy, Z, colors='k', levels=[-1, 1], linestyles='dashed')  # 마진
+
+    # 데이터 포인트 시각화
+    plt.scatter(X[y == 0, 0], X[y == 0, 1], color='green', label='Setosa', marker='o')
+    plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue', label='Versicolor', marker='s')
+
+    # 지원 벡터 표시
+    plt.scatter(svm_clf.support_vectors_[:, 0], svm_clf.support_vectors_[:, 1],
+                facecolors='none', edgecolors='k', s=200, label='Support Vectors', marker='o')
+    plt.xlabel("Petal Length (cm)")
+    plt.ylabel("Petal Width (cm)")
+    plt.legend()
+    plt.grid()
+
+# 시각화 함수 호출
+plot_decision_boundary(svm_clf, X, y)
+plt.show()
diff --git a/week2-huiju/task3.py b/week2-huiju/task3.py
@@ -0,0 +1,64 @@
+# 필요한 라이브러리 임포트
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.datasets import make_moons
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+from sklearn.utils import resample
+
+# 1. 데이터 준비
+X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# 2. 결정 트리 하이퍼파라미터 최적화
+param_grid = {
+    'max_depth': [3, 5, 6, 7, 8],  # None 제거
+    'max_leaf_nodes': [10, 15, 17, 20],
+    'min_samples_split': [2, 5, 10]
+}
+
+dt = DecisionTreeClassifier(random_state=42)
+grid_search = GridSearchCV(dt, param_grid, cv=5)
+grid_search.fit(X_train, y_train)
+
+best_params = grid_search.best_params_
+best_model = grid_search.best_estimator_
+
+# 최적의 하이퍼파라미터로 훈련된 모델의 테스트 정확도 확인
+test_accuracy_dt = accuracy_score(y_test, best_model.predict(X_test))
+print("Best Parameters:", best_params)
+print("Single Tree Accuracy:", test_accuracy_dt)
+
+# 3. 랜덤 포레스트 구현
+n_subsets = 100
+subtree_accuracies = []
+
+for _ in range(n_subsets):
+    # 랜덤하게 샘플링한 서브셋 생성
+    X_sub, y_sub = resample(X_train, y_train, n_samples=len(X_train), random_state=_)
+    rf_tree = DecisionTreeClassifier(max_depth=best_params['max_depth'], 
+                                      max_leaf_nodes=best_params['max_leaf_nodes'], 
+                                      min_samples_split=best_params['min_samples_split'])
+    rf_tree.fit(X_sub, y_sub)
+    subtree_accuracies.append(accuracy_score(y_test, rf_tree.predict(X_test)))
+
+# 4. 다수결 앙상블
+ensemble_predictions = np.zeros((len(X_test), n_subsets))
+
+for i in range(n_subsets):
+    X_sub, y_sub = resample(X_train, y_train, n_samples=len(X_train), random_state=i)
+    rf_tree = DecisionTreeClassifier(max_depth=best_params['max_depth'], 
+                                      max_leaf_nodes=best_params['max_leaf_nodes'], 
+                                      min_samples_split=best_params['min_samples_split'])
+    rf_tree.fit(X_sub, y_sub)
+    ensemble_predictions[:, i] = rf_tree.predict(X_test)
+
+# 다수결 방식으로 최종 예측 생성
+final_predictions = np.round(np.mean(ensemble_predictions, axis=1))
+ensemble_accuracy = accuracy_score(y_test, final_predictions)
+
+# 결과 출력
+print("Average Single Tree Accuracy:", np.mean(subtree_accuracies))
+print("Ensemble Accuracy:", ensemble_accuracy)