Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Week2] - huiju #42

Merged
merged 5 commits into from
Dec 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 0 additions & 43 deletions README.md

This file was deleted.

68 changes: 68 additions & 0 deletions week2-huiju/task1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt

# 1. 데이터 준비
iris = load_iris()
X = iris["data"][:, 3:]
y = (iris["target"] == 2).astype(int) # 타깃 이진화: Virginica 품종을 1로, 나머지는 0으로 변환

# 데이터 정보 출력
print("특성 이름:", iris.feature_names)
print("타깃 이름:", iris.target_names)
print("데이터 크기:", y.reshape(-1, 1).shape)

# 2. 데이터 분할
# 데이터 분할: 훈련 세트(80%)와 테스트 세트(20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 훈련 세트와 테스트 세트 크기 출력
print("훈련 세트 크기:", y_train.reshape(-1, 1).shape)
print("테스트 세트 크기:", y_test.reshape(-1, 1).shape)

# 3. 모델 학습 / 예측 및 평가
# 로지스틱 회귀 모델 정의 및 학습
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

res=log_reg.predict(X_test) # 예측 수행
accuracy=accuracy_score(y_test, res) # 모델의 정확도 계산

print("모델 정확도(Accuracy) ", accuracy) # 결과 출력

# 4. 결정 경계 시각화
X_new = np.linspace(0, 3, 1000).reshape(-1, 1)
y_proba = log_reg.predict_proba(X_new)
decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]

plt.plot(X_new, y_proba[:, 1], "g-", label="Iris virginica")
plt.axvline(x=decision_boundary, color="r", linestyle="--", label="Decision Boundary")
plt.xlabel("Petal width (cm)")
plt.ylabel("Probability")
plt.title("Logistic Regression - Decision Boundary")
plt.legend() # 범례를 추가
plt.grid()
plt.show()
print(f"결정 경계: {decision_boundary}")

# 4. 결정 경계 시각화
x_min, x_max = 0, 3
xx = np.linspace(x_min, x_max, 1000).reshape(-1, 1)
probabilities = model.predict_proba(np.hstack((np.zeros((1000, 1)), xx)))[:, 1]

plt.figure(figsize=(10, 6))
plt.plot(xx, probabilities, label='Iris Virginica 확률', color='blue')
plt.axhline(0.5, color='red', linestyle='--', label='결정 경계')
plt.title('Iris Virginica 확률 곡선 및 결정 경계')
plt.xlabel('꽃잎 너비 (cm)')
plt.ylabel('확률')
plt.legend()
plt.grid()
plt.show()

# 분석 :
# 모델의 성능은 정확도 1.0으로 매우 우수하며, 결정 경계 시각화를 통해 꽃잎 너비에 따라 Iris Virginica 품종과 나머지 품종을 잘 구분하고 있는 것을 확인할 수 있습니다.
57 changes: 57 additions & 0 deletions week2-huiju/task2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from sklearn.datasets import load_iris
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt

# 1. 데이터 준비
iris = load_iris()
X = iris.data[:, 2:4] # 꽃잎 길이와 꽃잎 너비
y = iris.target

# Iris Setosa와 Iris Versicolor 클래스만 선택
setosa_or_versicolor = (y == 0) | (y == 1)
X = X[setosa_or_versicolor]
y = y[setosa_or_versicolor]

# 데이터 정보 출력
print("특성 이름:", iris.feature_names[2:4])
print("타깃 이름:", iris.target_names[:2])
print("데이터 크기:", X.shape)

# 2. 모델 학습
svm_clf = SVC(kernel="linear", C=1)
svm_clf.fit(X, y)

# 지원 벡터 출력
print("지원 벡터(Support Vectors):")
print(svm_clf.support_vectors_)

# 3. 결정 경계 시각화
def plot_decision_boundary(svm_clf, X, y):
# 결정 경계 및 마진 계산
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))

Z = svm_clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# 결정 경계 및 마진 시각화
plt.contour(xx, yy, Z, colors='k', levels=[0], alpha=1, linewidths=2) # 결정 경계
plt.contour(xx, yy, Z, colors='k', levels=[-1, 1], linestyles='dashed') # 마진

# 데이터 포인트 시각화
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='green', label='Setosa', marker='o')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue', label='Versicolor', marker='s')

# 지원 벡터 표시
plt.scatter(svm_clf.support_vectors_[:, 0], svm_clf.support_vectors_[:, 1],
facecolors='none', edgecolors='k', s=200, label='Support Vectors', marker='o')
plt.xlabel("Petal Length (cm)")
plt.ylabel("Petal Width (cm)")
plt.legend()
plt.grid()

# 시각화 함수 호출
plot_decision_boundary(svm_clf, X, y)
plt.show()
64 changes: 64 additions & 0 deletions week2-huiju/task3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# 필요한 라이브러리 임포트
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

# 1. 데이터 준비
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 결정 트리 하이퍼파라미터 최적화
param_grid = {
'max_depth': [3, 5, 6, 7, 8], # None 제거
'max_leaf_nodes': [10, 15, 17, 20],
'min_samples_split': [2, 5, 10]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# 최적의 하이퍼파라미터로 훈련된 모델의 테스트 정확도 확인
test_accuracy_dt = accuracy_score(y_test, best_model.predict(X_test))
print("Best Parameters:", best_params)
print("Single Tree Accuracy:", test_accuracy_dt)

# 3. 랜덤 포레스트 구현
n_subsets = 100
subtree_accuracies = []

for _ in range(n_subsets):
# 랜덤하게 샘플링한 서브셋 생성
X_sub, y_sub = resample(X_train, y_train, n_samples=len(X_train), random_state=_)
rf_tree = DecisionTreeClassifier(max_depth=best_params['max_depth'],
max_leaf_nodes=best_params['max_leaf_nodes'],
min_samples_split=best_params['min_samples_split'])
rf_tree.fit(X_sub, y_sub)
subtree_accuracies.append(accuracy_score(y_test, rf_tree.predict(X_test)))

# 4. 다수결 앙상블
ensemble_predictions = np.zeros((len(X_test), n_subsets))

for i in range(n_subsets):
X_sub, y_sub = resample(X_train, y_train, n_samples=len(X_train), random_state=i)
rf_tree = DecisionTreeClassifier(max_depth=best_params['max_depth'],
max_leaf_nodes=best_params['max_leaf_nodes'],
min_samples_split=best_params['min_samples_split'])
rf_tree.fit(X_sub, y_sub)
ensemble_predictions[:, i] = rf_tree.predict(X_test)

# 다수결 방식으로 최종 예측 생성
final_predictions = np.round(np.mean(ensemble_predictions, axis=1))
ensemble_accuracy = accuracy_score(y_test, final_predictions)

# 결과 출력
print("Average Single Tree Accuracy:", np.mean(subtree_accuracies))
print("Ensemble Accuracy:", ensemble_accuracy)