prediction.py

# -*- coding: utf-8 -*-
"""Prediction.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1BfPPTnXBWm3AWg3YtDvUWsRQtWvggemf
"""

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 파일 경로 리스트 생성
file_paths = [
    "/content/nba_1819.csv",
    "/content/nba_1920.csv",
    "/content/nba_2021.csv",
    "/content/nba_2122.csv",
    "/content/nba_2223.csv",
    "/content/nba_2324.csv"
]

# 각 파일을 읽어와 DataFrame으로 변환
data_frames = [pd.read_csv(file_path) for file_path in file_paths]

# DataFrame들을 합치기
combined_data = pd.concat(data_frames, ignore_index=True)

# 필요 없는 열 제외
columns_to_exclude = ['Date', 'remarks', 'overtime']
combined_data = combined_data.drop(columns=columns_to_exclude)

# weekday 열을 숫자로 변환
weekday_mapping = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
}
combined_data['weekday'] = combined_data['weekday'].map(weekday_mapping)

# Home, Away 열을 숫자로 변환 (라벨 인코딩)
label_encoder = LabelEncoder()
combined_data['Home'] = label_encoder.fit_transform(combined_data['Home'])
combined_data['Away'] = label_encoder.fit_transform(combined_data['Away'])

# Win_Margin을 타겟 변수로 설정하여 모델 학습용 및 테스트용 데이터 분리
X = combined_data.drop(columns=["Win_Margin"])
y = combined_data["Win_Margin"]
train_data, test_data, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
rf = RandomForestRegressor()
rf.fit(train_data, train_labels)
predictions = rf.predict(test_data)

# 모델 평가 - RMSE 측정
rmse = mean_squared_error(test_labels, predictions, squared=False)
print("Root Mean Squared Error (RMSE) on test data:", rmse)

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 파일 경로 리스트 생성
file_paths = [
    "/content/nba_1819.csv",
    "/content/nba_1920.csv",
    "/content/nba_2021.csv",
    "/content/nba_2122.csv",
    "/content/nba_2223.csv",
    "/content/nba_2324.csv"
]

# 각 파일을 읽어와 DataFrame으로 변환
data_frames = [pd.read_csv(file_path) for file_path in file_paths]

# DataFrame들을 합치기
combined_data = pd.concat(data_frames, ignore_index=True)

# 'overtime', 'remarks' 열 제외 선택
columns_to_drop = ['overtime', 'remarks']
combined_data = combined_data.drop(columns=columns_to_drop)

# 특성과 타겟 분리
features = combined_data.drop(columns=['Win_Margin'])  # 타겟인 'Win_Margin'을 제외한 특성들
target = combined_data['Win_Margin']  # 예측하려는 타겟

# 범주형 데이터 처리 - 원-핫 인코딩
features_encoded = pd.get_dummies(features)

# 학습 데이터와 테스트 데이터 분리
train_features, test_features, train_target, test_target = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

# 모델 훈련
log_reg = LogisticRegression(max_iter=3000)
log_reg.fit(train_features, train_target)

# 두 팀의 이름 입력 받기
home_team = input("홈 팀 이름을 입력하세요: ")
away_team = input("원정 팀 이름을 입력하세요: ")

# 입력받은 팀 이름들을 특성으로 바꾸기
team_features = pd.get_dummies(pd.DataFrame({
    'Home': [home_team],
    'Away': [away_team]
}))

# 원-핫 인코딩에 맞게 열 수 일치시키기
team_features = team_features.reindex(columns=features_encoded.columns, fill_value=0)

# 모델을 통한 예측
predicted_result = log_reg.predict(team_features)[0]
if predicted_result > 0:
    print(f"{home_team}이(가) {away_team}을(를) 이길 것으로 예상됩니다.")
else:
    print(f"{away_team}이(가) {home_team}을(를) 이길 것으로 예상됩니다.")