-
Notifications
You must be signed in to change notification settings - Fork 5
/
sparktodataframe.py
59 lines (45 loc) · 1.91 KB
/
sparktodataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
"""sparkToDataframe.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1fw5VrLGcJCbuGs2qvL1zl81BKApNJoyr
"""
import pandas as pd
# 파일 경로 리스트 생성
file_paths = [
"/content/nba_1819.csv",
"/content/nba_1920.csv",
"/content/nba_2021.csv",
"/content/nba_2122.csv",
"/content/nba_2223.csv",
"/content/nba_2324.csv"
]
# 각 파일을 읽어와 DataFrame으로 변환
data_frames = [pd.read_csv(file_path) for file_path in file_paths]
# DataFrame들을 합치기
combined_data = pd.concat(data_frames, ignore_index=True)
# 'Date', 'Home', 'Away', 'weekday', 'overtime', 'remarks' 열 제외 선택
columns_to_drop = ['Date', 'Home', 'Away', 'weekday', 'overtime', 'remarks']
combined_data = combined_data.drop(columns=columns_to_drop)
# 특성 벡터화 (Pandas에서는 별도의 VectorAssembler가 필요 없음)
# 학습 데이터와 테스트 데이터 분리
train_data, test_data = combined_data.sample(frac=0.8, random_state=42), combined_data.sample(frac=0.2, random_state=42)
# 모델 학습 (Pandas에서는 별도의 Pipeline이나 Spark ML이 필요 없음)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
rf = RandomForestRegressor()
rf.fit(train_data.drop(columns=["Win_Margin"]), train_data["Win_Margin"])
predictions = rf.predict(test_data.drop(columns=["Win_Margin"]))
# 모델 평가 - RMSE 측정
rmse = mean_squared_error(test_data["Win_Margin"], predictions, squared=False)
print("Root Mean Squared Error (RMSE) on test data:", rmse)
import pandas as pd
import matplotlib.pyplot as plt
# 예측 결과를 시각화합니다.
plt.figure(figsize=(8, 6))
plt.scatter(test_data['Win_Margin'], predictions, alpha=0.5)
plt.xlabel('Actual Win Margin')
plt.ylabel('Predicted Win Margin')
plt.title('Actual vs. Predicted Win Margin')
plt.grid(True)
plt.show()