forked from afif2100/image-clasifier-tf-keras
-
Notifications
You must be signed in to change notification settings - Fork 0
/
split.py
109 lines (80 loc) · 3.1 KB
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# extract melalui notebook jika diperlukan
# !unzip flowers-recognition.zip
import os
mypath= 'flowers/'
file_name = []
tag = []
full_path = []
for path, subdirs, files in os.walk(mypath):
for name in files:
full_path.append(os.path.join(path, name))
tag.append(path.split('/')[-1])
file_name.append(name)
import pandas as pd
# memasukan variabel yang sudah dikumpulkan pada looping di atas menjadi sebuah dataframe agar rapih
df = pd.DataFrame({"path":full_path,'file_name':file_name,"tag":tag})
df.groupby(['tag']).size()
#tag
#daisy 1538
#dandelion 2110
#rose 1568
#sunflower 1468
#tulip 1968
#dtype: int64
#cek sample datanya
print(df.head())
#load library untuk train test split
from sklearn.model_selection import train_test_split
#variabel yang digunakan pada pemisahan data ini
X= df['path']
y= df['tag']
# split dataset awal menjadi data train dan test
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.20, random_state=300)
# kemudian data test dibagi menjadi 2 sehingga menjadi data test dan data validation.
X_test, X_val, y_test, y_val = train_test_split(
X_test, y_test, test_size=0.5, random_state=100)
# menyatukan kedalam masing-masing dataframe
df_tr = pd.DataFrame({'path':X_train
,'tag':y_train
,'set':'train'})
df_te = pd.DataFrame({'path':X_test
,'tag':y_test
,'set':'test'})
df_val = pd.DataFrame({'path':X_val
,'tag':y_val
,'set':'validation'})
print('train size', len(df_tr))
print('val size', len(df_te))
print('test size', len(df_val))
# melihat proporsi pada masing masing set apakah sudah ok atau masih ada yang ingin diubah
df_all = df_tr.append([df_te,df_val]).reset_index(drop=1)\
print('===================================================== \n')
print(df_all.groupby(['set','tag']).size(),'\n')
print('===================================================== \n')
#cek sample datanya
print(df_all.sample(3))
print('===================================================== \n')
# menghapus folder dataset jika diperlukan
#!rm -rf dataset/
import shutil
from tqdm import tqdm as tq
datasource_path = "flowers/"
dataset_path = "dataset/"
for index, row in tq(df_all.iterrows(), total=df_all.shape[0]):
#detect filepath
file_path = row['path']
if os.path.exists(file_path) == False:
file_path = os.path.join(datasource_path,row['tag'],row['image'].split('.')[0])
#make folder destination dirs
if os.path.exists(os.path.join(dataset_path,row['set'],row['tag'])) == False:
os.makedirs(os.path.join(dataset_path,row['set'],row['tag']))
#define file dest
destination_file_name = file_path.split(os.sep)[-1]
file_dest = os.path.join(dataset_path,row['set'],row['tag'],destination_file_name)
#copy file from source to dest
if os.path.exists(file_dest) == False:
#print(file_path,'►',file_dest)
shutil.copy2(file_path,file_dest)
# Output progress bar (notebook):
# HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))