-
Notifications
You must be signed in to change notification settings - Fork 0
/
book_file_handle.py
83 lines (72 loc) · 3.19 KB
/
book_file_handle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding:utf-8 -*-
__author__ = 'StackOF'
import os
import time
import pandas as pd
def get_all_file(folder_path, file_ext=''):
file_list = []
if folder_path is None:
raise Exception("folder_path is None")
for dirpath, dirnames, filenames in os.walk(folder_path):
for name in filenames:
if file_ext == '':
file_list.append(os.path.join(dirpath, name))
else:
ext = '.' + file_ext
if os.path.splitext(name)[1] == ext:
file_list.append(os.path.join(dirpath, name))
return file_list
def get_filename_ext(file_path):
full_filename = os.path.basename(file_path)
filename, ext = os.path.splitext(full_filename)
return file_path,full_filename,filename, ext
def get_book_file_df(path:str)->pd.DataFrame:
book_file_lst = get_all_file(path)
book_df = pd.DataFrame.from_records(pd.DataFrame(book_file_lst)[0].apply(get_filename_ext),columns=['filepath','full_filename','filename','ext'])
book_df['full_duplicated'] = book_df.duplicated('full_filename',keep='first')
book_df['filename_duplicated'] = book_df.duplicated('filename',keep=False)
book_df.sort_values(by=['filename'],inplace=True)
book_df.reset_index()
return book_df
def get_second():
return time.strftime("%Y%m%d_%H%M%S", time.localtime())
def write_df_lst_to_xlsx(xlsx_file:str,df_lst:list,sheet_lst:list,index_int=0):
try:
xf = xlsx_file
ew = pd.ExcelWriter(xlsx_file,engine='xlsxwriter')
except Exception as e:
print(f'can not open {xlsx_file}')
pre,ext = os.path.splitext(xlsx_file)
ts = get_second()
xf = pre+'_'+ts+ext
ew = pd.ExcelWriter(xf,engine='xlsxwriter')
with ew as writer:
for cur_pos in range(len(df_lst)):
df_lst[cur_pos].to_excel(writer,sheet_name=sheet_lst[cur_pos],index=index_int,freeze_panes=(1,0))
column_widths = (
df_lst[cur_pos].columns.to_series().apply(lambda x:len(x.encode('gbk'))+5).values
)
writer.sheets[sheet_lst[cur_pos]].autofilter(0,0,0,len(df_lst[cur_pos].columns)-1)
for i,width in enumerate(column_widths):
writer.sheets[sheet_lst[cur_pos]].set_column(i,i,width)
print(f'write the {sheet_lst} to {xf}')
def delete_dup_file(path):
file_lst = get_all_file(path,file_ext='epub')
for cur_file in file_lst:
file_path, full_filename, filename, ext = get_filename_ext(cur_file)
if filename[-1] == ")" and filename[-3] =="(":
try:
os.remove(cur_file)
except Exception as e:
print(cur_file,e)
# path = '/home/kali/SSD4T_1/Book'
path = '/home/kali/backup/ePUBee-backup'
delete_dup_file(path)
# book_df_all = get_book_file_df(path)
# write_df_lst_to_xlsx('/home/kali/SSD4T_1/book_info_0104.xlsx',[book_df_all],['bookinfo'])
# for cur_row,cur_file_info in book_df_all.iterrows():
# if cur_file_info['full_duplicated'] is True or (cur_file_info['filename'] is True and cur_file_info['ext'] != '.mobi'):
# try:
# os.remove(cur_file_info['filepath'])
# except Exception as e:
# print(cur_file_info['filepath'],e)