-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
173 lines (136 loc) · 7.28 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import argparse
import os
import shutil
import zipfile
from tempfile import mkdtemp
import xml.etree.ElementTree as ET
# Register the namespace globally to avoid 'ns0:' prefixes
ET.register_namespace('', 'http://schemas.openxmlformats.org/spreadsheetml/2006/main')
def unzip_excel_as_zip(filename, template_dir):
if not filename.endswith('.xlsx'):
raise ValueError("File must be an Excel (.xlsx) file")
temp_dir = mkdtemp()
extracted_dir = os.path.join(temp_dir, "Extracted")
extracted_template_dir = os.path.join(temp_dir, "Template")
output_dir = os.path.join(temp_dir, "output")
try:
shutil.copytree(template_dir, extracted_template_dir)
zip_path = filename + '.zip'
os.rename(filename, zip_path)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extracted_dir)
os.rename(zip_path, filename)
# Copy Template directory to output directory
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
shutil.copytree(template_dir, output_dir)
except Exception as e:
shutil.rmtree(temp_dir)
raise e
return temp_dir, extracted_dir, output_dir
def parse_template_sheet_data(filepath):
tree = ET.parse(filepath)
root = tree.getroot()
return tree, root # Return both the tree (for manipulation) and the root
def parse_sheet_data(filepath, namespaces):
tree = ET.parse(filepath)
root = tree.getroot()
sheet_data_elements = root.findall('.//ns:sheetData', namespaces=namespaces)
for sheet_data in sheet_data_elements:
if sheet_data.find('.//ns:row', namespaces=namespaces) is not None:
return sheet_data
raise ValueError(f"No <sheetData> element with a <row> child found in {filepath}")
def process_xml_files(extracted_dir, template_tree, template_root, namespaces, temp_dir):
external_links_path = os.path.join(extracted_dir, 'xl', 'externalLinks')
output_dir = os.path.join(temp_dir, "output")
os.makedirs(output_dir, exist_ok=True)
template_sheet_data = template_root.find('.//ns:sheetData', namespaces=namespaces)
output_count = 1
if template_sheet_data is not None and os.path.exists(external_links_path):
for i, filename in enumerate(sorted(os.listdir(external_links_path)), start=1):
if filename.endswith('.xml'):
full_path = os.path.join(external_links_path, filename)
try:
new_sheet_data = parse_sheet_data(full_path, namespaces)
template_sheet_data.clear() # Clear existing children
for child in new_sheet_data:
for subchild in child:
# Check if the node is a 'cell' and rename it to 'c'
if subchild.tag == '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}cell':
subchild.tag = '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}c'
template_sheet_data.append(child)
output_filename = os.path.join(output_dir, "xl", "worksheets", f"sheet{output_count}.xml")
template_tree.write(output_filename, encoding='UTF-8', xml_declaration=True)
print(f"Processed: {filename} -> {output_filename}")
output_count += 1
except ValueError as e:
print(e)
else:
raise ValueError("Template does not contain a <sheetData> element.")
return output_count - 1
def update_workbook_xml(output_dir, num_sheets):
workbook_xml_path = os.path.join(output_dir, "xl", "workbook.xml")
tree = ET.parse(workbook_xml_path)
root = tree.getroot()
root.attrib['xmlns:r'] = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
namespaces = {'ns': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
# Find the <sheets> node
sheets_node = root.find('ns:sheets', namespaces)
if sheets_node is not None:
sheets_node.clear() # Clear existing <sheet> children
# Append a new <sheet> node for each generated file
for i in range(1, num_sheets + 1):
sheet = ET.SubElement(sheets_node, '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}sheet',
attrib={
'name': f"Sheet{i}",
'sheetId': str(i),
'state': 'visible',
'r:id': f"rId{i+1}"
})
tree.write(workbook_xml_path, encoding='UTF-8', xml_declaration=True)
def update_workbook_rels(output_dir, num_sheets):
ET.register_namespace('', 'http://schemas.openxmlformats.org/package/2006/relationships')
workbook_xml_path = os.path.join(output_dir, "xl", "_rels", "workbook.xml.rels")
tree = ET.parse(workbook_xml_path)
root = tree.getroot()
if root is not None:
# Append a new <sheet> node for each generated file
for i in range(1, num_sheets + 1):
relationship = ET.SubElement(root, '{http://schemas.openxmlformats.org/package/2006/relationships}Relationship',
attrib={
'Id': f"rId{i+1}",
'Type': "http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet",
'Target': f"worksheets/sheet{i}.xml"
})
tree.write(workbook_xml_path, encoding='UTF-8', xml_declaration=True)
def create_new_excel_file(input_filepath, output_dir):
# Extract the base name and directory of the input file
base_name = os.path.splitext(os.path.basename(input_filepath))[0]
input_dir = os.path.dirname(input_filepath)
# Define the name and path for the zip (temporary) and final Excel file
temp_zip_path = os.path.join(input_dir, f"{base_name}-extracted")
final_excel_path = f"{temp_zip_path}.xlsx"
# Create a zip archive of the "output" directory
shutil.make_archive(temp_zip_path, 'zip', output_dir)
# Rename the zip file to have a ".xlsx" extension (overwrites if exists)
os.rename(f"{temp_zip_path}.zip", final_excel_path)
print(f"New Excel file created at: {final_excel_path}")
return final_excel_path
def main():
parser = argparse.ArgumentParser(description="Extract and process cached data from an Excel file.")
parser.add_argument("filename", type=str, help="Path to the Excel file")
args = parser.parse_args()
template_dir = "Template"
namespaces = {'ns': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
try:
temp_dir, extracted_dir, output_dir = unzip_excel_as_zip(args.filename, template_dir)
template_sheet_path = os.path.join(temp_dir, "Template", "xl", "worksheets", "sheet1.xml")
template_tree, template_root = parse_template_sheet_data(template_sheet_path)
num_sheets = process_xml_files(extracted_dir, template_tree, template_root, namespaces, temp_dir)
update_workbook_xml(output_dir, num_sheets)
update_workbook_rels(output_dir, num_sheets)
create_new_excel_file(args.filename, output_dir)
finally:
shutil.rmtree(temp_dir)
if __name__ == "__main__":
main()