-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathecc.py
422 lines (325 loc) · 15.7 KB
/
ecc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
import argparse
from collections import OrderedDict
import logging
import os
import pprint
import re
import shutil
import uuid
TERMSIZE = shutil.get_terminal_size() # get the terminal size for formatting
HEADER = lambda text: f"――― {text} {'―' * (TERMSIZE.columns - len(text) - 5)}"
PPRINT = lambda text: pprint.pformat(text, sort_dicts=False)
DEL = r'\.del\s+(?P<tgt>.+?)(?=\.\w+\s+|$)'
SUB = r'\.sub\s+(?P<tgt>.+?);(?P<src>.+?)(?=\.\w+\s+|$)'
BAL = r'\.bal\s+(?P<prefix>.)(?P<suffix>.)'
ORG = r'\.org\s+(?P<sym>.+?)(?=\.\w+\s+|$)'
FMT = r'\.fmt\s+(?P<sym>\w+)\s*::=\s*(?P<exprs>.*?)(?=\.\w+\s+|$)'
MAP = r'\.map\s+(?P<sym>\w+)\s*::=\s*(?P<exprs>.*?)(?=\.\w+\s+|$)'
INS = r'(?P<opc>[&*#]?\w+)(?:\s+(?P<tgt>[&*#]\w+))?\s*(?:,\s*(?P<src>[&*#]\w+))?'
OR = r'\s*\|\s*'
AND = r'\s*;\s*'
class Parser():
def __init__(self, grammar, **kwargs):
""" Initialize a source code parser from a given source language grammar.
Args:
grammar (str): source language grammar
"""
grammar = re.sub(r'\s{2,}', '\t', grammar)
grammar = re.sub(r'(\n|\t)', '', grammar)
logging.debug(HEADER('Source Processing'))
self.sdel = re.findall(DEL, grammar)
logging.debug(f"DEL: {PPRINT(self.sdel)}")
self.ssub = re.findall(SUB, grammar)
logging.debug(f"SUB: {PPRINT(self.ssub)}")
self.sbal = re.findall(BAL, grammar)
logging.debug(f"BAL: {PPRINT(self.sbal)}")
logging.debug(HEADER('Source Grammar'))
sfmt = [(sym, re.split(OR, exprs)) for sym, exprs in re.findall(FMT, grammar)]
self.sfmt = OrderedDict(sfmt)
for sym in self.sfmt: # balance delimeters in format using paired id's
for var, fmt in enumerate(self.sfmt[sym]):
ctr = lim = _ord = 0 # lifo counter references namespaces reserved with the limit
for prefix, suffix in self.sbal:
index, PREFIX, SUFFIX = 0, f'\\{prefix}', f'\\{suffix}'
while index < len(fmt): # single-forward-pass delimeter substitution
if fmt[index:].startswith(PREFIX): # if prefix in fmt...
grp = rf'(?P<d{lim}>@[0-9]+){PREFIX}' # declare a delimeter namespace
fmt = fmt[:index] + grp + fmt[index + len(PREFIX):]
ctr, lim, _ord, index = lim + 1, lim + 1, _ord + 1, index + len(grp)
elif fmt[index:].startswith(SUFFIX): # if suffix in fmt...
ctr -= 1 # decrement counter before reference; limit is maintained
grp = rf'(?P=d{ctr}){SUFFIX}' # reference a delimeter namespace
fmt = fmt[:index] + grp + fmt[index + len(SUFFIX):]
_ord, index = _ord - 1, index + len(grp)
else: index += 1 # increment if balanced delimeters not found
if _ord != 0: raise SyntaxError(f"Balanced delimeters are not balanced; residual nesting order is {ctr}. The incorrect format is variant {var} of {sym}: '{fmt}'")
else: self.sfmt[sym][var] = fmt
logging.debug(PPRINT(dict(self.sfmt)))
logging.debug(HEADER('Source Origins'))
self.sorg = {sym: exprs for sym, exprs in self.sfmt.items()
if sym in re.findall(ORG, grammar)}
logging.debug(PPRINT(self.sorg))
logging.debug(HEADER('Source Map'))
smap = {sym: [[re.match(INS, expr).groupdict()
for expr in re.split(AND, exprs)]
for exprs in re.split(OR, exprss)]
for sym, exprss in re.findall(MAP, grammar)}
self.smap = OrderedDict(smap)
logging.debug(PPRINT(dict(self.smap)))
def parse(self, source):
""" Parse the given source code into an AST with recursive decent, that is
translated into an internal representation.
Args:
source (str): source code to parse; formatted according to grammar
Returns:
ir (list, dict): internal representation of the given source code
"""
logging.info(HEADER('Source Code'))
logging.info(source)
logging.debug(HEADER('Source Code (Pre-Processed)'))
_source = self._preprocess(source)
logging.debug(_source)
logging.info(HEADER('Abstract Syntax'))
org = self.sorg.items() if self.sorg else self.sfmt.items()
ast = Parser._reduce(self._parse(_source, org))
logging.info(PPRINT(ast))
logging.info(HEADER('Internal Representation'))
ir = Parser._reduce(self._translate(ast))
logging.info(PPRINT(ir))
return ir
def _preprocess(self, source):
""" Modify given source code according to grammar configuration
Args:
self._del (str): characters to exclude
source (str): source code to preprocess
Returns:
source (str): preprocessed source code
"""
for _del in self.sdel: source = re.sub(_del, '', source)
for tgt, src in self.ssub: source = re.sub(tgt, src, source)
ctr = 0 # lifo counter provides stack-like balance tracking
for prefix, suffix in self.sbal:
idx = 0 # source will grow, so end condition is calculated each iteration
while idx < len(source): # single-forward-pass delimeter substitution
if source[idx:].startswith(prefix):
_id = rf'@{ctr}{prefix}' # balanced delimeter identifier
source = source[:idx] + _id + source[idx + len(prefix):]
ctr, idx = ctr + 1, idx + len(_id)
elif source[idx:].startswith(suffix):
ctr -= 1 # decrement counter before reference; limit is maintained
_id = rf'@{ctr}{suffix}' # balancing delimeter identifier
source = source[:idx] + _id + source[idx + len(suffix):]
idx = idx + len(_id)
else: idx += 1 # increment if balanced delimeters not found
if ctr != 0: raise SyntaxError(f"Balanced delimeters are not balanced; residual nesting order is {ctr}.")
return source
def _parse(self, source, targets):
""" Parse the given source code into an AST with recursive decent.
Args:
source (str): source code to parse; formatted according to grammar
targets (dict): targeted symbol(s) of recursive decent
Returns:
ast (list, dict): abstract syntax tree of the given source code
"""
ast = []
while source: # sequentially match source to grammar
for sym, exprs in targets:
for expr in exprs:
for prefix, suffix in self.sbal:
for delim in re.finditer(rf"@(?P<id>[0-9]+)\{suffix}", source):
if not re.search(rf"@{delim.group('id')}\{prefix}", source):
source = re.sub(rf"@{delim.group('id')}\{suffix}", '', source)
if not (match := re.match(expr, source, re.DOTALL)): continue
# recursively parse subexpressions (grammar references)
_ast = {sym: {}}
try:
if (_match := match.groupdict().items()):
for _sym, _expr in _match:
if not re.sub('@[0-9]+', '', _expr): continue
_targets = {_sym: self.sfmt[_sym]}.items()
_ast[sym][_sym] = self._parse(_expr, _targets)
else:
_ast[sym] = match.group(0)
ast.append(_ast); break
except: continue
else: continue
source = source[match.end():]; break
else: raise SyntaxError(f"{source}")
return ast
def _translate(self, ast):
""" Translate the given AST into an internal representation.
Args:
ast (list, dict): abstract syntax tree of the given source code
Returns:
ir (list, dict): internal representation of the given source code
"""
ir = []
for _ast in ast if isinstance(ast, list) else [ast]:
for sym, attrs in _ast.items(): # identify ast variant
for var, expr in enumerate(self.sfmt[sym]):
groups = re.compile(expr).groupindex.keys()
groups = [group for group in groups if not re.match(r'^d\d+$', group)]
if isinstance(attrs, dict) and set(attrs.keys()) == set(groups): break
elif not isinstance(attrs, dict) and re.match(expr, attrs): break
else: raise SyntaxError(f"{_ast}")
for ins in self.smap[sym][var]: # construct ir from ast
if (opc := ins['opc']).startswith('&'): # if opc is dereferenced...
opc = self._translate({opc[1:]: attrs[opc[1:]]})
elif opc.startswith('#'): opc = '#' + attrs # if opc is explicit...
elif opc.startswith('*'): opc = '*' + attrs # if opc is a symbol...
# initialize ir from instruction
ins = {attr: val for attr, val in ins.items()
if (attr != 'opc') and (val != None)}
if not ins: ir.append(opc); continue
# recursively translate slices of the ast
ir.append({opc: {attr: (self._translate({val[1:]: attrs[val[1:]]})
if val.startswith('&') else val)
for attr, val in ins.items()}})
return ir if len(ir) > 1 else ir[0]
@staticmethod
def _reduce(struct):
""" Recursively reduce nested lists and dictionaries.
Reduce any list with a single element to that element.
In any dictionary, if the value corresping to a key is a dictionary with
only one key, and that key is the same as the outer key, reduce the value
corresonding to the outer key to the value corresponding to the inner key.
"""
if isinstance(struct, list):
# recursively reduce lists by element(s)
_struct = [Parser._reduce(item) for item in struct]
# if a list has one element, replace it with that element
if len(_struct) == 1: return _struct[0]
else: return _struct
elif isinstance(struct, dict):
_struct = {}
for key, value in struct.items(): # recursively reduce dictionaries by key
_value = Parser._reduce(value)
if (isinstance(_value, dict) and len(_value) == 1 and key in _value):
_struct[key] = _value[key]
else: _struct[key] = _value
return _struct
else: return struct
class Optimizer():
def __init__(self, **kwargs):
""" Initialize an internal representation optimizer. """
def optimize(self, ir): return ir
class Generator():
def __init__(self, grammar, **kwargs):
""" Initialize a code generator from a given target language grammar.
Args:
grammar (str): target language grammar
"""
grammar = re.sub(r'\s{2,}', '\t', grammar)
grammar = re.sub(r'(\n|\t)', '', grammar)
logging.debug(HEADER('Target Map'))
tmap = [(sym, re.split(OR, exprs)) for sym, exprs in re.findall(MAP, grammar)]
self.tmap = OrderedDict(tmap)
logging.debug(PPRINT(dict(self.tmap)))
logging.debug(HEADER('Target Grammar'))
tfmt = [(sym, re.split(OR, exprs)) for sym, exprs in re.findall(FMT, grammar)]
self.tfmt = OrderedDict(tfmt)
logging.debug(PPRINT(dict(self.tfmt)))
logging.debug(HEADER('Target Deletions'))
self.tdel = re.findall(DEL, grammar)
logging.debug(PPRINT(self.tdel))
logging.debug(HEADER('Target Substitutions'))
self.tsub = re.findall(SUB, grammar)
logging.debug(PPRINT(self.tsub))
def generate(self, ir):
""" Generate target code from the given internal representation with
recursive decent.
Args:
ir (list, dict): internal representation of the given source code
Returns:
code (str): generated target code; formatted according to grammar
"""
logging.debug(HEADER('Target Code'))
_code = self._generate(ir)
logging.debug(_code)
logging.info(HEADER('Target Code (Post-Processed)'))
code = self._postprocess(_code)
logging.info(code)
return code
def _generate(self, ir, _syms={}, ofs=-2):
""" Generate target code from the given internal representation with
recursive decent.
Args:
ir (list, dict): partial internal representation of the given source code
Returns:
code (str): generated target code; formatted according to grammar
"""
code, syms = "", {sym:loc for sym, loc in _syms.items()}
for _ir in ir if isinstance(ir, list) else [ir]:
if isinstance(_ir, list): # recursive generation of ir as list
code += self._generate(_ir, syms, ofs); continue
for opc, args in _ir.items():
ins = {'opc':opc, 'tgt':None, 'src':None}
for opr in ('tgt', 'src'): # construct instruction from ir
if not opr in args: continue
if isinstance(args[opr], dict): ins[opr] = f"&{opr}"
else: ins[opr] = f"{args[opr][0]}{opr}"
for var, expr in enumerate(self.tmap[opc]): # identify ir variant
if ins == re.match(INS, expr).groupdict(): break
else: raise SyntaxError(f"Could not map {_ir} to a variant of {opc}.")
fmt = self.tfmt[opc][var] # format mapped ir variant
for opr in {opr:val for opr, val in args.items() if opr != 'opc'}:
if isinstance(args[opr], dict): # if operand is a nested ir...
fmt = re.sub(rf"\&{opr}", self._generate(args[opr], syms, ofs), fmt)
if re.search(rf"\!{opr}", fmt): # if temporary symbol is declared...
syms[(_id := uuid.uuid4())] = rf"{(ofs := ofs + 2)}"
fmt = re.sub(rf"\!{opr}", syms[_id], fmt)
elif args[opr][0] == '#': # if operand is an explicit value...
fmt = re.sub(rf"\${opr}", args[opr][1:], fmt)
elif args[opr][0] == '*': # if operand is a symbol...
if re.search(rf"!{opr}", fmt): # if symbol is declared...
if args[opr][1:] not in syms: # if first declaration of symbol...
syms[args[opr][1:]] = rf"{(ofs := ofs + 2)}"
fmt = re.sub(rf"!{opr}", rf"&{opr}", fmt)
if re.search(rf"&{opr}", fmt): # if symbol is dereferenced...
if args[opr][1:] not in syms: # if dereference before declaration...
raise UnboundLocalError(f"Symbol {args[opr][1:]} referenced before declaration.")
fmt = re.sub(rf"\&{opr}", syms[args[opr][1:]], fmt)
fmt = re.sub(rf"\${opr}", args[opr][1:], fmt) # format raw symbol
code += fmt.encode('utf-8') \
.decode('unicode_escape')
return code
def _postprocess(self, code):
""" Modify given target code according to grammar configuration
Args:
self.tdel (str): expressions to remove
self.tsub (str): expressions to substitute
code (str): target code to preprocess
Returns:
code (str): postprocessed target code
"""
for _del in self.tdel: code = re.sub(_del, '', code)
for tgt, src in self.tsub: code = re.sub(tgt, src, code)
return code
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='ecc', description='x86 assembly compilers generated from Backus Naur grammar extended with regular expressions')
parser.add_argument('source', help='source grammar file path')
parser.add_argument('target', help='target grammar file path')
parser.add_argument('code', help='source code file path')
parser.add_argument('-o', '--output', nargs='?', const='',
help="Output file path (default: 'temp.txt' if -o is passed without a value).")
parser.add_argument('-v', '--verbose', default='WARNING',
choices=['DEBUG', 'INFO', 'WARNING'],
help='Set the logging level (default: WARNING).')
args = parser.parse_args()
logging.basicConfig(level=args.verbose, format=f'%(message)s')
with open(args.source, 'r') as file: sgrammar = file.read()
parser = Parser(sgrammar)
with open(args.code, 'r') as file: source = file.read()
ir = parser.parse(source)
optimizer = Optimizer()
ir = optimizer.optimize(ir)
with open(args.target, 'r') as file: tgrammar = file.read()
generator = Generator(tgrammar)
code = generator.generate(ir)
if args.output is not None:
if args.output == '':
filepath = os.path.dirname(args.code)
filename = os.path.splitext(os.path.basename(args.code))[0]
extension = os.path.splitext(os.path.basename(args.target))[0]
args.output = os.path.join(filepath, f"{filename}.{extension}")
with open(args.output, 'w') as file: file.write(code)