-
Notifications
You must be signed in to change notification settings - Fork 25
/
cyk_parser.py
153 lines (134 loc) · 6.27 KB
/
cyk_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os.path
import argparse
import grammar_converter
class Node:
"""
Used for storing information about a non-terminal symbol. A node can have a maximum of two
children because of the CNF of the grammar.
It is possible though that there are multiple parses of a sentence. In this case information
about an alternative child is stored in self.child1 or self.child2 (the parser will decide
where according to the ambiguous rule).
Either child1 is a terminal symbol passed as string, or both children are Nodes.
"""
def __init__(self, symbol, child1, child2=None):
self.symbol = symbol
self.child1 = child1
self.child2 = child2
def __repr__(self):
"""
:return: the string representation of a Node object.
"""
return self.symbol
class Parser:
"""
A CYK parser which is able to parse any grammar in CNF. The grammar can be read from a file or
passed as a string. It either returns a string representation of the parse tree(s) or prints it
to standard out.
"""
def __init__(self, grammar, sentence):
"""
Creates a new parser object which will read in the grammar and transform it into CNF and
then parse the given sentence with that grammar.
:param grammar: the file path to the grammar/the string repr. of the grammar to read in
:param sentence: the file path to the sentence/the string repr. of the sentence to read in
"""
self.parse_table = None
self.prods = {}
self.grammar = None
if os.path.isfile(grammar):
self.grammar_from_file(grammar)
else:
self.grammar_from_string(grammar)
self.__call__(sentence)
def __call__(self, sentence, parse=False):
"""
Parse the given sentence (string or file) with the earlier given grammar.
:param sentence: the sentence to parse with self.grammar
"""
if os.path.isfile(sentence):
with open(sentence) as inp:
self.input = inp.readline().split()
if parse:
self.parse()
else:
self.input = sentence.split()
def grammar_from_file(self, grammar):
"""
Reads in a CFG from a given file, converts it to CNF and stores it in self.grammar.
:param grammar: the file in which the grammar is stored.
"""
self.grammar = grammar_converter.convert_grammar(grammar_converter.read_grammar(grammar))
def grammar_from_string(self, grammar):
"""
Reads in a CFG from a string, converts it to CNF and stores it in self.grammar.
:param grammar: the CFG in string representation.
:return:
"""
self.grammar = grammar_converter.convert_grammar([x.replace("->", "").split() for x in grammar.split("\n")])
def parse(self):
"""
Does the actual parsing according to the CYK algorithm. The parse table is stored in
self.parse_table.
"""
length = len(self.input)
# self.parse_table[y][x] is the list of nodes in the x+1 cell of y+1 row in the table.
# That cell covers the word below it and y more words after.
self.parse_table = [[[] for x in range(length - y)] for y in range(length)]
for i, word in enumerate(self.input):
# Find out which non terminals can generate the terminals in the input string
# and put them into the parse table. One terminal could be generated by multiple
# non terminals, therefore the parse table will contain a list of non terminals.
for rule in self.grammar:
if f"'{word}'" == rule[1]:
self.parse_table[0][i].append(Node(rule[0], word))
for words_to_consider in range(2, length + 1):
for starting_cell in range(0, length - words_to_consider + 1):
for left_size in range(1, words_to_consider):
right_size = words_to_consider - left_size
left_cell = self.parse_table[left_size - 1][starting_cell]
right_cell = self.parse_table[right_size - 1][starting_cell + left_size]
for rule in self.grammar:
left_nodes = [n for n in left_cell if n.symbol == rule[1]]
if left_nodes:
right_nodes = [n for n in right_cell if n.symbol == rule[2]]
self.parse_table[words_to_consider - 1][starting_cell].extend(
[Node(rule[0], left, right) for left in left_nodes for right in right_nodes]
)
def print_tree(self, output=True):
"""
Print the parse tree starting with the start symbol. Alternatively it returns the string
representation of the tree(s) instead of printing it.
"""
start_symbol = self.grammar[0][0]
final_nodes = [n for n in self.parse_table[-1][0] if n.symbol == start_symbol]
if final_nodes:
if output:
print("The given sentence is contained in the language produced by the given grammar!")
print("\nPossible parse(s):")
trees = [generate_tree(node) for node in final_nodes]
if output:
for tree in trees:
print(tree)
else:
return trees
else:
print("The given sentence is not contained in the language produced by the given grammar!")
def generate_tree(node):
"""
Generates the string representation of the parse tree.
:param node: the root node.
:return: the parse tree in string form.
"""
if node.child2 is None:
return f"[{node.symbol} '{node.child1}']"
return f"[{node.symbol} {generate_tree(node.child1)} {generate_tree(node.child2)}]"
if __name__ == '__main__':
argparser = argparse.ArgumentParser()
argparser.add_argument("grammar",
help="File containing the grammar or string directly representing the grammar.")
argparser.add_argument("sentence",
help="File containing the sentence or string directly representing the sentence.")
args = argparser.parse_args()
CYK = Parser(args.grammar, args.sentence)
CYK.parse()
CYK.print_tree()