Parsing with PLY: The Good, Bad, and Ugly
PLY is a Python implementation of Lex and Yacc originally written for Unix. Previously, I used it for tokenizing prerequisite strings. The goal is to have a program that performs the following trick:
|
At first glance, it looks like a simple task. Just replace “,” “and” with “,”, and replace “/” “or” with “/”. Regular expressions could do that. Then there comes the issue with the comma and its interpretation depending on the level in the hierarchy. So using PLY to do the same thing turned out to be much easier, since grammar rules are used instead of regular expressions. I put together a quick prototype in an hour.
import sys
sys.path.append("/Documents/Projects/ply/")
from prereqlex import tokens
def p_prereq_cp(p):
'prereq : cp'
p[0] = p [1]
def p_prereq_cp_and(p):
'cp : course AND cp'
p[0] = p[1] + ' ' + p[3]
def p_prereq_cp_or(p):
'cp : course OR cp'
p[0] = p[1] + ' ' + p[3]
def p_prereq_cp_comma(p):
'cp : course COMMA cp'
p[0] = p[1] + ' ' + p[3]
def p_prereq_cp_parens(p):
'cp : LPARENS cp RPARENS'
p[0] = p[2]
def p_prereq_cp_course(p):
'cp : course'
p[0] = p[1]
def p_prereq_course_extra(p):
'course : course OR NUM'
p[0] = p[1] + ' ' + p[3]
def p_prereq_course_default(p):
'course : DEPT NUM'
p[0] = p[1] + ' ' + p[2]
# Error rule for syntax errors
def p_error(p):
print "Syntax error in input!"
# Build the parser
parser = yacc.yacc()
while True:
try:
s = raw_input('> ')
except EOFError:
break
if not s: continue
result = parser.parse(s)
print result
> MATH 125 or 135
MATH 125 135
> MATH 125 or MATH 135
MATH 125 MATH 135
That’s pretty good for a start, but doesn’t get much better for interpreting the input correctly.
'prereq : cp SEMI lp SEMI ep'
p[0] = p[1]
def p_prereq_cp_lp(p):
'prereq : cp SEMI lp'
p[0] = (p[1] , 'Level ' + p[3])
def p_prereq_cp_ep(p):
'prereq : cp SEMI ep'
p[0] = p[1] + ' Exclude ' + p[3]
def p_prereq_cp(p):
'prereq : cp'
p[0] = p[1]
def p_prereq_cp_and(p):
'cp : cp AND cg'
p[0] = ('AND', p[1], p[3])
def p_prereq_cp_comma(p):
'cp : cp COMMA cg'
p[0] = ('COMMA', p[1], p[3])
def p_prereq_cp_comma_and(p):
'cp : cp COMMA AND cg'
p[0] = ('COMMA', p[1], p[4])
def p_prereq_cp_semi(p):
'cp : cp SEMI cg'
p[0] = ('SEMI', p[1], p[3])
def p_prereq_cp_semi_and(p):
'cp : cp SEMI AND cg'
p[0] = ('SEMI', p[1], p[4])
def p_prereq_cp_defualt(p):
'cp : cg'
p[0] = p[1]
def p_prereq_cg_or(p):
'cg : cg OR courses'
p[0] = ('OR', p[1], p[3])
def p_prereq_cg_slash(p):
'cg : cg SLASH courses'
p[0] = ('SLASH', p[1], p[3])
def p_prereq_cg_comma(p):
'cg : cg COMMA courses'
p[0] = ('COMMA', p[1], p[3])
def p_prereq_cg_comma_or(p):
'cg : cg COMMA OR courses'
p[0] = ('COMMA', p[1], p[4])
def p_prereq_cg_parens(p):
'cg : LPARENS cg RPARENS'
p[0] = p[2]
def p_prereq_cg_one_of(p):
'cg : ONE OF cg'
p[0] = ('OR', p[3])
def p_prereq_cg_courses(p):
'cg : courses'
p[0] = p[1]
def p_prereq_courses_or(p):
'courses : courses OR course'
p[0] = ('OR', p[1], p[3])
def p_prereq_courses_slash(p):
'courses : courses SLASH course'
p[0] = ('SLASH', p[1], p[3])
def p_prereq_courses_comma(p):
'courses : courses COMMA course'
p[0] = ('COMMA', p[1], p[3])
def p_prereq_courses_default(p):
'courses : course'
p[0] = p[1]
def p_prereq_course_default(p):
'course : DEPT NUM'
p[0] = p[1] + ' ' + p[2]
def p_prereq_course_short(p):
'course : NUM'
p[0] = p[1]
def p_prereq_lp(p):
'lp : LEVEL AT LEAST TERM'
p[0] = p[4]
def p_prereq_ep(p):
'ep : NOT OPEN TO'
p[0] = ''
precedence = (
('left', 'AND', 'SEMI'),
('left', 'COMMA'),
('left', 'SLASH'),
('left', 'OR'),
('left', 'DEPT'),
('right', 'NUM'),
)
# Error rule for syntax errors
def p_error(p):
print "Syntax error at '%s'" % p
import logging
logging.basicConfig(
level = logging.INFO,
filename = "parselog.txt",
filemode = "w",
format = "%(filename)10s:%(lineno)4d:%(message)s"
)
log = logging.getLogger()
parser = yacc.yacc(debug=True,debuglog=log)
prereqs = ('CHEM 120, 123; one of PHYS 111, 121; and one of PHYS 112, 122',
'EARTH 121, 121L or 122, 122L or 153 or CIVE 153 or CIVE 253 or GEOE 153 or ENVE 153',
'PHYS 112 or 122 or 125',
'CHEM 254 or EARTH 421, CHEM 350',
'(One of PHYS 112, 122, 125) and (one of MATH 118, 119, 128, 138, 148) and (one of AMATH 250, 350, CIVE 222, ENVE 223, MATH 218, 228, ME 203, SYDE 211)',
'One of PSCI 250, 255, 260, 264',
'One of ANTH 201/CLAS 221, ANTH 203, 233',
'MATH 135 or 145, STAT 220 or 230 or 240',
'AMATH 250 and MATH 237 or 247; Level at least 3A',
'BIOL 140, 140L, (BIOL 208 or 330)',
'(CS 240 or SE 240), CS 246, ECE 222',
'CS 246, (SE 240 or CS 240)',
'(AMATH 341/CM 271/ CS 371 or CS 370) and (MATH 128 or 138 or 148)',
'MATH 135 or 145, STAT 220 or 230 or 240',
'AMATH 250 and MATH 237',
'(BIOL 140,BIOL 208 or 330) and (BIOL 308 or 330)',
'MATH 229 or 239 or 249 and (One of CO 227, 250/350, 355, CO 352/CM 340)',
'(CS 240 or SE 240), CS 246, ECE 222',
'CS 241 and (STAT 206 or 230 or 240)')
limit = float("infinity");
i = 0;
for s in prereqs:
if i < limit:
result = parser.parse(s, debug=log)
print result
i += 1
Output:
('SEMI', ('SEMI', ('COMMA', 'CHEM 120', '123'), ('OR', ('COMMA', 'PHYS 111', '121'))), ('OR', ('COMMA', 'PHYS 112', '122'))) ('OR', ('OR', ('OR', ('OR', ('OR', ('COMMA', ('OR', ('COMMA', 'EARTH 121', '121L'), '122'), '122L'), '153'), 'CIVE 153'), 'CIVE 253'), 'GEOE 153'), 'ENVE 153') ('OR', ('OR', 'PHYS 112', '122'), '125') ('COMMA', ('OR', 'CHEM 254', 'EARTH 421'), 'CHEM 350') ('AND', ('AND', ('OR', ('COMMA', ('COMMA', 'PHYS 112', '122'), '125')), ('OR', ('COMMA', ('COMMA', ('COMMA', ('COMMA', 'MATH 118', '119'), '128'), '138'), '148'))), ('OR', ('COMMA', ('COMMA', ('COMMA', ('COMMA', ('COMMA', ('COMMA', ('COMMA', 'AMATH 250', '350'), 'CIVE 222'), 'ENVE 223'), 'MATH 218'), '228'), 'ME 203'), 'SYDE 211'))) ('OR', ('COMMA', ('COMMA', ('COMMA', 'PSCI 250', '255'), '260'), '264')) ('OR', ('COMMA', ('COMMA', ('SLASH', 'ANTH 201', 'CLAS 221'), 'ANTH 203'), '233')) ('OR', ('OR', ('COMMA', ('OR', 'MATH 135', '145'), 'STAT 220'), '230'), '240') (('AND', 'AMATH 250', ('OR', 'MATH 237', '247')), 'Level 3A') Syntax error at 'LexToken(LPARENS,'(',1,16)' Syntax error at 'LexToken(RPARENS,')',1,32)' None ('COMMA', ('OR', 'CS 240', 'SE 240'), ('COMMA', 'CS 246', 'ECE 222')) Syntax error at 'LexToken(LPARENS,'(',1,8)' Syntax error at 'LexToken(RPARENS,')',1,25)' None ('AND', ('OR', ('SLASH', ('SLASH', 'AMATH 341', 'CM 271'), 'CS 371'), 'CS 370'), ('OR', ('OR', 'MATH 128', '138'), '148')) ('OR', ('OR', ('COMMA', ('OR', 'MATH 135', '145'), 'STAT 220'), '230'), '240') ('AND', 'AMATH 250', 'MATH 237') ('AND', ('OR', ('COMMA', 'BIOL 140', 'BIOL 208'), '330'), ('OR', 'BIOL 308', '330')) ('AND', ('OR', ('OR', 'MATH 229', '239'), '249'), ('OR', ('SLASH', ('COMMA', ('COMMA', ('SLASH', ('COMMA', 'CO 227', '250'), '350'), '355'), 'CO 352'), 'CM 340'))) ('COMMA', ('OR', 'CS 240', 'SE 240'), ('COMMA', 'CS 246', 'ECE 222')) ('AND', 'CS 241', ('OR', ('OR', 'STAT 206', '230'), '240'))
This was the BNF grammar encoded:
prereq : cp cp : cp AND cgparens | cp COMMA cgparens | cp COMMA AND cgparens | cp SEMI cgparens | cp SEMI AND cgparens | cg cg : cg OR courses | cg SLASH courses | cg COMMA courses | cg COMMA OR courses | LPARENS cg RPARENS | ONE OF cg | courses courses : courses OR course | courses SLASH course | courses COMMA course | course course : DEPT NUM | NUM
, which gives a surprising error whenever parenthesis appeared:
yacc.py: 389:Action : Reduce rule [course -> DEPT NUM] with ['BIOL','140'] and goto state 23 yacc.py: 423:Result : ('BIOL 140') yacc.py: 389:Action : Reduce rule [courses -> course] with ['BIOL 140'] and goto state 22 yacc.py: 423:Result : ('BIOL 140') yacc.py: 389:Action : Reduce rule [course -> NUM] with ['140L'] and goto state 24 yacc.py: 423:Result : ('140L') yacc.py: 389:Action : Reduce rule [courses -> courses COMMA course] with ['BIOL 140',',','140L'] and goto state 21 yacc.py: 423:Result : (('COMMA', 'BIOL 140', '140L')) yacc.py: 493:Error : courses COMMA . LexToken(LPARENS,'(',1,16) yacc.py: 493:Error : courses COMMA . error yacc.py: 493:Error : courses . error yacc.py: 493:Error : . error yacc.py: 389:Action : Reduce rule [course -> DEPT NUM] with ['BIOL','208'] and goto state 23 yacc.py: 423:Result : ('BIOL 208') yacc.py: 389:Action : Reduce rule [courses -> course] with ['BIOL 208'] and goto state 22 yacc.py: 423:Result : ('BIOL 208') yacc.py: 389:Action : Reduce rule [course -> NUM] with ['330'] and goto state 24 yacc.py: 423:Result : ('330') yacc.py: 389:Action : Reduce rule [courses -> courses OR course] with ['BIOL 208','or','330'] and goto state 19 yacc.py: 423:Result : (('OR', 'BIOL 208', '330')) yacc.py: 389:Action : Reduce rule [cg -> courses] with [] and goto state 18 yacc.py: 423:Result : (('OR', 'BIOL 208', '330')) yacc.py: 493:Error : cg . LexToken(RPARENS,')',1,32) yacc.py: 493:Error : cg . error yacc.py: 493:Error : . error yacc.py: 493:Error : . $end
It means the parser just refused to use the rule cg : courses and subsequently cg : LPARENS cg RPARENS. At that time, I thought it must be something wrong with the grammar. Forgetting the parenthesis would have been an option, only if the compiler gave the correct interpretation. With that in mind, I worked on getting the final result:
'prereq : cp SEMI lp SEMI ep'
p[0] = p[1]
def p_prereq_cp_lp(p):
'prereq : cp SEMI lp'
p[0] = p[1] + 'Level ' + p[3]
def p_prereq_cp_ep(p):
'prereq : cp SEMI ep'
p[0] = p[1] + ' Exclude ' + p[3]
def p_prereq_cp(p):
'prereq : cp'
p[0] = p[1]
def p_cp(p):
'''cp : cg
| cp AND cg
| cp COMMA cg
| cp COMMA AND cg
| cp SEMI cg
| cp SEMI AND cg'''
if (len(p) == 2):
p[0] = p[1]
elif (len(p) == 4):
p[0] = p[1] + ', ' + p[3]
else:
p[0] = p[1] + ', ' + p[4]
def p_cg(p):
'''cg : cl
| LPARENS cg RPARENS
| ONE OF cl'''
if (len(p) == 2):
p[0] = p[1]
elif (p[1] == 'LPARENS'):
p[0] = p[2]
else:
p[0] = p[3]
def p_cl(p):
'''cl : courses
| cl OR courses
| cl SLASH courses
| cl COMMA courses'''
if (len(p) == 2):
p[0] = p[1]
else:
p[0] = p[1] + '/' + p[3]
def p_courses(p):
'courses : DEPT nums'
p[0] = p[1] + ' ' + p[2]
def p_nums(p):
'''nums : NUM
| nums OR NUM
| nums SLASH NUM
| nums COMMA NUM'''
if (len(p) == 2):
p[0] = p[1]
else:
p[0] = p[1] + '/' + p[3]
def p_lp(p):
'lp : LEVEL AT LEAST TERM'
p[0] = p[4]
def p_ep(p):
'ep : NOT OPEN TO'
p[0] = ''
# Error rule for syntax errors
def p_error(p):
print "Syntax error at '%s'" % p
I realize at this point that PLY couldn’t fly:
yacc.py: 389:Action : Reduce rule [nums -> NUM] with ['121'] and goto state 19 yacc.py: 423:Result : ('121') yacc.py: 389:Action : Reduce rule [nums -> nums COMMA NUM] with ['121',',','121L'] and goto state 22 yacc.py: 423:Result : ('121/121L') yacc.py: 389:Action : Reduce rule [nums -> nums OR NUM] with ['121/121L','or','122'] and goto state 20 yacc.py: 423:Result : ('121/121L/122') yacc.py: 389:Action : Reduce rule [nums -> nums COMMA NUM] with ['121/121L/122',',','122L'] and goto state 22 yacc.py: 423:Result : ('121/121L/122/122L') yacc.py: 389:Action : Reduce rule [nums -> nums OR NUM] with [,'or','153'] and goto state 20 yacc.py: 423:Result : ('121/121L/122/122L/153') yacc.py: 493:Error : DEPT nums OR . LexToken(DEPT,'CIVE',1,39)
It seems to be stuck on the rule nums : nums OR NUM. I did a test by deleting that rule. Then it parsed the rest of the string fine, just missing a few NUMs. For some reason, PLY just refused to use the rule cl : cl OR courses. I checked the PLY documentation. Yup, PLY turned out to be an iron balloon. “shift/reduce conflicts are resolved in favor of shifting (the default).” Of course, once the mystery is solved, the solution is not hard to find.