1 # Yapps 2.0 - yet another python parser system
2 # Amit J Patel, January 1999
3 # See http://theory.stanford.edu/~amitp/Yapps/ for documentation and updates
5 # v2.0.1 changes (October 2001):
6 # * The exceptions inherit the standard Exception class (thanks Rich Salz)
7 # * The scanner can use either a different set of regular expressions
8 # per instance, or allows the subclass to define class fields with
9 # the patterns. This improves performance when many Scanner objects
10 # are being created, because the regular expressions don't have to
11 # be recompiled each time. (thanks Amaury Forgeot d'Arc)
12 # v2.0.2 changes (April 2002)
13 # * Fixed a bug in generating the 'else' clause when the comment was too
14 # long. v2.0.1 was missing a newline. (thanks Steven Engelhardt)
15 # v2.0.3 changes (August 2002)
16 # * Fixed a bug with inline tokens using the r"" syntax.
25 def __init__(self, name, options, tokens, rules):
28 self.options = options
30 self.postparser = None
32 self.tokens = {} # Map from tokens to regexps
33 self.ignore = [] # List of token names to ignore in parsing
34 self.terminals = [] # List of token names (to maintain ordering)
39 if n in self.tokens.keys() and self.tokens[n] != t:
40 print 'Warning: token', n, 'multiply defined.'
42 self.terminals.append(n)
44 self.rules = {} # Map from rule names to parser nodes
45 self.params = {} # Map from rule names to parameters
46 self.goals = [] # List of rule names (to maintain ordering)
53 self.output = sys.stdout
55 def __getitem__(self, name):
57 return self.options.get(name, 0)
59 def non_ignored_tokens(self):
60 return filter(lambda x, i=self.ignore: x not in i, self.terminals)
63 self.change_count = 1+self.change_count
65 def subset(self, a, b):
66 "See if all elements of a are inside b"
68 if x not in b: return 0
71 def equal_set(self, a, b):
72 "See if a and b have the same elements"
73 if len(a) != len(b): return 0
75 return self.subset(a, b) and self.subset(b, a)
77 def add_to(self, parent, additions):
78 "Modify parent to include all elements in additions"
84 def equate(self, a, b):
88 def write(self, *args):
92 def in_test(self, x, full, b):
94 if len(b)==1: return '%s == %s' % (x, `b[0]`)
95 if full and len(b) > len(full)/2:
96 # Reverse the sense of the test.
97 not_b = filter(lambda x, b=b: x not in b, full)
98 return self.not_in_test(x, full, not_b)
99 return '%s in %s' % (x, `b`)
101 def not_in_test(self, x, full, b):
103 if len(b)==1: return '%s != %s' % (x, `b[0]`)
104 return '%s not in %s' % (x, `b`)
106 def peek_call(self, a):
108 if self.equal_set(a, self.non_ignored_tokens()): a_set = ''
109 if self['context-insensitive-scanner']: a_set = ''
110 return 'self._peek(%s)' % a_set
112 def peek_test(self, a, b):
113 if self.subset(a, b): return '1'
114 if self['context-insensitive-scanner']: a = self.non_ignored_tokens()
115 return self.in_test(self.peek_call(a), a, b)
117 def not_peek_test(self, a, b):
118 if self.subset(a, b): return '0'
119 return self.not_in_test(self.peek_call(a), a, b)
124 self.rules[r].setup(self, r)
125 if self.change_count == 0: break
126 self.change_count = 0
130 self.rules[r].update(self)
131 if self.change_count == 0: break
132 self.change_count = 0
134 def dump_information(self):
137 print ' _____' + '_'*len(r)
138 print ('___/Rule '+r+'\\' + '_'*80)[:79]
139 queue = [self.rules[r]]
148 if top.accepts_epsilon: eps = ['(null)']
149 print ' FIRST:', join(top.first+eps, ', ')
150 print ' FOLLOW:', join(top.follow, ', ')
151 for x in top.get_children(): queue.append(x)
153 def generate_output(self):
155 self.write(self.preparser)
156 self.write("from string import *\n")
157 self.write("import re\n")
158 self.write("from yappsrt import *\n")
160 self.write("class ", self.name, "Scanner(Scanner):\n")
161 self.write(" patterns = [\n")
162 for p in self.terminals:
163 self.write(" (%s, re.compile(%s)),\n" % (
164 `p`, `self.tokens[p]`))
166 self.write(" def __init__(self, str):\n")
167 self.write(" Scanner.__init__(self,None,%s,str)\n" %
171 self.write("class ", self.name, "(Parser):\n")
173 self.write(INDENT, "def ", r, "(self")
174 if self.params[r]: self.write(", ", self.params[r])
176 self.rules[r].output(self, INDENT+INDENT)
180 self.write("def parse(rule, text):\n")
181 self.write(" P = ", self.name, "(", self.name, "Scanner(text))\n")
182 self.write(" return wrap_error_reporter(P, rule)\n")
184 if self.postparser is not None:
185 self.write(self.postparser)
187 self.write("if __name__=='__main__':\n")
188 self.write(INDENT, "from sys import argv, stdin\n")
189 self.write(INDENT, "if len(argv) >= 2:\n")
190 self.write(INDENT*2, "if len(argv) >= 3:\n")
191 self.write(INDENT*3, "f = open(argv[2],'r')\n")
192 self.write(INDENT*2, "else:\n")
193 self.write(INDENT*3, "f = stdin\n")
194 self.write(INDENT*2, "print parse(argv[1], f.read())\n")
195 self.write(INDENT, "else: print 'Args: <rule> [<filename>]'\n")
197 ######################################################################
202 self.accepts_epsilon = 0
205 def setup(self, gen, rule):
206 # Setup will change accepts_epsilon,
207 # sometimes from 0 to 1 but never 1 to 0.
208 # It will take a finite number of steps to set things up
211 def used(self, vars):
212 "Return two lists: one of vars used, and the other of vars assigned"
215 def get_children(self):
216 "Return a list of sub-nodes"
222 def update(self, gen):
223 if self.accepts_epsilon:
224 gen.add_to(self.first, self.follow)
226 def output(self, gen, indent):
227 "Write out code to _gen_ with _indent_:string indentation"
228 gen.write(indent, "assert 0 # Invalid parser node\n")
230 class Terminal(Node):
231 def __init__(self, token):
234 self.accepts_epsilon = 0
239 def update(self, gen):
240 Node.update(self, gen)
241 if self.first != [self.token]:
242 self.first = [self.token]
245 def output(self, gen, indent):
247 if re.match('[a-zA-Z_]+$', self.token):
248 gen.write(self.token, " = ")
249 gen.write("self._scan(%s)\n" % `self.token`)
252 def __init__(self, expr):
256 def setup(self, gen, rule):
257 Node.setup(self, gen, rule)
258 if not self.accepts_epsilon:
259 self.accepts_epsilon = 1
263 return '{{ %s }}' % strip(self.expr)
265 def output(self, gen, indent):
266 gen.write(indent, strip(self.expr), '\n')
268 class NonTerminal(Node):
269 def __init__(self, name, args):
274 def setup(self, gen, rule):
275 Node.setup(self, gen, rule)
277 self.target = gen.rules[self.name]
278 if self.accepts_epsilon != self.target.accepts_epsilon:
279 self.accepts_epsilon = self.target.accepts_epsilon
281 except KeyError: # Oops, it's nonexistent
282 print 'Error: no rule <%s>' % self.name
286 return '<%s>' % self.name
288 def update(self, gen):
289 Node.update(self, gen)
290 gen.equate(self.first, self.target.first)
291 gen.equate(self.follow, self.target.follow)
293 def output(self, gen, indent):
295 gen.write(self.name, " = ")
296 gen.write("self.", self.name, "(", self.args, ")\n")
298 class Sequence(Node):
299 def __init__(self, *children):
301 self.children = children
303 def setup(self, gen, rule):
304 Node.setup(self, gen, rule)
305 for c in self.children: c.setup(gen, rule)
307 if not self.accepts_epsilon:
308 # If it's not already accepting epsilon, it might now do so.
309 for c in self.children:
310 # any non-epsilon means all is non-epsilon
311 if not c.accepts_epsilon: break
313 self.accepts_epsilon = 1
316 def get_children(self):
320 return '( %s )' % join(map(lambda x: str(x), self.children))
322 def update(self, gen):
323 Node.update(self, gen)
324 for g in self.children:
328 for g_i in range(len(self.children)):
329 g = self.children[g_i]
331 if empty: gen.add_to(self.first, g.first)
332 if not g.accepts_epsilon: empty = 0
334 if g_i == len(self.children)-1:
337 next = self.children[1+g_i].first
338 gen.add_to(g.follow, next)
341 gen.add_to(self.follow, self.children[-1].follow)
343 def output(self, gen, indent):
345 for c in self.children:
346 c.output(gen, indent)
348 # Placeholder for empty sequences, just in case
349 gen.write(indent, 'pass\n')
352 def __init__(self, *children):
354 self.children = children
356 def setup(self, gen, rule):
357 Node.setup(self, gen, rule)
358 for c in self.children: c.setup(gen, rule)
360 if not self.accepts_epsilon:
361 for c in self.children:
362 if c.accepts_epsilon:
363 self.accepts_epsilon = 1
366 def get_children(self):
370 return '( %s )' % join(map(lambda x: str(x), self.children), ' | ')
372 def update(self, gen):
373 Node.update(self, gen)
374 for g in self.children:
377 for g in self.children:
378 gen.add_to(self.first, g.first)
379 gen.add_to(self.follow, g.follow)
380 for g in self.children:
381 gen.add_to(g.follow, self.follow)
382 if self.accepts_epsilon:
383 gen.add_to(self.first, self.follow)
385 def output(self, gen, indent):
387 gen.write(indent, "_token_ = ", gen.peek_call(self.first), "\n")
389 tokens_unseen = self.first[:]
390 if gen['context-insensitive-scanner']:
391 # Context insensitive scanners can return ANY token,
392 # not only the ones in first.
393 tokens_unseen = gen.non_ignored_tokens()
394 for c in self.children:
401 if x in tokens_unseen: tokens_unseen.remove(x)
402 tokens_seen = tokens_seen + testset
405 print 'Error in rule', self.rule+':', c, 'never matches.'
407 print 'Warning:', self
408 print ' * These tokens are being ignored:', join(removed, ', ')
409 print ' due to previous choices using them.'
412 if not tokens_unseen: # context sensitive scanners only!
414 # if it's the first AND last test, then
415 # we can simply put the code without an if/else
416 c.output(gen, indent)
418 gen.write(indent, "else: ")
419 t = gen.in_test('', [], testset)
420 if len(t) < 70-len(indent):
423 c.output(gen, indent+INDENT)
425 gen.write(indent, test, " ",
426 gen.in_test('_token_', tokens_unseen, testset),
428 c.output(gen, indent+INDENT)
431 if gen['context-insensitive-scanner'] and tokens_unseen:
432 gen.write(indent, "else:\n")
433 gen.write(indent, INDENT, "raise SyntaxError(self._pos, ")
434 gen.write("'Could not match ", self.rule, "')\n")
437 def __init__(self, child):
441 def setup(self, gen, rule):
442 Node.setup(self, gen, rule)
443 self.child.setup(gen, rule)
445 def get_children(self):
448 def update(self, gen):
449 Node.update(self, gen)
450 self.child.update(gen)
451 gen.add_to(self.first, self.child.first)
452 gen.equate(self.follow, self.child.follow)
454 class Option(Wrapper):
455 def setup(self, gen, rule):
456 Wrapper.setup(self, gen, rule)
457 if not self.accepts_epsilon:
458 self.accepts_epsilon = 1
462 return '[ %s ]' % str(self.child)
464 def output(self, gen, indent):
465 if self.child.accepts_epsilon:
466 print 'Warning in rule', self.rule+': contents may be empty.'
467 gen.write(indent, "if %s:\n" %
468 gen.peek_test(self.first, self.child.first))
469 self.child.output(gen, indent+INDENT)
472 def setup(self, gen, rule):
473 Wrapper.setup(self, gen, rule)
474 if self.accepts_epsilon != self.child.accepts_epsilon:
475 self.accepts_epsilon = self.child.accepts_epsilon
479 return '%s+' % str(self.child)
481 def update(self, gen):
482 Wrapper.update(self, gen)
483 gen.add_to(self.follow, self.first)
485 def output(self, gen, indent):
486 if self.child.accepts_epsilon:
487 print 'Warning in rule', self.rule+':'
488 print ' * The repeated pattern could be empty. The resulting'
489 print ' parser may not work properly.'
490 gen.write(indent, "while 1:\n")
491 self.child.output(gen, indent+INDENT)
492 union = self.first[:]
493 gen.add_to(union, self.follow)
494 gen.write(indent+INDENT, "if %s: break\n" %
495 gen.not_peek_test(union, self.child.first))
498 def setup(self, gen, rule):
499 Wrapper.setup(self, gen, rule)
500 if not self.accepts_epsilon:
501 self.accepts_epsilon = 1
505 return '%s*' % str(self.child)
507 def output(self, gen, indent):
508 if self.child.accepts_epsilon:
509 print 'Warning in rule', self.rule+':'
510 print ' * The repeated pattern could be empty. The resulting'
511 print ' parser probably will not work properly.'
512 gen.write(indent, "while %s:\n" %
513 gen.peek_test(self.follow, self.child.first))
514 self.child.output(gen, indent+INDENT)
516 ######################################################################
517 # The remainder of this file is from parsedesc.{g,py}
524 def add_inline_token(tokens, str):
525 tokens.insert( 0, (str, eval(str, {}, {})) )
528 def cleanup_choice(lst):
529 if len(lst) == 0: return Sequence([])
530 if len(lst) == 1: return lst[0]
531 return apply(Choice, tuple(lst))
533 def cleanup_sequence(lst):
534 if len(lst) == 1: return lst[0]
535 return apply(Sequence, tuple(lst))
537 def cleanup_rep(node, rep):
538 if rep == 'star': return Star(node)
539 elif rep == 'plus': return Plus(node)
542 def resolve_name(tokens, id, args):
543 if id in map(lambda x: x[0], tokens):
546 print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args)
549 # It's a name, so assume it's a nonterminal
550 return NonTerminal(id, args)
555 from yappsrt import *
557 class ParserDescriptionScanner(Scanner):
558 def __init__(self, str):
559 Scanner.__init__(self,[
561 ('"ignore"', 'ignore'),
562 ('"token"', 'token'),
563 ('"option"', 'option'),
565 ('"parser"', 'parser'),
566 ('[ \011\015\012]+', '[ \011\015\012]+'),
567 ('#.*?\015?\012', '#.*?\015?\012'),
571 ('ID', '[a-zA-Z_][a-zA-Z_0-9]*'),
572 ('STR', '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"'),
580 ], ['[ \011\015\012]+', '#.*?\015?\012'], str)
582 class ParserDescription(Parser):
584 self._scan('"parser"')
585 ID = self._scan('ID')
587 Options = self.Options()
588 Tokens = self.Tokens()
589 Rules = self.Rules(Tokens)
590 END = self._scan('END')
591 return Generator(ID,Options,Tokens,Rules)
595 while self._peek('"option"', '"token"', '"ignore"', 'END', '"rule"') == '"option"':
596 self._scan('"option"')
604 while self._peek('"token"', '"ignore"', 'END', '"rule"') in ['"token"', '"ignore"']:
605 _token_ = self._peek('"token"', '"ignore"')
606 if _token_ == '"token"':
607 self._scan('"token"')
608 ID = self._scan('ID')
611 tok.append( (ID,Str) )
612 else: # == '"ignore"'
613 self._scan('"ignore"')
616 tok.append( ('#ignore',Str) )
619 def Rules(self, tokens):
621 while self._peek('"rule"', 'END') == '"rule"':
623 ID = self._scan('ID')
624 OptParam = self.OptParam()
626 ClauseA = self.ClauseA(tokens)
627 rul.append( (ID,OptParam,ClauseA) )
630 def ClauseA(self, tokens):
631 ClauseB = self.ClauseB(tokens)
633 while self._peek('OR', 'RP', 'RB', '"rule"', 'END') == 'OR':
634 OR = self._scan('OR')
635 ClauseB = self.ClauseB(tokens)
637 return cleanup_choice(v)
639 def ClauseB(self, tokens):
641 while self._peek('STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END') in ['STR', 'ID', 'LP', 'LB', 'STMT']:
642 ClauseC = self.ClauseC(tokens)
644 return cleanup_sequence(v)
646 def ClauseC(self, tokens):
647 ClauseD = self.ClauseD(tokens)
648 _token_ = self._peek('PLUS', 'STAR', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END')
649 if _token_ == 'PLUS':
650 PLUS = self._scan('PLUS')
652 elif _token_ == 'STAR':
653 STAR = self._scan('STAR')
658 def ClauseD(self, tokens):
659 _token_ = self._peek('STR', 'ID', 'LP', 'LB', 'STMT')
661 STR = self._scan('STR')
662 t = (STR, eval(STR,{},{}))
663 if t not in tokens: tokens.insert( 0, t )
665 elif _token_ == 'ID':
666 ID = self._scan('ID')
667 OptParam = self.OptParam()
668 return resolve_name(tokens, ID, OptParam)
669 elif _token_ == 'LP':
670 LP = self._scan('LP')
671 ClauseA = self.ClauseA(tokens)
672 RP = self._scan('RP')
674 elif _token_ == 'LB':
675 LB = self._scan('LB')
676 ClauseA = self.ClauseA(tokens)
677 RB = self._scan('RB')
678 return Option(ClauseA)
680 STMT = self._scan('STMT')
681 return Eval(STMT[2:-2])
684 if self._peek('ATTR', '":"', 'PLUS', 'STAR', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END') == 'ATTR':
685 ATTR = self._scan('ATTR')
690 STR = self._scan('STR')
691 return eval(STR,{},{})
697 # This replaces the default main routine
700 ('context-insensitive-scanner', 'context-insensitive-scanner',
701 'Scan all tokens (see docs)')
704 def generate(inputfilename, outputfilename='', dump=0, **flags):
705 """Generate a grammar, given an input filename (X.g)
706 and an output filename (defaulting to X.py)."""
708 if not outputfilename:
709 if inputfilename[-2:]=='.g': outputfilename = inputfilename[:-2]+'.py'
710 else: raise "Invalid Filename", outputfilename
712 print 'Input Grammar:', inputfilename
713 print 'Output File:', outputfilename
715 DIVIDER = '\n%%\n' # This pattern separates the pre/post parsers
716 preparser, postparser = None, None # Code before and after the parser desc
718 # Read the entire file
719 s = open(inputfilename,'r').read()
721 # See if there's a separation between the pre-parser and parser
723 if f >= 0: preparser, s = s[:f]+'\n\n', s[f+len(DIVIDER):]
725 # See if there's a separation between the parser and post-parser
727 if f >= 0: s, postparser = s[:f], '\n\n'+s[f+len(DIVIDER):]
729 # Create the parser and scanner
730 p = ParserDescription(ParserDescriptionScanner(s))
734 t = wrap_error_reporter(p, 'Parser')
735 if not t: return # Error
736 if preparser is not None: t.preparser = preparser
737 if postparser is not None: t.postparser = postparser
740 for f in t.options.keys():
741 for opt,_,_ in yapps_options:
744 print 'Warning: unrecognized option', f
745 # Add command line options to the set
746 for f in flags.keys(): t.options[f] = flags[f]
748 # Generate the output
752 t.output = open(outputfilename, 'w')
755 if __name__=='__main__':
757 optlist, args = getopt.getopt(sys.argv[1:], 'f:', ['dump'])
758 if not args or len(args) > 2:
760 print ' python', sys.argv[0], '[flags] input.g [output.py]'
762 print (' --dump' + ' '*40)[:35] + 'Dump out grammar information'
763 for flag, _, doc in yapps_options:
764 print (' -f' + flag + ' '*40)[:35] + doc
766 # Read in the options and create a list of flags
769 for flag, name, _ in yapps_options:
770 if opt == ('-f', flag):
774 if opt == ('--dump', ''):
777 print 'Warning - unrecognized option: ', opt[0], opt[1]
779 apply(generate, tuple(args), flags)