1 -----------------------------------------------------------------------------
3 -- Module : Text.Parsec.Token
4 -- Copyright : (c) Daan Leijen 1999-2001, (c) Paolo Martini 2007
5 -- License : BSD-style (see the LICENSE file)
7 -- Maintainer : derek.a.elkins@gmail.com
8 -- Stability : provisional
9 -- Portability : non-portable (uses local universal quantification: PolymorphicComponents)
11 -- A helper module to parse lexical elements (tokens). See 'makeTokenParser'
12 -- for a description of how to use it.
14 -----------------------------------------------------------------------------
16 {-# LANGUAGE PolymorphicComponents #-}
17 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
19 module Text.Parsec.Token
27 import Data.Char ( isAlpha, toLower, toUpper, isSpace, digitToInt )
28 import Data.List ( nub, sort )
29 import Control.Monad.Identity
30 import Text.Parsec.Prim
31 import Text.Parsec.Char
32 import Text.Parsec.Combinator
34 -----------------------------------------------------------
35 -- Language Definition
36 -----------------------------------------------------------
38 type LanguageDef st = GenLanguageDef String st Identity
40 -- | The @GenLanguageDef@ type is a record that contains all parameterizable
41 -- features of the 'Text.Parsec.Token' module. The module 'Text.Parsec.Language'
42 -- contains some default definitions.
44 data GenLanguageDef s u m
47 -- | Describes the start of a block comment. Use the empty string if the
48 -- language doesn't support block comments. For example \"\/*\".
50 commentStart :: String,
52 -- | Describes the end of a block comment. Use the empty string if the
53 -- language doesn't support block comments. For example \"*\/\".
57 -- | Describes the start of a line comment. Use the empty string if the
58 -- language doesn't support line comments. For example \"\/\/\".
60 commentLine :: String,
62 -- | Set to 'True' if the language supports nested block comments.
64 nestedComments :: Bool,
66 -- | This parser should accept any start characters of identifiers. For
67 -- example @letter \<|> char \"_\"@.
69 identStart :: ParsecT s u m Char,
71 -- | This parser should accept any legal tail characters of identifiers.
72 -- For example @alphaNum \<|> char \"_\"@.
74 identLetter :: ParsecT s u m Char,
76 -- | This parser should accept any start characters of operators. For
77 -- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@
79 opStart :: ParsecT s u m Char,
81 -- | This parser should accept any legal tail characters of operators.
82 -- Note that this parser should even be defined if the language doesn't
83 -- support user-defined operators, or otherwise the 'reservedOp'
84 -- parser won't work correctly.
86 opLetter :: ParsecT s u m Char,
88 -- | The list of reserved identifiers.
90 reservedNames :: [String],
92 -- | The list of reserved operators.
94 reservedOpNames:: [String],
96 -- | Set to 'True' if the language is case sensitive.
102 -----------------------------------------------------------
103 -- A first class module: TokenParser
104 -----------------------------------------------------------
106 type TokenParser st = GenTokenParser String st Identity
108 -- | The type of the record that holds lexical parsers that work on
109 -- @s@ streams with state @u@ over a monad @m@.
111 data GenTokenParser s u m
114 -- | This lexeme parser parses a legal identifier. Returns the identifier
115 -- string. This parser will fail on identifiers that are reserved
116 -- words. Legal identifier (start) characters and reserved words are
117 -- defined in the 'LanguageDef' that is passed to
118 -- 'makeTokenParser'. An @identifier@ is treated as
119 -- a single token using 'try'.
121 identifier :: ParsecT s u m String,
123 -- | The lexeme parser @reserved name@ parses @symbol
124 -- name@, but it also checks that the @name@ is not a prefix of a
125 -- valid identifier. A @reserved@ word is treated as a single token
128 reserved :: String -> ParsecT s u m (),
130 -- | This lexeme parser parses a legal operator. Returns the name of the
131 -- operator. This parser will fail on any operators that are reserved
132 -- operators. Legal operator (start) characters and reserved operators
133 -- are defined in the 'LanguageDef' that is passed to
134 -- 'makeTokenParser'. An @operator@ is treated as a
135 -- single token using 'try'.
137 operator :: ParsecT s u m String,
139 -- |The lexeme parser @reservedOp name@ parses @symbol
140 -- name@, but it also checks that the @name@ is not a prefix of a
141 -- valid operator. A @reservedOp@ is treated as a single token using
144 reservedOp :: String -> ParsecT s u m (),
147 -- | This lexeme parser parses a single literal character. Returns the
148 -- literal character value. This parsers deals correctly with escape
149 -- sequences. The literal character is parsed according to the grammar
150 -- rules defined in the Haskell report (which matches most programming
151 -- languages quite closely).
153 charLiteral :: ParsecT s u m Char,
155 -- | This lexeme parser parses a literal string. Returns the literal
156 -- string value. This parsers deals correctly with escape sequences and
157 -- gaps. The literal string is parsed according to the grammar rules
158 -- defined in the Haskell report (which matches most programming
159 -- languages quite closely).
161 stringLiteral :: ParsecT s u m String,
163 -- | This lexeme parser parses a natural number (a positive whole
164 -- number). Returns the value of the number. The number can be
165 -- specified in 'decimal', 'hexadecimal' or
166 -- 'octal'. The number is parsed according to the grammar
167 -- rules in the Haskell report.
169 natural :: ParsecT s u m Integer,
171 -- | This lexeme parser parses an integer (a whole number). This parser
172 -- is like 'natural' except that it can be prefixed with
173 -- sign (i.e. \'-\' or \'+\'). Returns the value of the number. The
174 -- number can be specified in 'decimal', 'hexadecimal'
175 -- or 'octal'. The number is parsed according
176 -- to the grammar rules in the Haskell report.
178 integer :: ParsecT s u m Integer,
180 -- | This lexeme parser parses a floating point value. Returns the value
181 -- of the number. The number is parsed according to the grammar rules
182 -- defined in the Haskell report.
184 float :: ParsecT s u m Double,
186 -- | This lexeme parser parses either 'natural' or a 'float'.
187 -- Returns the value of the number. This parsers deals with
188 -- any overlap in the grammar rules for naturals and floats. The number
189 -- is parsed according to the grammar rules defined in the Haskell report.
191 naturalOrFloat :: ParsecT s u m (Either Integer Double),
193 -- | Parses a positive whole number in the decimal system. Returns the
194 -- value of the number.
196 decimal :: ParsecT s u m Integer,
198 -- | Parses a positive whole number in the hexadecimal system. The number
199 -- should be prefixed with \"0x\" or \"0X\". Returns the value of the
202 hexadecimal :: ParsecT s u m Integer,
204 -- | Parses a positive whole number in the octal system. The number
205 -- should be prefixed with \"0o\" or \"0O\". Returns the value of the
208 octal :: ParsecT s u m Integer,
210 -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
211 -- trailing white space.
213 symbol :: String -> ParsecT s u m String,
215 -- | @lexeme p@ first applies parser @p@ and than the 'whiteSpace'
216 -- parser, returning the value of @p@. Every lexical
217 -- token (lexeme) is defined using @lexeme@, this way every parse
218 -- starts at a point without white space. Parsers that use @lexeme@ are
219 -- called /lexeme/ parsers in this document.
221 -- The only point where the 'whiteSpace' parser should be
222 -- called explicitly is the start of the main parser in order to skip
223 -- any leading white space.
225 -- > mainParser = do{ whiteSpace
226 -- > ; ds <- many (lexeme digit)
228 -- > ; return (sum ds)
231 lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a,
233 -- | Parses any white space. White space consists of /zero/ or more
234 -- occurrences of a 'space', a line comment or a block (multi
235 -- line) comment. Block comments may be nested. How comments are
236 -- started and ended is defined in the 'LanguageDef'
237 -- that is passed to 'makeTokenParser'.
239 whiteSpace :: ParsecT s u m (),
241 -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis,
242 -- returning the value of @p@.
244 parens :: forall a. ParsecT s u m a -> ParsecT s u m a,
246 -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (\'{\' and
247 -- \'}\'), returning the value of @p@.
249 braces :: forall a. ParsecT s u m a -> ParsecT s u m a,
251 -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (\'\<\'
252 -- and \'>\'), returning the value of @p@.
254 angles :: forall a. ParsecT s u m a -> ParsecT s u m a,
256 -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (\'[\'
257 -- and \']\'), returning the value of @p@.
259 brackets :: forall a. ParsecT s u m a -> ParsecT s u m a,
261 -- | DEPRECATED: Use 'brackets'.
263 squares :: forall a. ParsecT s u m a -> ParsecT s u m a,
265 -- | Lexeme parser |semi| parses the character \';\' and skips any
266 -- trailing white space. Returns the string \";\".
268 semi :: ParsecT s u m String,
270 -- | Lexeme parser @comma@ parses the character \',\' and skips any
271 -- trailing white space. Returns the string \",\".
273 comma :: ParsecT s u m String,
275 -- | Lexeme parser @colon@ parses the character \':\' and skips any
276 -- trailing white space. Returns the string \":\".
278 colon :: ParsecT s u m String,
280 -- | Lexeme parser @dot@ parses the character \'.\' and skips any
281 -- trailing white space. Returns the string \".\".
283 dot :: ParsecT s u m String,
285 -- | Lexeme parser @semiSep p@ parses /zero/ or more occurrences of @p@
286 -- separated by 'semi'. Returns a list of values returned by
289 semiSep :: forall a . ParsecT s u m a -> ParsecT s u m [a],
291 -- | Lexeme parser @semiSep1 p@ parses /one/ or more occurrences of @p@
292 -- separated by 'semi'. Returns a list of values returned by @p@.
294 semiSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a],
296 -- | Lexeme parser @commaSep p@ parses /zero/ or more occurrences of
297 -- @p@ separated by 'comma'. Returns a list of values returned
300 commaSep :: forall a . ParsecT s u m a -> ParsecT s u m [a],
302 -- | Lexeme parser @commaSep1 p@ parses /one/ or more occurrences of
303 -- @p@ separated by 'comma'. Returns a list of values returned
306 commaSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a]
309 -----------------------------------------------------------
310 -- Given a LanguageDef, create a token parser.
311 -----------------------------------------------------------
313 -- | The expression @makeTokenParser language@ creates a 'GenTokenParser'
314 -- record that contains lexical parsers that are
315 -- defined using the definitions in the @language@ record.
317 -- The use of this function is quite stylized - one imports the
318 -- appropiate language definition and selects the lexical parsers that
319 -- are needed from the resulting 'GenTokenParser'.
321 -- > module Main where
323 -- > import Text.Parsec
324 -- > import qualified Text.Parsec.Token as P
325 -- > import Text.Parsec.Language (haskellDef)
330 -- > expr = parens expr
336 -- > lexer = P.makeTokenParser haskellDef
338 -- > parens = P.parens lexer
339 -- > braces = P.braces lexer
340 -- > identifier = P.identifier lexer
341 -- > reserved = P.reserved lexer
344 makeTokenParser :: (Stream s m Char)
345 => GenLanguageDef s u m -> GenTokenParser s u m
346 makeTokenParser languageDef
347 = TokenParser{ identifier = identifier
348 , reserved = reserved
349 , operator = operator
350 , reservedOp = reservedOp
352 , charLiteral = charLiteral
353 , stringLiteral = stringLiteral
357 , naturalOrFloat = naturalOrFloat
359 , hexadecimal = hexadecimal
364 , whiteSpace = whiteSpace
369 , brackets = brackets
376 , semiSep1 = semiSep1
377 , commaSep = commaSep
378 , commaSep1 = commaSep1
382 -----------------------------------------------------------
384 -----------------------------------------------------------
385 parens p = between (symbol "(") (symbol ")") p
386 braces p = between (symbol "{") (symbol "}") p
387 angles p = between (symbol "<") (symbol ">") p
388 brackets p = between (symbol "[") (symbol "]") p
395 commaSep p = sepBy p comma
396 semiSep p = sepBy p semi
398 commaSep1 p = sepBy1 p comma
399 semiSep1 p = sepBy1 p semi
402 -----------------------------------------------------------
404 -----------------------------------------------------------
405 charLiteral = lexeme (between (char '\'')
406 (char '\'' <?> "end of character")
410 characterChar = charLetter <|> charEscape
411 <?> "literal character"
413 charEscape = do{ char '\\'; escapeCode }
414 charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))
418 stringLiteral = lexeme (
419 do{ str <- between (char '"')
420 (char '"' <?> "end of string")
422 ; return (foldr (maybe id (:)) "" str)
424 <?> "literal string")
426 stringChar = do{ c <- stringLetter; return (Just c) }
428 <?> "string character"
430 stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))
432 stringEscape = do{ char '\\'
433 ; do{ escapeGap ; return Nothing }
434 <|> do{ escapeEmpty; return Nothing }
435 <|> do{ esc <- escapeCode; return (Just esc) }
438 escapeEmpty = char '&'
439 escapeGap = do{ many1 space
440 ; char '\\' <?> "end of string gap"
446 escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
449 charControl = do{ char '^'
451 ; return (toEnum (fromEnum code - fromEnum 'A'))
454 charNum = do{ code <- decimal
455 <|> do{ char 'o'; number 8 octDigit }
456 <|> do{ char 'x'; number 16 hexDigit }
457 ; return (toEnum (fromInteger code))
460 charEsc = choice (map parseEsc escMap)
462 parseEsc (c,code) = do{ char c; return code }
464 charAscii = choice (map parseAscii asciiMap)
466 parseAscii (asc,code) = try (do{ string asc; return code })
469 -- escape code tables
470 escMap = zip ("abfnrtv\\\"\'") ("\a\b\f\n\r\t\v\\\"\'")
471 asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)
473 ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
474 "FS","GS","RS","US","SP"]
475 ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
476 "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
477 "CAN","SUB","ESC","DEL"]
479 ascii2 = ['\BS','\HT','\LF','\VT','\FF','\CR','\SO','\SI',
480 '\EM','\FS','\GS','\RS','\US','\SP']
481 ascii3 = ['\NUL','\SOH','\STX','\ETX','\EOT','\ENQ','\ACK',
482 '\BEL','\DLE','\DC1','\DC2','\DC3','\DC4','\NAK',
483 '\SYN','\ETB','\CAN','\SUB','\ESC','\DEL']
486 -----------------------------------------------------------
488 -----------------------------------------------------------
489 naturalOrFloat = lexeme (natFloat) <?> "number"
491 float = lexeme floating <?> "float"
492 integer = lexeme int <?> "integer"
493 natural = lexeme nat <?> "natural"
497 floating = do{ n <- decimal
502 natFloat = do{ char '0'
507 zeroNumFloat = do{ n <- hexadecimal <|> octal
514 decimalFloat = do{ n <- decimal
519 fractFloat n = do{ f <- fractExponent n
523 fractExponent n = do{ fract <- fraction
524 ; expo <- option 1.0 exponent'
525 ; return ((fromInteger n + fract)*expo)
528 do{ expo <- exponent'
529 ; return ((fromInteger n)*expo)
532 fraction = do{ char '.'
533 ; digits <- many1 digit <?> "fraction"
534 ; return (foldr op 0.0 digits)
538 op d f = (f + fromIntegral (digitToInt d))/10.0
540 exponent' = do{ oneOf "eE"
542 ; e <- decimal <?> "exponent"
543 ; return (power (f e))
547 power e | e < 0 = 1.0/power(-e)
548 | otherwise = fromInteger (10^e)
551 -- integers and naturals
552 int = do{ f <- lexeme sign
557 sign = (char '-' >> return negate)
558 <|> (char '+' >> return id)
561 nat = zeroNumber <|> decimal
563 zeroNumber = do{ char '0'
564 ; hexadecimal <|> octal <|> decimal <|> return 0
568 decimal = number 10 digit
569 hexadecimal = do{ oneOf "xX"; number 16 hexDigit }
570 octal = do{ oneOf "oO"; number 8 octDigit }
572 number base baseDigit
573 = do{ digits <- many1 baseDigit
574 ; let n = foldl (\x d -> base*x + toInteger (digitToInt d)) 0 digits
578 -----------------------------------------------------------
579 -- Operators & reserved ops
580 -----------------------------------------------------------
584 ; notFollowedBy (opLetter languageDef) <?> ("end of " ++ show name)
590 ; if (isReservedOp name)
591 then unexpected ("reserved operator " ++ show name)
596 do{ c <- (opStart languageDef)
597 ; cs <- many (opLetter languageDef)
603 isReserved (sort (reservedOpNames languageDef)) name
606 -----------------------------------------------------------
607 -- Identifiers & Reserved words
608 -----------------------------------------------------------
612 ; notFollowedBy (identLetter languageDef) <?> ("end of " ++ show name)
616 | caseSensitive languageDef = string name
617 | otherwise = do{ walk name; return name }
620 walk (c:cs) = do{ caseChar c <?> msg; walk cs }
622 caseChar c | isAlpha c = char (toLower c) <|> char (toUpper c)
631 ; if (isReservedName name)
632 then unexpected ("reserved word " ++ show name)
638 = do{ c <- identStart languageDef
639 ; cs <- many (identLetter languageDef)
645 = isReserved theReservedNames caseName
647 caseName | caseSensitive languageDef = name
648 | otherwise = map toLower name
651 isReserved names name
655 scan (r:rs) = case (compare r name) of
661 | caseSensitive languageDef = sortedNames
662 | otherwise = map (map toLower) sortedNames
664 sortedNames = sort (reservedNames languageDef)
668 -----------------------------------------------------------
669 -- White space & symbols
670 -----------------------------------------------------------
672 = lexeme (string name)
675 = do{ x <- p; whiteSpace; return x }
680 | noLine && noMulti = skipMany (simpleSpace <?> "")
681 | noLine = skipMany (simpleSpace <|> multiLineComment <?> "")
682 | noMulti = skipMany (simpleSpace <|> oneLineComment <?> "")
683 | otherwise = skipMany (simpleSpace <|> oneLineComment <|> multiLineComment <?> "")
685 noLine = null (commentLine languageDef)
686 noMulti = null (commentStart languageDef)
690 skipMany1 (satisfy isSpace)
693 do{ try (string (commentLine languageDef))
694 ; skipMany (satisfy (/= '\n'))
699 do { try (string (commentStart languageDef))
704 | nestedComments languageDef = inCommentMulti
705 | otherwise = inCommentSingle
708 = do{ try (string (commentEnd languageDef)) ; return () }
709 <|> do{ multiLineComment ; inCommentMulti }
710 <|> do{ skipMany1 (noneOf startEnd) ; inCommentMulti }
711 <|> do{ oneOf startEnd ; inCommentMulti }
714 startEnd = nub (commentEnd languageDef ++ commentStart languageDef)
717 = do{ try (string (commentEnd languageDef)); return () }
718 <|> do{ skipMany1 (noneOf startEnd) ; inCommentSingle }
719 <|> do{ oneOf startEnd ; inCommentSingle }
722 startEnd = nub (commentEnd languageDef ++ commentStart languageDef)