1 -----------------------------------------------------------------------------
3 -- Module : Text.Parsec.Token
4 -- Copyright : (c) Daan Leijen 1999-2001, (c) Paolo Martini 2007
5 -- License : BSD-style (see the LICENSE file)
7 -- Maintainer : derek.a.elkins@gmail.com
8 -- Stability : provisional
9 -- Portability : non-portable (uses local universal quantification: PolymorphicComponents)
11 -- A helper module to parse lexical elements (tokens). See 'makeTokenParser'
12 -- for a description of how to use it.
14 -----------------------------------------------------------------------------
16 {-# LANGUAGE PolymorphicComponents #-}
17 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
18 {-# OPTIONS_GHC -XFlexibleContexts #-}
20 module Text.Parsec.Token
28 import Data.Char ( isAlpha, toLower, toUpper, isSpace, digitToInt )
29 import Data.List ( nub, sort )
30 import Control.Monad.Identity
31 import Text.Parsec.Prim
32 import Text.Parsec.Char
33 import Text.Parsec.Combinator
35 -----------------------------------------------------------
36 -- Language Definition
37 -----------------------------------------------------------
39 type LanguageDef st = GenLanguageDef String st Identity
41 -- | The @GenLanguageDef@ type is a record that contains all parameterizable
42 -- features of the 'Text.Parsec.Token' module. The module 'Text.Parsec.Language'
43 -- contains some default definitions.
45 data GenLanguageDef s u m
48 -- | Describes the start of a block comment. Use the empty string if the
49 -- language doesn't support block comments. For example \"\/*\".
51 commentStart :: String,
53 -- | Describes the end of a block comment. Use the empty string if the
54 -- language doesn't support block comments. For example \"*\/\".
58 -- | Describes the start of a line comment. Use the empty string if the
59 -- language doesn't support line comments. For example \"\/\/\".
61 commentLine :: String,
63 -- | Set to 'True' if the language supports nested block comments.
65 nestedComments :: Bool,
67 -- | This parser should accept any start characters of identifiers. For
68 -- example @letter \<|> char \"_\"@.
70 identStart :: ParsecT s u m Char,
72 -- | This parser should accept any legal tail characters of identifiers.
73 -- For example @alphaNum \<|> char \"_\"@.
75 identLetter :: ParsecT s u m Char,
77 -- | This parser should accept any start characters of operators. For
78 -- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@
80 opStart :: ParsecT s u m Char,
82 -- | This parser should accept any legal tail characters of operators.
83 -- Note that this parser should even be defined if the language doesn't
84 -- support user-defined operators, or otherwise the 'reservedOp'
85 -- parser won't work correctly.
87 opLetter :: ParsecT s u m Char,
89 -- | The list of reserved identifiers.
91 reservedNames :: [String],
93 -- | The list of reserved operators.
95 reservedOpNames:: [String],
97 -- | Set to 'True' if the language is case sensitive.
103 -----------------------------------------------------------
104 -- A first class module: TokenParser
105 -----------------------------------------------------------
107 type TokenParser st = GenTokenParser String st Identity
109 -- | The type of the record that holds lexical parsers that work on
110 -- @s@ streams with state @u@ over a monad @m@.
112 data GenTokenParser s u m
115 -- | This lexeme parser parses a legal identifier. Returns the identifier
116 -- string. This parser will fail on identifiers that are reserved
117 -- words. Legal identifier (start) characters and reserved words are
118 -- defined in the 'LanguageDef' that is passed to
119 -- 'makeTokenParser'. An @identifier@ is treated as
120 -- a single token using 'try'.
122 identifier :: ParsecT s u m String,
124 -- | The lexeme parser @reserved name@ parses @symbol
125 -- name@, but it also checks that the @name@ is not a prefix of a
126 -- valid identifier. A @reserved@ word is treated as a single token
129 reserved :: String -> ParsecT s u m (),
131 -- | This lexeme parser parses a legal operator. Returns the name of the
132 -- operator. This parser will fail on any operators that are reserved
133 -- operators. Legal operator (start) characters and reserved operators
134 -- are defined in the 'LanguageDef' that is passed to
135 -- 'makeTokenParser'. An @operator@ is treated as a
136 -- single token using 'try'.
138 operator :: ParsecT s u m String,
140 -- |The lexeme parser @reservedOp name@ parses @symbol
141 -- name@, but it also checks that the @name@ is not a prefix of a
142 -- valid operator. A @reservedOp@ is treated as a single token using
145 reservedOp :: String -> ParsecT s u m (),
148 -- | This lexeme parser parses a single literal character. Returns the
149 -- literal character value. This parsers deals correctly with escape
150 -- sequences. The literal character is parsed according to the grammar
151 -- rules defined in the Haskell report (which matches most programming
152 -- languages quite closely).
154 charLiteral :: ParsecT s u m Char,
156 -- | This lexeme parser parses a literal string. Returns the literal
157 -- string value. This parsers deals correctly with escape sequences and
158 -- gaps. The literal string is parsed according to the grammar rules
159 -- defined in the Haskell report (which matches most programming
160 -- languages quite closely).
162 stringLiteral :: ParsecT s u m String,
164 -- | This lexeme parser parses a natural number (a positive whole
165 -- number). Returns the value of the number. The number can be
166 -- specified in 'decimal', 'hexadecimal' or
167 -- 'octal'. The number is parsed according to the grammar
168 -- rules in the Haskell report.
170 natural :: ParsecT s u m Integer,
172 -- | This lexeme parser parses an integer (a whole number). This parser
173 -- is like 'natural' except that it can be prefixed with
174 -- sign (i.e. \'-\' or \'+\'). Returns the value of the number. The
175 -- number can be specified in 'decimal', 'hexadecimal'
176 -- or 'octal'. The number is parsed according
177 -- to the grammar rules in the Haskell report.
179 integer :: ParsecT s u m Integer,
181 -- | This lexeme parser parses a floating point value. Returns the value
182 -- of the number. The number is parsed according to the grammar rules
183 -- defined in the Haskell report.
185 float :: ParsecT s u m Double,
187 -- | This lexeme parser parses either 'natural' or a 'float'.
188 -- Returns the value of the number. This parsers deals with
189 -- any overlap in the grammar rules for naturals and floats. The number
190 -- is parsed according to the grammar rules defined in the Haskell report.
192 naturalOrFloat :: ParsecT s u m (Either Integer Double),
194 -- | Parses a positive whole number in the decimal system. Returns the
195 -- value of the number.
197 decimal :: ParsecT s u m Integer,
199 -- | Parses a positive whole number in the hexadecimal system. The number
200 -- should be prefixed with \"0x\" or \"0X\". Returns the value of the
203 hexadecimal :: ParsecT s u m Integer,
205 -- | Parses a positive whole number in the octal system. The number
206 -- should be prefixed with \"0o\" or \"0O\". Returns the value of the
209 octal :: ParsecT s u m Integer,
211 -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
212 -- trailing white space.
214 symbol :: String -> ParsecT s u m String,
216 -- | @lexeme p@ first applies parser @p@ and than the 'whiteSpace'
217 -- parser, returning the value of @p@. Every lexical
218 -- token (lexeme) is defined using @lexeme@, this way every parse
219 -- starts at a point without white space. Parsers that use @lexeme@ are
220 -- called /lexeme/ parsers in this document.
222 -- The only point where the 'whiteSpace' parser should be
223 -- called explicitly is the start of the main parser in order to skip
224 -- any leading white space.
226 -- > mainParser = do{ whiteSpace
227 -- > ; ds <- many (lexeme digit)
229 -- > ; return (sum ds)
232 lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a,
234 -- | Parses any white space. White space consists of /zero/ or more
235 -- occurrences of a 'space', a line comment or a block (multi
236 -- line) comment. Block comments may be nested. How comments are
237 -- started and ended is defined in the 'LanguageDef'
238 -- that is passed to 'makeTokenParser'.
240 whiteSpace :: ParsecT s u m (),
242 -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis,
243 -- returning the value of @p@.
245 parens :: forall a. ParsecT s u m a -> ParsecT s u m a,
247 -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (\'{\' and
248 -- \'}\'), returning the value of @p@.
250 braces :: forall a. ParsecT s u m a -> ParsecT s u m a,
252 -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (\'\<\'
253 -- and \'>\'), returning the value of @p@.
255 angles :: forall a. ParsecT s u m a -> ParsecT s u m a,
257 -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (\'[\'
258 -- and \']\'), returning the value of @p@.
260 brackets :: forall a. ParsecT s u m a -> ParsecT s u m a,
262 -- | DEPRECATED: Use 'brackets'.
264 squares :: forall a. ParsecT s u m a -> ParsecT s u m a,
266 -- | Lexeme parser |semi| parses the character \';\' and skips any
267 -- trailing white space. Returns the string \";\".
269 semi :: ParsecT s u m String,
271 -- | Lexeme parser @comma@ parses the character \',\' and skips any
272 -- trailing white space. Returns the string \",\".
274 comma :: ParsecT s u m String,
276 -- | Lexeme parser @colon@ parses the character \':\' and skips any
277 -- trailing white space. Returns the string \":\".
279 colon :: ParsecT s u m String,
281 -- | Lexeme parser @dot@ parses the character \'.\' and skips any
282 -- trailing white space. Returns the string \".\".
284 dot :: ParsecT s u m String,
286 -- | Lexeme parser @semiSep p@ parses /zero/ or more occurrences of @p@
287 -- separated by 'semi'. Returns a list of values returned by
290 semiSep :: forall a . ParsecT s u m a -> ParsecT s u m [a],
292 -- | Lexeme parser @semiSep1 p@ parses /one/ or more occurrences of @p@
293 -- separated by 'semi'. Returns a list of values returned by @p@.
295 semiSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a],
297 -- | Lexeme parser @commaSep p@ parses /zero/ or more occurrences of
298 -- @p@ separated by 'comma'. Returns a list of values returned
301 commaSep :: forall a . ParsecT s u m a -> ParsecT s u m [a],
303 -- | Lexeme parser @commaSep1 p@ parses /one/ or more occurrences of
304 -- @p@ separated by 'comma'. Returns a list of values returned
307 commaSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a]
310 -----------------------------------------------------------
311 -- Given a LanguageDef, create a token parser.
312 -----------------------------------------------------------
314 -- | The expression @makeTokenParser language@ creates a 'GenTokenParser'
315 -- record that contains lexical parsers that are
316 -- defined using the definitions in the @language@ record.
318 -- The use of this function is quite stylized - one imports the
319 -- appropiate language definition and selects the lexical parsers that
320 -- are needed from the resulting 'GenTokenParser'.
322 -- > module Main where
324 -- > import Text.Parsec
325 -- > import qualified Text.Parsec.Token as P
326 -- > import Text.Parsec.Language (haskellDef)
331 -- > expr = parens expr
337 -- > lexer = P.makeTokenParser haskellDef
339 -- > parens = P.parens lexer
340 -- > braces = P.braces lexer
341 -- > identifier = P.identifier lexer
342 -- > reserved = P.reserved lexer
345 makeTokenParser :: (Stream s m Char)
346 => GenLanguageDef s u m -> GenTokenParser s u m
347 makeTokenParser languageDef
348 = TokenParser{ identifier = identifier
349 , reserved = reserved
350 , operator = operator
351 , reservedOp = reservedOp
353 , charLiteral = charLiteral
354 , stringLiteral = stringLiteral
358 , naturalOrFloat = naturalOrFloat
360 , hexadecimal = hexadecimal
365 , whiteSpace = whiteSpace
370 , brackets = brackets
377 , semiSep1 = semiSep1
378 , commaSep = commaSep
379 , commaSep1 = commaSep1
383 -----------------------------------------------------------
385 -----------------------------------------------------------
386 parens p = between (symbol "(") (symbol ")") p
387 braces p = between (symbol "{") (symbol "}") p
388 angles p = between (symbol "<") (symbol ">") p
389 brackets p = between (symbol "[") (symbol "]") p
396 commaSep p = sepBy p comma
397 semiSep p = sepBy p semi
399 commaSep1 p = sepBy1 p comma
400 semiSep1 p = sepBy1 p semi
403 -----------------------------------------------------------
405 -----------------------------------------------------------
406 charLiteral = lexeme (between (char '\'')
407 (char '\'' <?> "end of character")
411 characterChar = charLetter <|> charEscape
412 <?> "literal character"
414 charEscape = do{ char '\\'; escapeCode }
415 charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))
419 stringLiteral = lexeme (
420 do{ str <- between (char '"')
421 (char '"' <?> "end of string")
423 ; return (foldr (maybe id (:)) "" str)
425 <?> "literal string")
427 stringChar = do{ c <- stringLetter; return (Just c) }
429 <?> "string character"
431 stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))
433 stringEscape = do{ char '\\'
434 ; do{ escapeGap ; return Nothing }
435 <|> do{ escapeEmpty; return Nothing }
436 <|> do{ esc <- escapeCode; return (Just esc) }
439 escapeEmpty = char '&'
440 escapeGap = do{ many1 space
441 ; char '\\' <?> "end of string gap"
447 escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
450 charControl = do{ char '^'
452 ; return (toEnum (fromEnum code - fromEnum 'A'))
455 charNum = do{ code <- decimal
456 <|> do{ char 'o'; number 8 octDigit }
457 <|> do{ char 'x'; number 16 hexDigit }
458 ; return (toEnum (fromInteger code))
461 charEsc = choice (map parseEsc escMap)
463 parseEsc (c,code) = do{ char c; return code }
465 charAscii = choice (map parseAscii asciiMap)
467 parseAscii (asc,code) = try (do{ string asc; return code })
470 -- escape code tables
471 escMap = zip ("abfnrtv\\\"\'") ("\a\b\f\n\r\t\v\\\"\'")
472 asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)
474 ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
475 "FS","GS","RS","US","SP"]
476 ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
477 "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
478 "CAN","SUB","ESC","DEL"]
480 ascii2 = ['\BS','\HT','\LF','\VT','\FF','\CR','\SO','\SI',
481 '\EM','\FS','\GS','\RS','\US','\SP']
482 ascii3 = ['\NUL','\SOH','\STX','\ETX','\EOT','\ENQ','\ACK',
483 '\BEL','\DLE','\DC1','\DC2','\DC3','\DC4','\NAK',
484 '\SYN','\ETB','\CAN','\SUB','\ESC','\DEL']
487 -----------------------------------------------------------
489 -----------------------------------------------------------
490 naturalOrFloat = lexeme (natFloat) <?> "number"
492 float = lexeme floating <?> "float"
493 integer = lexeme int <?> "integer"
494 natural = lexeme nat <?> "natural"
498 floating = do{ n <- decimal
503 natFloat = do{ char '0'
508 zeroNumFloat = do{ n <- hexadecimal <|> octal
515 decimalFloat = do{ n <- decimal
520 fractFloat n = do{ f <- fractExponent n
524 fractExponent n = do{ fract <- fraction
525 ; expo <- option 1.0 exponent'
526 ; return ((fromInteger n + fract)*expo)
529 do{ expo <- exponent'
530 ; return ((fromInteger n)*expo)
533 fraction = do{ char '.'
534 ; digits <- many1 digit <?> "fraction"
535 ; return (foldr op 0.0 digits)
539 op d f = (f + fromIntegral (digitToInt d))/10.0
541 exponent' = do{ oneOf "eE"
543 ; e <- decimal <?> "exponent"
544 ; return (power (f e))
548 power e | e < 0 = 1.0/power(-e)
549 | otherwise = fromInteger (10^e)
552 -- integers and naturals
553 int = do{ f <- lexeme sign
558 sign = (char '-' >> return negate)
559 <|> (char '+' >> return id)
562 nat = zeroNumber <|> decimal
564 zeroNumber = do{ char '0'
565 ; hexadecimal <|> octal <|> decimal <|> return 0
569 decimal = number 10 digit
570 hexadecimal = do{ oneOf "xX"; number 16 hexDigit }
571 octal = do{ oneOf "oO"; number 8 octDigit }
573 number base baseDigit
574 = do{ digits <- many1 baseDigit
575 ; let n = foldl (\x d -> base*x + toInteger (digitToInt d)) 0 digits
579 -----------------------------------------------------------
580 -- Operators & reserved ops
581 -----------------------------------------------------------
585 ; notFollowedBy (opLetter languageDef) <?> ("end of " ++ show name)
591 ; if (isReservedOp name)
592 then unexpected ("reserved operator " ++ show name)
597 do{ c <- (opStart languageDef)
598 ; cs <- many (opLetter languageDef)
604 isReserved (sort (reservedOpNames languageDef)) name
607 -----------------------------------------------------------
608 -- Identifiers & Reserved words
609 -----------------------------------------------------------
613 ; notFollowedBy (identLetter languageDef) <?> ("end of " ++ show name)
617 | caseSensitive languageDef = string name
618 | otherwise = do{ walk name; return name }
621 walk (c:cs) = do{ caseChar c <?> msg; walk cs }
623 caseChar c | isAlpha c = char (toLower c) <|> char (toUpper c)
632 ; if (isReservedName name)
633 then unexpected ("reserved word " ++ show name)
639 = do{ c <- identStart languageDef
640 ; cs <- many (identLetter languageDef)
646 = isReserved theReservedNames caseName
648 caseName | caseSensitive languageDef = name
649 | otherwise = map toLower name
652 isReserved names name
656 scan (r:rs) = case (compare r name) of
662 | caseSensitive languageDef = sortedNames
663 | otherwise = map (map toLower) sortedNames
665 sortedNames = sort (reservedNames languageDef)
669 -----------------------------------------------------------
670 -- White space & symbols
671 -----------------------------------------------------------
673 = lexeme (string name)
676 = do{ x <- p; whiteSpace; return x }
681 | noLine && noMulti = skipMany (simpleSpace <?> "")
682 | noLine = skipMany (simpleSpace <|> multiLineComment <?> "")
683 | noMulti = skipMany (simpleSpace <|> oneLineComment <?> "")
684 | otherwise = skipMany (simpleSpace <|> oneLineComment <|> multiLineComment <?> "")
686 noLine = null (commentLine languageDef)
687 noMulti = null (commentStart languageDef)
691 skipMany1 (satisfy isSpace)
694 do{ try (string (commentLine languageDef))
695 ; skipMany (satisfy (/= '\n'))
700 do { try (string (commentStart languageDef))
705 | nestedComments languageDef = inCommentMulti
706 | otherwise = inCommentSingle
709 = do{ try (string (commentEnd languageDef)) ; return () }
710 <|> do{ multiLineComment ; inCommentMulti }
711 <|> do{ skipMany1 (noneOf startEnd) ; inCommentMulti }
712 <|> do{ oneOf startEnd ; inCommentMulti }
715 startEnd = nub (commentEnd languageDef ++ commentStart languageDef)
718 = do{ try (string (commentEnd languageDef)); return () }
719 <|> do{ skipMany1 (noneOf startEnd) ; inCommentSingle }
720 <|> do{ oneOf startEnd ; inCommentSingle }
723 startEnd = nub (commentEnd languageDef ++ commentStart languageDef)