sparsec

simple Haskell parser combinator library
git clone git://git.rr3.xyz/sparsec
Log | Files | Refs | README | LICENSE

commit a9b161714eae5633acb6095afb28058b8a21bf14
parent d4de0fa6982ad9b4be8024e371e1b25dcb99275a
Author: Robert Russell <robert@rr3.xyz>
Date:   Sun, 21 Dec 2025 21:52:53 -0800

Start huge refactor

Diffstat:
DSparsec.hs | 628-------------------------------------------------------------------------------
Mpackage.yaml | 17++++++++---------
Asrc/Sparsec.hs | 407+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/Sparsec/Char.hs | 38++++++++++++++++++++++++++++++++++++++
Asrc/Sparsec/Loc.hs | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/Sparsec/Span.hs | 43+++++++++++++++++++++++++++++++++++++++++++
6 files changed, 558 insertions(+), 637 deletions(-)

diff --git a/Sparsec.hs b/Sparsec.hs @@ -1,628 +0,0 @@ -module Sparsec ( - -- Char - charIsUpperLatin, - charIsLowerLatin, - charIsLatin, - charIsUpperGreek, - charIsLowerGreek, - charIsGreek, - charIsSafeUpperGreek, - charIsSafeLowerGreek, - charIsSafeGreek, - charIsNum2, - charIsNum4, - charIsNum8, - charIsNum10, - charIsNum16, - -- Rune - Rune (..), - runeElim, - runeIsUpperLatin, - runeIsLowerLatin, - runeIsLatin, - runeIsUpperGreek, - runeIsLowerGreek, - runeIsGreek, - runeIsSafeUpperGreek, - runeIsSafeLowerGreek, - runeIsSafeGreek, - runeIsNum2, - runeIsNum4, - runeIsNum8, - runeIsNum10, - runeIsNum16, - -- Loc - Loc#, - pattern Loc#, - locByte#, - locRow#, - locCol#, - locEq#, - Loc (..), - locZero, - locBox, - locUnbox, - -- Span - Span#, - pattern Span#, - spanBeg#, - spanEnd#, - spanEq#, - Span (..), - spanBox, - spanUnbox, - -- State - State#, - pattern State#, - stateInput#, - stateLoc#, - stateEq#, - State (..), - stateBox, - stateUnbox, - -- Result - Result# (..), - -- Parse monad - ParseT (..), - runParseT#, - runParseT, - mapParseT, - Parse, - runParse#, - runParse, - mapParse, - -- Core combinators - fail, - err, - succeed, - continue, - read, - write, - getLoc, - putLoc, - save, - load, - -- Derived combinators - not, - catch, - eof, - lookahead, - spanOf, - bytesOf, - cut, - try, - iter, - chainl, - chainr, - branch, - choice, - match, - someSepBy, - manySepBy, - -- TODO: Byte combinators - -- Utf8 combinators - Utf8Error (..), - nextRune, - nextChar, - runeIfM, - runeIf, - charIfM, - charIf, - runeWhileM, - runeWhile, - charWhileM, - charWhile, - rune, - char, - string, - natural, -) where - -import Control.Applicative -import Control.Monad hiding (fail) -import Control.Monad.Identity (Identity (..)) -import Control.Monad.Reader (MonadReader (..)) -import Control.Monad.State (MonadState (..)) -import Control.Monad.Trans (MonadTrans (..)) -import Data.ByteString (ByteString) -import Data.ByteString qualified as BS -import Data.ByteString.UTF8 qualified as UTF8 -import Data.Char -import Data.Foldable -import Data.Functor -import GHC.Exts (Int (..), Int#, (==#)) -import Numeric.Natural -import Text.Printf -import Prelude hiding (fail, not, read) - --------------------------------------------------------------------------------- --- Util - -intEq# :: Int# -> Int# -> Bool -intEq# x y = case x ==# y of - 0# -> False - _ -> True - --------------------------------------------------------------------------------- --- Char - -charIsUpperLatin, charIsLowerLatin, charIsLatin :: Char -> Bool -charIsUpperLatin c = 'A' <= c && c <= 'Z' -charIsLowerLatin c = 'a' <= c && c <= 'z' -charIsLatin c = charIsUpperLatin c || charIsLowerLatin c - -charIsUpperGreek, charIsLowerGreek, charIsGreek :: Char -> Bool -charIsUpperGreek c = 'Α' <= c && c <= 'Ω' -charIsLowerGreek c = 'α' <= c && c <= 'ω' -charIsGreek c = charIsUpperGreek c || charIsLowerGreek c - -charIsSafeUpperGreek, charIsSafeLowerGreek, charIsSafeGreek :: Char -> Bool -charIsSafeUpperGreek c = c == 'Γ' || c == 'Δ' || c == 'Θ' || c == 'Λ' || c == 'Ξ' || c == 'Π' || c == 'Σ' || c == 'Φ' || c == 'Ψ' || c == 'Ω' -charIsSafeLowerGreek c = charIsLowerGreek c && c /= 'ο' -charIsSafeGreek c = charIsSafeUpperGreek c || charIsSafeLowerGreek c - -charIsNum2, charIsNum4, charIsNum8, charIsNum10, charIsNum16 :: Char -> Bool -charIsNum2 c = '0' <= c && c <= '1' -charIsNum4 c = '0' <= c && c <= '3' -charIsNum8 c = '0' <= c && c <= '7' -charIsNum10 c = '0' <= c && c <= '9' -charIsNum16 c = '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' - --------------------------------------------------------------------------------- --- Rune --- TODO: Are runes bloat? Should we delete them? - -data Rune = RuneEof | RuneChar Char deriving (Eq) - -instance Show Rune where - show = \case - RuneEof -> "EOF" - RuneChar c -> show c - -runeElim :: a -> (Char -> a) -> Rune -> a -runeElim eof chr = \case - RuneEof -> eof - RuneChar c -> chr c - -runeIsUpperLatin, runeIsLowerLatin, runeIsLatin :: Rune -> Bool -runeIsUpperLatin = runeElim False charIsUpperLatin -runeIsLowerLatin = runeElim False charIsLowerLatin -runeIsLatin = runeElim False charIsLatin - -runeIsUpperGreek, runeIsLowerGreek, runeIsGreek :: Rune -> Bool -runeIsUpperGreek = runeElim False charIsUpperGreek -runeIsLowerGreek = runeElim False charIsLowerGreek -runeIsGreek = runeElim False charIsGreek - -runeIsSafeUpperGreek, runeIsSafeLowerGreek, runeIsSafeGreek :: Rune -> Bool -runeIsSafeUpperGreek = runeElim False charIsSafeUpperGreek -runeIsSafeLowerGreek = runeElim False charIsSafeLowerGreek -runeIsSafeGreek = runeElim False charIsSafeGreek - -runeIsNum2, runeIsNum4, runeIsNum8, runeIsNum10, runeIsNum16 :: Rune -> Bool -runeIsNum2 = runeElim False charIsNum2 -runeIsNum4 = runeElim False charIsNum4 -runeIsNum8 = runeElim False charIsNum8 -runeIsNum10 = runeElim False charIsNum10 -runeIsNum16 = runeElim False charIsNum16 - --------------------------------------------------------------------------------- --- Loc - -newtype Loc# = Loc_# (# Int#, Int#, Int# #) -pattern Loc# :: Int# -> Int# -> Int# -> Loc# -pattern Loc# b r c = Loc_# (# b, r, c #) -{-# COMPLETE Loc# #-} - -locByte#, locRow#, locCol# :: Loc# -> Int# -locByte# (Loc# b _ _) = b -locRow# (Loc# _ r _) = r -locCol# (Loc# _ _ c) = c - -locEq# :: Loc# -> Loc# -> Bool -locEq# (Loc# b0 r0 c0) (Loc# b1 r1 c1) = - intEq# b0 b1 && intEq# r0 r1 && intEq# c0 c1 - --- | Loc is a source code location, tracking byte offset, row (number of '\n' --- characters seen so far), and column (number of characters into the current --- row). All three of these quantities are 0-based. The column tracking is --- somewhat stupid in that it treats each Unicode codepoint as one column. -data Loc = Loc {locByte :: Int, locRow :: Int, locCol :: Int} deriving (Eq) - -locZero :: Loc -locZero = Loc 0 0 0 - --- Weird Show instance intended for debugging -instance Show Loc where - show (Loc b r c) = printf "%d:%d#%d" r c b - -locBox :: Loc# -> Loc -locBox (Loc# b r c) = Loc (I# b) (I# r) (I# c) - -locUnbox :: Loc -> Loc# -locUnbox (Loc (I# b) (I# r) (I# c)) = Loc# b r c - --------------------------------------------------------------------------------- --- Span - -newtype Span# = Span_# (# Loc#, Loc# #) -pattern Span# :: Loc# -> Loc# -> Span# -pattern Span# beg end = Span_# (# beg, end #) -{-# COMPLETE Span# #-} - -spanBeg#, spanEnd# :: Span# -> Loc# -spanBeg# (Span# beg _) = beg -spanEnd# (Span# _ end) = end - -spanEq# :: Span# -> Span# -> Bool -spanEq# (Span# beg0 end0) (Span# beg1 end1) = - locEq# beg0 beg1 && locEq# end0 end1 - --- | Span is a pair of Loc's specifying a range in the input. The beginning/left --- Loc is inclusive and the ending/right Loc is exclusive. -data Span = Span {spanBeg :: Loc, spanEnd :: Loc} deriving (Eq) - --- Weird Show instance intended for debugging -instance Show Span where - show (Span beg end) = printf "%s--%s" (show beg) (show end) - -spanBox :: Span# -> Span -spanBox (Span# beg end) = Span (locBox beg) (locBox end) - -spanUnbox :: Span -> Span# -spanUnbox (Span beg end) = Span# (locUnbox beg) (locUnbox end) - --------------------------------------------------------------------------------- --- State - -newtype State# = State_# (# ByteString, Loc# #) -pattern State# :: ByteString -> Loc# -> State# -pattern State# i l = State_# (# i, l #) -{-# COMPLETE State# #-} - -stateInput# :: State# -> ByteString -stateInput# (State# i _) = i - -stateLoc# :: State# -> Loc# -stateLoc# (State# _ l) = l - -stateEq# :: State# -> State# -> Bool -stateEq# (State# i0 l0) (State# i1 l1) = i0 == i1 && locEq# l0 l1 - -data State = State {stateInput :: ByteString, stateLoc :: Loc} deriving (Eq) - -stateBox :: State# -> State -stateBox (State# i l) = State i (locBox l) - -stateUnbox :: State -> State# -stateUnbox (State i l) = State# i (locUnbox l) - --------------------------------------------------------------------------------- --- Result - --- TODO: Result (no hash) - -data Result# e a = Failure# | Error# e | Success# a State# - -instance (Eq e, Eq a) => Eq (Result# e a) where - Failure# == Failure# = True - Error# e0 == Error# e1 = e0 == e1 - Success# a0 s0 == Success# a1 s1 = a0 == a1 && stateEq# s0 s1 - _ == _ = False - -instance (Show e, Show a) => Show (Result# e a) where - show = \case - Failure# -> "failure" - Error# e -> "error: " ++ show e - Success# a (State# i _) -> - if BS.null i - then - printf "success: %s" (show a) - else - printf - "success (%d bytes remaining): %s" - (BS.length i) - (show a) - -instance Functor (Result# e) where - fmap f = \case - Failure# -> Failure# - Error# e -> Error# e - Success# a s -> Success# (f a) s - --------------------------------------------------------------------------------- --- ParseT - --- | ParseT is a monad transformer for parsing. It effectively has a ByteString --- and Loc state effect for the input, and an error effect (of some type e) for --- parsing errors. Parsing *errors* are distinct from parsing *failures* in --- that only the latter trigger backtracking in the Alternative instance. -newtype ParseT e m a = ParseT (State# -> m (Result# e a)) - -runParseT# :: (Monad m) => ParseT e m a -> State# -> m (Result# e a) -runParseT# (ParseT f) = f - -runParseT :: (Monad m) => ParseT e m a -> ByteString -> Loc -> m (Result# e a) -runParseT p i l = p `runParseT#` State# i (locUnbox l) - -mapParseT :: - (Monad m, Monad m') => - (m (Result# e a) -> m' (Result# e' a')) -> - ParseT e m a -> - ParseT e' m' a' -mapParseT f p = ParseT \s -> f $ p `runParseT#` s - --------------------------------------------------------------------------------- --- Parse - -type Parse e a = ParseT e Identity a - -runParse# :: Parse e a -> State# -> Result# e a -runParse# p s = runIdentity $ p `runParseT#` s - -runParse :: Parse e a -> ByteString -> Loc -> Result# e a -runParse p i l = p `runParse#` State# i (locUnbox l) - -mapParse :: (Result# e a -> Result# e' a') -> Parse e a -> Parse e' a' -mapParse f = mapParseT (Identity . f . runIdentity) - --------------------------------------------------------------------------------- --- Core combinators - -fail :: (Monad m) => ParseT e m a -fail = ParseT \_ -> pure Failure# - -err :: (Monad m) => e -> ParseT e m a -err e = ParseT \_ -> pure $ Error# e - -succeed :: (Monad m) => a -> ParseT e m a -succeed a = ParseT \s -> pure $ Success# a s - -continue :: (Monad m) => ParseT e' m a' -> (e -> ParseT e' m a') -> (a -> ParseT e' m a') -> ParseT e m a -> ParseT e' m a' -continue kf ke ks p = ParseT \s -> - p `runParseT#` s >>= \case - Failure# -> kf `runParseT#` s - Error# e -> ke e `runParseT#` s - Success# a s' -> ks a `runParseT#` s' - --- | Read the entire input without consuming it. -read :: (Monad m) => ParseT e m ByteString -read = ParseT \s -> pure $ Success# (stateInput# s) s - --- | Replace the entire input without affecting the current location. -write :: (Monad m) => ByteString -> ParseT e m () -write i = ParseT \(State# _ l) -> pure $ Success# () (State# i l) - -getLoc :: (Monad m) => ParseT e m Loc -getLoc = ParseT \s -> pure $ Success# (locBox $ stateLoc# s) s - -putLoc :: (Monad m) => Loc -> ParseT e m () -putLoc l = ParseT \(State# i _) -> pure $ Success# () (State# i (locUnbox l)) - -save :: (Monad m) => ParseT e m State -save = ParseT \s -> pure $ Success# (stateBox s) s - -load :: (Monad m) => State -> ParseT e m () -load s = ParseT \_ -> pure $ Success# () (stateUnbox s) - --------------------------------------------------------------------------------- --- Instances - -instance (Monad m) => Functor (ParseT e m) where - fmap = liftM - -instance (Monad m) => Applicative (ParseT e m) where - pure = succeed - (<*>) = ap - -instance (Monad m) => Monad (ParseT e m) where - p >>= k = continue fail err k p - -instance (Monad m) => Alternative (ParseT e m) where - empty = fail - p <|> q = continue q err succeed p - -instance MonadTrans (ParseT e) where - lift m = ParseT \s -> (`Success#` s) <$> m - -instance (MonadReader r m) => MonadReader r (ParseT e m) where - ask = lift ask - local f p = ParseT \s -> local f $ p `runParseT#` s - -instance (MonadState s m) => MonadState s (ParseT e m) where - get = lift get - put = lift . put - --------------------------------------------------------------------------------- --- General combinators - --- | Turn a failure into a success and vice versa. -not :: (Monad m) => ParseT e m a -> ParseT e m () -not = continue (succeed ()) err (const fail) - --- | Catch an error. -catch :: (Monad m) => ParseT e m a -> (e -> ParseT e' m a) -> ParseT e' m a -p `catch` h = continue fail h succeed p - -eof :: (Monad m) => ParseT e m () -eof = guard . BS.null =<< read - --- | Run a parser without changing the parser state. -lookahead :: (Monad m) => ParseT e m a -> ParseT e m a -lookahead p = do - s <- save - a <- p - load s - pure a - -spanOf :: (Monad m) => ParseT e m a -> ParseT e m (a, Span) -spanOf p = do - beg <- getLoc - a <- p - end <- getLoc - pure (a, Span beg end) - -bytesOf :: (Monad m) => ParseT e m a -> ParseT e m (a, ByteString) -bytesOf p = do - State i beg <- save - a <- p - end <- getLoc - pure (a, BS.take (locByte end - locByte beg) i) - --- | Convert a failure into an error. --- --- TODO: Why do we call this "cut"? -cut :: (Monad m) => ParseT e m a -> e -> ParseT e m a -p `cut` e = p <|> err e - --- | Convert an error into a failure. -try :: (Monad m) => ParseT e m a -> ParseT e m a -try p = p `catch` const fail - --- | Iterate a parsing function until it fails. I.e., --- @iter f a = pure a >>= f >>= f >>= ...@ --- where the chain of binds is as long as possible without failure. -iter :: (Monad m) => (a -> ParseT e m a) -> a -> ParseT e m a -iter f a = (f a >>= iter f) <|> pure a - --- | Parse a `b` and then zero or more `a`s, and then combine in a left-nested --- fashion. -chainl :: (Monad m) => (b -> a -> b) -> ParseT e m b -> ParseT e m a -> ParseT e m b -chainl f pb pa = pb >>= iter \b -> f b <$> pa - --- | Parse zero or more `a`s (greedily) and then a `b`, and then combine in a --- right-nested fashion. -chainr :: (Monad m) => (a -> b -> b) -> ParseT e m a -> ParseT e m b -> ParseT e m b -chainr f pa pb = f <$> pa <*> chainr f pa pb <|> pb - --- | `branch pc pt pf` runs pc; if it succeeds, it continues with pt, and --- otherwise it continues with pf. -branch :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m b -> ParseT e m b -branch p kt kf = continue kf err (const kt) p - -choice :: (Monad m, Foldable t) => t (ParseT e m a) -> ParseT e m a -choice = asum - --- | `match scrut cases` binds scrut to the first non-failing case and returns --- the result. -match :: - (Monad m, Functor t, Foldable t) => - ParseT e m a -> - t (a -> ParseT e m b) -> - ParseT e m b -match scrut cases = choice $ (scrut >>=) <$> cases - -someSepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a] -someSepBy p sep = (:) <$> p <*> many (sep *> p) - -manySepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a] -manySepBy p sep = someSepBy p sep <|> pure [] - --------------------------------------------------------------------------------- --- Byte-wise parse combinators - --- TODO - --------------------------------------------------------------------------------- --- Utf8 parse combinators - --- All these combinators involve UTF-8 decoding and therefore require that --- the error type e supports UTF-8 errors. - -class Utf8Error e where - utf8Error :: Loc -> e - -nextRune :: forall e m. (Utf8Error e, Monad m) => ParseT e m Rune -nextRune = toRune . UTF8.decode =<< read - where - toRune :: Maybe (Char, Int) -> ParseT e m Rune - toRune = \case - Just (c, w) - | c == UTF8.replacement_char -> err . utf8Error =<< getLoc - | otherwise -> updateState c w $> RuneChar c - Nothing -> pure RuneEof - - updateState :: Char -> Int -> ParseT e m () - updateState c w = do - write . BS.drop w =<< read - loc <- getLoc - putLoc case c of - '\n' -> Loc (locByte loc + w) (locRow loc + 1) 0 - _ -> Loc (locByte loc + w) (locRow loc) (locCol loc + 1) - -nextChar :: (Utf8Error e, Monad m) => ParseT e m Char -nextChar = - nextRune >>= \case - RuneEof -> fail - RuneChar c -> pure c - -runeIfM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m Rune -runeIfM want = do - r <- nextRune - lift (want r) >>= guard - pure r - -runeIf :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m Rune -runeIf want = runeIfM (pure . want) - -charIfM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m Char -charIfM want = do - c <- nextChar - lift (want c) >>= guard - pure c - -charIf :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m Char -charIf want = charIfM (pure . want) - -runeWhileM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m ByteString -runeWhileM want = snd <$> bytesOf (many (runeIfM want)) - -runeWhile :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m ByteString -runeWhile want = runeWhileM (pure . want) - -charWhileM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m ByteString -charWhileM want = runeWhileM \case - RuneEof -> pure False - RuneChar c -> want c - -charWhile :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m ByteString -charWhile want = charWhileM (pure . want) - -rune :: (Utf8Error e, Monad m) => Rune -> ParseT e m () -rune r = void $ runeIf (== r) - -char :: (Utf8Error e, Monad m) => Char -> ParseT e m () -char c = void $ charIf (== c) - -string :: (Utf8Error e, Monad m) => String -> ParseT e m () -string = traverse_ char - -natural :: (Utf8Error e, Monad m) => Int -> ParseT e m Natural -natural = \case - 0 -> - let prefix l u = char '0' *> (char l <|> char u) - in prefix 'b' 'B' *> natural 2 - <|> prefix 'q' 'Q' *> natural 4 - <|> prefix 'o' 'O' *> natural 8 - <|> prefix 'x' 'X' *> natural 16 - <|> natural 10 - b - | 0 < b && b <= 36 -> - let nat :: Int -> Natural - nat = fromIntegral - - maxn = min (chr $ ord '0' + b) (succ '9') - maxu = min (chr $ ord 'A' + b - 10) (succ 'Z') - maxl = min (chr $ ord 'a' + b - 10) (succ 'z') - - numeral = - nextChar >>= \c -> - if - | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0') - | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10) - | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10) - | otherwise -> fail - - underscores = many (char '_') - in numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral)) - _ -> error "natural: invalid base" diff --git a/package.yaml b/package.yaml @@ -1,5 +1,5 @@ name: sparsec -version: 0.1.1.0 +version: 0.2.0.0 author: "Robert Russell" license: ISC @@ -26,14 +26,13 @@ dependencies: - bytestring library: - source-dirs: . + source-dirs: src dependencies: - mtl - utf8-string - -executables: - example: - source-dirs: example - main: Main.hs - dependencies: - - sparsec +# executables: +# example: +# source-dirs: example +# main: Main.hs +# dependencies: +# - sparsec diff --git a/src/Sparsec.hs b/src/Sparsec.hs @@ -0,0 +1,407 @@ +module Sparsec ( + -- State + State#, + pattern State#, + stateInput#, + stateLoc#, + stateEq#, + State (..), + pattern State, + -- Result + Result# (..), + -- Parse monad + ParseT (..), + runParseT#, + runParseT, + mapParseT, + Parse, + runParse#, + runParse, + mapParse, + -- Core combinators + fail, + err, + succeed, + continue, + get#, + put#, + -- General combinators + not, + try, + cut, + catch, + branch, + lookahead, + withSpan, + withConsumed, + iter, + chainl, + chainr, + many, + some, + many_, + some_, + manySepBy, + someSepBy, + choose, + match, + -- Utf8 combinators + next, + eof, + anyChar, + charIf, + char, + string, + natural, +) where + +import Control.Applicative +import Control.Monad hiding (fail) +import Control.Monad.Identity (Identity (..)) +import Control.Monad.Reader (MonadReader (..)) +import Control.Monad.State (MonadState (..)) +import Control.Monad.Trans (MonadTrans (..)) +import Data.ByteString (ByteString) +import Data.ByteString qualified as BS +import Data.ByteString.UTF8 qualified as UTF8 +import Data.Char +import Data.Foldable +import Data.Functor +import GHC.Exts (Int (..), (-#)) +import Numeric.Natural +import Text.Printf +import Prelude hiding (fail, not, read) + +import Sparsec.Loc +import Sparsec.Span + +-------------------------------------------------------------------------------- +-- State# + +-- TODO: Try making ByteString part strict. + +newtype State# = State_# (# ByteString, Loc# #) + +pattern State# :: ByteString -> Loc# -> State# +pattern State# i l = State_# (# i, l #) +{-# COMPLETE State# #-} + +stateInput# :: State# -> ByteString +stateInput# (State# i _) = i + +stateLoc# :: State# -> Loc# +stateLoc# (State# _ l) = l + +stateEq# :: State# -> State# -> Bool +stateEq# (State# i0 l0) (State# i1 l1) = i0 == i1 && l0 `locEq#` l1 + +data State = StateBox {stateUnbox :: State#} + +pattern State :: ByteString -> Loc# -> State +pattern State i l = StateBox (State# i l) +{-# COMPLETE State #-} + +instance Eq State where + StateBox s0 == StateBox s1 = s0 `stateEq#` s1 + +-------------------------------------------------------------------------------- +-- Result# + +data Result# e a + = Utf8Error# Loc# + | Failure# + | Error# e + | Success# a State# + +instance (Eq e, Eq a) => Eq (Result# e a) where + Utf8Error# l0 == Utf8Error# l1 = l0 `locEq#` l1 + Failure# == Failure# = True + Error# e0 == Error# e1 = e0 == e1 + Success# a0 s0 == Success# a1 s1 = a0 == a1 && s0 `stateEq#` s1 + _ == _ = False + +instance (Show e, Show a) => Show (Result# e a) where + show = \case + Utf8Error# l -> "utf8 error at " ++ locShow# l + Failure# -> "failure" + Error# e -> "error: " ++ show e + Success# a (State# i _) -> + if BS.null i + then printf "success: %s" (show a) + else printf "success (%d bytes remaining): %s" (BS.length i) (show a) + +instance Functor (Result# e) where + fmap f = \case + Utf8Error# l -> Utf8Error# l + Failure# -> Failure# + Error# e -> Error# e + Success# a s -> Success# (f a) s + +-------------------------------------------------------------------------------- +-- Result + +-- TODO? + +-------------------------------------------------------------------------------- +-- ParseT + +-- | ParseT is a monad transformer for parsing. It effectively has a ByteString +-- and Loc state effect for the input, and an error effect (of some type e) for +-- parsing errors. Parsing *errors* are distinct from parsing *failures* in +-- that only the latter trigger backtracking in the Alternative instance. +newtype ParseT e m a = ParseT (State# -> m (Result# e a)) + +runParseT# :: (Monad m) => ParseT e m a -> State# -> m (Result# e a) +runParseT# (ParseT f) = f + +runParseT :: (Monad m) => ParseT e m a -> ByteString -> Loc -> m (Result# e a) +runParseT p i l = p `runParseT#` State# i (locUnbox l) + +mapParseT :: + (Monad m, Monad m') => + (m (Result# e a) -> m' (Result# e' a')) -> + ParseT e m a -> + ParseT e' m' a' +mapParseT f p = ParseT \s -> f $ p `runParseT#` s + +-------------------------------------------------------------------------------- +-- Parse + +type Parse e a = ParseT e Identity a + +runParse# :: Parse e a -> State# -> Result# e a +runParse# p s = runIdentity $ p `runParseT#` s + +runParse :: Parse e a -> ByteString -> Loc -> Result# e a +runParse p i l = p `runParse#` State# i (locUnbox l) + +mapParse :: (Result# e a -> Result# e' a') -> Parse e a -> Parse e' a' +mapParse f = mapParseT (Identity . f . runIdentity) + +-------------------------------------------------------------------------------- +-- Core combinators + +fail :: (Monad m) => ParseT e m a +fail = ParseT \_ -> pure Failure# + +err :: (Monad m) => e -> ParseT e m a +err e = ParseT \_ -> pure $ Error# e + +succeed :: (Monad m) => a -> ParseT e m a +succeed a = ParseT \s -> pure $ Success# a s + +continue :: (Monad m) => ParseT f m b -> (e -> ParseT f m b) -> (a -> ParseT f m b) -> ParseT e m a -> ParseT f m b +continue kf ke ks p = ParseT \s -> + p `runParseT#` s >>= \case + Utf8Error# l -> pure $ Utf8Error# l + Failure# -> kf `runParseT#` s + Error# e -> ke e `runParseT#` s + Success# a s' -> ks a `runParseT#` s' + +get# :: (Monad m) => (State# -> ParseT e m a) -> ParseT e m a +get# f = ParseT \s -> f s `runParseT#` s + +put# :: (Monad m) => State# -> ParseT e m a -> ParseT e m a +put# s p = ParseT \_ -> p `runParseT#` s + +-- | Read the entire input without consuming it. +-- getInput :: (Monad m) => ParseT e m ByteString +-- getInput = ParseT \s -> pure $ Success# (stateInput# s) s + +-- | Replace the entire input without affecting the current location. +-- putInput :: (Monad m) => ByteString -> ParseT e m () +-- putInput i = ParseT \(State# _ l) -> pure $ Success# () (State# i l) + +-- modifyInput :: (Monad m) => (ByteString -> ByteString) -> Parse e m () +-- modifyInput f = ParseT \(State# i l) -> pure $ Success# () (State (f i) l) + +-- getLoc :: (Monad m) => ParseT e m Loc +-- getLoc = ParseT \s -> pure $ Success# (locBox $ stateLoc# s) s + +-- putLoc :: (Monad m) => Loc -> ParseT e m () +-- putLoc l = ParseT \(State# i _) -> pure $ Success# () (State# i (locUnbox l)) + +-- modifyLoc# :: (Monad m) => (Loc# -> Loc#) -> ParseT e m () +-- modifyLoc# f = ParseT \(State# i l) -> pure $ Success# () (State# i (f l)) + +-- modifyLoc :: (Monad m) => (Loc -> Loc) -> ParseT e m () +-- modifyLoc f = ParseT \(State# i l) -> pure $ Success# () (State# i (unboxLoc . f . boxLoc $ l)) + +-- save :: (Monad m) => ParseT e m State +-- save = ParseT \s -> pure $ Success# (stateBox s) s + +-- load :: (Monad m) => State -> ParseT e m () +-- load s = ParseT \_ -> pure $ Success# () (stateUnbox s) + +-------------------------------------------------------------------------------- +-- Instances + +instance (Monad m) => Functor (ParseT e m) where + fmap = liftM + +instance (Monad m) => Applicative (ParseT e m) where + pure = succeed + (<*>) = ap + +instance (Monad m) => Monad (ParseT e m) where + p >>= k = continue fail err k p + +instance (Monad m) => Alternative (ParseT e m) where + empty = fail + p <|> q = continue q err succeed p + +instance MonadTrans (ParseT e) where + lift m = ParseT \s -> (`Success#` s) <$> m + +instance (MonadReader r m) => MonadReader r (ParseT e m) where + ask = lift ask + local f p = ParseT \s -> local f $ p `runParseT#` s + +instance (MonadState s m) => MonadState s (ParseT e m) where + get = lift get + put = lift . put + +-------------------------------------------------------------------------------- +-- General combinators + +-- TODO: some_, many_ +-- TODO: Boxed variants of functions +-- TODO: Non-CPS variants of functions + +-- | Turn a failure into a success and vice versa. +not :: (Monad m) => ParseT e m a -> ParseT e m () +not = continue (succeed ()) err (const fail) + +-- | Convert an error into a failure. +try :: (Monad m) => ParseT e m a -> ParseT e m a +try = continue fail (const fail) succeed + +-- | Convert a failure into an error. +cut :: (Monad m) => ParseT e m a -> e -> ParseT e m a +p `cut` e = continue (err e) err succeed p + +-- | Catch an error. +catch :: (Monad m) => ParseT e m a -> (e -> ParseT f m a) -> ParseT f m a +p `catch` h = continue fail h succeed p + +-- | `branch p ks kf` runs p; if it succeeds, it continues with ks; if it fails, +-- it continues with kf. +branch :: (Monad m) => ParseT e m a -> (a -> ParseT e m b) -> ParseT e m b -> ParseT e m b +branch p ks kf = continue kf err ks p + +-- | Run a parser without changing the parser state. +lookahead :: (Monad m) => ParseT e m a -> ParseT e m a +lookahead p = get# \s -> p <* put# s (pure ()) + +withSpan :: (Monad m) => (a -> Span# -> ParseT e m b) -> ParseT e m a -> ParseT e m b +withSpan k p = + get# \(State# _ beg) -> + p >>= \a -> + get# \(State# _ end) -> + k a (Span# beg end) + +withConsumed :: (Monad m) => (a -> ByteString -> ParseT e m b) -> ParseT e m a -> ParseT e m b +withConsumed k p = + get# \(State# i (Loc# b0 _ _)) -> + p >>= \a -> + get# \(State# _ (Loc# b1 _ _)) -> + k a (BS.take (I# (b1 -# b0)) i) + +-- | Iterate a parsing function until it fails. I.e., +-- @iter f a = pure a >>= f >>= f >>= ...@ +-- where the chain of binds is as long as possible without failure. +iter :: (Monad m) => (a -> ParseT e m a) -> a -> ParseT e m a +iter f a = (f a >>= iter f) <|> pure a + +-- | Parse a `b` and then zero or more `a`s, and then combine in a left-nested +-- fashion. +chainl :: (Monad m) => (b -> a -> b) -> ParseT e m b -> ParseT e m a -> ParseT e m b +chainl f pb pa = pb >>= iter \b -> f b <$> pa + +-- | Parse zero or more `a`s (greedily) and then a `b`, and then combine in a +-- right-nested fashion. +chainr :: (Monad m) => (a -> b -> b) -> ParseT e m a -> ParseT e m b -> ParseT e m b +chainr f pa pb = f <$> pa <*> chainr f pa pb <|> pb + +many_ :: (Monad m) => ParseT e m a -> ParseT e m () +many_ p = some_ p <|> pure () + +some_ :: (Monad m) => ParseT e m a -> ParseT e m () +some_ p = p *> many_ p + +manySepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a] +manySepBy p sep = someSepBy p sep <|> pure [] + +someSepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a] +someSepBy p sep = (:) <$> p <*> many (sep *> p) + +choose :: (Monad m, Foldable t) => t (ParseT e m a) -> ParseT e m a +choose = asum + +-- | `match scrut cases` binds scrut to the first non-failing case and returns +-- the result. +match :: + (Monad m, Functor t, Foldable t) => + ParseT e m a -> + t (a -> ParseT e m b) -> + ParseT e m b +match scrut cases = choose $ (scrut >>=) <$> cases + +-------------------------------------------------------------------------------- +-- Utf8 combinators + +-- TODO: Try writing custom utf8 decoder that doesn't allocate Maybes. +-- TODO: Non-CPS version of next? + +next :: (Monad m) => ParseT e m a -> (Char -> ParseT e m a) -> ParseT e m a +next ke kc = ParseT \s@(State# i l) -> + case UTF8.decode i of + Just (c, w) + | c == '\xFFFD' -> pure $ Utf8Error# l + | otherwise -> kc c `runParseT#` State# (BS.drop w i) (locAdvance# c w l) + Nothing -> ke `runParseT#` s + +eof :: (Monad m) => ParseT e m () +eof = get# \(State# i _) -> guard (BS.null i) + +anyChar :: (Monad m) => ParseT e m Char +anyChar = next fail succeed + +charIf :: (Monad m) => (Char -> Bool) -> ParseT e m Char +charIf want = next fail (\c -> guard (want c) $> c) + +char :: (Monad m) => Char -> ParseT e m () +char c = void $ charIf (== c) + +string :: (Monad m) => String -> ParseT e m () +string = traverse_ char + +natural :: (Monad m) => Int -> ParseT e m Natural +natural = \case + 0 -> + let prefix l u = char '0' *> (char l <|> char u) + in prefix 'b' 'B' *> natural 2 + <|> prefix 'q' 'Q' *> natural 4 + <|> prefix 'o' 'O' *> natural 8 + <|> prefix 'x' 'X' *> natural 16 + <|> natural 10 + b + | 0 < b && b <= 36 -> + let nat :: Int -> Natural + nat = fromIntegral + + maxn = min (chr $ ord '0' + b) (succ '9') + maxu = min (chr $ ord 'A' + b - 10) (succ 'Z') + maxl = min (chr $ ord 'a' + b - 10) (succ 'z') + + numeral = + anyChar >>= \c -> + if + | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0') + | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10) + | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10) + | otherwise -> fail + + underscores = many (char '_') + in numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral)) + _ -> error "natural: invalid base" diff --git a/src/Sparsec/Char.hs b/src/Sparsec/Char.hs @@ -0,0 +1,38 @@ +module Sparsec.Char ( + charIsUpperLatin, + charIsLowerLatin, + charIsLatin, + charIsUpperGreek, + charIsLowerGreek, + charIsGreek, + charIsSafeUpperGreek, + charIsSafeLowerGreek, + charIsSafeGreek, + charIsNum2, + charIsNum4, + charIsNum8, + charIsNum10, + charIsNum16, +) where + +charIsUpperLatin, charIsLowerLatin, charIsLatin :: Char -> Bool +charIsUpperLatin c = 'A' <= c && c <= 'Z' +charIsLowerLatin c = 'a' <= c && c <= 'z' +charIsLatin c = charIsUpperLatin c || charIsLowerLatin c + +charIsUpperGreek, charIsLowerGreek, charIsGreek :: Char -> Bool +charIsUpperGreek c = 'Α' <= c && c <= 'Ω' +charIsLowerGreek c = 'α' <= c && c <= 'ω' +charIsGreek c = charIsUpperGreek c || charIsLowerGreek c + +charIsSafeUpperGreek, charIsSafeLowerGreek, charIsSafeGreek :: Char -> Bool +charIsSafeUpperGreek c = c == 'Γ' || c == 'Δ' || c == 'Θ' || c == 'Λ' || c == 'Ξ' || c == 'Π' || c == 'Σ' || c == 'Φ' || c == 'Ψ' || c == 'Ω' +charIsSafeLowerGreek c = charIsLowerGreek c && c /= 'ο' +charIsSafeGreek c = charIsSafeUpperGreek c || charIsSafeLowerGreek c + +charIsNum2, charIsNum4, charIsNum8, charIsNum10, charIsNum16 :: Char -> Bool +charIsNum2 c = '0' <= c && c <= '1' +charIsNum4 c = '0' <= c && c <= '3' +charIsNum8 c = '0' <= c && c <= '7' +charIsNum10 c = '0' <= c && c <= '9' +charIsNum16 c = '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' diff --git a/src/Sparsec/Loc.hs b/src/Sparsec/Loc.hs @@ -0,0 +1,62 @@ +module Sparsec.Loc ( + Loc#, + pattern Loc#, + locByte#, + locRow#, + locCol#, + locEq#, + locShow#, + locAdvance#, + Loc (..), + pattern Loc, + locZero, +) where + +import GHC.Exts (Int (..), Int#, (+#), (==#)) + +intEq# :: Int# -> Int# -> Bool +intEq# x y = case x ==# y of + 0# -> False + _ -> True + +newtype Loc# = Loc_# (# Int#, Int#, Int# #) + +pattern Loc# :: Int# -> Int# -> Int# -> Loc# +pattern Loc# b r c = Loc_# (# b, r, c #) +{-# COMPLETE Loc# #-} + +locByte#, locRow#, locCol# :: Loc# -> Int# +locByte# (Loc# b _ _) = b +locRow# (Loc# _ r _) = r +locCol# (Loc# _ _ c) = c + +locEq# :: Loc# -> Loc# -> Bool +locEq# (Loc# b0 r0 c0) (Loc# b1 r1 c1) = + b0 `intEq#` b1 && r0 `intEq#` r1 && c0 `intEq#` c1 + +locShow# :: Loc# -> String +locShow# (Loc# b r c) = show (I# b) ++ "@" ++ show (I# r) ++ ":" ++ show (I# c) + +locAdvance# :: Char -> Int -> Loc# -> Loc# +locAdvance# x (I# w) (Loc# b r c) = case x of + '\n' -> Loc# (b +# w) (r +# 1#) 0# + _ -> Loc# (b +# w) r (c +# 1#) + +-- | Loc is a source code location, tracking byte offset, row (number of '\n' +-- characters seen so far), and column (number of characters into the current +-- row). All three of these quantities are 0-based. The column tracking is +-- somewhat stupid in that it treats each Unicode codepoint as one column. +data Loc = LocBox {locUnbox :: Loc#} + +pattern Loc :: Int# -> Int# -> Int# -> Loc +pattern Loc b r c = LocBox (Loc# b r c) +{-# COMPLETE Loc #-} + +locZero :: Loc +locZero = Loc 0# 0# 0# + +instance Eq Loc where + LocBox l0 == LocBox l1 = l0 `locEq#` l1 + +instance Show Loc where + show (LocBox l) = locShow# l diff --git a/src/Sparsec/Span.hs b/src/Sparsec/Span.hs @@ -0,0 +1,43 @@ +module Sparsec.Span ( + Span#, + pattern Span#, + spanBeg#, + spanEnd#, + spanEq#, + spanShow#, + Span (..), + pattern Span, +) where + +import Sparsec.Loc + +newtype Span# = Span_# (# Loc#, Loc# #) + +pattern Span# :: Loc# -> Loc# -> Span# +pattern Span# beg end = Span_# (# beg, end #) +{-# COMPLETE Span# #-} + +spanBeg#, spanEnd# :: Span# -> Loc# +spanBeg# (Span# beg _) = beg +spanEnd# (Span# _ end) = end + +spanEq# :: Span# -> Span# -> Bool +spanEq# (Span# beg0 end0) (Span# beg1 end1) = + beg0 `locEq#` beg1 && end0 `locEq#` end1 + +spanShow# :: Span# -> String +spanShow# (Span# beg end) = locShow# beg ++ "--" ++ locShow# end + +-- | Span is a pair of Loc's specifying a range in the input. The beginning/left +-- Loc is inclusive and the ending/right Loc is exclusive. +data Span = SpanBox {spanUnbox :: Span#} + +pattern Span :: Loc# -> Loc# -> Span +pattern Span beg end = SpanBox (Span# beg end) +{-# COMPLETE Span #-} + +instance Eq Span where + SpanBox span0 == SpanBox span1 = span0 `spanEq#` span1 + +instance Show Span where + show (SpanBox span) = spanShow# span