sparsec

simple Haskell parser combinator library
git clone git://git.rr3.xyz/sparsec
Log | Files | Refs | README | LICENSE

commit ceefed6f32eac1424a789568be5c950d4f207798
parent e89fbd9aaa39e3916ad86d6362d377d5b18b3c59
Author: Robert Russell <robert@rr3.xyz>
Date:   Sat, 13 Dec 2025 22:20:38 -0800

Update to GHC 9.10 and language version GHC2024, and clean up

Diffstat:
MSparsec.hs | 314+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mpackage.yaml | 7+++----
Mstack.yaml | 2+-
3 files changed, 177 insertions(+), 146 deletions(-)

diff --git a/Sparsec.hs b/Sparsec.hs @@ -1,50 +1,52 @@ module Sparsec where + import Control.Applicative import Control.Monad hiding (fail) import Control.Monad.Identity import Control.Monad.Reader import Control.Monad.State import Data.ByteString (ByteString) -import qualified Data.ByteString as ByteString -import qualified Data.ByteString.UTF8 as UTF8 +import Data.ByteString qualified as ByteString +import Data.ByteString.UTF8 qualified as UTF8 import Data.Char import Data.Foldable +import Data.Functor import Numeric.Natural -import Prelude hiding (fail, read) import Text.Printf +import Prelude hiding (fail, read) -------------------------------------------------------------------------------- -- Rune -data Rune = RuneEof | RuneChar Char deriving Eq +data Rune = RuneEof | RuneChar Char deriving (Eq) instance Show Rune where show = \case - RuneEof -> "EOF" + RuneEof -> "EOF" RuneChar c -> show c -runeCase :: a -> (Char -> a) -> Rune -> a -runeCase eof chr = \case - RuneEof -> eof +runeElim :: a -> (Char -> a) -> Rune -> a +runeElim eof chr = \case + RuneEof -> eof RuneChar c -> chr c runeIsSpace :: Rune -> Bool -runeIsSpace = runeCase False isSpace +runeIsSpace = runeElim False isSpace runeIsLetter :: Rune -> Bool -runeIsLetter = runeCase False isLetter +runeIsLetter = runeElim False isLetter runeIsDigit :: Rune -> Bool -runeIsDigit = runeCase False isDigit +runeIsDigit = runeElim False isDigit -------------------------------------------------------------------------------- --- Locations +-- Loc --- Loc is a source code location, tracking byte offset, row (number of '\n' +-- | Loc is a source code location, tracking byte offset, row (number of '\n' -- characters seen so far), and column (number of characters into the current -- row). All three of these quantities are 0-based. The column tracking is -- somewhat stupid in that it treats each Unicode codepoint as one column. -data Loc = Loc { byte :: Int, row :: Int, col :: Int } deriving Eq +data Loc = Loc {byte :: Int, row :: Int, col :: Int} deriving (Eq) locZero :: Loc locZero = Loc 0 0 0 @@ -54,11 +56,11 @@ instance Show Loc where show (Loc b r c) = printf "%d:%d#%d" r c b -------------------------------------------------------------------------------- --- Spans +-- Span --- Span is a pair of Loc's specifying a range in the input. The beginning/left +-- | Span is a pair of Loc's specifying a range in the input. The beginning/left -- Loc is inclusive and the ending/right Loc is exclusive. -data Span = Span Loc Loc deriving Eq +data Span = Span Loc Loc deriving (Eq) -- Weird Show instance intended for debugging instance Show Span where @@ -68,44 +70,48 @@ instance Show Span where -- Result data Result e a - = Fail + = Failure | Error e | Ok a ByteString Loc - deriving Eq + deriving (Eq) instance (Show e, Show a) => Show (Result e a) where show = \case - Fail -> "fail" + Failure -> "failure" Error e -> "error: " ++ show e Ok a rest _ -> - if ByteString.null rest then - printf "ok: %s" (show a) - else - printf "ok (%d bytes remaining): %s" - (ByteString.length rest) (show a) + if ByteString.null rest + then + printf "ok: %s" (show a) + else + printf + "ok (%d bytes remaining): %s" + (ByteString.length rest) + (show a) instance Functor (Result e) where fmap f = \case - Fail -> Fail + Failure -> Failure Error e -> Error e Ok a rest loc -> Ok (f a) rest loc -------------------------------------------------------------------------------- --- Parse monad +-- Parse monad and core combinators --- ParseT is a monad transformer for parsing. It effectively has a ByteString +-- | ParseT is a monad transformer for parsing. It effectively has a ByteString -- and Loc state effect for the input, and an error effect (of some type e) for -- parsing errors. Parsing *errors* are distinct from parsing *failures* in -- that only the latter trigger backtracking in the Alternative instance. newtype ParseT e m a = ParseT (ByteString -> Loc -> m (Result e a)) -runParseT :: Monad m => ParseT e m a -> ByteString -> Loc -> m (Result e a) +runParseT :: (Monad m) => ParseT e m a -> ByteString -> Loc -> m (Result e a) runParseT (ParseT f) = f -mapParseT :: (Monad m, Monad m') - => (m (Result e a) -> m' (Result e' a')) - -> ParseT e m a - -> ParseT e' m' a' +mapParseT :: + (Monad m, Monad m') => + (m (Result e a) -> m' (Result e' a')) -> + ParseT e m a -> + ParseT e' m' a' mapParseT f p = ParseT \input loc -> f $ runParseT p input loc type Parse e a = ParseT e Identity a @@ -116,41 +122,39 @@ runParse (ParseT f) input loc = runIdentity $ f input loc mapParse :: (Result e a -> Result e' a') -> Parse e a -> Parse e' a' mapParse f = mapParseT (Identity . f . runIdentity) -instance Monad m => Functor (ParseT e m) where +instance (Monad m) => Functor (ParseT e m) where fmap = liftM -instance Monad m => Applicative (ParseT e m) where +instance (Monad m) => Applicative (ParseT e m) where pure a = ParseT \input loc -> pure $ Ok a input loc (<*>) = ap -instance Monad m => Monad (ParseT e m) where +instance (Monad m) => Monad (ParseT e m) where p >>= k = ParseT \input loc -> runParseT p input loc >>= \case - Fail -> pure $ Fail - Error e -> pure $ Error e + Failure -> pure Failure + Error e -> pure $ Error e Ok a input' loc' -> runParseT (k a) input' loc' -instance Monad m => Alternative (ParseT e m) where - empty = ParseT \_ _ -> pure Fail +instance (Monad m) => Alternative (ParseT e m) where + empty = ParseT \_ _ -> pure Failure p <|> q = ParseT \input loc -> runParseT p input loc >>= \case - Fail -> runParseT q input loc - Error e -> pure $ Error e + Failure -> runParseT q input loc + Error e -> pure $ Error e Ok a input' loc' -> pure $ Ok a input' loc' instance MonadTrans (ParseT e) where - lift m = ParseT \input loc -> do - a <- m - pure $ Ok a input loc + lift m = ParseT \input loc -> (\a -> Ok a input loc) <$> m -instance MonadReader r m => MonadReader r (ParseT e m) where +instance (MonadReader r m) => MonadReader r (ParseT e m) where ask = lift ask local f p = ParseT \input loc -> local f $ runParseT p input loc -instance MonadState s m => MonadState s (ParseT e m) where +instance (MonadState s m) => MonadState s (ParseT e m) where get = lift get put = lift . put @@ -158,123 +162,148 @@ instance MonadState s m => MonadState s (ParseT e m) where -- TODO: Separate a MTL-like MonadParse typeclass with (a subset of) the -- following methods? --- Read the entire input without consuming it. -read :: Monad m => ParseT e m ByteString +-- | Read the entire input without consuming it. +read :: (Monad m) => ParseT e m ByteString read = ParseT \input loc -> pure $ Ok input input loc --- Replace the entire input without affecting the current location. -write :: Monad m => ByteString -> ParseT e m () +-- | Replace the entire input without affecting the current location. +write :: (Monad m) => ByteString -> ParseT e m () write input = ParseT \_ loc -> pure $ Ok () input loc -getLoc :: Monad m => ParseT e m Loc +getLoc :: (Monad m) => ParseT e m Loc getLoc = ParseT \input loc -> pure $ Ok loc input loc -putLoc :: Monad m => Loc -> ParseT e m () +putLoc :: (Monad m) => Loc -> ParseT e m () putLoc loc = ParseT \input _ -> pure $ Ok () input loc --- Save parsing state. This is effectively a combination of read and getLoc. -save :: Monad m => ParseT e m (ByteString, Loc) +-- | Save parsing state. This is effectively a combination of read and getLoc. +save :: (Monad m) => ParseT e m (ByteString, Loc) save = ParseT \input loc -> pure $ Ok (input, loc) input loc --- Load parsing state. This is effectively a combination of write and putLoc. -load :: Monad m => (ByteString, Loc) -> ParseT e m () +-- | Load parsing state. This is effectively a combination of write and putLoc. +load :: (Monad m) => (ByteString, Loc) -> ParseT e m () load (input, loc) = ParseT \_ _ -> pure $ Ok () input loc -fail :: Monad m => ParseT e m a +fail :: (Monad m) => ParseT e m a fail = empty --- Turn a failure into a success and vice versa. -notp :: Monad m => ParseT e m a -> ParseT e m () +-- | Turn a failure into a success and vice versa. +notp :: (Monad m) => ParseT e m a -> ParseT e m () notp p = ParseT \input loc -> runParseT p input loc >>= \case - Fail -> pure $ Ok () input loc - Error e -> pure $ Error e - Ok _ _ _ -> pure Fail + Failure -> pure $ Ok () input loc + Error e -> pure $ Error e + Ok{} -> pure Failure -err :: Monad m => e -> ParseT e m a +err :: (Monad m) => e -> ParseT e m a err e = ParseT \_ _ -> pure $ Error e --- Catch an error. -catch :: Monad m => ParseT e m a -> (e -> ParseT e m a) -> ParseT e m a +-- | Catch an error. +catch :: (Monad m) => ParseT e m a -> (e -> ParseT e' m a) -> ParseT e' m a p `catch` h = ParseT \input loc -> runParseT p input loc >>= \case - Fail -> pure Fail - Error e -> runParseT (h e) input loc + Failure -> pure Failure + Error e -> runParseT (h e) input loc Ok a input' loc' -> pure $ Ok a input' loc' +-- handle generalizes the following: +-- (<|>): provide a continuation for failure +-- catch: provide a continuation for error +-- (>>=): provide a continuation for ok +-- handle let's you specify a continuation for all three cases. +-- handle :: (Monad m) => ParseT e' m a' -> (e -> ParseT e' m a') -> (a -> ParseT e' m a') -> ParseT e m a -> ParseT e' m a' +-- handle hf he ho p = ParseT \input loc -> +-- runParseT p input loc >>= \case +-- Failure -> runParseT hf input loc +-- Error e -> runParseT (he e) input loc +-- Ok a input' loc' -> runParseT (ho a) input' loc' +-- bind2 :: (Monad m) => (a -> ParseT e m a') -> ParseT e m a -> ParseT e m a' +-- bind2 h = handle fail err h +-- catch2 :: (Monad m) => (e -> ParseT e' m a) -> ParseT e m a -> ParseT e' m a +-- catch2 h = handle fail h pure +-- alt2 :: (Monad m) => ParseT e m a -> ParseT e m a -> ParseT e m a +-- alt2 h = handle h err pure +-- branch2 :: (Monad m) => ParseT e m a' -> (a -> ParseT e m a') -> ParseT e m a -> ParseT e m a' +-- branch2 hf ho = handle hf err ho +-- -- TODO: Rename Ok to Success +-- succeed :: (Monad m) => a -> ParseT e m a +-- succeed = pure + -------------------------------------------------------------------------------- --- General parse combinators +-- General combinators -eof :: Monad m => ParseT e m () -eof = (ByteString.null <$> read) >>= guard +eof :: (Monad m) => ParseT e m () +eof = guard . ByteString.null =<< read --- Run a parser without changing the parser state. -lookahead :: Monad m => ParseT e m a -> ParseT e m a +-- | Run a parser without changing the parser state. +lookahead :: (Monad m) => ParseT e m a -> ParseT e m a lookahead p = do s <- save a <- p load s pure a -spanOf :: Monad m => ParseT e m a -> ParseT e m (a, Span) +spanOf :: (Monad m) => ParseT e m a -> ParseT e m (a, Span) spanOf p = do beg <- getLoc a <- p end <- getLoc pure (a, Span beg end) -bytesOf :: Monad m => ParseT e m a -> ParseT e m (a, ByteString) +bytesOf :: (Monad m) => ParseT e m a -> ParseT e m (a, ByteString) bytesOf p = do (input, beg) <- save a <- p end <- getLoc pure (a, ByteString.take (end.byte - beg.byte) input) --- Convert a failure into an error. +-- | Convert a failure into an error. +-- -- TODO: Why do we call this "cut"? -cut :: Monad m => ParseT e m a -> e -> ParseT e m a +cut :: (Monad m) => ParseT e m a -> e -> ParseT e m a p `cut` e = p <|> err e --- Convert an error into a failure. -try :: Monad m => ParseT e m a -> ParseT e m a -try p = p `catch` \_ -> fail +-- | Convert an error into a failure. +try :: (Monad m) => ParseT e m a -> ParseT e m a +try p = p `catch` const fail --- Iterate a parsing function until it fails. I.e., --- iter f a = pure a >>= f >>= f >>= ... +-- | Iterate a parsing function until it fails. I.e., +-- @iter f a = pure a >>= f >>= f >>= ...@ -- where the chain of binds is as long as possible without failure. -iter :: Monad m => (a -> ParseT e m a) -> a -> ParseT e m a +iter :: (Monad m) => (a -> ParseT e m a) -> a -> ParseT e m a iter f a = (f a >>= iter f) <|> pure a --- Parse a `b` and then zero or more `a`s, and then combine in a left-nested +-- | Parse a `b` and then zero or more `a`s, and then combine in a left-nested -- fashion. -chainl :: Monad m => (b -> a -> b) -> ParseT e m b -> ParseT e m a -> ParseT e m b -chainl f pb pa = pb >>= iter \b -> (f b <$> pa) +chainl :: (Monad m) => (b -> a -> b) -> ParseT e m b -> ParseT e m a -> ParseT e m b +chainl f pb pa = pb >>= iter \b -> f b <$> pa --- Parse zero or more `a`s (greedily) and then a `b`, and then combine in a +-- | Parse zero or more `a`s (greedily) and then a `b`, and then combine in a -- right-nested fashion. -chainr :: Monad m => (a -> b -> b) -> ParseT e m a -> ParseT e m b -> ParseT e m b +chainr :: (Monad m) => (a -> b -> b) -> ParseT e m a -> ParseT e m b -> ParseT e m b chainr f pa pb = f <$> pa <*> chainr f pa pb <|> pb --- `branch pc pt pf` runs pc; if it succeeds, it continues with pt, and +-- | `branch pc pt pf` runs pc; if it succeeds, it continues with pt, and -- otherwise it continues with pf. -branch :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m b -> ParseT e m b -branch pc pt pf = pc *> pt <|> pf +branch :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m b -> ParseT e m b +branch pc pt pf = pc *> pt <|> pf -- TODO: This is bugged. pt can backtrack into pf. choice :: (Monad m, Foldable t) => t (ParseT e m a) -> ParseT e m a choice = asum --- `match scrut cases` binds scrut to the first non-failing case and returns +-- | `match scrut cases` binds scrut to the first non-failing case and returns -- the result. -match :: (Monad m, Functor t, Foldable t) - => ParseT e m a - -> t (a -> ParseT e m b) - -> ParseT e m b +match :: + (Monad m, Functor t, Foldable t) => + ParseT e m a -> + t (a -> ParseT e m b) -> + ParseT e m b match scrut cases = choice $ (scrut >>=) <$> cases -someSepBy :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a] +someSepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a] someSepBy p sep = (:) <$> p <*> many (sep *> p) -manySepBy :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a] +manySepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a] manySepBy p sep = someSepBy p sep <|> pure [] -------------------------------------------------------------------------------- @@ -292,30 +321,33 @@ class Utf8Error e where utf8Error :: Loc -> e nextRune :: forall e m. (Utf8Error e, Monad m) => ParseT e m Rune -nextRune = - (UTF8.decode <$> read) >>= \case +nextRune = toRune . UTF8.decode =<< read + where + toRune :: Maybe (Char, Int) -> ParseT e m Rune + toRune = \case Just (c, w) - | c == UTF8.replacement_char -> getLoc >>= (err . utf8Error) - | otherwise -> updateState c w *> pure (RuneChar c) + | c == UTF8.replacement_char -> err . utf8Error =<< getLoc + | otherwise -> updateState c w $> RuneChar c Nothing -> pure RuneEof - where - updateState :: Char -> Int -> ParseT e m () - updateState c w = do - (ByteString.drop w <$> read) >>= write - loc <- getLoc - putLoc case c of - '\n' -> loc { byte = loc.byte + w, row = loc.row + 1, col = 0 } - _ -> loc { byte = loc.byte + w, col = loc.col + 1 } + + updateState :: Char -> Int -> ParseT e m () + updateState c w = do + write . ByteString.drop w =<< read + loc <- getLoc + putLoc case c of + '\n' -> loc{byte = loc.byte + w, row = loc.row + 1, col = 0} + _ -> loc{byte = loc.byte + w, col = loc.col + 1} nextChar :: (Utf8Error e, Monad m) => ParseT e m Char -nextChar = nextRune >>= \case - RuneEof -> fail - RuneChar c -> pure c +nextChar = + nextRune >>= \case + RuneEof -> fail + RuneChar c -> pure c runeIfM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m Rune runeIfM want = do r <- nextRune - (lift $ want r) >>= guard + lift (want r) >>= guard pure r runeIf :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m Rune @@ -324,14 +356,14 @@ runeIf want = runeIfM (pure . want) charIfM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m Char charIfM want = do c <- nextChar - (lift $ want c) >>= guard + lift (want c) >>= guard pure c charIf :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m Char charIf want = charIfM (pure . want) runeWhileM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m ByteString -runeWhileM want = snd <$> (bytesOf $ many (runeIfM want)) +runeWhileM want = snd <$> bytesOf (many (runeIfM want)) runeWhile :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m ByteString runeWhile want = runeWhileM (pure . want) @@ -345,40 +377,40 @@ charWhile :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m ByteString charWhile want = charWhileM (pure . want) rune :: (Utf8Error e, Monad m) => Rune -> ParseT e m () -rune r = runeIf (== r) *> pure () +rune r = void $ runeIf (== r) char :: (Utf8Error e, Monad m) => Char -> ParseT e m () -char c = charIf (== c) *> pure () +char c = void $ charIf (== c) string :: (Utf8Error e, Monad m) => String -> ParseT e m () string = traverse_ char natural :: (Utf8Error e, Monad m) => Int -> ParseT e m Natural natural = \case - 0 -> - let prefix l u = char '0' *> (char l <|> char u) in - prefix 'b' 'B' *> natural 2 + 0 -> + let prefix l u = char '0' *> (char l <|> char u) + in prefix 'b' 'B' *> natural 2 <|> prefix 'q' 'Q' *> natural 4 <|> prefix 'o' 'O' *> natural 8 <|> prefix 'x' 'X' *> natural 16 <|> natural 10 - - b | 0 < b && b <= 36 -> - let nat :: Int -> Natural - nat = fromIntegral - - maxn = min (chr $ ord '0' + b) (succ '9') - maxu = min (chr $ ord 'A' + b - 10) (succ 'Z') - maxl = min (chr $ ord 'a' + b - 10) (succ 'z') - - numeral = nextChar >>= \c -> if - | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0') - | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10) - | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10) - | otherwise -> fail - - underscores = many (char '_') in - - numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral)) - - _ -> error "natural: invalid base" + b + | 0 < b && b <= 36 -> + let nat :: Int -> Natural + nat = fromIntegral + + maxn = min (chr $ ord '0' + b) (succ '9') + maxu = min (chr $ ord 'A' + b - 10) (succ 'Z') + maxl = min (chr $ ord 'a' + b - 10) (succ 'z') + + numeral = + nextChar >>= \c -> + if + | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0') + | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10) + | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10) + | otherwise -> fail + + underscores = many (char '_') + in numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral)) + _ -> error "natural: invalid base" diff --git a/package.yaml b/package.yaml @@ -1,5 +1,5 @@ name: sparsec -version: 0.1.0.0 +version: 0.1.1.0 author: "Robert Russell" license: ISC @@ -8,12 +8,11 @@ ghc-options: - -Wno-name-shadowing - -Wno-missing-signatures -language: GHC2021 +language: GHC2024 default-extensions: - BlockArguments - DuplicateRecordFields - - LambdaCase - MultiWayIf - NoFieldSelectors - OverloadedRecordDot @@ -21,7 +20,7 @@ default-extensions: - UndecidableInstances dependencies: - - base >= 4.18 && < 5 + - base >= 4.20 && < 5 - bytestring library: diff --git a/stack.yaml b/stack.yaml @@ -1 +1 @@ -resolver: lts-23.23 +resolver: lts-24.23