Start huge refactor - sparsec - simple Haskell parser combinator library

commit a9b161714eae5633acb6095afb28058b8a21bf14
parent d4de0fa6982ad9b4be8024e371e1b25dcb99275a
Author: Robert Russell <robert@rr3.xyz>
Date:   Sun, 21 Dec 2025 21:52:53 -0800

Start huge refactor

Diffstat:
D Sparsec.hs  | 628 -------------------------------------------------------------------------------
M package.yaml  | 17 ++++++++---------
A src/Sparsec.hs  | 407 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/Sparsec/Char.hs  | 38 ++++++++++++++++++++++++++++++++++++++
A src/Sparsec/Loc.hs  | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/Sparsec/Span.hs  | 43 +++++++++++++++++++++++++++++++++++++++++++

6 files changed, 558 insertions(+), 637 deletions(-)
diff --git a/Sparsec.hs b/Sparsec.hs
@@ -1,628 +0,0 @@
-module Sparsec (
-    -- Char
-    charIsUpperLatin,
-    charIsLowerLatin,
-    charIsLatin,
-    charIsUpperGreek,
-    charIsLowerGreek,
-    charIsGreek,
-    charIsSafeUpperGreek,
-    charIsSafeLowerGreek,
-    charIsSafeGreek,
-    charIsNum2,
-    charIsNum4,
-    charIsNum8,
-    charIsNum10,
-    charIsNum16,
-    -- Rune
-    Rune (..),
-    runeElim,
-    runeIsUpperLatin,
-    runeIsLowerLatin,
-    runeIsLatin,
-    runeIsUpperGreek,
-    runeIsLowerGreek,
-    runeIsGreek,
-    runeIsSafeUpperGreek,
-    runeIsSafeLowerGreek,
-    runeIsSafeGreek,
-    runeIsNum2,
-    runeIsNum4,
-    runeIsNum8,
-    runeIsNum10,
-    runeIsNum16,
-    -- Loc
-    Loc#,
-    pattern Loc#,
-    locByte#,
-    locRow#,
-    locCol#,
-    locEq#,
-    Loc (..),
-    locZero,
-    locBox,
-    locUnbox,
-    -- Span
-    Span#,
-    pattern Span#,
-    spanBeg#,
-    spanEnd#,
-    spanEq#,
-    Span (..),
-    spanBox,
-    spanUnbox,
-    -- State
-    State#,
-    pattern State#,
-    stateInput#,
-    stateLoc#,
-    stateEq#,
-    State (..),
-    stateBox,
-    stateUnbox,
-    -- Result
-    Result# (..),
-    -- Parse monad
-    ParseT (..),
-    runParseT#,
-    runParseT,
-    mapParseT,
-    Parse,
-    runParse#,
-    runParse,
-    mapParse,
-    -- Core combinators
-    fail,
-    err,
-    succeed,
-    continue,
-    read,
-    write,
-    getLoc,
-    putLoc,
-    save,
-    load,
-    -- Derived combinators
-    not,
-    catch,
-    eof,
-    lookahead,
-    spanOf,
-    bytesOf,
-    cut,
-    try,
-    iter,
-    chainl,
-    chainr,
-    branch,
-    choice,
-    match,
-    someSepBy,
-    manySepBy,
-    -- TODO: Byte combinators
-    -- Utf8 combinators
-    Utf8Error (..),
-    nextRune,
-    nextChar,
-    runeIfM,
-    runeIf,
-    charIfM,
-    charIf,
-    runeWhileM,
-    runeWhile,
-    charWhileM,
-    charWhile,
-    rune,
-    char,
-    string,
-    natural,
-) where
-
-import Control.Applicative
-import Control.Monad hiding (fail)
-import Control.Monad.Identity (Identity (..))
-import Control.Monad.Reader (MonadReader (..))
-import Control.Monad.State (MonadState (..))
-import Control.Monad.Trans (MonadTrans (..))
-import Data.ByteString (ByteString)
-import Data.ByteString qualified as BS
-import Data.ByteString.UTF8 qualified as UTF8
-import Data.Char
-import Data.Foldable
-import Data.Functor
-import GHC.Exts (Int (..), Int#, (==#))
-import Numeric.Natural
-import Text.Printf
-import Prelude hiding (fail, not, read)
-
---------------------------------------------------------------------------------
--- Util
-
-intEq# :: Int# -> Int# -> Bool
-intEq# x y = case x ==# y of
-    0# -> False
-    _ -> True
-
---------------------------------------------------------------------------------
--- Char
-
-charIsUpperLatin, charIsLowerLatin, charIsLatin :: Char -> Bool
-charIsUpperLatin c = 'A' <= c && c <= 'Z'
-charIsLowerLatin c = 'a' <= c && c <= 'z'
-charIsLatin c = charIsUpperLatin c || charIsLowerLatin c
-
-charIsUpperGreek, charIsLowerGreek, charIsGreek :: Char -> Bool
-charIsUpperGreek c = 'Α' <= c && c <= 'Ω'
-charIsLowerGreek c = 'α' <= c && c <= 'ω'
-charIsGreek c = charIsUpperGreek c || charIsLowerGreek c
-
-charIsSafeUpperGreek, charIsSafeLowerGreek, charIsSafeGreek :: Char -> Bool
-charIsSafeUpperGreek c = c == 'Γ' || c == 'Δ' || c == 'Θ' || c == 'Λ' || c == 'Ξ' || c == 'Π' || c == 'Σ' || c == 'Φ' || c == 'Ψ' || c == 'Ω'
-charIsSafeLowerGreek c = charIsLowerGreek c && c /= 'ο'
-charIsSafeGreek c = charIsSafeUpperGreek c || charIsSafeLowerGreek c
-
-charIsNum2, charIsNum4, charIsNum8, charIsNum10, charIsNum16 :: Char -> Bool
-charIsNum2 c = '0' <= c && c <= '1'
-charIsNum4 c = '0' <= c && c <= '3'
-charIsNum8 c = '0' <= c && c <= '7'
-charIsNum10 c = '0' <= c && c <= '9'
-charIsNum16 c = '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f'
-
---------------------------------------------------------------------------------
--- Rune
--- TODO: Are runes bloat? Should we delete them?
-
-data Rune = RuneEof | RuneChar Char deriving (Eq)
-
-instance Show Rune where
-    show = \case
-        RuneEof -> "EOF"
-        RuneChar c -> show c
-
-runeElim :: a -> (Char -> a) -> Rune -> a
-runeElim eof chr = \case
-    RuneEof -> eof
-    RuneChar c -> chr c
-
-runeIsUpperLatin, runeIsLowerLatin, runeIsLatin :: Rune -> Bool
-runeIsUpperLatin = runeElim False charIsUpperLatin
-runeIsLowerLatin = runeElim False charIsLowerLatin
-runeIsLatin = runeElim False charIsLatin
-
-runeIsUpperGreek, runeIsLowerGreek, runeIsGreek :: Rune -> Bool
-runeIsUpperGreek = runeElim False charIsUpperGreek
-runeIsLowerGreek = runeElim False charIsLowerGreek
-runeIsGreek = runeElim False charIsGreek
-
-runeIsSafeUpperGreek, runeIsSafeLowerGreek, runeIsSafeGreek :: Rune -> Bool
-runeIsSafeUpperGreek = runeElim False charIsSafeUpperGreek
-runeIsSafeLowerGreek = runeElim False charIsSafeLowerGreek
-runeIsSafeGreek = runeElim False charIsSafeGreek
-
-runeIsNum2, runeIsNum4, runeIsNum8, runeIsNum10, runeIsNum16 :: Rune -> Bool
-runeIsNum2 = runeElim False charIsNum2
-runeIsNum4 = runeElim False charIsNum4
-runeIsNum8 = runeElim False charIsNum8
-runeIsNum10 = runeElim False charIsNum10
-runeIsNum16 = runeElim False charIsNum16
-
---------------------------------------------------------------------------------
--- Loc
-
-newtype Loc# = Loc_# (# Int#, Int#, Int# #)
-pattern Loc# :: Int# -> Int# -> Int# -> Loc#
-pattern Loc# b r c = Loc_# (# b, r, c #)
-{-# COMPLETE Loc# #-}
-
-locByte#, locRow#, locCol# :: Loc# -> Int#
-locByte# (Loc# b _ _) = b
-locRow# (Loc# _ r _) = r
-locCol# (Loc# _ _ c) = c
-
-locEq# :: Loc# -> Loc# -> Bool
-locEq# (Loc# b0 r0 c0) (Loc# b1 r1 c1) =
-    intEq# b0 b1 && intEq# r0 r1 && intEq# c0 c1
-
--- | Loc is a source code location, tracking byte offset, row (number of '\n'
--- characters seen so far), and column (number of characters into the current
--- row). All three of these quantities are 0-based. The column tracking is
--- somewhat stupid in that it treats each Unicode codepoint as one column.
-data Loc = Loc {locByte :: Int, locRow :: Int, locCol :: Int} deriving (Eq)
-
-locZero :: Loc
-locZero = Loc 0 0 0
-
--- Weird Show instance intended for debugging
-instance Show Loc where
-    show (Loc b r c) = printf "%d:%d#%d" r c b
-
-locBox :: Loc# -> Loc
-locBox (Loc# b r c) = Loc (I# b) (I# r) (I# c)
-
-locUnbox :: Loc -> Loc#
-locUnbox (Loc (I# b) (I# r) (I# c)) = Loc# b r c
-
---------------------------------------------------------------------------------
--- Span
-
-newtype Span# = Span_# (# Loc#, Loc# #)
-pattern Span# :: Loc# -> Loc# -> Span#
-pattern Span# beg end = Span_# (# beg, end #)
-{-# COMPLETE Span# #-}
-
-spanBeg#, spanEnd# :: Span# -> Loc#
-spanBeg# (Span# beg _) = beg
-spanEnd# (Span# _ end) = end
-
-spanEq# :: Span# -> Span# -> Bool
-spanEq# (Span# beg0 end0) (Span# beg1 end1) =
-    locEq# beg0 beg1 && locEq# end0 end1
-
--- | Span is a pair of Loc's specifying a range in the input. The beginning/left
--- Loc is inclusive and the ending/right Loc is exclusive.
-data Span = Span {spanBeg :: Loc, spanEnd :: Loc} deriving (Eq)
-
--- Weird Show instance intended for debugging
-instance Show Span where
-    show (Span beg end) = printf "%s--%s" (show beg) (show end)
-
-spanBox :: Span# -> Span
-spanBox (Span# beg end) = Span (locBox beg) (locBox end)
-
-spanUnbox :: Span -> Span#
-spanUnbox (Span beg end) = Span# (locUnbox beg) (locUnbox end)
-
---------------------------------------------------------------------------------
--- State
-
-newtype State# = State_# (# ByteString, Loc# #)
-pattern State# :: ByteString -> Loc# -> State#
-pattern State# i l = State_# (# i, l #)
-{-# COMPLETE State# #-}
-
-stateInput# :: State# -> ByteString
-stateInput# (State# i _) = i
-
-stateLoc# :: State# -> Loc#
-stateLoc# (State# _ l) = l
-
-stateEq# :: State# -> State# -> Bool
-stateEq# (State# i0 l0) (State# i1 l1) = i0 == i1 && locEq# l0 l1
-
-data State = State {stateInput :: ByteString, stateLoc :: Loc} deriving (Eq)
-
-stateBox :: State# -> State
-stateBox (State# i l) = State i (locBox l)
-
-stateUnbox :: State -> State#
-stateUnbox (State i l) = State# i (locUnbox l)
-
---------------------------------------------------------------------------------
--- Result
-
--- TODO: Result (no hash)
-
-data Result# e a = Failure# | Error# e | Success# a State#
-
-instance (Eq e, Eq a) => Eq (Result# e a) where
-    Failure# == Failure# = True
-    Error# e0 == Error# e1 = e0 == e1
-    Success# a0 s0 == Success# a1 s1 = a0 == a1 && stateEq# s0 s1
-    _ == _ = False
-
-instance (Show e, Show a) => Show (Result# e a) where
-    show = \case
-        Failure# -> "failure"
-        Error# e -> "error: " ++ show e
-        Success# a (State# i _) ->
-            if BS.null i
-                then
-                    printf "success: %s" (show a)
-                else
-                    printf
-                        "success (%d bytes remaining): %s"
-                        (BS.length i)
-                        (show a)
-
-instance Functor (Result# e) where
-    fmap f = \case
-        Failure# -> Failure#
-        Error# e -> Error# e
-        Success# a s -> Success# (f a) s
-
---------------------------------------------------------------------------------
--- ParseT
-
--- | ParseT is a monad transformer for parsing. It effectively has a ByteString
--- and Loc state effect for the input, and an error effect (of some type e) for
--- parsing errors. Parsing *errors* are distinct from parsing *failures* in
--- that only the latter trigger backtracking in the Alternative instance.
-newtype ParseT e m a = ParseT (State# -> m (Result# e a))
-
-runParseT# :: (Monad m) => ParseT e m a -> State# -> m (Result# e a)
-runParseT# (ParseT f) = f
-
-runParseT :: (Monad m) => ParseT e m a -> ByteString -> Loc -> m (Result# e a)
-runParseT p i l = p `runParseT#` State# i (locUnbox l)
-
-mapParseT ::
-    (Monad m, Monad m') =>
-    (m (Result# e a) -> m' (Result# e' a')) ->
-    ParseT e m a ->
-    ParseT e' m' a'
-mapParseT f p = ParseT \s -> f $ p `runParseT#` s
-
---------------------------------------------------------------------------------
--- Parse
-
-type Parse e a = ParseT e Identity a
-
-runParse# :: Parse e a -> State# -> Result# e a
-runParse# p s = runIdentity $ p `runParseT#` s
-
-runParse :: Parse e a -> ByteString -> Loc -> Result# e a
-runParse p i l = p `runParse#` State# i (locUnbox l)
-
-mapParse :: (Result# e a -> Result# e' a') -> Parse e a -> Parse e' a'
-mapParse f = mapParseT (Identity . f . runIdentity)
-
---------------------------------------------------------------------------------
--- Core combinators
-
-fail :: (Monad m) => ParseT e m a
-fail = ParseT \_ -> pure Failure#
-
-err :: (Monad m) => e -> ParseT e m a
-err e = ParseT \_ -> pure $ Error# e
-
-succeed :: (Monad m) => a -> ParseT e m a
-succeed a = ParseT \s -> pure $ Success# a s
-
-continue :: (Monad m) => ParseT e' m a' -> (e -> ParseT e' m a') -> (a -> ParseT e' m a') -> ParseT e m a -> ParseT e' m a'
-continue kf ke ks p = ParseT \s ->
-    p `runParseT#` s >>= \case
-        Failure# -> kf `runParseT#` s
-        Error# e -> ke e `runParseT#` s
-        Success# a s' -> ks a `runParseT#` s'
-
--- | Read the entire input without consuming it.
-read :: (Monad m) => ParseT e m ByteString
-read = ParseT \s -> pure $ Success# (stateInput# s) s
-
--- | Replace the entire input without affecting the current location.
-write :: (Monad m) => ByteString -> ParseT e m ()
-write i = ParseT \(State# _ l) -> pure $ Success# () (State# i l)
-
-getLoc :: (Monad m) => ParseT e m Loc
-getLoc = ParseT \s -> pure $ Success# (locBox $ stateLoc# s) s
-
-putLoc :: (Monad m) => Loc -> ParseT e m ()
-putLoc l = ParseT \(State# i _) -> pure $ Success# () (State# i (locUnbox l))
-
-save :: (Monad m) => ParseT e m State
-save = ParseT \s -> pure $ Success# (stateBox s) s
-
-load :: (Monad m) => State -> ParseT e m ()
-load s = ParseT \_ -> pure $ Success# () (stateUnbox s)
-
---------------------------------------------------------------------------------
--- Instances
-
-instance (Monad m) => Functor (ParseT e m) where
-    fmap = liftM
-
-instance (Monad m) => Applicative (ParseT e m) where
-    pure = succeed
-    (<*>) = ap
-
-instance (Monad m) => Monad (ParseT e m) where
-    p >>= k = continue fail err k p
-
-instance (Monad m) => Alternative (ParseT e m) where
-    empty = fail
-    p <|> q = continue q err succeed p
-
-instance MonadTrans (ParseT e) where
-    lift m = ParseT \s -> (`Success#` s) <$> m
-
-instance (MonadReader r m) => MonadReader r (ParseT e m) where
-    ask = lift ask
-    local f p = ParseT \s -> local f $ p `runParseT#` s
-
-instance (MonadState s m) => MonadState s (ParseT e m) where
-    get = lift get
-    put = lift . put
-
---------------------------------------------------------------------------------
--- General combinators
-
--- | Turn a failure into a success and vice versa.
-not :: (Monad m) => ParseT e m a -> ParseT e m ()
-not = continue (succeed ()) err (const fail)
-
--- | Catch an error.
-catch :: (Monad m) => ParseT e m a -> (e -> ParseT e' m a) -> ParseT e' m a
-p `catch` h = continue fail h succeed p
-
-eof :: (Monad m) => ParseT e m ()
-eof = guard . BS.null =<< read
-
--- | Run a parser without changing the parser state.
-lookahead :: (Monad m) => ParseT e m a -> ParseT e m a
-lookahead p = do
-    s <- save
-    a <- p
-    load s
-    pure a
-
-spanOf :: (Monad m) => ParseT e m a -> ParseT e m (a, Span)
-spanOf p = do
-    beg <- getLoc
-    a <- p
-    end <- getLoc
-    pure (a, Span beg end)
-
-bytesOf :: (Monad m) => ParseT e m a -> ParseT e m (a, ByteString)
-bytesOf p = do
-    State i beg <- save
-    a <- p
-    end <- getLoc
-    pure (a, BS.take (locByte end - locByte beg) i)
-
--- | Convert a failure into an error.
---
--- TODO: Why do we call this "cut"?
-cut :: (Monad m) => ParseT e m a -> e -> ParseT e m a
-p `cut` e = p <|> err e
-
--- | Convert an error into a failure.
-try :: (Monad m) => ParseT e m a -> ParseT e m a
-try p = p `catch` const fail
-
--- | Iterate a parsing function until it fails. I.e.,
--- @iter f a = pure a >>= f >>= f >>= ...@
--- where the chain of binds is as long as possible without failure.
-iter :: (Monad m) => (a -> ParseT e m a) -> a -> ParseT e m a
-iter f a = (f a >>= iter f) <|> pure a
-
--- | Parse a `b` and then zero or more `a`s, and then combine in a left-nested
--- fashion.
-chainl :: (Monad m) => (b -> a -> b) -> ParseT e m b -> ParseT e m a -> ParseT e m b
-chainl f pb pa = pb >>= iter \b -> f b <$> pa
-
--- | Parse zero or more `a`s (greedily) and then a `b`, and then combine in a
--- right-nested fashion.
-chainr :: (Monad m) => (a -> b -> b) -> ParseT e m a -> ParseT e m b -> ParseT e m b
-chainr f pa pb = f <$> pa <*> chainr f pa pb <|> pb
-
--- | `branch pc pt pf` runs pc; if it succeeds, it continues with pt, and
--- otherwise it continues with pf.
-branch :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m b -> ParseT e m b
-branch p kt kf = continue kf err (const kt) p
-
-choice :: (Monad m, Foldable t) => t (ParseT e m a) -> ParseT e m a
-choice = asum
-
--- | `match scrut cases` binds scrut to the first non-failing case and returns
--- the result.
-match ::
-    (Monad m, Functor t, Foldable t) =>
-    ParseT e m a ->
-    t (a -> ParseT e m b) ->
-    ParseT e m b
-match scrut cases = choice $ (scrut >>=) <$> cases
-
-someSepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a]
-someSepBy p sep = (:) <$> p <*> many (sep *> p)
-
-manySepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a]
-manySepBy p sep = someSepBy p sep <|> pure []
-
---------------------------------------------------------------------------------
--- Byte-wise parse combinators
-
--- TODO
-
---------------------------------------------------------------------------------
--- Utf8 parse combinators
-
--- All these combinators involve UTF-8 decoding and therefore require that
--- the error type e supports UTF-8 errors.
-
-class Utf8Error e where
-    utf8Error :: Loc -> e
-
-nextRune :: forall e m. (Utf8Error e, Monad m) => ParseT e m Rune
-nextRune = toRune . UTF8.decode =<< read
-  where
-    toRune :: Maybe (Char, Int) -> ParseT e m Rune
-    toRune = \case
-        Just (c, w)
-            | c == UTF8.replacement_char -> err . utf8Error =<< getLoc
-            | otherwise -> updateState c w $> RuneChar c
-        Nothing -> pure RuneEof
-
-    updateState :: Char -> Int -> ParseT e m ()
-    updateState c w = do
-        write . BS.drop w =<< read
-        loc <- getLoc
-        putLoc case c of
-            '\n' -> Loc (locByte loc + w) (locRow loc + 1) 0
-            _ -> Loc (locByte loc + w) (locRow loc) (locCol loc + 1)
-
-nextChar :: (Utf8Error e, Monad m) => ParseT e m Char
-nextChar =
-    nextRune >>= \case
-        RuneEof -> fail
-        RuneChar c -> pure c
-
-runeIfM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m Rune
-runeIfM want = do
-    r <- nextRune
-    lift (want r) >>= guard
-    pure r
-
-runeIf :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m Rune
-runeIf want = runeIfM (pure . want)
-
-charIfM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m Char
-charIfM want = do
-    c <- nextChar
-    lift (want c) >>= guard
-    pure c
-
-charIf :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m Char
-charIf want = charIfM (pure . want)
-
-runeWhileM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m ByteString
-runeWhileM want = snd <$> bytesOf (many (runeIfM want))
-
-runeWhile :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m ByteString
-runeWhile want = runeWhileM (pure . want)
-
-charWhileM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m ByteString
-charWhileM want = runeWhileM \case
-    RuneEof -> pure False
-    RuneChar c -> want c
-
-charWhile :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m ByteString
-charWhile want = charWhileM (pure . want)
-
-rune :: (Utf8Error e, Monad m) => Rune -> ParseT e m ()
-rune r = void $ runeIf (== r)
-
-char :: (Utf8Error e, Monad m) => Char -> ParseT e m ()
-char c = void $ charIf (== c)
-
-string :: (Utf8Error e, Monad m) => String -> ParseT e m ()
-string = traverse_ char
-
-natural :: (Utf8Error e, Monad m) => Int -> ParseT e m Natural
-natural = \case
-    0 ->
-        let prefix l u = char '0' *> (char l <|> char u)
-        in prefix 'b' 'B' *> natural 2
-            <|> prefix 'q' 'Q' *> natural 4
-            <|> prefix 'o' 'O' *> natural 8
-            <|> prefix 'x' 'X' *> natural 16
-            <|> natural 10
-    b
-        | 0 < b && b <= 36 ->
-            let nat :: Int -> Natural
-                nat = fromIntegral
-
-                maxn = min (chr $ ord '0' + b) (succ '9')
-                maxu = min (chr $ ord 'A' + b - 10) (succ 'Z')
-                maxl = min (chr $ ord 'a' + b - 10) (succ 'z')
-
-                numeral =
-                    nextChar >>= \c ->
-                        if
-                            | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0')
-                            | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10)
-                            | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10)
-                            | otherwise -> fail
-
-                underscores = many (char '_')
-            in numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral))
-    _ -> error "natural: invalid base"
diff --git a/package.yaml b/package.yaml
@@ -1,5 +1,5 @@
 name: sparsec
-version: 0.1.1.0
+version: 0.2.0.0
 author: "Robert Russell"
 license: ISC
 
@@ -26,14 +26,13 @@ dependencies:
     - bytestring
 
 library:
-    source-dirs: .
+    source-dirs: src
     dependencies:
         - mtl
         - utf8-string
-
-executables:
-    example:
-        source-dirs: example
-        main: Main.hs
-        dependencies:
-            - sparsec
+# executables:
+#     example:
+#         source-dirs: example
+#         main: Main.hs
+#         dependencies:
+#             - sparsec
diff --git a/src/Sparsec.hs b/src/Sparsec.hs
@@ -0,0 +1,407 @@
+module Sparsec (
+    -- State
+    State#,
+    pattern State#,
+    stateInput#,
+    stateLoc#,
+    stateEq#,
+    State (..),
+    pattern State,
+    -- Result
+    Result# (..),
+    -- Parse monad
+    ParseT (..),
+    runParseT#,
+    runParseT,
+    mapParseT,
+    Parse,
+    runParse#,
+    runParse,
+    mapParse,
+    -- Core combinators
+    fail,
+    err,
+    succeed,
+    continue,
+    get#,
+    put#,
+    -- General combinators
+    not,
+    try,
+    cut,
+    catch,
+    branch,
+    lookahead,
+    withSpan,
+    withConsumed,
+    iter,
+    chainl,
+    chainr,
+    many,
+    some,
+    many_,
+    some_,
+    manySepBy,
+    someSepBy,
+    choose,
+    match,
+    -- Utf8 combinators
+    next,
+    eof,
+    anyChar,
+    charIf,
+    char,
+    string,
+    natural,
+) where
+
+import Control.Applicative
+import Control.Monad hiding (fail)
+import Control.Monad.Identity (Identity (..))
+import Control.Monad.Reader (MonadReader (..))
+import Control.Monad.State (MonadState (..))
+import Control.Monad.Trans (MonadTrans (..))
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.UTF8 qualified as UTF8
+import Data.Char
+import Data.Foldable
+import Data.Functor
+import GHC.Exts (Int (..), (-#))
+import Numeric.Natural
+import Text.Printf
+import Prelude hiding (fail, not, read)
+
+import Sparsec.Loc
+import Sparsec.Span
+
+--------------------------------------------------------------------------------
+-- State#
+
+-- TODO: Try making ByteString part strict.
+
+newtype State# = State_# (# ByteString, Loc# #)
+
+pattern State# :: ByteString -> Loc# -> State#
+pattern State# i l = State_# (# i, l #)
+{-# COMPLETE State# #-}
+
+stateInput# :: State# -> ByteString
+stateInput# (State# i _) = i
+
+stateLoc# :: State# -> Loc#
+stateLoc# (State# _ l) = l
+
+stateEq# :: State# -> State# -> Bool
+stateEq# (State# i0 l0) (State# i1 l1) = i0 == i1 && l0 `locEq#` l1
+
+data State = StateBox {stateUnbox :: State#}
+
+pattern State :: ByteString -> Loc# -> State
+pattern State i l = StateBox (State# i l)
+{-# COMPLETE State #-}
+
+instance Eq State where
+    StateBox s0 == StateBox s1 = s0 `stateEq#` s1
+
+--------------------------------------------------------------------------------
+-- Result#
+
+data Result# e a
+    = Utf8Error# Loc#
+    | Failure#
+    | Error# e
+    | Success# a State#
+
+instance (Eq e, Eq a) => Eq (Result# e a) where
+    Utf8Error# l0 == Utf8Error# l1 = l0 `locEq#` l1
+    Failure# == Failure# = True
+    Error# e0 == Error# e1 = e0 == e1
+    Success# a0 s0 == Success# a1 s1 = a0 == a1 && s0 `stateEq#` s1
+    _ == _ = False
+
+instance (Show e, Show a) => Show (Result# e a) where
+    show = \case
+        Utf8Error# l -> "utf8 error at " ++ locShow# l
+        Failure# -> "failure"
+        Error# e -> "error: " ++ show e
+        Success# a (State# i _) ->
+            if BS.null i
+                then printf "success: %s" (show a)
+                else printf "success (%d bytes remaining): %s" (BS.length i) (show a)
+
+instance Functor (Result# e) where
+    fmap f = \case
+        Utf8Error# l -> Utf8Error# l
+        Failure# -> Failure#
+        Error# e -> Error# e
+        Success# a s -> Success# (f a) s
+
+--------------------------------------------------------------------------------
+-- Result
+
+-- TODO?
+
+--------------------------------------------------------------------------------
+-- ParseT
+
+-- | ParseT is a monad transformer for parsing. It effectively has a ByteString
+-- and Loc state effect for the input, and an error effect (of some type e) for
+-- parsing errors. Parsing *errors* are distinct from parsing *failures* in
+-- that only the latter trigger backtracking in the Alternative instance.
+newtype ParseT e m a = ParseT (State# -> m (Result# e a))
+
+runParseT# :: (Monad m) => ParseT e m a -> State# -> m (Result# e a)
+runParseT# (ParseT f) = f
+
+runParseT :: (Monad m) => ParseT e m a -> ByteString -> Loc -> m (Result# e a)
+runParseT p i l = p `runParseT#` State# i (locUnbox l)
+
+mapParseT ::
+    (Monad m, Monad m') =>
+    (m (Result# e a) -> m' (Result# e' a')) ->
+    ParseT e m a ->
+    ParseT e' m' a'
+mapParseT f p = ParseT \s -> f $ p `runParseT#` s
+
+--------------------------------------------------------------------------------
+-- Parse
+
+type Parse e a = ParseT e Identity a
+
+runParse# :: Parse e a -> State# -> Result# e a
+runParse# p s = runIdentity $ p `runParseT#` s
+
+runParse :: Parse e a -> ByteString -> Loc -> Result# e a
+runParse p i l = p `runParse#` State# i (locUnbox l)
+
+mapParse :: (Result# e a -> Result# e' a') -> Parse e a -> Parse e' a'
+mapParse f = mapParseT (Identity . f . runIdentity)
+
+--------------------------------------------------------------------------------
+-- Core combinators
+
+fail :: (Monad m) => ParseT e m a
+fail = ParseT \_ -> pure Failure#
+
+err :: (Monad m) => e -> ParseT e m a
+err e = ParseT \_ -> pure $ Error# e
+
+succeed :: (Monad m) => a -> ParseT e m a
+succeed a = ParseT \s -> pure $ Success# a s
+
+continue :: (Monad m) => ParseT f m b -> (e -> ParseT f m b) -> (a -> ParseT f m b) -> ParseT e m a -> ParseT f m b
+continue kf ke ks p = ParseT \s ->
+    p `runParseT#` s >>= \case
+        Utf8Error# l -> pure $ Utf8Error# l
+        Failure# -> kf `runParseT#` s
+        Error# e -> ke e `runParseT#` s
+        Success# a s' -> ks a `runParseT#` s'
+
+get# :: (Monad m) => (State# -> ParseT e m a) -> ParseT e m a
+get# f = ParseT \s -> f s `runParseT#` s
+
+put# :: (Monad m) => State# -> ParseT e m a -> ParseT e m a
+put# s p = ParseT \_ -> p `runParseT#` s
+
+-- | Read the entire input without consuming it.
+-- getInput :: (Monad m) => ParseT e m ByteString
+-- getInput = ParseT \s -> pure $ Success# (stateInput# s) s
+
+-- | Replace the entire input without affecting the current location.
+-- putInput :: (Monad m) => ByteString -> ParseT e m ()
+-- putInput i = ParseT \(State# _ l) -> pure $ Success# () (State# i l)
+
+-- modifyInput :: (Monad m) => (ByteString -> ByteString) -> Parse e m ()
+-- modifyInput f = ParseT \(State# i l) -> pure $ Success# () (State (f i) l)
+
+-- getLoc :: (Monad m) => ParseT e m Loc
+-- getLoc = ParseT \s -> pure $ Success# (locBox $ stateLoc# s) s
+
+-- putLoc :: (Monad m) => Loc -> ParseT e m ()
+-- putLoc l = ParseT \(State# i _) -> pure $ Success# () (State# i (locUnbox l))
+
+-- modifyLoc# :: (Monad m) => (Loc# -> Loc#) -> ParseT e m ()
+-- modifyLoc# f = ParseT \(State# i l) -> pure $ Success# () (State# i (f l))
+
+-- modifyLoc :: (Monad m) => (Loc -> Loc) -> ParseT e m ()
+-- modifyLoc f = ParseT \(State# i l) -> pure $ Success# () (State# i (unboxLoc . f . boxLoc $ l))
+
+-- save :: (Monad m) => ParseT e m State
+-- save = ParseT \s -> pure $ Success# (stateBox s) s
+
+-- load :: (Monad m) => State -> ParseT e m ()
+-- load s = ParseT \_ -> pure $ Success# () (stateUnbox s)
+
+--------------------------------------------------------------------------------
+-- Instances
+
+instance (Monad m) => Functor (ParseT e m) where
+    fmap = liftM
+
+instance (Monad m) => Applicative (ParseT e m) where
+    pure = succeed
+    (<*>) = ap
+
+instance (Monad m) => Monad (ParseT e m) where
+    p >>= k = continue fail err k p
+
+instance (Monad m) => Alternative (ParseT e m) where
+    empty = fail
+    p <|> q = continue q err succeed p
+
+instance MonadTrans (ParseT e) where
+    lift m = ParseT \s -> (`Success#` s) <$> m
+
+instance (MonadReader r m) => MonadReader r (ParseT e m) where
+    ask = lift ask
+    local f p = ParseT \s -> local f $ p `runParseT#` s
+
+instance (MonadState s m) => MonadState s (ParseT e m) where
+    get = lift get
+    put = lift . put
+
+--------------------------------------------------------------------------------
+-- General combinators
+
+-- TODO: some_, many_
+-- TODO: Boxed variants of functions
+-- TODO: Non-CPS variants of functions
+
+-- | Turn a failure into a success and vice versa.
+not :: (Monad m) => ParseT e m a -> ParseT e m ()
+not = continue (succeed ()) err (const fail)
+
+-- | Convert an error into a failure.
+try :: (Monad m) => ParseT e m a -> ParseT e m a
+try = continue fail (const fail) succeed
+
+-- | Convert a failure into an error.
+cut :: (Monad m) => ParseT e m a -> e -> ParseT e m a
+p `cut` e = continue (err e) err succeed p
+
+-- | Catch an error.
+catch :: (Monad m) => ParseT e m a -> (e -> ParseT f m a) -> ParseT f m a
+p `catch` h = continue fail h succeed p
+
+-- | `branch p ks kf` runs p; if it succeeds, it continues with ks; if it fails,
+-- it continues with kf.
+branch :: (Monad m) => ParseT e m a -> (a -> ParseT e m b) -> ParseT e m b -> ParseT e m b
+branch p ks kf = continue kf err ks p
+
+-- | Run a parser without changing the parser state.
+lookahead :: (Monad m) => ParseT e m a -> ParseT e m a
+lookahead p = get# \s -> p <* put# s (pure ())
+
+withSpan :: (Monad m) => (a -> Span# -> ParseT e m b) -> ParseT e m a -> ParseT e m b
+withSpan k p =
+    get# \(State# _ beg) ->
+        p >>= \a ->
+            get# \(State# _ end) ->
+                k a (Span# beg end)
+
+withConsumed :: (Monad m) => (a -> ByteString -> ParseT e m b) -> ParseT e m a -> ParseT e m b
+withConsumed k p =
+    get# \(State# i (Loc# b0 _ _)) ->
+        p >>= \a ->
+            get# \(State# _ (Loc# b1 _ _)) ->
+                k a (BS.take (I# (b1 -# b0)) i)
+
+-- | Iterate a parsing function until it fails. I.e.,
+-- @iter f a = pure a >>= f >>= f >>= ...@
+-- where the chain of binds is as long as possible without failure.
+iter :: (Monad m) => (a -> ParseT e m a) -> a -> ParseT e m a
+iter f a = (f a >>= iter f) <|> pure a
+
+-- | Parse a `b` and then zero or more `a`s, and then combine in a left-nested
+-- fashion.
+chainl :: (Monad m) => (b -> a -> b) -> ParseT e m b -> ParseT e m a -> ParseT e m b
+chainl f pb pa = pb >>= iter \b -> f b <$> pa
+
+-- | Parse zero or more `a`s (greedily) and then a `b`, and then combine in a
+-- right-nested fashion.
+chainr :: (Monad m) => (a -> b -> b) -> ParseT e m a -> ParseT e m b -> ParseT e m b
+chainr f pa pb = f <$> pa <*> chainr f pa pb <|> pb
+
+many_ :: (Monad m) => ParseT e m a -> ParseT e m ()
+many_ p = some_ p <|> pure ()
+
+some_ :: (Monad m) => ParseT e m a -> ParseT e m ()
+some_ p = p *> many_ p
+
+manySepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+manySepBy p sep = someSepBy p sep <|> pure []
+
+someSepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+someSepBy p sep = (:) <$> p <*> many (sep *> p)
+
+choose :: (Monad m, Foldable t) => t (ParseT e m a) -> ParseT e m a
+choose = asum
+
+-- | `match scrut cases` binds scrut to the first non-failing case and returns
+-- the result.
+match ::
+    (Monad m, Functor t, Foldable t) =>
+    ParseT e m a ->
+    t (a -> ParseT e m b) ->
+    ParseT e m b
+match scrut cases = choose $ (scrut >>=) <$> cases
+
+--------------------------------------------------------------------------------
+-- Utf8 combinators
+
+-- TODO: Try writing custom utf8 decoder that doesn't allocate Maybes.
+-- TODO: Non-CPS version of next?
+
+next :: (Monad m) => ParseT e m a -> (Char -> ParseT e m a) -> ParseT e m a
+next ke kc = ParseT \s@(State# i l) ->
+    case UTF8.decode i of
+        Just (c, w)
+            | c == '\xFFFD' -> pure $ Utf8Error# l
+            | otherwise -> kc c `runParseT#` State# (BS.drop w i) (locAdvance# c w l)
+        Nothing -> ke `runParseT#` s
+
+eof :: (Monad m) => ParseT e m ()
+eof = get# \(State# i _) -> guard (BS.null i)
+
+anyChar :: (Monad m) => ParseT e m Char
+anyChar = next fail succeed
+
+charIf :: (Monad m) => (Char -> Bool) -> ParseT e m Char
+charIf want = next fail (\c -> guard (want c) $> c)
+
+char :: (Monad m) => Char -> ParseT e m ()
+char c = void $ charIf (== c)
+
+string :: (Monad m) => String -> ParseT e m ()
+string = traverse_ char
+
+natural :: (Monad m) => Int -> ParseT e m Natural
+natural = \case
+    0 ->
+        let prefix l u = char '0' *> (char l <|> char u)
+        in prefix 'b' 'B' *> natural 2
+            <|> prefix 'q' 'Q' *> natural 4
+            <|> prefix 'o' 'O' *> natural 8
+            <|> prefix 'x' 'X' *> natural 16
+            <|> natural 10
+    b
+        | 0 < b && b <= 36 ->
+            let nat :: Int -> Natural
+                nat = fromIntegral
+
+                maxn = min (chr $ ord '0' + b) (succ '9')
+                maxu = min (chr $ ord 'A' + b - 10) (succ 'Z')
+                maxl = min (chr $ ord 'a' + b - 10) (succ 'z')
+
+                numeral =
+                    anyChar >>= \c ->
+                        if
+                            | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0')
+                            | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10)
+                            | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10)
+                            | otherwise -> fail
+
+                underscores = many (char '_')
+            in numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral))
+    _ -> error "natural: invalid base"
diff --git a/src/Sparsec/Char.hs b/src/Sparsec/Char.hs
@@ -0,0 +1,38 @@
+module Sparsec.Char (
+    charIsUpperLatin,
+    charIsLowerLatin,
+    charIsLatin,
+    charIsUpperGreek,
+    charIsLowerGreek,
+    charIsGreek,
+    charIsSafeUpperGreek,
+    charIsSafeLowerGreek,
+    charIsSafeGreek,
+    charIsNum2,
+    charIsNum4,
+    charIsNum8,
+    charIsNum10,
+    charIsNum16,
+) where
+
+charIsUpperLatin, charIsLowerLatin, charIsLatin :: Char -> Bool
+charIsUpperLatin c = 'A' <= c && c <= 'Z'
+charIsLowerLatin c = 'a' <= c && c <= 'z'
+charIsLatin c = charIsUpperLatin c || charIsLowerLatin c
+
+charIsUpperGreek, charIsLowerGreek, charIsGreek :: Char -> Bool
+charIsUpperGreek c = 'Α' <= c && c <= 'Ω'
+charIsLowerGreek c = 'α' <= c && c <= 'ω'
+charIsGreek c = charIsUpperGreek c || charIsLowerGreek c
+
+charIsSafeUpperGreek, charIsSafeLowerGreek, charIsSafeGreek :: Char -> Bool
+charIsSafeUpperGreek c = c == 'Γ' || c == 'Δ' || c == 'Θ' || c == 'Λ' || c == 'Ξ' || c == 'Π' || c == 'Σ' || c == 'Φ' || c == 'Ψ' || c == 'Ω'
+charIsSafeLowerGreek c = charIsLowerGreek c && c /= 'ο'
+charIsSafeGreek c = charIsSafeUpperGreek c || charIsSafeLowerGreek c
+
+charIsNum2, charIsNum4, charIsNum8, charIsNum10, charIsNum16 :: Char -> Bool
+charIsNum2 c = '0' <= c && c <= '1'
+charIsNum4 c = '0' <= c && c <= '3'
+charIsNum8 c = '0' <= c && c <= '7'
+charIsNum10 c = '0' <= c && c <= '9'
+charIsNum16 c = '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f'
diff --git a/src/Sparsec/Loc.hs b/src/Sparsec/Loc.hs
@@ -0,0 +1,62 @@
+module Sparsec.Loc (
+    Loc#,
+    pattern Loc#,
+    locByte#,
+    locRow#,
+    locCol#,
+    locEq#,
+    locShow#,
+    locAdvance#,
+    Loc (..),
+    pattern Loc,
+    locZero,
+) where
+
+import GHC.Exts (Int (..), Int#, (+#), (==#))
+
+intEq# :: Int# -> Int# -> Bool
+intEq# x y = case x ==# y of
+    0# -> False
+    _ -> True
+
+newtype Loc# = Loc_# (# Int#, Int#, Int# #)
+
+pattern Loc# :: Int# -> Int# -> Int# -> Loc#
+pattern Loc# b r c = Loc_# (# b, r, c #)
+{-# COMPLETE Loc# #-}
+
+locByte#, locRow#, locCol# :: Loc# -> Int#
+locByte# (Loc# b _ _) = b
+locRow# (Loc# _ r _) = r
+locCol# (Loc# _ _ c) = c
+
+locEq# :: Loc# -> Loc# -> Bool
+locEq# (Loc# b0 r0 c0) (Loc# b1 r1 c1) =
+    b0 `intEq#` b1 && r0 `intEq#` r1 && c0 `intEq#` c1
+
+locShow# :: Loc# -> String
+locShow# (Loc# b r c) = show (I# b) ++ "@" ++ show (I# r) ++ ":" ++ show (I# c)
+
+locAdvance# :: Char -> Int -> Loc# -> Loc#
+locAdvance# x (I# w) (Loc# b r c) = case x of
+    '\n' -> Loc# (b +# w) (r +# 1#) 0#
+    _ -> Loc# (b +# w) r (c +# 1#)
+
+-- | Loc is a source code location, tracking byte offset, row (number of '\n'
+-- characters seen so far), and column (number of characters into the current
+-- row). All three of these quantities are 0-based. The column tracking is
+-- somewhat stupid in that it treats each Unicode codepoint as one column.
+data Loc = LocBox {locUnbox :: Loc#}
+
+pattern Loc :: Int# -> Int# -> Int# -> Loc
+pattern Loc b r c = LocBox (Loc# b r c)
+{-# COMPLETE Loc #-}
+
+locZero :: Loc
+locZero = Loc 0# 0# 0#
+
+instance Eq Loc where
+    LocBox l0 == LocBox l1 = l0 `locEq#` l1
+
+instance Show Loc where
+    show (LocBox l) = locShow# l
diff --git a/src/Sparsec/Span.hs b/src/Sparsec/Span.hs
@@ -0,0 +1,43 @@
+module Sparsec.Span (
+    Span#,
+    pattern Span#,
+    spanBeg#,
+    spanEnd#,
+    spanEq#,
+    spanShow#,
+    Span (..),
+    pattern Span,
+) where
+
+import Sparsec.Loc
+
+newtype Span# = Span_# (# Loc#, Loc# #)
+
+pattern Span# :: Loc# -> Loc# -> Span#
+pattern Span# beg end = Span_# (# beg, end #)
+{-# COMPLETE Span# #-}
+
+spanBeg#, spanEnd# :: Span# -> Loc#
+spanBeg# (Span# beg _) = beg
+spanEnd# (Span# _ end) = end
+
+spanEq# :: Span# -> Span# -> Bool
+spanEq# (Span# beg0 end0) (Span# beg1 end1) =
+    beg0 `locEq#` beg1 && end0 `locEq#` end1
+
+spanShow# :: Span# -> String
+spanShow# (Span# beg end) = locShow# beg ++ "--" ++ locShow# end
+
+-- | Span is a pair of Loc's specifying a range in the input. The beginning/left
+-- Loc is inclusive and the ending/right Loc is exclusive.
+data Span = SpanBox {spanUnbox :: Span#}
+
+pattern Span :: Loc# -> Loc# -> Span
+pattern Span beg end = SpanBox (Span# beg end)
+{-# COMPLETE Span #-}
+
+instance Eq Span where
+    SpanBox span0 == SpanBox span1 = span0 `spanEq#` span1
+
+instance Show Span where
+    show (SpanBox span) = spanShow# span

	sparsec simple Haskell parser combinator library
	git clone git://git.rr3.xyz/sparsec
	Log \| Files \| Refs \| README \| LICENSE

D	Sparsec.hs	\|	628	-------------------------------------------------------------------------------
M	package.yaml	\|	17	++++++++---------
A	src/Sparsec.hs	\|	407	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/Sparsec/Char.hs	\|	38	++++++++++++++++++++++++++++++++++++++
A	src/Sparsec/Loc.hs	\|	62	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/Sparsec/Span.hs	\|	43	+++++++++++++++++++++++++++++++++++++++++++