commit a9b161714eae5633acb6095afb28058b8a21bf14
parent d4de0fa6982ad9b4be8024e371e1b25dcb99275a
Author: Robert Russell <robert@rr3.xyz>
Date: Sun, 21 Dec 2025 21:52:53 -0800
Start huge refactor
Diffstat:
6 files changed, 558 insertions(+), 637 deletions(-)
diff --git a/Sparsec.hs b/Sparsec.hs
@@ -1,628 +0,0 @@
-module Sparsec (
- -- Char
- charIsUpperLatin,
- charIsLowerLatin,
- charIsLatin,
- charIsUpperGreek,
- charIsLowerGreek,
- charIsGreek,
- charIsSafeUpperGreek,
- charIsSafeLowerGreek,
- charIsSafeGreek,
- charIsNum2,
- charIsNum4,
- charIsNum8,
- charIsNum10,
- charIsNum16,
- -- Rune
- Rune (..),
- runeElim,
- runeIsUpperLatin,
- runeIsLowerLatin,
- runeIsLatin,
- runeIsUpperGreek,
- runeIsLowerGreek,
- runeIsGreek,
- runeIsSafeUpperGreek,
- runeIsSafeLowerGreek,
- runeIsSafeGreek,
- runeIsNum2,
- runeIsNum4,
- runeIsNum8,
- runeIsNum10,
- runeIsNum16,
- -- Loc
- Loc#,
- pattern Loc#,
- locByte#,
- locRow#,
- locCol#,
- locEq#,
- Loc (..),
- locZero,
- locBox,
- locUnbox,
- -- Span
- Span#,
- pattern Span#,
- spanBeg#,
- spanEnd#,
- spanEq#,
- Span (..),
- spanBox,
- spanUnbox,
- -- State
- State#,
- pattern State#,
- stateInput#,
- stateLoc#,
- stateEq#,
- State (..),
- stateBox,
- stateUnbox,
- -- Result
- Result# (..),
- -- Parse monad
- ParseT (..),
- runParseT#,
- runParseT,
- mapParseT,
- Parse,
- runParse#,
- runParse,
- mapParse,
- -- Core combinators
- fail,
- err,
- succeed,
- continue,
- read,
- write,
- getLoc,
- putLoc,
- save,
- load,
- -- Derived combinators
- not,
- catch,
- eof,
- lookahead,
- spanOf,
- bytesOf,
- cut,
- try,
- iter,
- chainl,
- chainr,
- branch,
- choice,
- match,
- someSepBy,
- manySepBy,
- -- TODO: Byte combinators
- -- Utf8 combinators
- Utf8Error (..),
- nextRune,
- nextChar,
- runeIfM,
- runeIf,
- charIfM,
- charIf,
- runeWhileM,
- runeWhile,
- charWhileM,
- charWhile,
- rune,
- char,
- string,
- natural,
-) where
-
-import Control.Applicative
-import Control.Monad hiding (fail)
-import Control.Monad.Identity (Identity (..))
-import Control.Monad.Reader (MonadReader (..))
-import Control.Monad.State (MonadState (..))
-import Control.Monad.Trans (MonadTrans (..))
-import Data.ByteString (ByteString)
-import Data.ByteString qualified as BS
-import Data.ByteString.UTF8 qualified as UTF8
-import Data.Char
-import Data.Foldable
-import Data.Functor
-import GHC.Exts (Int (..), Int#, (==#))
-import Numeric.Natural
-import Text.Printf
-import Prelude hiding (fail, not, read)
-
---------------------------------------------------------------------------------
--- Util
-
-intEq# :: Int# -> Int# -> Bool
-intEq# x y = case x ==# y of
- 0# -> False
- _ -> True
-
---------------------------------------------------------------------------------
--- Char
-
-charIsUpperLatin, charIsLowerLatin, charIsLatin :: Char -> Bool
-charIsUpperLatin c = 'A' <= c && c <= 'Z'
-charIsLowerLatin c = 'a' <= c && c <= 'z'
-charIsLatin c = charIsUpperLatin c || charIsLowerLatin c
-
-charIsUpperGreek, charIsLowerGreek, charIsGreek :: Char -> Bool
-charIsUpperGreek c = 'Α' <= c && c <= 'Ω'
-charIsLowerGreek c = 'α' <= c && c <= 'ω'
-charIsGreek c = charIsUpperGreek c || charIsLowerGreek c
-
-charIsSafeUpperGreek, charIsSafeLowerGreek, charIsSafeGreek :: Char -> Bool
-charIsSafeUpperGreek c = c == 'Γ' || c == 'Δ' || c == 'Θ' || c == 'Λ' || c == 'Ξ' || c == 'Π' || c == 'Σ' || c == 'Φ' || c == 'Ψ' || c == 'Ω'
-charIsSafeLowerGreek c = charIsLowerGreek c && c /= 'ο'
-charIsSafeGreek c = charIsSafeUpperGreek c || charIsSafeLowerGreek c
-
-charIsNum2, charIsNum4, charIsNum8, charIsNum10, charIsNum16 :: Char -> Bool
-charIsNum2 c = '0' <= c && c <= '1'
-charIsNum4 c = '0' <= c && c <= '3'
-charIsNum8 c = '0' <= c && c <= '7'
-charIsNum10 c = '0' <= c && c <= '9'
-charIsNum16 c = '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f'
-
---------------------------------------------------------------------------------
--- Rune
--- TODO: Are runes bloat? Should we delete them?
-
-data Rune = RuneEof | RuneChar Char deriving (Eq)
-
-instance Show Rune where
- show = \case
- RuneEof -> "EOF"
- RuneChar c -> show c
-
-runeElim :: a -> (Char -> a) -> Rune -> a
-runeElim eof chr = \case
- RuneEof -> eof
- RuneChar c -> chr c
-
-runeIsUpperLatin, runeIsLowerLatin, runeIsLatin :: Rune -> Bool
-runeIsUpperLatin = runeElim False charIsUpperLatin
-runeIsLowerLatin = runeElim False charIsLowerLatin
-runeIsLatin = runeElim False charIsLatin
-
-runeIsUpperGreek, runeIsLowerGreek, runeIsGreek :: Rune -> Bool
-runeIsUpperGreek = runeElim False charIsUpperGreek
-runeIsLowerGreek = runeElim False charIsLowerGreek
-runeIsGreek = runeElim False charIsGreek
-
-runeIsSafeUpperGreek, runeIsSafeLowerGreek, runeIsSafeGreek :: Rune -> Bool
-runeIsSafeUpperGreek = runeElim False charIsSafeUpperGreek
-runeIsSafeLowerGreek = runeElim False charIsSafeLowerGreek
-runeIsSafeGreek = runeElim False charIsSafeGreek
-
-runeIsNum2, runeIsNum4, runeIsNum8, runeIsNum10, runeIsNum16 :: Rune -> Bool
-runeIsNum2 = runeElim False charIsNum2
-runeIsNum4 = runeElim False charIsNum4
-runeIsNum8 = runeElim False charIsNum8
-runeIsNum10 = runeElim False charIsNum10
-runeIsNum16 = runeElim False charIsNum16
-
---------------------------------------------------------------------------------
--- Loc
-
-newtype Loc# = Loc_# (# Int#, Int#, Int# #)
-pattern Loc# :: Int# -> Int# -> Int# -> Loc#
-pattern Loc# b r c = Loc_# (# b, r, c #)
-{-# COMPLETE Loc# #-}
-
-locByte#, locRow#, locCol# :: Loc# -> Int#
-locByte# (Loc# b _ _) = b
-locRow# (Loc# _ r _) = r
-locCol# (Loc# _ _ c) = c
-
-locEq# :: Loc# -> Loc# -> Bool
-locEq# (Loc# b0 r0 c0) (Loc# b1 r1 c1) =
- intEq# b0 b1 && intEq# r0 r1 && intEq# c0 c1
-
--- | Loc is a source code location, tracking byte offset, row (number of '\n'
--- characters seen so far), and column (number of characters into the current
--- row). All three of these quantities are 0-based. The column tracking is
--- somewhat stupid in that it treats each Unicode codepoint as one column.
-data Loc = Loc {locByte :: Int, locRow :: Int, locCol :: Int} deriving (Eq)
-
-locZero :: Loc
-locZero = Loc 0 0 0
-
--- Weird Show instance intended for debugging
-instance Show Loc where
- show (Loc b r c) = printf "%d:%d#%d" r c b
-
-locBox :: Loc# -> Loc
-locBox (Loc# b r c) = Loc (I# b) (I# r) (I# c)
-
-locUnbox :: Loc -> Loc#
-locUnbox (Loc (I# b) (I# r) (I# c)) = Loc# b r c
-
---------------------------------------------------------------------------------
--- Span
-
-newtype Span# = Span_# (# Loc#, Loc# #)
-pattern Span# :: Loc# -> Loc# -> Span#
-pattern Span# beg end = Span_# (# beg, end #)
-{-# COMPLETE Span# #-}
-
-spanBeg#, spanEnd# :: Span# -> Loc#
-spanBeg# (Span# beg _) = beg
-spanEnd# (Span# _ end) = end
-
-spanEq# :: Span# -> Span# -> Bool
-spanEq# (Span# beg0 end0) (Span# beg1 end1) =
- locEq# beg0 beg1 && locEq# end0 end1
-
--- | Span is a pair of Loc's specifying a range in the input. The beginning/left
--- Loc is inclusive and the ending/right Loc is exclusive.
-data Span = Span {spanBeg :: Loc, spanEnd :: Loc} deriving (Eq)
-
--- Weird Show instance intended for debugging
-instance Show Span where
- show (Span beg end) = printf "%s--%s" (show beg) (show end)
-
-spanBox :: Span# -> Span
-spanBox (Span# beg end) = Span (locBox beg) (locBox end)
-
-spanUnbox :: Span -> Span#
-spanUnbox (Span beg end) = Span# (locUnbox beg) (locUnbox end)
-
---------------------------------------------------------------------------------
--- State
-
-newtype State# = State_# (# ByteString, Loc# #)
-pattern State# :: ByteString -> Loc# -> State#
-pattern State# i l = State_# (# i, l #)
-{-# COMPLETE State# #-}
-
-stateInput# :: State# -> ByteString
-stateInput# (State# i _) = i
-
-stateLoc# :: State# -> Loc#
-stateLoc# (State# _ l) = l
-
-stateEq# :: State# -> State# -> Bool
-stateEq# (State# i0 l0) (State# i1 l1) = i0 == i1 && locEq# l0 l1
-
-data State = State {stateInput :: ByteString, stateLoc :: Loc} deriving (Eq)
-
-stateBox :: State# -> State
-stateBox (State# i l) = State i (locBox l)
-
-stateUnbox :: State -> State#
-stateUnbox (State i l) = State# i (locUnbox l)
-
---------------------------------------------------------------------------------
--- Result
-
--- TODO: Result (no hash)
-
-data Result# e a = Failure# | Error# e | Success# a State#
-
-instance (Eq e, Eq a) => Eq (Result# e a) where
- Failure# == Failure# = True
- Error# e0 == Error# e1 = e0 == e1
- Success# a0 s0 == Success# a1 s1 = a0 == a1 && stateEq# s0 s1
- _ == _ = False
-
-instance (Show e, Show a) => Show (Result# e a) where
- show = \case
- Failure# -> "failure"
- Error# e -> "error: " ++ show e
- Success# a (State# i _) ->
- if BS.null i
- then
- printf "success: %s" (show a)
- else
- printf
- "success (%d bytes remaining): %s"
- (BS.length i)
- (show a)
-
-instance Functor (Result# e) where
- fmap f = \case
- Failure# -> Failure#
- Error# e -> Error# e
- Success# a s -> Success# (f a) s
-
---------------------------------------------------------------------------------
--- ParseT
-
--- | ParseT is a monad transformer for parsing. It effectively has a ByteString
--- and Loc state effect for the input, and an error effect (of some type e) for
--- parsing errors. Parsing *errors* are distinct from parsing *failures* in
--- that only the latter trigger backtracking in the Alternative instance.
-newtype ParseT e m a = ParseT (State# -> m (Result# e a))
-
-runParseT# :: (Monad m) => ParseT e m a -> State# -> m (Result# e a)
-runParseT# (ParseT f) = f
-
-runParseT :: (Monad m) => ParseT e m a -> ByteString -> Loc -> m (Result# e a)
-runParseT p i l = p `runParseT#` State# i (locUnbox l)
-
-mapParseT ::
- (Monad m, Monad m') =>
- (m (Result# e a) -> m' (Result# e' a')) ->
- ParseT e m a ->
- ParseT e' m' a'
-mapParseT f p = ParseT \s -> f $ p `runParseT#` s
-
---------------------------------------------------------------------------------
--- Parse
-
-type Parse e a = ParseT e Identity a
-
-runParse# :: Parse e a -> State# -> Result# e a
-runParse# p s = runIdentity $ p `runParseT#` s
-
-runParse :: Parse e a -> ByteString -> Loc -> Result# e a
-runParse p i l = p `runParse#` State# i (locUnbox l)
-
-mapParse :: (Result# e a -> Result# e' a') -> Parse e a -> Parse e' a'
-mapParse f = mapParseT (Identity . f . runIdentity)
-
---------------------------------------------------------------------------------
--- Core combinators
-
-fail :: (Monad m) => ParseT e m a
-fail = ParseT \_ -> pure Failure#
-
-err :: (Monad m) => e -> ParseT e m a
-err e = ParseT \_ -> pure $ Error# e
-
-succeed :: (Monad m) => a -> ParseT e m a
-succeed a = ParseT \s -> pure $ Success# a s
-
-continue :: (Monad m) => ParseT e' m a' -> (e -> ParseT e' m a') -> (a -> ParseT e' m a') -> ParseT e m a -> ParseT e' m a'
-continue kf ke ks p = ParseT \s ->
- p `runParseT#` s >>= \case
- Failure# -> kf `runParseT#` s
- Error# e -> ke e `runParseT#` s
- Success# a s' -> ks a `runParseT#` s'
-
--- | Read the entire input without consuming it.
-read :: (Monad m) => ParseT e m ByteString
-read = ParseT \s -> pure $ Success# (stateInput# s) s
-
--- | Replace the entire input without affecting the current location.
-write :: (Monad m) => ByteString -> ParseT e m ()
-write i = ParseT \(State# _ l) -> pure $ Success# () (State# i l)
-
-getLoc :: (Monad m) => ParseT e m Loc
-getLoc = ParseT \s -> pure $ Success# (locBox $ stateLoc# s) s
-
-putLoc :: (Monad m) => Loc -> ParseT e m ()
-putLoc l = ParseT \(State# i _) -> pure $ Success# () (State# i (locUnbox l))
-
-save :: (Monad m) => ParseT e m State
-save = ParseT \s -> pure $ Success# (stateBox s) s
-
-load :: (Monad m) => State -> ParseT e m ()
-load s = ParseT \_ -> pure $ Success# () (stateUnbox s)
-
---------------------------------------------------------------------------------
--- Instances
-
-instance (Monad m) => Functor (ParseT e m) where
- fmap = liftM
-
-instance (Monad m) => Applicative (ParseT e m) where
- pure = succeed
- (<*>) = ap
-
-instance (Monad m) => Monad (ParseT e m) where
- p >>= k = continue fail err k p
-
-instance (Monad m) => Alternative (ParseT e m) where
- empty = fail
- p <|> q = continue q err succeed p
-
-instance MonadTrans (ParseT e) where
- lift m = ParseT \s -> (`Success#` s) <$> m
-
-instance (MonadReader r m) => MonadReader r (ParseT e m) where
- ask = lift ask
- local f p = ParseT \s -> local f $ p `runParseT#` s
-
-instance (MonadState s m) => MonadState s (ParseT e m) where
- get = lift get
- put = lift . put
-
---------------------------------------------------------------------------------
--- General combinators
-
--- | Turn a failure into a success and vice versa.
-not :: (Monad m) => ParseT e m a -> ParseT e m ()
-not = continue (succeed ()) err (const fail)
-
--- | Catch an error.
-catch :: (Monad m) => ParseT e m a -> (e -> ParseT e' m a) -> ParseT e' m a
-p `catch` h = continue fail h succeed p
-
-eof :: (Monad m) => ParseT e m ()
-eof = guard . BS.null =<< read
-
--- | Run a parser without changing the parser state.
-lookahead :: (Monad m) => ParseT e m a -> ParseT e m a
-lookahead p = do
- s <- save
- a <- p
- load s
- pure a
-
-spanOf :: (Monad m) => ParseT e m a -> ParseT e m (a, Span)
-spanOf p = do
- beg <- getLoc
- a <- p
- end <- getLoc
- pure (a, Span beg end)
-
-bytesOf :: (Monad m) => ParseT e m a -> ParseT e m (a, ByteString)
-bytesOf p = do
- State i beg <- save
- a <- p
- end <- getLoc
- pure (a, BS.take (locByte end - locByte beg) i)
-
--- | Convert a failure into an error.
---
--- TODO: Why do we call this "cut"?
-cut :: (Monad m) => ParseT e m a -> e -> ParseT e m a
-p `cut` e = p <|> err e
-
--- | Convert an error into a failure.
-try :: (Monad m) => ParseT e m a -> ParseT e m a
-try p = p `catch` const fail
-
--- | Iterate a parsing function until it fails. I.e.,
--- @iter f a = pure a >>= f >>= f >>= ...@
--- where the chain of binds is as long as possible without failure.
-iter :: (Monad m) => (a -> ParseT e m a) -> a -> ParseT e m a
-iter f a = (f a >>= iter f) <|> pure a
-
--- | Parse a `b` and then zero or more `a`s, and then combine in a left-nested
--- fashion.
-chainl :: (Monad m) => (b -> a -> b) -> ParseT e m b -> ParseT e m a -> ParseT e m b
-chainl f pb pa = pb >>= iter \b -> f b <$> pa
-
--- | Parse zero or more `a`s (greedily) and then a `b`, and then combine in a
--- right-nested fashion.
-chainr :: (Monad m) => (a -> b -> b) -> ParseT e m a -> ParseT e m b -> ParseT e m b
-chainr f pa pb = f <$> pa <*> chainr f pa pb <|> pb
-
--- | `branch pc pt pf` runs pc; if it succeeds, it continues with pt, and
--- otherwise it continues with pf.
-branch :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m b -> ParseT e m b
-branch p kt kf = continue kf err (const kt) p
-
-choice :: (Monad m, Foldable t) => t (ParseT e m a) -> ParseT e m a
-choice = asum
-
--- | `match scrut cases` binds scrut to the first non-failing case and returns
--- the result.
-match ::
- (Monad m, Functor t, Foldable t) =>
- ParseT e m a ->
- t (a -> ParseT e m b) ->
- ParseT e m b
-match scrut cases = choice $ (scrut >>=) <$> cases
-
-someSepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a]
-someSepBy p sep = (:) <$> p <*> many (sep *> p)
-
-manySepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a]
-manySepBy p sep = someSepBy p sep <|> pure []
-
---------------------------------------------------------------------------------
--- Byte-wise parse combinators
-
--- TODO
-
---------------------------------------------------------------------------------
--- Utf8 parse combinators
-
--- All these combinators involve UTF-8 decoding and therefore require that
--- the error type e supports UTF-8 errors.
-
-class Utf8Error e where
- utf8Error :: Loc -> e
-
-nextRune :: forall e m. (Utf8Error e, Monad m) => ParseT e m Rune
-nextRune = toRune . UTF8.decode =<< read
- where
- toRune :: Maybe (Char, Int) -> ParseT e m Rune
- toRune = \case
- Just (c, w)
- | c == UTF8.replacement_char -> err . utf8Error =<< getLoc
- | otherwise -> updateState c w $> RuneChar c
- Nothing -> pure RuneEof
-
- updateState :: Char -> Int -> ParseT e m ()
- updateState c w = do
- write . BS.drop w =<< read
- loc <- getLoc
- putLoc case c of
- '\n' -> Loc (locByte loc + w) (locRow loc + 1) 0
- _ -> Loc (locByte loc + w) (locRow loc) (locCol loc + 1)
-
-nextChar :: (Utf8Error e, Monad m) => ParseT e m Char
-nextChar =
- nextRune >>= \case
- RuneEof -> fail
- RuneChar c -> pure c
-
-runeIfM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m Rune
-runeIfM want = do
- r <- nextRune
- lift (want r) >>= guard
- pure r
-
-runeIf :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m Rune
-runeIf want = runeIfM (pure . want)
-
-charIfM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m Char
-charIfM want = do
- c <- nextChar
- lift (want c) >>= guard
- pure c
-
-charIf :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m Char
-charIf want = charIfM (pure . want)
-
-runeWhileM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m ByteString
-runeWhileM want = snd <$> bytesOf (many (runeIfM want))
-
-runeWhile :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m ByteString
-runeWhile want = runeWhileM (pure . want)
-
-charWhileM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m ByteString
-charWhileM want = runeWhileM \case
- RuneEof -> pure False
- RuneChar c -> want c
-
-charWhile :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m ByteString
-charWhile want = charWhileM (pure . want)
-
-rune :: (Utf8Error e, Monad m) => Rune -> ParseT e m ()
-rune r = void $ runeIf (== r)
-
-char :: (Utf8Error e, Monad m) => Char -> ParseT e m ()
-char c = void $ charIf (== c)
-
-string :: (Utf8Error e, Monad m) => String -> ParseT e m ()
-string = traverse_ char
-
-natural :: (Utf8Error e, Monad m) => Int -> ParseT e m Natural
-natural = \case
- 0 ->
- let prefix l u = char '0' *> (char l <|> char u)
- in prefix 'b' 'B' *> natural 2
- <|> prefix 'q' 'Q' *> natural 4
- <|> prefix 'o' 'O' *> natural 8
- <|> prefix 'x' 'X' *> natural 16
- <|> natural 10
- b
- | 0 < b && b <= 36 ->
- let nat :: Int -> Natural
- nat = fromIntegral
-
- maxn = min (chr $ ord '0' + b) (succ '9')
- maxu = min (chr $ ord 'A' + b - 10) (succ 'Z')
- maxl = min (chr $ ord 'a' + b - 10) (succ 'z')
-
- numeral =
- nextChar >>= \c ->
- if
- | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0')
- | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10)
- | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10)
- | otherwise -> fail
-
- underscores = many (char '_')
- in numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral))
- _ -> error "natural: invalid base"
diff --git a/package.yaml b/package.yaml
@@ -1,5 +1,5 @@
name: sparsec
-version: 0.1.1.0
+version: 0.2.0.0
author: "Robert Russell"
license: ISC
@@ -26,14 +26,13 @@ dependencies:
- bytestring
library:
- source-dirs: .
+ source-dirs: src
dependencies:
- mtl
- utf8-string
-
-executables:
- example:
- source-dirs: example
- main: Main.hs
- dependencies:
- - sparsec
+# executables:
+# example:
+# source-dirs: example
+# main: Main.hs
+# dependencies:
+# - sparsec
diff --git a/src/Sparsec.hs b/src/Sparsec.hs
@@ -0,0 +1,407 @@
+module Sparsec (
+ -- State
+ State#,
+ pattern State#,
+ stateInput#,
+ stateLoc#,
+ stateEq#,
+ State (..),
+ pattern State,
+ -- Result
+ Result# (..),
+ -- Parse monad
+ ParseT (..),
+ runParseT#,
+ runParseT,
+ mapParseT,
+ Parse,
+ runParse#,
+ runParse,
+ mapParse,
+ -- Core combinators
+ fail,
+ err,
+ succeed,
+ continue,
+ get#,
+ put#,
+ -- General combinators
+ not,
+ try,
+ cut,
+ catch,
+ branch,
+ lookahead,
+ withSpan,
+ withConsumed,
+ iter,
+ chainl,
+ chainr,
+ many,
+ some,
+ many_,
+ some_,
+ manySepBy,
+ someSepBy,
+ choose,
+ match,
+ -- Utf8 combinators
+ next,
+ eof,
+ anyChar,
+ charIf,
+ char,
+ string,
+ natural,
+) where
+
+import Control.Applicative
+import Control.Monad hiding (fail)
+import Control.Monad.Identity (Identity (..))
+import Control.Monad.Reader (MonadReader (..))
+import Control.Monad.State (MonadState (..))
+import Control.Monad.Trans (MonadTrans (..))
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.UTF8 qualified as UTF8
+import Data.Char
+import Data.Foldable
+import Data.Functor
+import GHC.Exts (Int (..), (-#))
+import Numeric.Natural
+import Text.Printf
+import Prelude hiding (fail, not, read)
+
+import Sparsec.Loc
+import Sparsec.Span
+
+--------------------------------------------------------------------------------
+-- State#
+
+-- TODO: Try making ByteString part strict.
+
+newtype State# = State_# (# ByteString, Loc# #)
+
+pattern State# :: ByteString -> Loc# -> State#
+pattern State# i l = State_# (# i, l #)
+{-# COMPLETE State# #-}
+
+stateInput# :: State# -> ByteString
+stateInput# (State# i _) = i
+
+stateLoc# :: State# -> Loc#
+stateLoc# (State# _ l) = l
+
+stateEq# :: State# -> State# -> Bool
+stateEq# (State# i0 l0) (State# i1 l1) = i0 == i1 && l0 `locEq#` l1
+
+data State = StateBox {stateUnbox :: State#}
+
+pattern State :: ByteString -> Loc# -> State
+pattern State i l = StateBox (State# i l)
+{-# COMPLETE State #-}
+
+instance Eq State where
+ StateBox s0 == StateBox s1 = s0 `stateEq#` s1
+
+--------------------------------------------------------------------------------
+-- Result#
+
+data Result# e a
+ = Utf8Error# Loc#
+ | Failure#
+ | Error# e
+ | Success# a State#
+
+instance (Eq e, Eq a) => Eq (Result# e a) where
+ Utf8Error# l0 == Utf8Error# l1 = l0 `locEq#` l1
+ Failure# == Failure# = True
+ Error# e0 == Error# e1 = e0 == e1
+ Success# a0 s0 == Success# a1 s1 = a0 == a1 && s0 `stateEq#` s1
+ _ == _ = False
+
+instance (Show e, Show a) => Show (Result# e a) where
+ show = \case
+ Utf8Error# l -> "utf8 error at " ++ locShow# l
+ Failure# -> "failure"
+ Error# e -> "error: " ++ show e
+ Success# a (State# i _) ->
+ if BS.null i
+ then printf "success: %s" (show a)
+ else printf "success (%d bytes remaining): %s" (BS.length i) (show a)
+
+instance Functor (Result# e) where
+ fmap f = \case
+ Utf8Error# l -> Utf8Error# l
+ Failure# -> Failure#
+ Error# e -> Error# e
+ Success# a s -> Success# (f a) s
+
+--------------------------------------------------------------------------------
+-- Result
+
+-- TODO?
+
+--------------------------------------------------------------------------------
+-- ParseT
+
+-- | ParseT is a monad transformer for parsing. It effectively has a ByteString
+-- and Loc state effect for the input, and an error effect (of some type e) for
+-- parsing errors. Parsing *errors* are distinct from parsing *failures* in
+-- that only the latter trigger backtracking in the Alternative instance.
+newtype ParseT e m a = ParseT (State# -> m (Result# e a))
+
+runParseT# :: (Monad m) => ParseT e m a -> State# -> m (Result# e a)
+runParseT# (ParseT f) = f
+
+runParseT :: (Monad m) => ParseT e m a -> ByteString -> Loc -> m (Result# e a)
+runParseT p i l = p `runParseT#` State# i (locUnbox l)
+
+mapParseT ::
+ (Monad m, Monad m') =>
+ (m (Result# e a) -> m' (Result# e' a')) ->
+ ParseT e m a ->
+ ParseT e' m' a'
+mapParseT f p = ParseT \s -> f $ p `runParseT#` s
+
+--------------------------------------------------------------------------------
+-- Parse
+
+type Parse e a = ParseT e Identity a
+
+runParse# :: Parse e a -> State# -> Result# e a
+runParse# p s = runIdentity $ p `runParseT#` s
+
+runParse :: Parse e a -> ByteString -> Loc -> Result# e a
+runParse p i l = p `runParse#` State# i (locUnbox l)
+
+mapParse :: (Result# e a -> Result# e' a') -> Parse e a -> Parse e' a'
+mapParse f = mapParseT (Identity . f . runIdentity)
+
+--------------------------------------------------------------------------------
+-- Core combinators
+
+fail :: (Monad m) => ParseT e m a
+fail = ParseT \_ -> pure Failure#
+
+err :: (Monad m) => e -> ParseT e m a
+err e = ParseT \_ -> pure $ Error# e
+
+succeed :: (Monad m) => a -> ParseT e m a
+succeed a = ParseT \s -> pure $ Success# a s
+
+continue :: (Monad m) => ParseT f m b -> (e -> ParseT f m b) -> (a -> ParseT f m b) -> ParseT e m a -> ParseT f m b
+continue kf ke ks p = ParseT \s ->
+ p `runParseT#` s >>= \case
+ Utf8Error# l -> pure $ Utf8Error# l
+ Failure# -> kf `runParseT#` s
+ Error# e -> ke e `runParseT#` s
+ Success# a s' -> ks a `runParseT#` s'
+
+get# :: (Monad m) => (State# -> ParseT e m a) -> ParseT e m a
+get# f = ParseT \s -> f s `runParseT#` s
+
+put# :: (Monad m) => State# -> ParseT e m a -> ParseT e m a
+put# s p = ParseT \_ -> p `runParseT#` s
+
+-- | Read the entire input without consuming it.
+-- getInput :: (Monad m) => ParseT e m ByteString
+-- getInput = ParseT \s -> pure $ Success# (stateInput# s) s
+
+-- | Replace the entire input without affecting the current location.
+-- putInput :: (Monad m) => ByteString -> ParseT e m ()
+-- putInput i = ParseT \(State# _ l) -> pure $ Success# () (State# i l)
+
+-- modifyInput :: (Monad m) => (ByteString -> ByteString) -> Parse e m ()
+-- modifyInput f = ParseT \(State# i l) -> pure $ Success# () (State (f i) l)
+
+-- getLoc :: (Monad m) => ParseT e m Loc
+-- getLoc = ParseT \s -> pure $ Success# (locBox $ stateLoc# s) s
+
+-- putLoc :: (Monad m) => Loc -> ParseT e m ()
+-- putLoc l = ParseT \(State# i _) -> pure $ Success# () (State# i (locUnbox l))
+
+-- modifyLoc# :: (Monad m) => (Loc# -> Loc#) -> ParseT e m ()
+-- modifyLoc# f = ParseT \(State# i l) -> pure $ Success# () (State# i (f l))
+
+-- modifyLoc :: (Monad m) => (Loc -> Loc) -> ParseT e m ()
+-- modifyLoc f = ParseT \(State# i l) -> pure $ Success# () (State# i (unboxLoc . f . boxLoc $ l))
+
+-- save :: (Monad m) => ParseT e m State
+-- save = ParseT \s -> pure $ Success# (stateBox s) s
+
+-- load :: (Monad m) => State -> ParseT e m ()
+-- load s = ParseT \_ -> pure $ Success# () (stateUnbox s)
+
+--------------------------------------------------------------------------------
+-- Instances
+
+instance (Monad m) => Functor (ParseT e m) where
+ fmap = liftM
+
+instance (Monad m) => Applicative (ParseT e m) where
+ pure = succeed
+ (<*>) = ap
+
+instance (Monad m) => Monad (ParseT e m) where
+ p >>= k = continue fail err k p
+
+instance (Monad m) => Alternative (ParseT e m) where
+ empty = fail
+ p <|> q = continue q err succeed p
+
+instance MonadTrans (ParseT e) where
+ lift m = ParseT \s -> (`Success#` s) <$> m
+
+instance (MonadReader r m) => MonadReader r (ParseT e m) where
+ ask = lift ask
+ local f p = ParseT \s -> local f $ p `runParseT#` s
+
+instance (MonadState s m) => MonadState s (ParseT e m) where
+ get = lift get
+ put = lift . put
+
+--------------------------------------------------------------------------------
+-- General combinators
+
+-- TODO: some_, many_
+-- TODO: Boxed variants of functions
+-- TODO: Non-CPS variants of functions
+
+-- | Turn a failure into a success and vice versa.
+not :: (Monad m) => ParseT e m a -> ParseT e m ()
+not = continue (succeed ()) err (const fail)
+
+-- | Convert an error into a failure.
+try :: (Monad m) => ParseT e m a -> ParseT e m a
+try = continue fail (const fail) succeed
+
+-- | Convert a failure into an error.
+cut :: (Monad m) => ParseT e m a -> e -> ParseT e m a
+p `cut` e = continue (err e) err succeed p
+
+-- | Catch an error.
+catch :: (Monad m) => ParseT e m a -> (e -> ParseT f m a) -> ParseT f m a
+p `catch` h = continue fail h succeed p
+
+-- | `branch p ks kf` runs p; if it succeeds, it continues with ks; if it fails,
+-- it continues with kf.
+branch :: (Monad m) => ParseT e m a -> (a -> ParseT e m b) -> ParseT e m b -> ParseT e m b
+branch p ks kf = continue kf err ks p
+
+-- | Run a parser without changing the parser state.
+lookahead :: (Monad m) => ParseT e m a -> ParseT e m a
+lookahead p = get# \s -> p <* put# s (pure ())
+
+withSpan :: (Monad m) => (a -> Span# -> ParseT e m b) -> ParseT e m a -> ParseT e m b
+withSpan k p =
+ get# \(State# _ beg) ->
+ p >>= \a ->
+ get# \(State# _ end) ->
+ k a (Span# beg end)
+
+withConsumed :: (Monad m) => (a -> ByteString -> ParseT e m b) -> ParseT e m a -> ParseT e m b
+withConsumed k p =
+ get# \(State# i (Loc# b0 _ _)) ->
+ p >>= \a ->
+ get# \(State# _ (Loc# b1 _ _)) ->
+ k a (BS.take (I# (b1 -# b0)) i)
+
+-- | Iterate a parsing function until it fails. I.e.,
+-- @iter f a = pure a >>= f >>= f >>= ...@
+-- where the chain of binds is as long as possible without failure.
+iter :: (Monad m) => (a -> ParseT e m a) -> a -> ParseT e m a
+iter f a = (f a >>= iter f) <|> pure a
+
+-- | Parse a `b` and then zero or more `a`s, and then combine in a left-nested
+-- fashion.
+chainl :: (Monad m) => (b -> a -> b) -> ParseT e m b -> ParseT e m a -> ParseT e m b
+chainl f pb pa = pb >>= iter \b -> f b <$> pa
+
+-- | Parse zero or more `a`s (greedily) and then a `b`, and then combine in a
+-- right-nested fashion.
+chainr :: (Monad m) => (a -> b -> b) -> ParseT e m a -> ParseT e m b -> ParseT e m b
+chainr f pa pb = f <$> pa <*> chainr f pa pb <|> pb
+
+many_ :: (Monad m) => ParseT e m a -> ParseT e m ()
+many_ p = some_ p <|> pure ()
+
+some_ :: (Monad m) => ParseT e m a -> ParseT e m ()
+some_ p = p *> many_ p
+
+manySepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+manySepBy p sep = someSepBy p sep <|> pure []
+
+someSepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+someSepBy p sep = (:) <$> p <*> many (sep *> p)
+
+choose :: (Monad m, Foldable t) => t (ParseT e m a) -> ParseT e m a
+choose = asum
+
+-- | `match scrut cases` binds scrut to the first non-failing case and returns
+-- the result.
+match ::
+ (Monad m, Functor t, Foldable t) =>
+ ParseT e m a ->
+ t (a -> ParseT e m b) ->
+ ParseT e m b
+match scrut cases = choose $ (scrut >>=) <$> cases
+
+--------------------------------------------------------------------------------
+-- Utf8 combinators
+
+-- TODO: Try writing custom utf8 decoder that doesn't allocate Maybes.
+-- TODO: Non-CPS version of next?
+
+next :: (Monad m) => ParseT e m a -> (Char -> ParseT e m a) -> ParseT e m a
+next ke kc = ParseT \s@(State# i l) ->
+ case UTF8.decode i of
+ Just (c, w)
+ | c == '\xFFFD' -> pure $ Utf8Error# l
+ | otherwise -> kc c `runParseT#` State# (BS.drop w i) (locAdvance# c w l)
+ Nothing -> ke `runParseT#` s
+
+eof :: (Monad m) => ParseT e m ()
+eof = get# \(State# i _) -> guard (BS.null i)
+
+anyChar :: (Monad m) => ParseT e m Char
+anyChar = next fail succeed
+
+charIf :: (Monad m) => (Char -> Bool) -> ParseT e m Char
+charIf want = next fail (\c -> guard (want c) $> c)
+
+char :: (Monad m) => Char -> ParseT e m ()
+char c = void $ charIf (== c)
+
+string :: (Monad m) => String -> ParseT e m ()
+string = traverse_ char
+
+natural :: (Monad m) => Int -> ParseT e m Natural
+natural = \case
+ 0 ->
+ let prefix l u = char '0' *> (char l <|> char u)
+ in prefix 'b' 'B' *> natural 2
+ <|> prefix 'q' 'Q' *> natural 4
+ <|> prefix 'o' 'O' *> natural 8
+ <|> prefix 'x' 'X' *> natural 16
+ <|> natural 10
+ b
+ | 0 < b && b <= 36 ->
+ let nat :: Int -> Natural
+ nat = fromIntegral
+
+ maxn = min (chr $ ord '0' + b) (succ '9')
+ maxu = min (chr $ ord 'A' + b - 10) (succ 'Z')
+ maxl = min (chr $ ord 'a' + b - 10) (succ 'z')
+
+ numeral =
+ anyChar >>= \c ->
+ if
+ | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0')
+ | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10)
+ | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10)
+ | otherwise -> fail
+
+ underscores = many (char '_')
+ in numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral))
+ _ -> error "natural: invalid base"
diff --git a/src/Sparsec/Char.hs b/src/Sparsec/Char.hs
@@ -0,0 +1,38 @@
+module Sparsec.Char (
+ charIsUpperLatin,
+ charIsLowerLatin,
+ charIsLatin,
+ charIsUpperGreek,
+ charIsLowerGreek,
+ charIsGreek,
+ charIsSafeUpperGreek,
+ charIsSafeLowerGreek,
+ charIsSafeGreek,
+ charIsNum2,
+ charIsNum4,
+ charIsNum8,
+ charIsNum10,
+ charIsNum16,
+) where
+
+charIsUpperLatin, charIsLowerLatin, charIsLatin :: Char -> Bool
+charIsUpperLatin c = 'A' <= c && c <= 'Z'
+charIsLowerLatin c = 'a' <= c && c <= 'z'
+charIsLatin c = charIsUpperLatin c || charIsLowerLatin c
+
+charIsUpperGreek, charIsLowerGreek, charIsGreek :: Char -> Bool
+charIsUpperGreek c = 'Α' <= c && c <= 'Ω'
+charIsLowerGreek c = 'α' <= c && c <= 'ω'
+charIsGreek c = charIsUpperGreek c || charIsLowerGreek c
+
+charIsSafeUpperGreek, charIsSafeLowerGreek, charIsSafeGreek :: Char -> Bool
+charIsSafeUpperGreek c = c == 'Γ' || c == 'Δ' || c == 'Θ' || c == 'Λ' || c == 'Ξ' || c == 'Π' || c == 'Σ' || c == 'Φ' || c == 'Ψ' || c == 'Ω'
+charIsSafeLowerGreek c = charIsLowerGreek c && c /= 'ο'
+charIsSafeGreek c = charIsSafeUpperGreek c || charIsSafeLowerGreek c
+
+charIsNum2, charIsNum4, charIsNum8, charIsNum10, charIsNum16 :: Char -> Bool
+charIsNum2 c = '0' <= c && c <= '1'
+charIsNum4 c = '0' <= c && c <= '3'
+charIsNum8 c = '0' <= c && c <= '7'
+charIsNum10 c = '0' <= c && c <= '9'
+charIsNum16 c = '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f'
diff --git a/src/Sparsec/Loc.hs b/src/Sparsec/Loc.hs
@@ -0,0 +1,62 @@
+module Sparsec.Loc (
+ Loc#,
+ pattern Loc#,
+ locByte#,
+ locRow#,
+ locCol#,
+ locEq#,
+ locShow#,
+ locAdvance#,
+ Loc (..),
+ pattern Loc,
+ locZero,
+) where
+
+import GHC.Exts (Int (..), Int#, (+#), (==#))
+
+intEq# :: Int# -> Int# -> Bool
+intEq# x y = case x ==# y of
+ 0# -> False
+ _ -> True
+
+newtype Loc# = Loc_# (# Int#, Int#, Int# #)
+
+pattern Loc# :: Int# -> Int# -> Int# -> Loc#
+pattern Loc# b r c = Loc_# (# b, r, c #)
+{-# COMPLETE Loc# #-}
+
+locByte#, locRow#, locCol# :: Loc# -> Int#
+locByte# (Loc# b _ _) = b
+locRow# (Loc# _ r _) = r
+locCol# (Loc# _ _ c) = c
+
+locEq# :: Loc# -> Loc# -> Bool
+locEq# (Loc# b0 r0 c0) (Loc# b1 r1 c1) =
+ b0 `intEq#` b1 && r0 `intEq#` r1 && c0 `intEq#` c1
+
+locShow# :: Loc# -> String
+locShow# (Loc# b r c) = show (I# b) ++ "@" ++ show (I# r) ++ ":" ++ show (I# c)
+
+locAdvance# :: Char -> Int -> Loc# -> Loc#
+locAdvance# x (I# w) (Loc# b r c) = case x of
+ '\n' -> Loc# (b +# w) (r +# 1#) 0#
+ _ -> Loc# (b +# w) r (c +# 1#)
+
+-- | Loc is a source code location, tracking byte offset, row (number of '\n'
+-- characters seen so far), and column (number of characters into the current
+-- row). All three of these quantities are 0-based. The column tracking is
+-- somewhat stupid in that it treats each Unicode codepoint as one column.
+data Loc = LocBox {locUnbox :: Loc#}
+
+pattern Loc :: Int# -> Int# -> Int# -> Loc
+pattern Loc b r c = LocBox (Loc# b r c)
+{-# COMPLETE Loc #-}
+
+locZero :: Loc
+locZero = Loc 0# 0# 0#
+
+instance Eq Loc where
+ LocBox l0 == LocBox l1 = l0 `locEq#` l1
+
+instance Show Loc where
+ show (LocBox l) = locShow# l
diff --git a/src/Sparsec/Span.hs b/src/Sparsec/Span.hs
@@ -0,0 +1,43 @@
+module Sparsec.Span (
+ Span#,
+ pattern Span#,
+ spanBeg#,
+ spanEnd#,
+ spanEq#,
+ spanShow#,
+ Span (..),
+ pattern Span,
+) where
+
+import Sparsec.Loc
+
+newtype Span# = Span_# (# Loc#, Loc# #)
+
+pattern Span# :: Loc# -> Loc# -> Span#
+pattern Span# beg end = Span_# (# beg, end #)
+{-# COMPLETE Span# #-}
+
+spanBeg#, spanEnd# :: Span# -> Loc#
+spanBeg# (Span# beg _) = beg
+spanEnd# (Span# _ end) = end
+
+spanEq# :: Span# -> Span# -> Bool
+spanEq# (Span# beg0 end0) (Span# beg1 end1) =
+ beg0 `locEq#` beg1 && end0 `locEq#` end1
+
+spanShow# :: Span# -> String
+spanShow# (Span# beg end) = locShow# beg ++ "--" ++ locShow# end
+
+-- | Span is a pair of Loc's specifying a range in the input. The beginning/left
+-- Loc is inclusive and the ending/right Loc is exclusive.
+data Span = SpanBox {spanUnbox :: Span#}
+
+pattern Span :: Loc# -> Loc# -> Span
+pattern Span beg end = SpanBox (Span# beg end)
+{-# COMPLETE Span #-}
+
+instance Eq Span where
+ SpanBox span0 == SpanBox span1 = span0 `spanEq#` span1
+
+instance Show Span where
+ show (SpanBox span) = spanShow# span