sparsec

simple Haskell parser combinator library
git clone git://git.rr3.xyz/sparsec
Log | Files | Refs | README | LICENSE

commit d4de0fa6982ad9b4be8024e371e1b25dcb99275a
parent 0ef3f55b8e3aaf39074c713feedc96a04b48a43c
Author: Robert Russell <robert@rr3.xyz>
Date:   Sun, 14 Dec 2025 13:56:53 -0800

Unbox parser state

Diffstat:
MSparsec.hs | 255++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
Mpackage.yaml | 4++++
2 files changed, 191 insertions(+), 68 deletions(-)

diff --git a/Sparsec.hs b/Sparsec.hs @@ -32,31 +32,59 @@ module Sparsec ( runeIsNum10, runeIsNum16, -- Loc + Loc#, + pattern Loc#, + locByte#, + locRow#, + locCol#, + locEq#, Loc (..), locZero, + locBox, + locUnbox, -- Span + Span#, + pattern Span#, + spanBeg#, + spanEnd#, + spanEq#, Span (..), + spanBox, + spanUnbox, + -- State + State#, + pattern State#, + stateInput#, + stateLoc#, + stateEq#, + State (..), + stateBox, + stateUnbox, -- Result - Result (..), + Result# (..), -- Parse monad ParseT (..), + runParseT#, runParseT, mapParseT, Parse, + runParse#, runParse, mapParse, -- Core combinators + fail, + err, + succeed, + continue, read, write, getLoc, putLoc, save, load, - fail, + -- Derived combinators not, - err, catch, - -- Derived combinators eof, lookahead, spanOf, @@ -92,20 +120,30 @@ module Sparsec ( import Control.Applicative import Control.Monad hiding (fail) -import Control.Monad.Identity -import Control.Monad.Reader -import Control.Monad.State +import Control.Monad.Identity (Identity (..)) +import Control.Monad.Reader (MonadReader (..)) +import Control.Monad.State (MonadState (..)) +import Control.Monad.Trans (MonadTrans (..)) import Data.ByteString (ByteString) import Data.ByteString qualified as BS import Data.ByteString.UTF8 qualified as UTF8 import Data.Char import Data.Foldable import Data.Functor +import GHC.Exts (Int (..), Int#, (==#)) import Numeric.Natural import Text.Printf import Prelude hiding (fail, not, read) -------------------------------------------------------------------------------- +-- Util + +intEq# :: Int# -> Int# -> Bool +intEq# x y = case x ==# y of + 0# -> False + _ -> True + +-------------------------------------------------------------------------------- -- Char charIsUpperLatin, charIsLowerLatin, charIsLatin :: Char -> Bool @@ -171,6 +209,20 @@ runeIsNum16 = runeElim False charIsNum16 -------------------------------------------------------------------------------- -- Loc +newtype Loc# = Loc_# (# Int#, Int#, Int# #) +pattern Loc# :: Int# -> Int# -> Int# -> Loc# +pattern Loc# b r c = Loc_# (# b, r, c #) +{-# COMPLETE Loc# #-} + +locByte#, locRow#, locCol# :: Loc# -> Int# +locByte# (Loc# b _ _) = b +locRow# (Loc# _ r _) = r +locCol# (Loc# _ _ c) = c + +locEq# :: Loc# -> Loc# -> Bool +locEq# (Loc# b0 r0 c0) (Loc# b1 r1 c1) = + intEq# b0 b1 && intEq# r0 r1 && intEq# c0 c1 + -- | Loc is a source code location, tracking byte offset, row (number of '\n' -- characters seen so far), and column (number of characters into the current -- row). All three of these quantities are 0-based. The column tracking is @@ -184,88 +236,177 @@ locZero = Loc 0 0 0 instance Show Loc where show (Loc b r c) = printf "%d:%d#%d" r c b +locBox :: Loc# -> Loc +locBox (Loc# b r c) = Loc (I# b) (I# r) (I# c) + +locUnbox :: Loc -> Loc# +locUnbox (Loc (I# b) (I# r) (I# c)) = Loc# b r c + -------------------------------------------------------------------------------- -- Span +newtype Span# = Span_# (# Loc#, Loc# #) +pattern Span# :: Loc# -> Loc# -> Span# +pattern Span# beg end = Span_# (# beg, end #) +{-# COMPLETE Span# #-} + +spanBeg#, spanEnd# :: Span# -> Loc# +spanBeg# (Span# beg _) = beg +spanEnd# (Span# _ end) = end + +spanEq# :: Span# -> Span# -> Bool +spanEq# (Span# beg0 end0) (Span# beg1 end1) = + locEq# beg0 beg1 && locEq# end0 end1 + -- | Span is a pair of Loc's specifying a range in the input. The beginning/left -- Loc is inclusive and the ending/right Loc is exclusive. -data Span = Span Loc Loc deriving (Eq) +data Span = Span {spanBeg :: Loc, spanEnd :: Loc} deriving (Eq) -- Weird Show instance intended for debugging instance Show Span where show (Span beg end) = printf "%s--%s" (show beg) (show end) +spanBox :: Span# -> Span +spanBox (Span# beg end) = Span (locBox beg) (locBox end) + +spanUnbox :: Span -> Span# +spanUnbox (Span beg end) = Span# (locUnbox beg) (locUnbox end) + +-------------------------------------------------------------------------------- +-- State + +newtype State# = State_# (# ByteString, Loc# #) +pattern State# :: ByteString -> Loc# -> State# +pattern State# i l = State_# (# i, l #) +{-# COMPLETE State# #-} + +stateInput# :: State# -> ByteString +stateInput# (State# i _) = i + +stateLoc# :: State# -> Loc# +stateLoc# (State# _ l) = l + +stateEq# :: State# -> State# -> Bool +stateEq# (State# i0 l0) (State# i1 l1) = i0 == i1 && locEq# l0 l1 + +data State = State {stateInput :: ByteString, stateLoc :: Loc} deriving (Eq) + +stateBox :: State# -> State +stateBox (State# i l) = State i (locBox l) + +stateUnbox :: State -> State# +stateUnbox (State i l) = State# i (locUnbox l) + -------------------------------------------------------------------------------- -- Result -data Result e a - = Failure - | Error e - | Success a ByteString Loc - deriving (Eq) +-- TODO: Result (no hash) + +data Result# e a = Failure# | Error# e | Success# a State# -instance (Show e, Show a) => Show (Result e a) where +instance (Eq e, Eq a) => Eq (Result# e a) where + Failure# == Failure# = True + Error# e0 == Error# e1 = e0 == e1 + Success# a0 s0 == Success# a1 s1 = a0 == a1 && stateEq# s0 s1 + _ == _ = False + +instance (Show e, Show a) => Show (Result# e a) where show = \case - Failure -> "failure" - Error e -> "error: " ++ show e - Success a i _ -> + Failure# -> "failure" + Error# e -> "error: " ++ show e + Success# a (State# i _) -> if BS.null i then - printf "ok: %s" (show a) + printf "success: %s" (show a) else printf - "ok (%d bytes remaining): %s" + "success (%d bytes remaining): %s" (BS.length i) (show a) -instance Functor (Result e) where +instance Functor (Result# e) where fmap f = \case - Failure -> Failure - Error e -> Error e - Success a i l -> Success (f a) i l + Failure# -> Failure# + Error# e -> Error# e + Success# a s -> Success# (f a) s -------------------------------------------------------------------------------- --- Parse monad and core combinators +-- ParseT -- | ParseT is a monad transformer for parsing. It effectively has a ByteString -- and Loc state effect for the input, and an error effect (of some type e) for -- parsing errors. Parsing *errors* are distinct from parsing *failures* in -- that only the latter trigger backtracking in the Alternative instance. -newtype ParseT e m a = ParseT (ByteString -> Loc -> m (Result e a)) +newtype ParseT e m a = ParseT (State# -> m (Result# e a)) + +runParseT# :: (Monad m) => ParseT e m a -> State# -> m (Result# e a) +runParseT# (ParseT f) = f -runParseT :: (Monad m) => ParseT e m a -> ByteString -> Loc -> m (Result e a) -runParseT (ParseT f) = f +runParseT :: (Monad m) => ParseT e m a -> ByteString -> Loc -> m (Result# e a) +runParseT p i l = p `runParseT#` State# i (locUnbox l) mapParseT :: (Monad m, Monad m') => - (m (Result e a) -> m' (Result e' a')) -> + (m (Result# e a) -> m' (Result# e' a')) -> ParseT e m a -> ParseT e' m' a' -mapParseT f p = ParseT \i l -> f $ runParseT p i l +mapParseT f p = ParseT \s -> f $ p `runParseT#` s + +-------------------------------------------------------------------------------- +-- Parse type Parse e a = ParseT e Identity a -runParse :: Parse e a -> ByteString -> Loc -> Result e a -runParse (ParseT f) i l = runIdentity $ f i l +runParse# :: Parse e a -> State# -> Result# e a +runParse# p s = runIdentity $ p `runParseT#` s -mapParse :: (Result e a -> Result e' a') -> Parse e a -> Parse e' a' +runParse :: Parse e a -> ByteString -> Loc -> Result# e a +runParse p i l = p `runParse#` State# i (locUnbox l) + +mapParse :: (Result# e a -> Result# e' a') -> Parse e a -> Parse e' a' mapParse f = mapParseT (Identity . f . runIdentity) +-------------------------------------------------------------------------------- +-- Core combinators + fail :: (Monad m) => ParseT e m a -fail = ParseT \_ _ -> pure Failure +fail = ParseT \_ -> pure Failure# err :: (Monad m) => e -> ParseT e m a -err e = ParseT \_ _ -> pure $ Error e +err e = ParseT \_ -> pure $ Error# e succeed :: (Monad m) => a -> ParseT e m a -succeed a = ParseT \i l -> pure $ Success a i l +succeed a = ParseT \s -> pure $ Success# a s continue :: (Monad m) => ParseT e' m a' -> (e -> ParseT e' m a') -> (a -> ParseT e' m a') -> ParseT e m a -> ParseT e' m a' -continue kf ke ks p = ParseT \i l -> - runParseT p i l >>= \case - Failure -> runParseT kf i l - Error e -> runParseT (ke e) i l - Success a i' l' -> runParseT (ks a) i' l' +continue kf ke ks p = ParseT \s -> + p `runParseT#` s >>= \case + Failure# -> kf `runParseT#` s + Error# e -> ke e `runParseT#` s + Success# a s' -> ks a `runParseT#` s' + +-- | Read the entire input without consuming it. +read :: (Monad m) => ParseT e m ByteString +read = ParseT \s -> pure $ Success# (stateInput# s) s + +-- | Replace the entire input without affecting the current location. +write :: (Monad m) => ByteString -> ParseT e m () +write i = ParseT \(State# _ l) -> pure $ Success# () (State# i l) + +getLoc :: (Monad m) => ParseT e m Loc +getLoc = ParseT \s -> pure $ Success# (locBox $ stateLoc# s) s + +putLoc :: (Monad m) => Loc -> ParseT e m () +putLoc l = ParseT \(State# i _) -> pure $ Success# () (State# i (locUnbox l)) + +save :: (Monad m) => ParseT e m State +save = ParseT \s -> pure $ Success# (stateBox s) s + +load :: (Monad m) => State -> ParseT e m () +load s = ParseT \_ -> pure $ Success# () (stateUnbox s) + +-------------------------------------------------------------------------------- +-- Instances instance (Monad m) => Functor (ParseT e m) where fmap = liftM @@ -282,37 +423,18 @@ instance (Monad m) => Alternative (ParseT e m) where p <|> q = continue q err succeed p instance MonadTrans (ParseT e) where - lift m = ParseT \i l -> (\a -> Success a i l) <$> m + lift m = ParseT \s -> (`Success#` s) <$> m instance (MonadReader r m) => MonadReader r (ParseT e m) where ask = lift ask - local f p = ParseT \i l -> local f $ runParseT p i l + local f p = ParseT \s -> local f $ p `runParseT#` s instance (MonadState s m) => MonadState s (ParseT e m) where get = lift get put = lift . put --- | Read the entire input without consuming it. -read :: (Monad m) => ParseT e m ByteString -read = ParseT \i l -> pure $ Success i i l - --- | Replace the entire input without affecting the current location. -write :: (Monad m) => ByteString -> ParseT e m () -write i = ParseT \_ l -> pure $ Success () i l - -getLoc :: (Monad m) => ParseT e m Loc -getLoc = ParseT \i l -> pure $ Success l i l - -putLoc :: (Monad m) => Loc -> ParseT e m () -putLoc l = ParseT \i _ -> pure $ Success () i l - --- | Save parsing state. This is effectively a combination of read and getLoc. -save :: (Monad m) => ParseT e m (ByteString, Loc) -save = ParseT \i l -> pure $ Success (i, l) i l - --- | Load parsing state. This is effectively a combination of write and putLoc. -load :: (Monad m) => (ByteString, Loc) -> ParseT e m () -load (i, l) = ParseT \_ _ -> pure $ Success () i l +-------------------------------------------------------------------------------- +-- General combinators -- | Turn a failure into a success and vice versa. not :: (Monad m) => ParseT e m a -> ParseT e m () @@ -322,9 +444,6 @@ not = continue (succeed ()) err (const fail) catch :: (Monad m) => ParseT e m a -> (e -> ParseT e' m a) -> ParseT e' m a p `catch` h = continue fail h succeed p --------------------------------------------------------------------------------- --- General combinators - eof :: (Monad m) => ParseT e m () eof = guard . BS.null =<< read @@ -345,10 +464,10 @@ spanOf p = do bytesOf :: (Monad m) => ParseT e m a -> ParseT e m (a, ByteString) bytesOf p = do - (input, beg) <- save + State i beg <- save a <- p end <- getLoc - pure (a, BS.take (locByte end - locByte beg) input) + pure (a, BS.take (locByte end - locByte beg) i) -- | Convert a failure into an error. -- diff --git a/package.yaml b/package.yaml @@ -13,9 +13,13 @@ language: GHC2024 default-extensions: - BlockArguments + - MagicHash - MultiWayIf - OverloadedStrings + - PatternSynonyms + - UnboxedTuples - UndecidableInstances + - UnliftedNewtypes dependencies: - base >= 4.20 && < 5