commit 4039f027a606a2a044a862a610eca7e4dc934020
parent b13648356cab32b00ed3defa8983bf8efdf93ea8
Author: Robert Russell <robert@rr3.xyz>
Date: Tue, 27 Aug 2024 00:00:06 -0700
Move Sparsec from System Q
Diffstat:
4 files changed, 435 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.stack-work
+*.lock
+*.cabal
diff --git a/Sparsec.hs b/Sparsec.hs
@@ -0,0 +1,393 @@
+module Sparsec where
+import Control.Applicative
+import Control.Monad hiding (fail)
+import Control.Monad.Identity
+import Control.Monad.Reader
+import Control.Monad.State
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as ByteString
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.Char
+import Data.Foldable
+import Numeric.Natural
+import Prelude hiding (fail, lex, read)
+import Text.Printf
+
+--------------------------------------------------------------------------------
+-- Rune
+
+data Rune = RuneEof | RuneChar Char deriving Eq
+
+instance Show Rune where
+ show = \case
+ RuneEof -> "EOF"
+ RuneChar c -> show c
+
+runeCase :: a -> (Char -> a) -> Rune -> a
+runeCase eof chr = \case
+ RuneEof -> eof
+ RuneChar c -> chr c
+
+runeIsSpace :: Rune -> Bool
+runeIsSpace = runeCase False isSpace
+
+runeIsLetter :: Rune -> Bool
+runeIsLetter = runeCase False isLetter
+
+runeIsDigit :: Rune -> Bool
+runeIsDigit = runeCase False isDigit
+
+--------------------------------------------------------------------------------
+-- Locations
+
+-- Loc is a source code location, tracking byte offset, row (number of '\n'
+-- characters seen so far), and column (number of characters into the current
+-- row). All three of these quantities are 0-based. The column tracking is
+-- somewhat stupid in that it treats each Unicode codepoint as one column.
+data Loc = Loc { byte :: Int, row :: Int, col :: Int } deriving Eq
+
+locZero :: Loc
+locZero = Loc 0 0 0
+
+-- Weird Show instance intended for debugging
+instance Show Loc where
+ show (Loc b r c) = printf "%d:%d#%d" r c b
+
+--------------------------------------------------------------------------------
+-- Spans
+
+-- Span is a pair of Loc's specifying a range in the input. The beginning/left
+-- Loc is inclusive and the ending/right Loc is exclusive.
+data Span = Span Loc Loc deriving Eq
+
+-- Weird Show instance intended for debugging
+instance Show Span where
+ show (Span beg end) = printf "%s--%s" (show beg) (show end)
+
+--------------------------------------------------------------------------------
+-- Result
+
+data Result e a
+ = Fail
+ | Error e
+ | Ok a ByteString Loc
+ deriving Eq
+
+instance (Show e, Show a) => Show (Result e a) where
+ show = \case
+ Fail -> "fail"
+ Error e -> "error: " ++ show e
+ Ok a rest _ ->
+ if ByteString.null rest then
+ printf "ok: %s" (show a)
+ else
+ printf "ok (%d bytes remaining): %s"
+ (ByteString.length rest) (show a)
+
+instance Functor (Result e) where
+ fmap f = \case
+ Fail -> Fail
+ Error e -> Error e
+ Ok a rest loc -> Ok (f a) rest loc
+
+--------------------------------------------------------------------------------
+-- Parse monad
+
+-- ParseT is a monad transformer for parsing. It effectively has a ByteString
+-- and Loc state effect for the input, and an error effect (of some type e) for
+-- parsing errors. Parsing *errors* are distinct from parsing *failures* in
+-- that only the latter trigger backtracking in the Alternative instance.
+newtype ParseT e m a = ParseT (ByteString -> Loc -> m (Result e a))
+
+runParseT :: Monad m => ParseT e m a -> ByteString -> Loc -> m (Result e a)
+runParseT (ParseT f) = f
+
+mapParseT :: (Monad m, Monad m')
+ => (m (Result e a) -> m' (Result e' a'))
+ -> ParseT e m a
+ -> ParseT e' m' a'
+mapParseT f p = ParseT \input loc -> f $ runParseT p input loc
+
+type Parse e a = ParseT e Identity a
+
+instance Monad m => Functor (ParseT e m) where
+ fmap = liftM
+
+instance Monad m => Applicative (ParseT e m) where
+ pure a = ParseT \input loc -> pure $ Ok a input loc
+
+ (<*>) = ap
+
+instance Monad m => Monad (ParseT e m) where
+ p >>= k = ParseT \input loc ->
+ runParseT p input loc >>= \case
+ Fail -> pure $ Fail
+ Error e -> pure $ Error e
+ Ok a input' loc' -> runParseT (k a) input' loc'
+
+instance Monad m => Alternative (ParseT e m) where
+ empty = ParseT \_ _ -> pure Fail
+
+ p <|> q = ParseT \input loc ->
+ runParseT p input loc >>= \case
+ Fail -> runParseT q input loc
+ Error e -> pure $ Error e
+ Ok a input' loc' -> pure $ Ok a input' loc'
+
+instance MonadTrans (ParseT e) where
+ lift m = ParseT \input loc -> do
+ a <- m
+ pure $ Ok a input loc
+
+instance MonadReader r m => MonadReader r (ParseT e m) where
+ ask = lift ask
+
+ local f p = ParseT \input loc -> local f $ runParseT p input loc
+
+instance MonadState s m => MonadState s (ParseT e m) where
+ get = lift get
+
+ put = lift . put
+
+-- TODO: Separate a MTL-like MonadParse typeclass with (a subset of) the
+-- following methods?
+
+-- Read the entire input without consuming it.
+read :: Monad m => ParseT e m ByteString
+read = ParseT \input loc -> pure $ Ok input input loc
+
+-- Replace the entire input without affecting the current location.
+write :: Monad m => ByteString -> ParseT e m ()
+write input = ParseT \_ loc -> pure $ Ok () input loc
+
+getLoc :: Monad m => ParseT e m Loc
+getLoc = ParseT \input loc -> pure $ Ok loc input loc
+
+putLoc :: Monad m => Loc -> ParseT e m ()
+putLoc loc = ParseT \input _ -> pure $ Ok () input loc
+
+-- Save parsing state. This is effectively a combination of read and getLoc.
+save :: Monad m => ParseT e m (ByteString, Loc)
+save = ParseT \input loc -> pure $ Ok (input, loc) input loc
+
+-- Load parsing state. This is effectively a combination of write and putLoc.
+load :: Monad m => (ByteString, Loc) -> ParseT e m ()
+load (input, loc) = ParseT \_ _ -> pure $ Ok () input loc
+
+fail :: Monad m => ParseT e m a
+fail = empty
+
+-- Turn a failure into a success and vice versa.
+notp :: Monad m => ParseT e m a -> ParseT e m ()
+notp p = ParseT \input loc ->
+ runParseT p input loc >>= \case
+ Fail -> pure $ Ok () input loc
+ Error e -> pure $ Error e
+ Ok _ _ _ -> pure Fail
+
+err :: Monad m => e -> ParseT e m a
+err e = ParseT \_ _ -> pure $ Error e
+
+-- Catch an error.
+catch :: Monad m => ParseT e m a -> (e -> ParseT e m a) -> ParseT e m a
+p `catch` h = ParseT \input loc ->
+ runParseT p input loc >>= \case
+ Fail -> pure Fail
+ Error e -> runParseT (h e) input loc
+ Ok a input' loc' -> pure $ Ok a input' loc'
+
+--------------------------------------------------------------------------------
+-- General parse combinators
+
+-- Run a parser without changing the parser state.
+lookahead :: Monad m => ParseT e m a -> ParseT e m a
+lookahead p = do
+ s <- save
+ a <- p
+ load s
+ pure a
+
+spanOf :: Monad m => ParseT e m a -> ParseT e m Span
+spanOf p = Span <$> getLoc <*> lookahead (p *> getLoc)
+
+bytesOf :: Monad m => ParseT e m a -> ParseT e m ByteString
+bytesOf p = do
+ (input, beg) <- save
+ end <- lookahead (p *> getLoc)
+ pure $ ByteString.drop (end.byte - beg.byte) input
+
+-- Convert a failure into an error.
+-- TODO: Why do we call this "cut"?
+cut :: Monad m => ParseT e m a -> e -> ParseT e m a
+p `cut` e = p <|> err e
+
+-- Convert an error into a failure.
+try :: Monad m => ParseT e m a -> ParseT e m a
+try p = p `catch` \_ -> fail
+
+opt :: Monad m => ParseT e m a -> ParseT e m (Maybe a)
+opt p = (Just <$> p) <|> pure Nothing
+
+eof :: Monad m => ParseT e m ()
+eof = (ByteString.null <$> read) >>= guard
+
+iter :: Monad m => (a -> ParseT e m a) -> a -> ParseT e m a
+iter f a = (f a >>= iter f) <|> pure a
+
+someUntil :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+someUntil p end = some (notp end *> p)
+
+manyUntil :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+manyUntil p end = many (notp end *> p)
+
+someSepBy :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+someSepBy p sep = (:) <$> p <*> many (sep *> p)
+
+manySepBy :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+manySepBy p sep = someSepBy p sep <|> pure []
+
+--------------------------------------------------------------------------------
+-- Byte-wise parse combinators
+
+-- TODO
+
+--------------------------------------------------------------------------------
+-- Utf8 parse combinators
+
+-- All these combinators involve UTF-8 decoding and therefore require that
+-- the error type e supports UTF-8 errors.
+
+class Utf8Error e where
+ utf8Error :: Loc -> e
+
+nextRune :: forall e m. (Utf8Error e, Monad m) => ParseT e m Rune
+nextRune =
+ (UTF8.decode <$> read) >>= \case
+ Just (c, w)
+ | c == UTF8.replacement_char -> getLoc >>= (err . utf8Error)
+ | otherwise -> updateState c w *> pure (RuneChar c)
+ Nothing -> pure RuneEof
+ where
+ updateState :: Char -> Int -> ParseT e m ()
+ updateState c w = do
+ (ByteString.drop w <$> read) >>= write
+ loc <- getLoc
+ putLoc case c of
+ '\n' -> loc { byte = loc.byte + w, row = loc.row + 1, col = 0 }
+ _ -> loc { byte = loc.byte + w, col = loc.col + 1 }
+
+nextChar :: (Utf8Error e, Monad m) => ParseT e m Char
+nextChar = nextRune >>= \case
+ RuneEof -> fail
+ RuneChar c -> pure c
+
+oneRuneM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m Rune
+oneRuneM want = do
+ r <- nextRune
+ (lift $ want r) >>= guard
+ pure r
+
+oneRune :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m Rune
+oneRune want = oneRuneM (pure . want)
+
+oneCharM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m Char
+oneCharM want = do
+ c <- nextChar
+ (lift $ want c) >>= guard
+ pure c
+
+oneChar :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m Char
+oneChar want = oneCharM (pure . want)
+
+manyRuneM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m ByteString
+manyRuneM want =
+ fst <$> lexM () \((), r) ->
+ want r >>= \case
+ False -> pure $ LexRejectEmit ()
+ True -> pure $ LexAcceptCont ()
+
+manyRune :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m ByteString
+manyRune want = manyRuneM (pure . want)
+
+manyCharM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m ByteString
+manyCharM want = manyRuneM \case
+ RuneEof -> pure False
+ RuneChar c -> want c
+
+manyChar :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m ByteString
+manyChar want = manyCharM (pure . want)
+
+rune :: (Utf8Error e, Monad m) => Rune -> ParseT e m ()
+rune r = oneRune (== r) *> pure ()
+
+char :: (Utf8Error e, Monad m) => Char -> ParseT e m ()
+char c = oneChar (== c) *> pure ()
+
+string :: (Utf8Error e, Monad m) => String -> ParseT e m ()
+string = traverse_ char
+
+natural :: (Utf8Error e, Monad m) => Int -> ParseT e m Natural
+natural = \case
+ 0 ->
+ let prefix l u = char '0' *> (char l <|> char u) in
+ prefix 'b' 'B' *> natural 2
+ <|> prefix 'q' 'Q' *> natural 4
+ <|> prefix 'o' 'O' *> natural 8
+ <|> prefix 'x' 'X' *> natural 16
+ <|> natural 10
+
+ b | 0 < b && b <= 36 ->
+ let nat :: Int -> Natural
+ nat = fromIntegral
+
+ maxn = min (chr $ ord '0' + b) (succ '9')
+ maxu = min (chr $ ord 'A' + b - 10) (succ 'Z')
+ maxl = min (chr $ ord 'a' + b - 10) (succ 'z')
+
+ numeral = nextChar >>= \c -> if
+ | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0')
+ | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10)
+ | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10)
+ | otherwise -> fail
+
+ underscores = many (char '_') in
+
+ numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral))
+
+ _ -> error "natural: invalid base"
+
+data LexAction e t s
+ = LexFail
+ | LexError e
+ | LexRejectEmit t -- Reject rune and emit token.
+ | LexAcceptCont s -- Accept rune and continue lexing in the given state.
+
+-- TODO: Delete this.
+-- lexM (and it's non-monadic counterpart lex) lexes a token (of some type t)
+-- in the manner of an automaton. That is, given a start state (of some type
+-- s) and a transition function that maps (s, Rune) pairs to "actions", lexM
+-- tracks the current state and repeatedly calls the transition function with
+-- runes from the input until it returns an action indicating it should stop.
+lexM :: forall m e t s. (Utf8Error e, Monad m)
+ => s
+ -> ((s, Rune) -> m (LexAction e t s))
+ -> ParseT e m (ByteString, t)
+lexM startState trans = do
+ (input, beg) <- save
+ let gobble :: s -> ParseT e m t
+ gobble s = do
+ r <- lookahead nextRune
+ (lift $ trans (s, r)) >>= \case
+ LexFail -> fail
+ LexError e -> err e
+ LexRejectEmit t -> pure t
+ LexAcceptCont s -> nextRune *> gobble s
+ t <- gobble startState
+ end <- getLoc
+ pure (ByteString.take (end.byte - beg.byte) input, t)
+
+lex :: (Utf8Error e, Monad m)
+ => s
+ -> ((s, Rune) -> LexAction e t s)
+ -> ParseT e m (ByteString, t)
+lex startState trans = lexM startState (pure . trans)
diff --git a/package.yaml b/package.yaml
@@ -0,0 +1,38 @@
+name: sparsec
+version: 0.1.0.0
+author: "Robert Russell"
+license: ISC
+
+ghc-options:
+ - -Wall
+ - -Wno-name-shadowing
+
+default-extensions:
+ - DuplicateRecordFields
+ - NoFieldSelectors
+ - OverloadedRecordDot
+
+ - BlockArguments
+ - LambdaCase
+ - MultiWayIf
+
+ - ScopedTypeVariables
+
+ - FlexibleInstances
+ - MultiParamTypeClasses
+ - UndecidableInstances
+
+dependencies:
+ - base >= 4.18 && < 5
+
+library:
+ source-dirs: .
+ dependencies:
+ - mtl
+ - bytestring
+ - utf8-string
+
+executables:
+ example:
+ source-dirs: example
+ main: Main.hs
diff --git a/stack.yaml b/stack.yaml
@@ -0,0 +1 @@
+resolver: lts-22.23