Move Sparsec from System Q - sparsec - simple Haskell parser combinator library

commit 4039f027a606a2a044a862a610eca7e4dc934020
parent b13648356cab32b00ed3defa8983bf8efdf93ea8
Author: Robert Russell <robert@rr3.xyz>
Date:   Tue, 27 Aug 2024 00:00:06 -0700

Move Sparsec from System Q

Diffstat:
A .gitignore  | 3 +++
A Sparsec.hs  | 393 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A package.yaml  | 38 ++++++++++++++++++++++++++++++++++++++
A stack.yaml  | 1 +

4 files changed, 435 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.stack-work
+*.lock
+*.cabal
diff --git a/Sparsec.hs b/Sparsec.hs
@@ -0,0 +1,393 @@
+module Sparsec where
+import Control.Applicative
+import Control.Monad hiding (fail)
+import Control.Monad.Identity
+import Control.Monad.Reader
+import Control.Monad.State
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as ByteString
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.Char
+import Data.Foldable
+import Numeric.Natural
+import Prelude hiding (fail, lex, read)
+import Text.Printf
+
+--------------------------------------------------------------------------------
+-- Rune
+
+data Rune = RuneEof | RuneChar Char deriving Eq
+
+instance Show Rune where
+    show = \case
+        RuneEof    -> "EOF"
+        RuneChar c -> show c
+
+runeCase :: a -> (Char -> a) -> Rune -> a
+runeCase eof chr = \case
+    RuneEof    -> eof
+    RuneChar c -> chr c
+
+runeIsSpace :: Rune -> Bool
+runeIsSpace = runeCase False isSpace
+
+runeIsLetter :: Rune -> Bool
+runeIsLetter = runeCase False isLetter
+
+runeIsDigit :: Rune -> Bool
+runeIsDigit = runeCase False isDigit
+
+--------------------------------------------------------------------------------
+-- Locations
+
+-- Loc is a source code location, tracking byte offset, row (number of '\n'
+-- characters seen so far), and column (number of characters into the current
+-- row). All three of these quantities are 0-based. The column tracking is
+-- somewhat stupid in that it treats each Unicode codepoint as one column.
+data Loc = Loc { byte :: Int, row :: Int, col :: Int } deriving Eq
+
+locZero :: Loc
+locZero = Loc 0 0 0
+
+-- Weird Show instance intended for debugging
+instance Show Loc where
+    show (Loc b r c) = printf "%d:%d#%d" r c b
+
+--------------------------------------------------------------------------------
+-- Spans
+
+-- Span is a pair of Loc's specifying a range in the input. The beginning/left
+-- Loc is inclusive and the ending/right Loc is exclusive.
+data Span = Span Loc Loc deriving Eq
+
+-- Weird Show instance intended for debugging
+instance Show Span where
+    show (Span beg end) = printf "%s--%s" (show beg) (show end)
+
+--------------------------------------------------------------------------------
+-- Result
+
+data Result e a
+    = Fail
+    | Error e
+    | Ok a ByteString Loc
+    deriving Eq
+
+instance (Show e, Show a) => Show (Result e a) where
+    show = \case
+        Fail -> "fail"
+        Error e -> "error: " ++ show e
+        Ok a rest _ ->
+            if ByteString.null rest then
+                printf "ok: %s" (show a)
+            else
+                printf "ok (%d bytes remaining): %s"
+                    (ByteString.length rest) (show a)
+
+instance Functor (Result e) where
+    fmap f = \case
+        Fail -> Fail
+        Error e -> Error e
+        Ok a rest loc -> Ok (f a) rest loc
+
+--------------------------------------------------------------------------------
+-- Parse monad
+
+-- ParseT is a monad transformer for parsing. It effectively has a ByteString
+-- and Loc state effect for the input, and an error effect (of some type e) for
+-- parsing errors. Parsing *errors* are distinct from parsing *failures* in
+-- that only the latter trigger backtracking in the Alternative instance.
+newtype ParseT e m a = ParseT (ByteString -> Loc -> m (Result e a))
+
+runParseT :: Monad m => ParseT e m a -> ByteString -> Loc -> m (Result e a)
+runParseT (ParseT f) = f
+
+mapParseT :: (Monad m, Monad m')
+          => (m (Result e a) -> m' (Result e' a'))
+          -> ParseT e m a
+          -> ParseT e' m' a'
+mapParseT f p = ParseT \input loc -> f $ runParseT p input loc
+
+type Parse e a = ParseT e Identity a
+
+instance Monad m => Functor (ParseT e m) where
+    fmap = liftM
+
+instance Monad m => Applicative (ParseT e m) where
+    pure a = ParseT \input loc -> pure $ Ok a input loc
+
+    (<*>) = ap
+
+instance Monad m => Monad (ParseT e m) where
+    p >>= k = ParseT \input loc ->
+        runParseT p input loc >>= \case
+            Fail             -> pure $ Fail
+            Error e          -> pure $ Error e
+            Ok a input' loc' -> runParseT (k a) input' loc'
+
+instance Monad m => Alternative (ParseT e m) where
+    empty = ParseT \_ _ -> pure Fail
+
+    p <|> q = ParseT \input loc ->
+        runParseT p input loc >>= \case
+            Fail             -> runParseT q input loc
+            Error e          -> pure $ Error e
+            Ok a input' loc' -> pure $ Ok a input' loc'
+
+instance MonadTrans (ParseT e) where
+    lift m = ParseT \input loc -> do
+        a <- m
+        pure $ Ok a input loc
+
+instance MonadReader r m => MonadReader r (ParseT e m) where
+    ask = lift ask
+
+    local f p = ParseT \input loc -> local f $ runParseT p input loc
+
+instance MonadState s m => MonadState s (ParseT e m) where
+    get = lift get
+
+    put = lift . put
+
+-- TODO: Separate a MTL-like MonadParse typeclass with (a subset of) the
+-- following methods?
+
+-- Read the entire input without consuming it.
+read :: Monad m => ParseT e m ByteString
+read = ParseT \input loc -> pure $ Ok input input loc
+
+-- Replace the entire input without affecting the current location.
+write :: Monad m => ByteString -> ParseT e m ()
+write input = ParseT \_ loc -> pure $ Ok () input loc
+
+getLoc :: Monad m => ParseT e m Loc
+getLoc = ParseT \input loc -> pure $ Ok loc input loc
+
+putLoc :: Monad m => Loc -> ParseT e m ()
+putLoc loc = ParseT \input _ -> pure $ Ok () input loc
+
+-- Save parsing state. This is effectively a combination of read and getLoc.
+save :: Monad m => ParseT e m (ByteString, Loc)
+save = ParseT \input loc -> pure $ Ok (input, loc) input loc
+
+-- Load parsing state. This is effectively a combination of write and putLoc.
+load :: Monad m => (ByteString, Loc) -> ParseT e m ()
+load (input, loc) = ParseT \_ _ -> pure $ Ok () input loc
+
+fail :: Monad m => ParseT e m a
+fail = empty
+
+-- Turn a failure into a success and vice versa.
+notp :: Monad m => ParseT e m a -> ParseT e m ()
+notp p = ParseT \input loc ->
+    runParseT p input loc >>= \case
+        Fail     -> pure $ Ok () input loc
+        Error e  -> pure $ Error e
+        Ok _ _ _ -> pure Fail
+
+err :: Monad m => e -> ParseT e m a
+err e = ParseT \_ _ -> pure $ Error e
+
+-- Catch an error.
+catch :: Monad m => ParseT e m a -> (e -> ParseT e m a) -> ParseT e m a
+p `catch` h = ParseT \input loc ->
+    runParseT p input loc >>= \case
+        Fail             -> pure Fail
+        Error e          -> runParseT (h e) input loc
+        Ok a input' loc' -> pure $ Ok a input' loc'
+
+--------------------------------------------------------------------------------
+-- General parse combinators
+
+-- Run a parser without changing the parser state.
+lookahead :: Monad m => ParseT e m a -> ParseT e m a
+lookahead p = do
+    s <- save
+    a <- p
+    load s
+    pure a
+
+spanOf :: Monad m => ParseT e m a -> ParseT e m Span
+spanOf p = Span <$> getLoc <*> lookahead (p *> getLoc)
+
+bytesOf :: Monad m => ParseT e m a -> ParseT e m ByteString
+bytesOf p = do
+    (input, beg) <- save
+    end <- lookahead (p *> getLoc)
+    pure $ ByteString.drop (end.byte - beg.byte) input
+
+-- Convert a failure into an error.
+-- TODO: Why do we call this "cut"?
+cut :: Monad m => ParseT e m a -> e -> ParseT e m a
+p `cut` e = p <|> err e
+
+-- Convert an error into a failure.
+try :: Monad m => ParseT e m a -> ParseT e m a
+try p = p `catch` \_ -> fail
+
+opt :: Monad m => ParseT e m a -> ParseT e m (Maybe a)
+opt p = (Just <$> p) <|> pure Nothing
+
+eof :: Monad m => ParseT e m ()
+eof = (ByteString.null <$> read) >>= guard
+
+iter :: Monad m => (a -> ParseT e m a) -> a -> ParseT e m a
+iter f a = (f a >>= iter f) <|> pure a
+
+someUntil :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+someUntil p end = some (notp end *> p)
+
+manyUntil :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+manyUntil p end = many (notp end *> p)
+
+someSepBy :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+someSepBy p sep = (:) <$> p <*> many (sep *> p)
+
+manySepBy :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a]
+manySepBy p sep = someSepBy p sep <|> pure []
+
+--------------------------------------------------------------------------------
+-- Byte-wise parse combinators
+
+-- TODO
+
+--------------------------------------------------------------------------------
+-- Utf8 parse combinators
+
+-- All these combinators involve UTF-8 decoding and therefore require that
+-- the error type e supports UTF-8 errors.
+
+class Utf8Error e where
+    utf8Error :: Loc -> e
+
+nextRune :: forall e m. (Utf8Error e, Monad m) => ParseT e m Rune
+nextRune =
+    (UTF8.decode <$> read) >>= \case
+        Just (c, w)
+            | c == UTF8.replacement_char -> getLoc >>= (err . utf8Error)
+            | otherwise -> updateState c w *> pure (RuneChar c)
+        Nothing -> pure RuneEof
+    where
+        updateState :: Char -> Int -> ParseT e m ()
+        updateState c w = do
+            (ByteString.drop w <$> read) >>= write
+            loc <- getLoc
+            putLoc case c of
+                '\n' -> loc { byte = loc.byte + w, row = loc.row + 1, col = 0 }
+                _    -> loc { byte = loc.byte + w, col = loc.col + 1 }
+
+nextChar :: (Utf8Error e, Monad m) => ParseT e m Char
+nextChar = nextRune >>= \case
+    RuneEof -> fail
+    RuneChar c -> pure c
+
+oneRuneM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m Rune
+oneRuneM want = do
+    r <- nextRune
+    (lift $ want r) >>= guard
+    pure r
+
+oneRune :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m Rune
+oneRune want = oneRuneM (pure . want)
+
+oneCharM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m Char
+oneCharM want = do
+    c <- nextChar
+    (lift $ want c) >>= guard
+    pure c
+
+oneChar :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m Char
+oneChar want = oneCharM (pure . want)
+
+manyRuneM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m ByteString
+manyRuneM want =
+    fst <$> lexM () \((), r) ->
+        want r >>= \case
+            False -> pure $ LexRejectEmit ()
+            True  -> pure $ LexAcceptCont ()
+
+manyRune :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m ByteString
+manyRune want = manyRuneM (pure . want)
+
+manyCharM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m ByteString
+manyCharM want = manyRuneM \case
+    RuneEof -> pure False
+    RuneChar c -> want c
+
+manyChar :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m ByteString
+manyChar want = manyCharM (pure . want)
+
+rune :: (Utf8Error e, Monad m) => Rune -> ParseT e m ()
+rune r = oneRune (== r) *> pure ()
+
+char :: (Utf8Error e, Monad m) => Char -> ParseT e m ()
+char c = oneChar (== c) *> pure ()
+
+string :: (Utf8Error e, Monad m) => String -> ParseT e m ()
+string = traverse_ char
+
+natural :: (Utf8Error e, Monad m) => Int -> ParseT e m Natural
+natural = \case
+     0 ->
+        let prefix l u = char '0' *> (char l <|> char u) in
+        prefix 'b' 'B' *> natural 2
+            <|> prefix 'q' 'Q' *> natural 4
+            <|> prefix 'o' 'O' *> natural 8
+            <|> prefix 'x' 'X' *> natural 16
+            <|> natural 10
+
+     b | 0 < b && b <= 36 ->
+        let nat :: Int -> Natural
+            nat = fromIntegral
+
+            maxn = min (chr $ ord '0' + b) (succ '9')
+            maxu = min (chr $ ord 'A' + b - 10) (succ 'Z')
+            maxl = min (chr $ ord 'a' + b - 10) (succ 'z')
+
+            numeral = nextChar >>= \c -> if
+                | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0')
+                | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10)
+                | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10)
+                | otherwise            -> fail
+
+            underscores = many (char '_') in
+
+        numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral))
+
+     _ -> error "natural: invalid base"
+
+data LexAction e t s
+    = LexFail
+    | LexError e
+    | LexRejectEmit t -- Reject rune and emit token.
+    | LexAcceptCont s -- Accept rune and continue lexing in the given state.
+
+-- TODO: Delete this.
+-- lexM (and it's non-monadic counterpart lex) lexes a token (of some type t)
+-- in the manner of an automaton. That is, given a start state (of some type
+-- s) and a transition function that maps (s, Rune) pairs to "actions", lexM
+-- tracks the current state and repeatedly calls the transition function with
+-- runes from the input until it returns an action indicating it should stop.
+lexM :: forall m e t s. (Utf8Error e, Monad m)
+     => s
+     -> ((s, Rune) -> m (LexAction e t s))
+     -> ParseT e m (ByteString, t)
+lexM startState trans = do
+    (input, beg) <- save
+    let gobble :: s -> ParseT e m t
+        gobble s = do
+            r <- lookahead nextRune
+            (lift $ trans (s, r)) >>= \case
+                LexFail         -> fail
+                LexError e      -> err e
+                LexRejectEmit t -> pure t
+                LexAcceptCont s -> nextRune *> gobble s
+    t <- gobble startState
+    end <- getLoc
+    pure (ByteString.take (end.byte - beg.byte) input, t)
+
+lex :: (Utf8Error e, Monad m)
+    => s
+    -> ((s, Rune) -> LexAction e t s)
+    -> ParseT e m (ByteString, t)
+lex startState trans = lexM startState (pure . trans)
diff --git a/package.yaml b/package.yaml
@@ -0,0 +1,38 @@
+name: sparsec
+version: 0.1.0.0
+author: "Robert Russell"
+license: ISC
+
+ghc-options:
+  - -Wall
+  - -Wno-name-shadowing
+
+default-extensions:
+  - DuplicateRecordFields
+  - NoFieldSelectors
+  - OverloadedRecordDot
+
+  - BlockArguments
+  - LambdaCase
+  - MultiWayIf
+
+  - ScopedTypeVariables
+
+  - FlexibleInstances
+  - MultiParamTypeClasses
+  - UndecidableInstances
+
+dependencies:
+  - base >= 4.18 && < 5
+
+library:
+  source-dirs: .
+  dependencies:
+    - mtl
+    - bytestring
+    - utf8-string
+
+executables:
+  example:
+    source-dirs: example
+    main: Main.hs
diff --git a/stack.yaml b/stack.yaml
@@ -0,0 +1 @@
+resolver: lts-22.23

	sparsec simple Haskell parser combinator library
	git clone git://git.rr3.xyz/sparsec
	Log \| Files \| Refs \| README \| LICENSE

A	.gitignore	\|	3	+++
A	Sparsec.hs	\|	393	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	package.yaml	\|	38	++++++++++++++++++++++++++++++++++++++
A	stack.yaml	\|	1	+