sparsec

simple Haskell parser combinator library
git clone git://git.rr3.xyz/sparsec
Log | Files | Refs | README | LICENSE

commit 4039f027a606a2a044a862a610eca7e4dc934020
parent b13648356cab32b00ed3defa8983bf8efdf93ea8
Author: Robert Russell <robert@rr3.xyz>
Date:   Tue, 27 Aug 2024 00:00:06 -0700

Move Sparsec from System Q

Diffstat:
A.gitignore | 3+++
ASparsec.hs | 393+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apackage.yaml | 38++++++++++++++++++++++++++++++++++++++
Astack.yaml | 1+
4 files changed, 435 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,3 @@ +.stack-work +*.lock +*.cabal diff --git a/Sparsec.hs b/Sparsec.hs @@ -0,0 +1,393 @@ +module Sparsec where +import Control.Applicative +import Control.Monad hiding (fail) +import Control.Monad.Identity +import Control.Monad.Reader +import Control.Monad.State +import Data.ByteString (ByteString) +import qualified Data.ByteString as ByteString +import qualified Data.ByteString.UTF8 as UTF8 +import Data.Char +import Data.Foldable +import Numeric.Natural +import Prelude hiding (fail, lex, read) +import Text.Printf + +-------------------------------------------------------------------------------- +-- Rune + +data Rune = RuneEof | RuneChar Char deriving Eq + +instance Show Rune where + show = \case + RuneEof -> "EOF" + RuneChar c -> show c + +runeCase :: a -> (Char -> a) -> Rune -> a +runeCase eof chr = \case + RuneEof -> eof + RuneChar c -> chr c + +runeIsSpace :: Rune -> Bool +runeIsSpace = runeCase False isSpace + +runeIsLetter :: Rune -> Bool +runeIsLetter = runeCase False isLetter + +runeIsDigit :: Rune -> Bool +runeIsDigit = runeCase False isDigit + +-------------------------------------------------------------------------------- +-- Locations + +-- Loc is a source code location, tracking byte offset, row (number of '\n' +-- characters seen so far), and column (number of characters into the current +-- row). All three of these quantities are 0-based. The column tracking is +-- somewhat stupid in that it treats each Unicode codepoint as one column. +data Loc = Loc { byte :: Int, row :: Int, col :: Int } deriving Eq + +locZero :: Loc +locZero = Loc 0 0 0 + +-- Weird Show instance intended for debugging +instance Show Loc where + show (Loc b r c) = printf "%d:%d#%d" r c b + +-------------------------------------------------------------------------------- +-- Spans + +-- Span is a pair of Loc's specifying a range in the input. The beginning/left +-- Loc is inclusive and the ending/right Loc is exclusive. +data Span = Span Loc Loc deriving Eq + +-- Weird Show instance intended for debugging +instance Show Span where + show (Span beg end) = printf "%s--%s" (show beg) (show end) + +-------------------------------------------------------------------------------- +-- Result + +data Result e a + = Fail + | Error e + | Ok a ByteString Loc + deriving Eq + +instance (Show e, Show a) => Show (Result e a) where + show = \case + Fail -> "fail" + Error e -> "error: " ++ show e + Ok a rest _ -> + if ByteString.null rest then + printf "ok: %s" (show a) + else + printf "ok (%d bytes remaining): %s" + (ByteString.length rest) (show a) + +instance Functor (Result e) where + fmap f = \case + Fail -> Fail + Error e -> Error e + Ok a rest loc -> Ok (f a) rest loc + +-------------------------------------------------------------------------------- +-- Parse monad + +-- ParseT is a monad transformer for parsing. It effectively has a ByteString +-- and Loc state effect for the input, and an error effect (of some type e) for +-- parsing errors. Parsing *errors* are distinct from parsing *failures* in +-- that only the latter trigger backtracking in the Alternative instance. +newtype ParseT e m a = ParseT (ByteString -> Loc -> m (Result e a)) + +runParseT :: Monad m => ParseT e m a -> ByteString -> Loc -> m (Result e a) +runParseT (ParseT f) = f + +mapParseT :: (Monad m, Monad m') + => (m (Result e a) -> m' (Result e' a')) + -> ParseT e m a + -> ParseT e' m' a' +mapParseT f p = ParseT \input loc -> f $ runParseT p input loc + +type Parse e a = ParseT e Identity a + +instance Monad m => Functor (ParseT e m) where + fmap = liftM + +instance Monad m => Applicative (ParseT e m) where + pure a = ParseT \input loc -> pure $ Ok a input loc + + (<*>) = ap + +instance Monad m => Monad (ParseT e m) where + p >>= k = ParseT \input loc -> + runParseT p input loc >>= \case + Fail -> pure $ Fail + Error e -> pure $ Error e + Ok a input' loc' -> runParseT (k a) input' loc' + +instance Monad m => Alternative (ParseT e m) where + empty = ParseT \_ _ -> pure Fail + + p <|> q = ParseT \input loc -> + runParseT p input loc >>= \case + Fail -> runParseT q input loc + Error e -> pure $ Error e + Ok a input' loc' -> pure $ Ok a input' loc' + +instance MonadTrans (ParseT e) where + lift m = ParseT \input loc -> do + a <- m + pure $ Ok a input loc + +instance MonadReader r m => MonadReader r (ParseT e m) where + ask = lift ask + + local f p = ParseT \input loc -> local f $ runParseT p input loc + +instance MonadState s m => MonadState s (ParseT e m) where + get = lift get + + put = lift . put + +-- TODO: Separate a MTL-like MonadParse typeclass with (a subset of) the +-- following methods? + +-- Read the entire input without consuming it. +read :: Monad m => ParseT e m ByteString +read = ParseT \input loc -> pure $ Ok input input loc + +-- Replace the entire input without affecting the current location. +write :: Monad m => ByteString -> ParseT e m () +write input = ParseT \_ loc -> pure $ Ok () input loc + +getLoc :: Monad m => ParseT e m Loc +getLoc = ParseT \input loc -> pure $ Ok loc input loc + +putLoc :: Monad m => Loc -> ParseT e m () +putLoc loc = ParseT \input _ -> pure $ Ok () input loc + +-- Save parsing state. This is effectively a combination of read and getLoc. +save :: Monad m => ParseT e m (ByteString, Loc) +save = ParseT \input loc -> pure $ Ok (input, loc) input loc + +-- Load parsing state. This is effectively a combination of write and putLoc. +load :: Monad m => (ByteString, Loc) -> ParseT e m () +load (input, loc) = ParseT \_ _ -> pure $ Ok () input loc + +fail :: Monad m => ParseT e m a +fail = empty + +-- Turn a failure into a success and vice versa. +notp :: Monad m => ParseT e m a -> ParseT e m () +notp p = ParseT \input loc -> + runParseT p input loc >>= \case + Fail -> pure $ Ok () input loc + Error e -> pure $ Error e + Ok _ _ _ -> pure Fail + +err :: Monad m => e -> ParseT e m a +err e = ParseT \_ _ -> pure $ Error e + +-- Catch an error. +catch :: Monad m => ParseT e m a -> (e -> ParseT e m a) -> ParseT e m a +p `catch` h = ParseT \input loc -> + runParseT p input loc >>= \case + Fail -> pure Fail + Error e -> runParseT (h e) input loc + Ok a input' loc' -> pure $ Ok a input' loc' + +-------------------------------------------------------------------------------- +-- General parse combinators + +-- Run a parser without changing the parser state. +lookahead :: Monad m => ParseT e m a -> ParseT e m a +lookahead p = do + s <- save + a <- p + load s + pure a + +spanOf :: Monad m => ParseT e m a -> ParseT e m Span +spanOf p = Span <$> getLoc <*> lookahead (p *> getLoc) + +bytesOf :: Monad m => ParseT e m a -> ParseT e m ByteString +bytesOf p = do + (input, beg) <- save + end <- lookahead (p *> getLoc) + pure $ ByteString.drop (end.byte - beg.byte) input + +-- Convert a failure into an error. +-- TODO: Why do we call this "cut"? +cut :: Monad m => ParseT e m a -> e -> ParseT e m a +p `cut` e = p <|> err e + +-- Convert an error into a failure. +try :: Monad m => ParseT e m a -> ParseT e m a +try p = p `catch` \_ -> fail + +opt :: Monad m => ParseT e m a -> ParseT e m (Maybe a) +opt p = (Just <$> p) <|> pure Nothing + +eof :: Monad m => ParseT e m () +eof = (ByteString.null <$> read) >>= guard + +iter :: Monad m => (a -> ParseT e m a) -> a -> ParseT e m a +iter f a = (f a >>= iter f) <|> pure a + +someUntil :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a] +someUntil p end = some (notp end *> p) + +manyUntil :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a] +manyUntil p end = many (notp end *> p) + +someSepBy :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a] +someSepBy p sep = (:) <$> p <*> many (sep *> p) + +manySepBy :: Monad m => ParseT e m a -> ParseT e m b -> ParseT e m [a] +manySepBy p sep = someSepBy p sep <|> pure [] + +-------------------------------------------------------------------------------- +-- Byte-wise parse combinators + +-- TODO + +-------------------------------------------------------------------------------- +-- Utf8 parse combinators + +-- All these combinators involve UTF-8 decoding and therefore require that +-- the error type e supports UTF-8 errors. + +class Utf8Error e where + utf8Error :: Loc -> e + +nextRune :: forall e m. (Utf8Error e, Monad m) => ParseT e m Rune +nextRune = + (UTF8.decode <$> read) >>= \case + Just (c, w) + | c == UTF8.replacement_char -> getLoc >>= (err . utf8Error) + | otherwise -> updateState c w *> pure (RuneChar c) + Nothing -> pure RuneEof + where + updateState :: Char -> Int -> ParseT e m () + updateState c w = do + (ByteString.drop w <$> read) >>= write + loc <- getLoc + putLoc case c of + '\n' -> loc { byte = loc.byte + w, row = loc.row + 1, col = 0 } + _ -> loc { byte = loc.byte + w, col = loc.col + 1 } + +nextChar :: (Utf8Error e, Monad m) => ParseT e m Char +nextChar = nextRune >>= \case + RuneEof -> fail + RuneChar c -> pure c + +oneRuneM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m Rune +oneRuneM want = do + r <- nextRune + (lift $ want r) >>= guard + pure r + +oneRune :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m Rune +oneRune want = oneRuneM (pure . want) + +oneCharM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m Char +oneCharM want = do + c <- nextChar + (lift $ want c) >>= guard + pure c + +oneChar :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m Char +oneChar want = oneCharM (pure . want) + +manyRuneM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m ByteString +manyRuneM want = + fst <$> lexM () \((), r) -> + want r >>= \case + False -> pure $ LexRejectEmit () + True -> pure $ LexAcceptCont () + +manyRune :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m ByteString +manyRune want = manyRuneM (pure . want) + +manyCharM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m ByteString +manyCharM want = manyRuneM \case + RuneEof -> pure False + RuneChar c -> want c + +manyChar :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m ByteString +manyChar want = manyCharM (pure . want) + +rune :: (Utf8Error e, Monad m) => Rune -> ParseT e m () +rune r = oneRune (== r) *> pure () + +char :: (Utf8Error e, Monad m) => Char -> ParseT e m () +char c = oneChar (== c) *> pure () + +string :: (Utf8Error e, Monad m) => String -> ParseT e m () +string = traverse_ char + +natural :: (Utf8Error e, Monad m) => Int -> ParseT e m Natural +natural = \case + 0 -> + let prefix l u = char '0' *> (char l <|> char u) in + prefix 'b' 'B' *> natural 2 + <|> prefix 'q' 'Q' *> natural 4 + <|> prefix 'o' 'O' *> natural 8 + <|> prefix 'x' 'X' *> natural 16 + <|> natural 10 + + b | 0 < b && b <= 36 -> + let nat :: Int -> Natural + nat = fromIntegral + + maxn = min (chr $ ord '0' + b) (succ '9') + maxu = min (chr $ ord 'A' + b - 10) (succ 'Z') + maxl = min (chr $ ord 'a' + b - 10) (succ 'z') + + numeral = nextChar >>= \c -> if + | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0') + | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10) + | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10) + | otherwise -> fail + + underscores = many (char '_') in + + numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral)) + + _ -> error "natural: invalid base" + +data LexAction e t s + = LexFail + | LexError e + | LexRejectEmit t -- Reject rune and emit token. + | LexAcceptCont s -- Accept rune and continue lexing in the given state. + +-- TODO: Delete this. +-- lexM (and it's non-monadic counterpart lex) lexes a token (of some type t) +-- in the manner of an automaton. That is, given a start state (of some type +-- s) and a transition function that maps (s, Rune) pairs to "actions", lexM +-- tracks the current state and repeatedly calls the transition function with +-- runes from the input until it returns an action indicating it should stop. +lexM :: forall m e t s. (Utf8Error e, Monad m) + => s + -> ((s, Rune) -> m (LexAction e t s)) + -> ParseT e m (ByteString, t) +lexM startState trans = do + (input, beg) <- save + let gobble :: s -> ParseT e m t + gobble s = do + r <- lookahead nextRune + (lift $ trans (s, r)) >>= \case + LexFail -> fail + LexError e -> err e + LexRejectEmit t -> pure t + LexAcceptCont s -> nextRune *> gobble s + t <- gobble startState + end <- getLoc + pure (ByteString.take (end.byte - beg.byte) input, t) + +lex :: (Utf8Error e, Monad m) + => s + -> ((s, Rune) -> LexAction e t s) + -> ParseT e m (ByteString, t) +lex startState trans = lexM startState (pure . trans) diff --git a/package.yaml b/package.yaml @@ -0,0 +1,38 @@ +name: sparsec +version: 0.1.0.0 +author: "Robert Russell" +license: ISC + +ghc-options: + - -Wall + - -Wno-name-shadowing + +default-extensions: + - DuplicateRecordFields + - NoFieldSelectors + - OverloadedRecordDot + + - BlockArguments + - LambdaCase + - MultiWayIf + + - ScopedTypeVariables + + - FlexibleInstances + - MultiParamTypeClasses + - UndecidableInstances + +dependencies: + - base >= 4.18 && < 5 + +library: + source-dirs: . + dependencies: + - mtl + - bytestring + - utf8-string + +executables: + example: + source-dirs: example + main: Main.hs diff --git a/stack.yaml b/stack.yaml @@ -0,0 +1 @@ +resolver: lts-22.23