(*
A simple combinator-style parsing library for F#.
Inspired by the Hutton & Meijer paper as well as the FParsec
combinator library. Other than being much smaller, this
library trades away performance for simplicity. If you need
a fast library, look at FParsec.
Version: 1.10 (2023-11-01)
*)
module Combinator
open System
open System.Text.RegularExpressions
///
/// A 3-tuple representing a "rich string" that the parser needs for normal operation.
/// First element: the input string
/// Second element: the current position in the parse
/// Third element: a boolean which is true if debugging is enabled
///
type Input = string * int * bool
///
/// Use this to prepare a rich string (an Input) for normal (non-debug)
/// parsing operation.
///
/// An input string.
/// Returns an Input.
let prepare(input: string) : Input = input, 0, false
///
/// Use this to prepare a rich string (an Input) for debug-mode
/// parsing operation.
///
/// An input string.
/// Returns an Input.
let debug(input: string) : Input = input, 0, true
///
/// Extracts the string input from an Input tuple.
///
/// An Input.
/// The input string.
let input i =
let (e,_,_) = i
e
///
/// Extracts the current position from an Input tuple.
///
/// An Input.
/// The position int.
let position i =
let (_,e,_) = i
e
///
/// Returns true if the Input's current position
/// is at the end of the input string ("end of file").
///
/// An Input.
/// true iff the position is EOF.
let isEOF i =
let pos = position i
let len = String.length (input i)
pos >= len
///
/// Returns true if the Input is running in
/// debug mode.
///
/// An Input.
/// true iff debug mode enabled.
let isDebug i =
let (_,_,e) = i
e
/// Represents the result of running a Parser<'a>.
type Outcome<'a> =
| Success of result: 'a * remaining: Input
| Failure of fail_pos: int * rule: string
/// A Parser<'a> is a function from Input to
/// Outcome<'a>.
type Parser<'a> = Input -> Outcome<'a>
///
/// recparser is used to declare a parser before it is
/// defined. The primary use case is when defining recursive
/// parsers, e.g., parsers of the form e ::= ... e ....
///
/// A tuple containing a simple parser that calls an
/// implementation stored in a mutable reference cell, and a
/// mutable reference cell initialized to hold a dummy
/// implementation.
let recparser() =
let dumbparser = fun (input: Input) -> failwith "You forgot to initialize your recursive parser."
let r = ref dumbparser
(fun (input: Input) -> !r input), r
// suggested refactoring in RFC FS-1111 due to ref cell deprecation
// https://github.com/fsharp/fslang-design/blob/main/FSharp-6.0/FS-1111-refcell-op-information-messages.md
// to be enabled CSCI 334, Spring 2024
// type 'a RefCell = { Value: 'a }
// let recparser() =
// let dumbparser = fun (input: Input) -> failwith "You forgot to initialize your recursive parser."
// let r = { Value = dumbparser }
// (fun (input: Input) -> r.Value input), r
///
/// Returns the hexadecimal character code for the given character.
///
/// A char.
/// A string representing a char code, in hex.
let cToHex(c: char) = "0x" + System.Convert.ToByte(c).ToString("x2");;
///
/// A debug parser. Prints debug information for the given parser
/// p as a side effect.
///
let ()(p: Parser<'a>)(label: string)(i: Input) : Outcome<'a> =
// if debugging is enabled...
if (isDebug i) then
let nextText = (input i).Substring(position i)
if (input i).Length - (position i) > 0 then
eprintfn "[attempting: %s on \"%s\", next char: %s]" label nextText (cToHex (input i).[0])
else
eprintfn "[attempting: %s on \"%s\", next char: EOF]" label nextText
let o = p i
match o with
| Success(a, i') ->
let i1pos = position i
let i2pos = position i'
let istr = input i
let nconsumed = i2pos - i1pos
let iconsumed = istr.Substring(i1pos, i2pos - i1pos)
let rem = istr.[i2pos..]
if istr.Length - i2pos > 0 then
eprintfn "[success: %s, consumed: \"%s\", remaining: \"%s\", next char: %s]" label iconsumed rem (cToHex rem.[0])
else
eprintfn "[success: %s, consumed: \"%s\", remaining: \"%s\", next char: EOF]" label iconsumed rem
| Failure(pos,rule) ->
let rem = (input i).[pos..]
if rem.Length > 0 then
eprintfn "[failure at pos %d in rule [%s]: %s, remaining input: \"%s\", next char: %s]" pos rule label rem (cToHex rem.[0])
else
eprintfn "[failure at pos %d in rule [%s]: %s, remaining input: \"%s\", next char: EOF]" pos rule label rem
o
// if debugging is disabled
else
p i
///
/// Returns true if the given regular expression rgx matches s.
///
/// A string.
/// A string representing a C# regular expression.
/// true iff rgx matches s.
let is_regexp(s: string)(rgx: string) =
Regex.Match(s, rgx).Success
///
/// Returns true if the given character is whitespace.
///
/// A char.
/// true iff c is whitespace.
let is_whitespace(c: char) = is_regexp (c.ToString()) @"\s"
///
/// Returns true if the given character is whitespace,
/// not including newline characters.
///
/// A char.
/// true iff c is whitespace but not newline.
let is_whitespace_no_nl(c: char) = is_regexp (c.ToString()) @"\t| "
///
/// Returns true if the given character is uppercase.
///
/// A char.
/// true iff c is uppercase.
let is_upper(c: char) = is_regexp (c.ToString()) @"[A-Z]"
///
/// Returns true if the given character is lowercase.
///
/// A char.
/// true iff c is lowercase.
let is_lower(c: char) = is_regexp (c.ToString()) @"[a-z]"
///
/// Returns true if the given character is a letter.
///
/// A char.
/// true iff c is a letter.
let is_letter(c: char) = is_upper c || is_lower c
///
/// Returns true if the given character is a numeric digit.
///
/// A char.
/// true iff c is a numeric digit.
let is_digit(c: char) = is_regexp (c.ToString()) @"[0-9]"
///
/// Consumes nothing from the given Input, returning a.
///
/// Any value.
/// An Input.
/// Returns an Outcome that is always Success(a).
let presult(a: 'a)(i: Input) : Outcome<'a> = Success(a,i)
///
/// Consumes nothing from the given Input and fails.
///
/// An Input.
/// Returns an Outcome<'a> that is always Failure.
let pzero(i: Input) : Outcome<'a> = Failure((position i), "pzero")
///
/// Consumes a single character from the given Input.
///
/// An Input.
/// Returns a Parser that succeeds with a single char.
let pitem(i: Input) : Outcome =
let pos = position i
let istr = input i
if pos >= String.length istr then
Failure ((position i),"pitem")
else
let debug = isDebug i
let pos = position i
Success (istr.[pos], (istr, pos + 1, debug))
///
/// Runs p and then calls f on the result, yielding
/// a new parser that is a function of the first parser's result.
/// If an Input is also given, also runs the second parser.
///
let pbind(p: Parser<'a>)(f: 'a -> Parser<'b>)(i: Input) : Outcome<'b> =
match p i with
| Success(a,i') -> f a i'
| Failure(pos,rule) -> Failure(pos,rule)
///
/// Runs p1 and, if it succeeds, runs p2 on the
/// remaining input. If both p1 and p2 succeed,
/// runs f on the pair of results.
///
let pseq(p1: Parser<'a>)(p2: Parser<'b>)(f: 'a*'b -> 'c) : Parser<'c> =
pbind p1 (fun a ->
pbind p2 (fun b ->
presult (f (a,b))
)
)
///
/// Overrides the failure cause returned by a failing parser.
///
let cause(p: Parser<'a>)(rule: String)(i: Input) : Outcome<'a> =
let o = p i
match o with
| Success _ -> o
| Failure(pos,_) -> Failure(pos, rule)
///
/// Checks whether the current character matches a predicate.
/// Useful for checking whether a character matches a set of characters.
///
let psat(f: char -> bool) : Parser =
cause
(pbind pitem (fun c -> if (f c) then presult c else pzero))
"psat"
///
/// Checks whether the current character matches a given character.
///
let pchar(c: char) : Parser =
cause
(psat (fun c' -> c' = c))
(sprintf "pchar '%c'" c)
///
/// Checks whether the current character is a letter.
///
let pletter : Parser =
cause
(psat is_letter)
"is_letter"
///
/// Checks whether the current character is a numeric digit.
///
let pdigit : Parser =
cause
(psat is_digit)
"is_digit"
///
/// Checks whether the current character is an uppercase letter.
///
let pupper : Parser =
cause
(psat is_upper)
"is_upper"
///
/// Checks whether the current character is a lowercase letter.
///
let plower : Parser =
cause
(psat is_lower)
"is_lower"
///
/// Allows parsing alternatives. First tries p1 and if that
/// fails, tries p2. Returns Success if either
/// p1 or p2 succeeds, and failure otherwise. Note that
/// both parser alternatives must return the same type.
///
let (<|>)(p1: Parser<'a>)(p2: Parser<'a>)(i: Input) : Outcome<'a> =
let o = p1 i
match o with
| Success(_,_) -> o
| Failure(pos,rule) ->
let o2 = p2 i
match o2 with
| Success(_,_) -> o2
| Failure(pos2,rule2) ->
// return the furthest failure
if pos >= pos2 then
Failure(pos,rule)
else
Failure(pos2,rule2)
///
/// Runs p, and when it succeeds, runs a function f
/// to transform the output of p.
///
let pfun(p: Parser<'a>)(f: 'a -> 'b)(i: Input) : Outcome<'b> =
let o = p i
match o with
| Success(a,i') -> Success(f a, i')
| Failure(pos,rule) -> Failure(pos,rule)
///
/// Runs p, and when it succeeds, runs a function f
/// to transform the output of p. This is syntactic sugar
/// for the pfun function so that pfun can be used
/// inline, ala p |>> f.
///
let (|>>)(p: Parser<'a>)(f: 'a -> 'b) : Parser<'b> = pfun p f
///
/// The parser equivalent of a constant function. Runs p and if it
/// succeeds, returns x.
///
let pfresult(p: Parser<'a>)(x: 'b) : Parser<'b> =
pbind p (fun _ -> presult x)
///
/// Runs p zero or more times. Always runs until p fails at
/// least once. If p is incapable of failing, this will loop forever,
/// so don't do that.
///
let rec pmany0(p: Parser<'a>)(i: Input) : Outcome<'a list> =
let rec pm0(xs: 'a list)(i: Input) : Outcome<'a list> =
match p i with
| Failure(pos,rule) -> Success(xs, i)
| Success(a, i') ->
if i = i' then
failwith "pmany parser loops infinitely!"
pm0 (a::xs) i'
match pm0 [] i with
| Success(xs,i') -> Success(List.rev xs, i')
| Failure(pos,rule) -> Failure(pos,rule)
///
/// Runs p one or more times. Always runs until p fails at
/// least once. If p is incapable of failing, this will loop forever,
/// so don't do that.
///
let pmany1(p: Parser<'a>) : Parser<'a list> =
pseq p (pmany0 p) (fun (x,xs) -> x :: xs)
///
/// Consumes zero or more whitespace characters, excluding newlines.
///
let pwsNoNL0 : Parser = pmany0 (psat is_whitespace_no_nl)
///
/// Consumes one or more whitespace characters, excluding newlines.
///
let pwsNoNL1 : Parser = pmany1 (psat is_whitespace_no_nl)
///
/// Consumes zero or more whitespace characters.
///
let pws0 : Parser =
cause
(pmany0 (psat is_whitespace))
"pws0"
///
/// Consumes one or more whitespace characters.
///
let pws1 : Parser =
cause
(pmany1 (psat is_whitespace))
"pws1"
///
/// Consumes the given string.
///
let pstr(s: string) : Parser =
cause
(s.ToCharArray()
|> Array.fold (fun pacc c ->
pseq pacc (pchar c) (fun (s,ch) -> s + ch.ToString())
) (presult ""))
(sprintf "pstr \"%s\"" s)
///
/// Consumes only the newline character. Should work for both UNIX and
/// Windows line endings.
///
let pnl : Parser =
cause
((psat (fun c -> c = '\n') |>> (fun c -> c.ToString()))
<|> (pstr "\r\n"))
"pnl"
///
/// Consumes the end of file. Run this to ensure that the entire
/// input has been parsed.
///
let peof(i: Input) : Outcome =
match pitem i with
| Failure(pos,rule) ->
if isEOF i then
Success(true, i)
else
Failure(pos, rule)
| Success(_,_) -> Failure((position i), "peof")
///
/// Runs pleft and pright, returning only the result of pleft if
/// both parsers succeed.
///
let pleft(pleft: Parser<'a>)(pright: Parser<'b>) : Parser<'a> =
pbind pleft (fun a -> pfresult pright a)
///
/// Runs pleft and pright, returning only the result of pright if
/// both parsers succeed.
///
let pright(pleft: Parser<'a>)(pright: Parser<'b>) : Parser<'b> =
pbind pleft (fun _ -> pright)
///
/// Runs popen, then p, the pclose, returning only the result of p if
/// all three parsers succeed.
///
let pbetween(popen: Parser<'a>)(p: Parser<'b>)(pclose: Parser<'c>) : Parser<'b> =
pright popen (pleft p pclose)
///
/// Turns a list of characters into a string.
///
let stringify(cs: char list) : string = String.Join("", cs)
(* do not call directly *)
let rec leftpad str ch num =
if num > 0 then
leftpad (ch.ToString() + str) ch (num - 1)
else
str
(* do not call directly *)
let windowLeftIndex(window_sz: int)(failure_pos: int) : int =
if failure_pos - window_sz < 0 then
0
else
failure_pos - window_sz
(* do not call directly *)
let windowRightIndex(window_sz: int)(failure_pos: int)(buffer_len: int) : int =
if failure_pos + window_sz >= buffer_len then
buffer_len - 1
else
failure_pos + window_sz
(* do not call directly *)
let indexOfLastNewlineLeftWindow(left_index: int)(failure_pos: int)(buffer: string) : int =
// search for last occurrence of '\n'
let rec searchBackward(pos: int) : int option =
if pos <= left_index then
None
else if buffer.[pos] = '\n' then
Some pos
else
searchBackward (pos - 1)
match searchBackward (failure_pos - 1) with
| Some idx -> idx
| None -> left_index
(* do not call directly *)
let indexOfFirstNewlineRightWindow(right_index: int)(failure_pos: int)(buffer: string) : int =
// search for first occurrence of '\n'
let rec searchForward(pos: int) : int option =
if pos >= right_index then
None
else if buffer.[pos] = '\n' then
Some pos
else
searchForward (pos + 1)
match searchForward (failure_pos + 1) with
| Some idx -> idx
| None -> right_index
///
/// Produce a diagnostic message for a parser failure.
///
/// The amount of context (in chars) to show to the left and right of the failure position.
/// Where the parse failed.
/// The input stream.
/// The error message.
/// Returns a diagnostic string.
let diagnosticMessage(window_sz: int)(failure_pos: int)(buffer: string)(err: string) : string =
// compute window
let left_idx = windowLeftIndex window_sz failure_pos
let right_idx = windowRightIndex window_sz failure_pos buffer.Length
let last_nl_left = indexOfLastNewlineLeftWindow left_idx failure_pos buffer
let first_nl_right = indexOfFirstNewlineRightWindow right_idx failure_pos buffer
// find caret position in last line
let caret_pos = failure_pos - last_nl_left + 1
// create window string
let window = buffer.Substring(left_idx, failure_pos - left_idx + 1 + right_idx - failure_pos)
// augment with diagnostic info
let diag = err + "\n\n" + window + "\n" + (leftpad "^" ' ' (caret_pos - 1)) + "\n"
diag