(* A simple combinator-style parsing library for F#. Inspired by the Hutton & Meijer paper as well as the FParsec combinator library. Other than being much smaller, this library trades away performance for simplicity. If you need a fast library, look at FParsec. Version: 1.10 (2023-11-01) *) module Combinator open System open System.Text.RegularExpressions /// /// A 3-tuple representing a "rich string" that the parser needs for normal operation. /// First element: the input string /// Second element: the current position in the parse /// Third element: a boolean which is true if debugging is enabled /// type Input = string * int * bool /// /// Use this to prepare a rich string (an Input) for normal (non-debug) /// parsing operation. /// /// An input string. /// Returns an Input. let prepare(input: string) : Input = input, 0, false /// /// Use this to prepare a rich string (an Input) for debug-mode /// parsing operation. /// /// An input string. /// Returns an Input. let debug(input: string) : Input = input, 0, true /// /// Extracts the string input from an Input tuple. /// /// An Input. /// The input string. let input i = let (e,_,_) = i e /// /// Extracts the current position from an Input tuple. /// /// An Input. /// The position int. let position i = let (_,e,_) = i e /// /// Returns true if the Input's current position /// is at the end of the input string ("end of file"). /// /// An Input. /// true iff the position is EOF. let isEOF i = let pos = position i let len = String.length (input i) pos >= len /// /// Returns true if the Input is running in /// debug mode. /// /// An Input. /// true iff debug mode enabled. let isDebug i = let (_,_,e) = i e /// Represents the result of running a Parser<'a>. type Outcome<'a> = | Success of result: 'a * remaining: Input | Failure of fail_pos: int * rule: string /// A Parser<'a> is a function from Input to /// Outcome<'a>. type Parser<'a> = Input -> Outcome<'a> /// /// recparser is used to declare a parser before it is /// defined. The primary use case is when defining recursive /// parsers, e.g., parsers of the form e ::= ... e .... /// /// A tuple containing a simple parser that calls an /// implementation stored in a mutable reference cell, and a /// mutable reference cell initialized to hold a dummy /// implementation. let recparser() = let dumbparser = fun (input: Input) -> failwith "You forgot to initialize your recursive parser." let r = ref dumbparser (fun (input: Input) -> !r input), r // suggested refactoring in RFC FS-1111 due to ref cell deprecation // https://github.com/fsharp/fslang-design/blob/main/FSharp-6.0/FS-1111-refcell-op-information-messages.md // to be enabled CSCI 334, Spring 2024 // type 'a RefCell = { Value: 'a } // let recparser() = // let dumbparser = fun (input: Input) -> failwith "You forgot to initialize your recursive parser." // let r = { Value = dumbparser } // (fun (input: Input) -> r.Value input), r /// /// Returns the hexadecimal character code for the given character. /// /// A char. /// A string representing a char code, in hex. let cToHex(c: char) = "0x" + System.Convert.ToByte(c).ToString("x2");; /// /// A debug parser. Prints debug information for the given parser /// p as a side effect. /// let ()(p: Parser<'a>)(label: string)(i: Input) : Outcome<'a> = // if debugging is enabled... if (isDebug i) then let nextText = (input i).Substring(position i) if (input i).Length - (position i) > 0 then eprintfn "[attempting: %s on \"%s\", next char: %s]" label nextText (cToHex (input i).[0]) else eprintfn "[attempting: %s on \"%s\", next char: EOF]" label nextText let o = p i match o with | Success(a, i') -> let i1pos = position i let i2pos = position i' let istr = input i let nconsumed = i2pos - i1pos let iconsumed = istr.Substring(i1pos, i2pos - i1pos) let rem = istr.[i2pos..] if istr.Length - i2pos > 0 then eprintfn "[success: %s, consumed: \"%s\", remaining: \"%s\", next char: %s]" label iconsumed rem (cToHex rem.[0]) else eprintfn "[success: %s, consumed: \"%s\", remaining: \"%s\", next char: EOF]" label iconsumed rem | Failure(pos,rule) -> let rem = (input i).[pos..] if rem.Length > 0 then eprintfn "[failure at pos %d in rule [%s]: %s, remaining input: \"%s\", next char: %s]" pos rule label rem (cToHex rem.[0]) else eprintfn "[failure at pos %d in rule [%s]: %s, remaining input: \"%s\", next char: EOF]" pos rule label rem o // if debugging is disabled else p i /// /// Returns true if the given regular expression rgx matches s. /// /// A string. /// A string representing a C# regular expression. /// true iff rgx matches s. let is_regexp(s: string)(rgx: string) = Regex.Match(s, rgx).Success /// /// Returns true if the given character is whitespace. /// /// A char. /// true iff c is whitespace. let is_whitespace(c: char) = is_regexp (c.ToString()) @"\s" /// /// Returns true if the given character is whitespace, /// not including newline characters. /// /// A char. /// true iff c is whitespace but not newline. let is_whitespace_no_nl(c: char) = is_regexp (c.ToString()) @"\t| " /// /// Returns true if the given character is uppercase. /// /// A char. /// true iff c is uppercase. let is_upper(c: char) = is_regexp (c.ToString()) @"[A-Z]" /// /// Returns true if the given character is lowercase. /// /// A char. /// true iff c is lowercase. let is_lower(c: char) = is_regexp (c.ToString()) @"[a-z]" /// /// Returns true if the given character is a letter. /// /// A char. /// true iff c is a letter. let is_letter(c: char) = is_upper c || is_lower c /// /// Returns true if the given character is a numeric digit. /// /// A char. /// true iff c is a numeric digit. let is_digit(c: char) = is_regexp (c.ToString()) @"[0-9]" /// /// Consumes nothing from the given Input, returning a. /// /// Any value. /// An Input. /// Returns an Outcome that is always Success(a). let presult(a: 'a)(i: Input) : Outcome<'a> = Success(a,i) /// /// Consumes nothing from the given Input and fails. /// /// An Input. /// Returns an Outcome<'a> that is always Failure. let pzero(i: Input) : Outcome<'a> = Failure((position i), "pzero") /// /// Consumes a single character from the given Input. /// /// An Input. /// Returns a Parser that succeeds with a single char. let pitem(i: Input) : Outcome = let pos = position i let istr = input i if pos >= String.length istr then Failure ((position i),"pitem") else let debug = isDebug i let pos = position i Success (istr.[pos], (istr, pos + 1, debug)) /// /// Runs p and then calls f on the result, yielding /// a new parser that is a function of the first parser's result. /// If an Input is also given, also runs the second parser. /// let pbind(p: Parser<'a>)(f: 'a -> Parser<'b>)(i: Input) : Outcome<'b> = match p i with | Success(a,i') -> f a i' | Failure(pos,rule) -> Failure(pos,rule) /// /// Runs p1 and, if it succeeds, runs p2 on the /// remaining input. If both p1 and p2 succeed, /// runs f on the pair of results. /// let pseq(p1: Parser<'a>)(p2: Parser<'b>)(f: 'a*'b -> 'c) : Parser<'c> = pbind p1 (fun a -> pbind p2 (fun b -> presult (f (a,b)) ) ) /// /// Overrides the failure cause returned by a failing parser. /// let cause(p: Parser<'a>)(rule: String)(i: Input) : Outcome<'a> = let o = p i match o with | Success _ -> o | Failure(pos,_) -> Failure(pos, rule) /// /// Checks whether the current character matches a predicate. /// Useful for checking whether a character matches a set of characters. /// let psat(f: char -> bool) : Parser = cause (pbind pitem (fun c -> if (f c) then presult c else pzero)) "psat" /// /// Checks whether the current character matches a given character. /// let pchar(c: char) : Parser = cause (psat (fun c' -> c' = c)) (sprintf "pchar '%c'" c) /// /// Checks whether the current character is a letter. /// let pletter : Parser = cause (psat is_letter) "is_letter" /// /// Checks whether the current character is a numeric digit. /// let pdigit : Parser = cause (psat is_digit) "is_digit" /// /// Checks whether the current character is an uppercase letter. /// let pupper : Parser = cause (psat is_upper) "is_upper" /// /// Checks whether the current character is a lowercase letter. /// let plower : Parser = cause (psat is_lower) "is_lower" /// /// Allows parsing alternatives. First tries p1 and if that /// fails, tries p2. Returns Success if either /// p1 or p2 succeeds, and failure otherwise. Note that /// both parser alternatives must return the same type. /// let (<|>)(p1: Parser<'a>)(p2: Parser<'a>)(i: Input) : Outcome<'a> = let o = p1 i match o with | Success(_,_) -> o | Failure(pos,rule) -> let o2 = p2 i match o2 with | Success(_,_) -> o2 | Failure(pos2,rule2) -> // return the furthest failure if pos >= pos2 then Failure(pos,rule) else Failure(pos2,rule2) /// /// Runs p, and when it succeeds, runs a function f /// to transform the output of p. /// let pfun(p: Parser<'a>)(f: 'a -> 'b)(i: Input) : Outcome<'b> = let o = p i match o with | Success(a,i') -> Success(f a, i') | Failure(pos,rule) -> Failure(pos,rule) /// /// Runs p, and when it succeeds, runs a function f /// to transform the output of p. This is syntactic sugar /// for the pfun function so that pfun can be used /// inline, ala p |>> f. /// let (|>>)(p: Parser<'a>)(f: 'a -> 'b) : Parser<'b> = pfun p f /// /// The parser equivalent of a constant function. Runs p and if it /// succeeds, returns x. /// let pfresult(p: Parser<'a>)(x: 'b) : Parser<'b> = pbind p (fun _ -> presult x) /// /// Runs p zero or more times. Always runs until p fails at /// least once. If p is incapable of failing, this will loop forever, /// so don't do that. /// let rec pmany0(p: Parser<'a>)(i: Input) : Outcome<'a list> = let rec pm0(xs: 'a list)(i: Input) : Outcome<'a list> = match p i with | Failure(pos,rule) -> Success(xs, i) | Success(a, i') -> if i = i' then failwith "pmany parser loops infinitely!" pm0 (a::xs) i' match pm0 [] i with | Success(xs,i') -> Success(List.rev xs, i') | Failure(pos,rule) -> Failure(pos,rule) /// /// Runs p one or more times. Always runs until p fails at /// least once. If p is incapable of failing, this will loop forever, /// so don't do that. /// let pmany1(p: Parser<'a>) : Parser<'a list> = pseq p (pmany0 p) (fun (x,xs) -> x :: xs) /// /// Consumes zero or more whitespace characters, excluding newlines. /// let pwsNoNL0 : Parser = pmany0 (psat is_whitespace_no_nl) /// /// Consumes one or more whitespace characters, excluding newlines. /// let pwsNoNL1 : Parser = pmany1 (psat is_whitespace_no_nl) /// /// Consumes zero or more whitespace characters. /// let pws0 : Parser = cause (pmany0 (psat is_whitespace)) "pws0" /// /// Consumes one or more whitespace characters. /// let pws1 : Parser = cause (pmany1 (psat is_whitespace)) "pws1" /// /// Consumes the given string. /// let pstr(s: string) : Parser = cause (s.ToCharArray() |> Array.fold (fun pacc c -> pseq pacc (pchar c) (fun (s,ch) -> s + ch.ToString()) ) (presult "")) (sprintf "pstr \"%s\"" s) /// /// Consumes only the newline character. Should work for both UNIX and /// Windows line endings. /// let pnl : Parser = cause ((psat (fun c -> c = '\n') |>> (fun c -> c.ToString())) <|> (pstr "\r\n")) "pnl" /// /// Consumes the end of file. Run this to ensure that the entire /// input has been parsed. /// let peof(i: Input) : Outcome = match pitem i with | Failure(pos,rule) -> if isEOF i then Success(true, i) else Failure(pos, rule) | Success(_,_) -> Failure((position i), "peof") /// /// Runs pleft and pright, returning only the result of pleft if /// both parsers succeed. /// let pleft(pleft: Parser<'a>)(pright: Parser<'b>) : Parser<'a> = pbind pleft (fun a -> pfresult pright a) /// /// Runs pleft and pright, returning only the result of pright if /// both parsers succeed. /// let pright(pleft: Parser<'a>)(pright: Parser<'b>) : Parser<'b> = pbind pleft (fun _ -> pright) /// /// Runs popen, then p, the pclose, returning only the result of p if /// all three parsers succeed. /// let pbetween(popen: Parser<'a>)(p: Parser<'b>)(pclose: Parser<'c>) : Parser<'b> = pright popen (pleft p pclose) /// /// Turns a list of characters into a string. /// let stringify(cs: char list) : string = String.Join("", cs) (* do not call directly *) let rec leftpad str ch num = if num > 0 then leftpad (ch.ToString() + str) ch (num - 1) else str (* do not call directly *) let windowLeftIndex(window_sz: int)(failure_pos: int) : int = if failure_pos - window_sz < 0 then 0 else failure_pos - window_sz (* do not call directly *) let windowRightIndex(window_sz: int)(failure_pos: int)(buffer_len: int) : int = if failure_pos + window_sz >= buffer_len then buffer_len - 1 else failure_pos + window_sz (* do not call directly *) let indexOfLastNewlineLeftWindow(left_index: int)(failure_pos: int)(buffer: string) : int = // search for last occurrence of '\n' let rec searchBackward(pos: int) : int option = if pos <= left_index then None else if buffer.[pos] = '\n' then Some pos else searchBackward (pos - 1) match searchBackward (failure_pos - 1) with | Some idx -> idx | None -> left_index (* do not call directly *) let indexOfFirstNewlineRightWindow(right_index: int)(failure_pos: int)(buffer: string) : int = // search for first occurrence of '\n' let rec searchForward(pos: int) : int option = if pos >= right_index then None else if buffer.[pos] = '\n' then Some pos else searchForward (pos + 1) match searchForward (failure_pos + 1) with | Some idx -> idx | None -> right_index /// /// Produce a diagnostic message for a parser failure. /// /// The amount of context (in chars) to show to the left and right of the failure position. /// Where the parse failed. /// The input stream. /// The error message. /// Returns a diagnostic string. let diagnosticMessage(window_sz: int)(failure_pos: int)(buffer: string)(err: string) : string = // compute window let left_idx = windowLeftIndex window_sz failure_pos let right_idx = windowRightIndex window_sz failure_pos buffer.Length let last_nl_left = indexOfLastNewlineLeftWindow left_idx failure_pos buffer let first_nl_right = indexOfFirstNewlineRightWindow right_idx failure_pos buffer // find caret position in last line let caret_pos = failure_pos - last_nl_left + 1 // create window string let window = buffer.Substring(left_idx, failure_pos - left_idx + 1 + right_idx - failure_pos) // augment with diagnostic info let diag = err + "\n\n" + window + "\n" + (leftpad "^" ' ' (caret_pos - 1)) + "\n" diag