diff --git a/.gitignore b/.gitignore index 0a37b21..814ced8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ __pycache__/ bin/ + +# TEMP: used while debugging +# (and cause I'm super duper lazy) +src/nlx diff --git a/lang/demo/single_toks.no b/lang/demo/single_toks.no new file mode 100644 index 0000000..683090a --- /dev/null +++ b/lang/demo/single_toks.no @@ -0,0 +1,2 @@ +[a]b(#) +(c)d[e] diff --git a/py/m.py b/py/m.py index e2b60c5..9576f4c 100644 --- a/py/m.py +++ b/py/m.py @@ -2,7 +2,7 @@ import sys import readline -from noether.math import * +from noether.lib.math import * from noether.cli import * diff --git a/src/ddemo b/src/ddemo new file mode 100755 index 0000000..af30039 --- /dev/null +++ b/src/ddemo @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -e + +if [ -z "$1" ]; then + echo "Usage: ddemo DEMOFILE" + echo "Demo files are located in lang/demo" + exit 1 +fi + +nim c nlx.nim +./nlx ../lang/demo/$1 diff --git a/src/nlx.nim b/src/nlx.nim index 4b0c678..e145943 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -1,22 +1,29 @@ import os -import noether/lexer/tok -import noether/lexer/tokstream -import noether/parser/parser +import noether/lib/io +import noether/lexer/lex +import noether/parser/parse +{.hint: "Don't forget to drink more water (^_^)".} when isMainModule: echo "Noether Lang Extras v0.1.0 - nlx" - if paramCount() > 0: - let filename = paramStr(1) - var tokStream = newTokStream(filename, isFile=true) - - # # DumpTok - # var tok: nlTok - # while tokStream.nextTok(tok): - # echo tok + # really lazy argparse implementation (temporary) + let + paramC = paramCount() + cmd = if paramC > 2: paramStr 1 + else: "tok" + var stream = if paramC > 0: streamFile(paramStr paramC) + else: streamString(readAll stdin) + + var lexer = newLexer(stream) + if cmd == "tok": + # DumpTok + while lexer.progress(): + echo lexer.tok + elif cmd == "tree": + discard # DumpTree - discard parse(tokStream) - + # discard parse(tokStream) else: - echo "usage: nlx filename" + echo "Usage: nlx [tok|tree] \n demo files are accessible at lang/demo" diff --git a/src/noether.nim b/src/noether.nim index 509b123..83c2b1b 100644 --- a/src/noether.nim +++ b/src/noether.nim @@ -2,4 +2,4 @@ # uses this file as the main entry point of the application. when isMainModule: - echo "Noether Lang" + echo "Noether Lang v0.1.0" diff --git a/src/noether/lexer/lex.nim b/src/noether/lexer/lex.nim new file mode 100644 index 0000000..8f81b86 --- /dev/null +++ b/src/noether/lexer/lex.nim @@ -0,0 +1,178 @@ +import + streams, + options + +import tok +export tok + +type + # Abstracts the "building process" (lexing) + # of nlTok objects from a given Stream of characters. + nlLexer* = object + stream: Stream + done*: bool + # store current token and upcoming (build) token + tok*: nlTok # current token + btok: nlTok # the build token + # save char and pos and its token type + char: char + cTKind: nlTokKind + # track line number, line content, etc + line: string + lineNum: int + pos: int + +proc atEOL(lexer: nlLexer): bool {.inline.} = + result = (lexer.char == '\n') +proc atEOF(lexer: nlLexer): bool {.inline.} = + result = (lexer.char == '\0') + +# Initialise a new lexer +proc newLexer*(stream: var Stream): nlLexer = + result = nlLexer( + stream: stream, + done: false, + tok: emptyTok(0), + btok: emptyTok(0), + line: "", + lineNum: 1, + pos: -1, # after initial readChar this -> 0 + char: '\0', # use \0 as initial invalid char + cTKind: tkNONE, + ) + +# Classifies the current character to its nlTokKind +proc classifyTok*(lexer: nlLexer): nlTokKind {.inline.} = + case lexer.char: + of '\0': + result = tkEOF + of '\r', '\n': + result = tkEOL + of ' ', '\t': + result = tkWTSP + of '(': + result = tkLPAR + of ')': + result = tkRPAR + of '{': + result = tkLBRA + of '}': + result = tkRBRA + of '[': + result = tkLSQB + of ']': + result = tkRSQB + of '\'': + result = tkSQUO + of '\"': + result = tkDQUO + of '`': + result = tkGRVA + of '#': + result = tkHASH + else: + result = tkWORD + + +#[ ====================================================== ] + | nlLexer Internal Interface for Token Construction ] + ]# + +# Reset the build token to be "empty" +proc resetBuild(lexer: var nlLexer) = + lexer.btok = emptyTok(lexer.pos) + +# "Finishes" the build token by setting various properties +proc finishBuild(lexer: var nlLexer) = + lexer.btok.lineNum = lexer.lineNum + lexer.btok.endPos = lexer.pos + lexer.btok.lit = lexer.line[lexer.btok.startPos ..< lexer.line.high] + +# Finish, return, and reset the build token +proc flushBuild(lexer: var nlLexer): nlTok = + finishBuild(lexer) + result = lexer.btok + resetBuild(lexer) + +# Is the build token "compatible" with the current char? (if not then flushbuild) +# NOTE: This implicitly handles Windows CRLF, Unix LF, & Mac OS CR compatability +# NOTE: since atEOL => '\n', but '\r' and '\n' are both tkEOL so they both flush. +proc isIncompatibleBuild(lexer: nlLexer): bool = + result = (lexer.cTKind != lexer.btok.kind or lexer.atEOL()) + +# Inherit the build token's type from current char +proc inherit(lexer: var nlLexer) = + lexer.btok.kind = lexer.cTKind + +# Add a character to the nlLexer's build token. +# Flushes and returns the build token if finished. +proc appendBuild(lexer: var nlLexer): Option[nlTok] = + # untyped build tokens inherit type immediately + if lexer.btok.isUntyped(): + lexer.inherit() + + # check character and build token compatability + if isIncompatibleBuild(lexer): + # flush old build token, the new one inherits type + result = some(flushBuild(lexer)) + lexer.inherit() + else: + result = none(nlTok) + +#[ ========================================= ] + | nlLexer Internal Char Streaming Interface ] + ]# + +# Read the next char in the stream +# NOTE: readChar raises IOError on error, returns \0 on EOF +proc readChar(lexer: var nlLexer): bool = + if lexer.atEOL(): + inc lexer.lineNum + # sets lexer.char to '\0' if EOF + lexer.char = lexer.stream.readChar() + lexer.cTKind = lexer.classifyTok() + lexer.line.add(lexer.char) + inc lexer.pos + result = lexer.atEOF() + +#[ ======================== + | nlLexer Public Interface + ]# + +# Read until EOL and return the current line +# NOTE: Does NOT update the lexer's state (unsafe) +# NOTE: ONLY call if a lex/parse error needs displaying +proc unsafeGetLine*(lexer: var nlLexer): string = + while not lexer.atEOL() and lexer.readChar(): + discard + result = lexer.line + +# Lexes and returns the next token in the "token stream" +# via repeatedly calling readChar() and appendBuild(). +# Returns a boolean indicating whether EOF has been reached. +# NOTE: access the new token via `stream.tok` +proc progress*(lexer: var nlLexer): bool = + # Return prematurely if already closed + if lexer.done: + return false + while true: + let + atEOF = lexer.readChar() + flushedTok = lexer.appendBuild() + newTokBuilt = flushedTok.isSome + + if newTokBuilt: + lexer.tok = flushedTok.get() + # if canProgress and atEOF: + # if atEOF: + # if newTokBuilt: + # stream.isClosed = true + # return newTokBuilt + # elif newTokBuilt: + # return true + if newTokBuilt: + if atEOF: + lexer.done = true + return true + elif atEOF: + return false diff --git a/src/noether/lexer/lstream.nim b/src/noether/lexer/lstream.nim deleted file mode 100644 index 034f48b..0000000 --- a/src/noether/lexer/lstream.nim +++ /dev/null @@ -1,66 +0,0 @@ -import std/streams -import std/options - -import tok -export tok - -type - # Character streaming for the nlTokStream - nlLStream = object - stream: Stream - # row/column positions - line*: string - lineNum*: Natural - pos*: Natural - -proc streamFile*(filename: string): FileStream = - result = newFileStream(filename, fmRead) - -proc streamString*(str: string): StringStream = - result = newStringStream(str) - -proc newLStream*(content: string, isFile: bool = false): nlLStream = - result = nlLStream( - stream: if isFile: streamFile(content) else: streamString(content), - line: "", - lineNum: Natural 0, - pos: Natural 0, - ) - -# Checks whether we've reached EOL -# NOTE: also checks if we've surpassed it (ie invalid lstream.pos) -proc atEOL*(lstream: nlLStream): bool = - result = (lstream.pos >= lstream.line.len - 1) - -# Checks whether we are EXACTLY at EOL, but not surpassed -proc exactlyEOL*(lstream: nlLStream): bool = - result = (lstream.pos == lstream.line.len - 1) - -# Checks whether we have surpassed EOL -proc outOfBounds*(lstream: nlLStream): bool = - result = (lstream.pos > lstream.line.len - 1) - -# Progress the lex stream to the next line (if available) -proc progLine*(lstream: var nlLStream): bool = - if lstream.stream.readLine(lstream.line): - inc lstream.lineNum - lstream.pos = Natural 0 - return true - return false - -# Progress the lex stream to the next character in the line -# forcefully (aka does NOT check if we reached EOL) -proc forceProgChar*(lstream: var nlLStream) = - inc lstream.pos - -# Progress the lex stream to the next character (if available) -proc progress*(lstream: var nlLStream): bool = - if not lstream.atEOL(): - lstream.forceProgChar() - result = true - else: - # attempt to progress next line past EOL - result = lstream.progLine() - -proc currChar*(lstream: nlLStream): char = - result = lstream.line[lstream.pos] diff --git a/src/noether/lexer/tok.nim b/src/noether/lexer/tok.nim index fb3067c..b19c341 100644 --- a/src/noether/lexer/tok.nim +++ b/src/noether/lexer/tok.nim @@ -1,40 +1,53 @@ -include toktype +type + # nlTokKind allows primitive nlToks to be typed, + # the nlTokKind enum should never be directly + # accessed. Use the interface in this file instead. + nlTokKind* = enum + tkNONE, # Placeholder Value + + tkEOF, # End of File + tkEOL, # End of Line (\0 --> EOL) + + tkWORD, # Alphanumeric token + tkSYMB, # Symbolic token + + tkLNFD, # \r \n Line-Feed + tkWTSP, # ' ' \t Whitespace + + # RESERVED SYMBOLS + tkLPAR, # ( Left Parenthesis + tkRPAR, # ) Right Parenthesis + tkLBRA, # { Left Brace + tkRBRA, # } Right Brace + tkLSQB, # [ Left Square Bracket + tkRSQB, # ] Right Square Bracket + # tkLANB, # < Left Angle Bracket + # tkRANB, # > Right Angle Bracket + tkSQUO, # ' Single Quotation Marking + tkDQUO, # " Double Quotation Marking + tkGRVA, # ` Grave Accent + tkHASH, # # Number Sign (Hashtag) type - nlTok* = object - tType*: nlTokType - lit*: string - lineNum*: Natural - startPos*: Natural - endPos*: Natural + nlTok* = tuple + # NOTE: nlTokBuilder will mutate nlTok.kind + kind: nlTokKind + lit: string + lineNum: int + startPos: int + endPos: int # Generates an "empty" nlTok with only a startPos, # all other fields are expected to be filled out later. -proc emptyTok*(startPos: int): nlTok = - result = nlTok( - tType: nlTokType.NONE, +proc emptyTok*(startPos: int): nlTok {.inline.} = + result = ( + kind: tkNONE, lit: "", - startPos: Natural startPos, + lineNum: 0, + startPos: startPos, + endPos: startPos, ) -# Checks if an nlTok has nlTokType.NONE -proc isTokUntyped*(tType: nlTokType): bool = - result = (tType == nlTokType.NONE) - -# Checks if an nlTok has nlTokType.EOL -proc isTokEOL*(tok: nlTok): bool = - result = (tok.tType == nlTokType.EOL) - - - -# This method is only used to convert null -# terminator nlToks into line-feed ones. -# Returns a copy of an nlTok, changing its type -proc tokTermToLineFeed*(tok: nlTok): nlTok = - result = nlTok( - tType: nlTokType.LNFD, - lit: tok.lit, - lineNum: tok.lineNum, - startPos: tok.startPos, - endPos: tok.endPos, - ) +# Checks if an nlTok has tkNONE +proc isUntyped*(tok: nlTok): bool {.inline.} = + result = (tok.kind == tkNONE) diff --git a/src/noether/lexer/tokbuilding.nim b/src/noether/lexer/tokbuilding.nim deleted file mode 100644 index 99022ee..0000000 --- a/src/noether/lexer/tokbuilding.nim +++ /dev/null @@ -1,86 +0,0 @@ -include lstream - -type - # Provides a stream-like interface for lexing nlToks - # Internally reliant on the functionality of nlLStream - nlTokStream* = object - lstream: nlLStream - build: nlTok # the build token - currTok*: nlTok # the current token - closed: bool # EOF + all tokens built - -# Generates an EOL token for the nlTokStream's state -proc EOLTok(tokStream: nlTokStream): nlTok = - result = nlTok( - tType: nlTokType.EOL, - lit: "\0", - lineNum: Natural tokStream.lstream.lineNum, - startPos: Natural tokStream.lstream.pos, - endPos: Natural tokStream.lstream.pos, - ) - -# Resets the build token to an "empty" nlTok -proc resetBuild(tokStream: var nlTokStream) = - tokStream.build = emptyTok(tokStream.lstream.pos) - -# Completes a token generated by emptyTok() -# based on the nlTokStream's nlLStream's -# current line and character positions -proc finishBuild(ts: var nlTokStream) = - ts.build.lineNum = Natural ts.lstream.lineNum - ts.build.endPos = Natural ts.lstream.pos - ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos] - -# Returns the nlTokStream's build token and -# empties the build token's contents. -proc flushBuild(tokStream: var nlTokStream): nlTok = - finishBuild(tokStream) - result = tokStream.build - resetBuild(tokStream) - -# Returns whether the build token has a set type yet. -# This indicates that the build token should inherit -# the nlTokType of the nlLStream's next character. -proc isUntypedBuild(tokStream: nlTokStream): bool = - result = isTokUntyped(tokStream.build.tType) - -# Check whether an nlTokType is "compatible" with the build token. -# NOTE: flushBuild() should be called when an incompatible token is discovered. -proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool = - result = (tType == tokStream.build.tType) - -# Add a character to the nlTokStream's build token. -# Flushes and returns the build token if "fully built", -# and a boolean indicating whether the nlTokStream can progress. -proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool = - # the "pos > EOL" invalid state is used intentionally - # to indicate all tokens have been built, and return EOL Token - if tokStream.lstream.outOfBounds(): - buildTok = some(EOLTok(tokStream)) - return true # can progress once more - - let tType = getTokType(tokStream.lstream.currChar()) - # untyped build tokens must inherited immediately - if isUntypedBuild(tokStream): - tokStream.build.tType = tType - - # check if EOL reached - if tokStream.lstream.atEOL(): - # flush old build token, the new one can be left untyped - let compatible = isCompatibleBuild(tokStream, tType) - result = false # DO NOT PROGRESS - if compatible: - # force the lstream into an invalid state by progressing beyond EOL - # we can then detect this state on the next progBuild and return - # an EOL character (very unsafe implementation but it works well) - tokStream.lstream.forceProgChar() - buildTok = some(flushBuild(tokStream)) - # check character and build token compatability - elif not isCompatibleBuild(tokStream, tType): - # flush old build token, the new one inherits type - buildTok = some(flushBuild(tokStream)) - tokStream.build.tType = tType - result = true # can progress - else: - buildTok = none(nlTok) - result = true # can progress diff --git a/src/noether/lexer/tokkind.nim b/src/noether/lexer/tokkind.nim new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/noether/lexer/tokkind.nim @@ -0,0 +1 @@ + diff --git a/src/noether/lexer/tokstream.nim b/src/noether/lexer/tokstream.nim deleted file mode 100644 index c3fb3f2..0000000 --- a/src/noether/lexer/tokstream.nim +++ /dev/null @@ -1,53 +0,0 @@ -include tokbuilding - -# Initialises a new nlTokStream on a string or file -proc newTokStream*(content: string, isFile: bool = false): nlTokStream = - result = nlTokStream( - lstream: newLStream(content, isFile=isFile), - closed: false, - ) - # 1. initialise an empty build token - # 2. progress to the first line - result.resetBuild() - discard result.lstream.progLine() - -# Defines a short-hand notation for getting the current line -proc currLine*(tokStream: nlTokStream): string = - result = tokStream.lstream.line - -# Reimplements nlLStream.progress() for nlTokStream -# to account for additional structure (ie the build token) -proc progChar(tokStream: var nlTokStream): bool = - if not tokStream.lstream.atEOL(): - tokStream.lstream.forceProgChar() - result = true - else: - # attempt to progress to next line past EOL - result = tokStream.lstream.progLine() - tokStream.resetBuild() - -# Generates and sets (by reference) the next token in the stream, -# via repeatedly calling progBuild() and progChar(). -# Returns a boolean indicating whether EOF has been reached. -# NOTE: progBuild adds lstream's current char to the build token -# NOTE: progChar progresses to lstream's next char -proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool = - # Return prematurely if already closed - if tokStream.closed: - return false - while true: - var flushedTok: Option[nlTok] - let - canProgress = tokStream.progBuild(flushedTok) - buildComplete = flushedTok.isSome - # canProgress & EOF reached => no more tokens to build :) - # NOTE: reachedEOF and not canProgress => more tokens unwrapping - if buildComplete: - # return the finished build token, and save it as the current token - tok = flushedTok.get() - tokStream.currTok = tok - if canProgress and not tokStream.progChar(): - tokStream.closed = true - return buildComplete - elif buildComplete: - return true diff --git a/src/noether/lexer/toktype.nim b/src/noether/lexer/toktype.nim deleted file mode 100644 index 0f40023..0000000 --- a/src/noether/lexer/toktype.nim +++ /dev/null @@ -1,54 +0,0 @@ -type - # nlTokType allows primitive nlToks to be typed, - # the nlTokType enum should never be directly - # accessed. Use the interface in this file instead. - nlTokType* = enum - NONE, # Placeholder Value - EOF, # End of File - EOL, # End of Line (\0 --> EOL) - WORD, # Alphanumeric token - SYMB, # Symbolic token - LNFD, # \r \n Line-Feed - WTSP, # ' ' \t Whitespace - LPAR, # ( Left Parenthesis - RPAR, # ) Right Parenthesis - LBRA, # { Left Brace - RBRA, # } Right Brace - LSQB, # [ Left Square Bracket - RSQB, # ] Right Square Bracket - # LANB, # < Left Angle Bracket - # RANB, # > Right Angle Bracket - SQUO, # ' Single Quotation Marking - DQUO, # " Double Quotation Marking - GRVA, # ` Grave Accent - HASH, # # Number Sign (Hashtag) - -# Classifies a character to its nlTokType -proc getTokType*(c: char): nlTokType = - case c: - of '\0', '\r', '\n': - result = nlTokType.EOL - of ' ', '\t': - result = nlTokType.WTSP - of '(': - result = nlTokType.LPAR - of ')': - result = nlTokType.RPAR - of '{': - result = nlTokType.LBRA - of '}': - result = nlTokType.RBRA - of '[': - result = nlTokType.LSQB - of ']': - result = nlTokType.RSQB - of '\'': - result = nlTokType.SQUO - of '\"': - result = nlTokType.DQUO - of '`': - result = nlTokType.GRVA - of '#': - result = nlTokType.HASH - else: - result = nlTokType.WORD diff --git a/src/noether/lib/err.nim b/src/noether/lib/err.nim new file mode 100644 index 0000000..ec4c848 --- /dev/null +++ b/src/noether/lib/err.nim @@ -0,0 +1 @@ +proc echoErrorHeader(): = diff --git a/src/noether/lib/io.nim b/src/noether/lib/io.nim new file mode 100644 index 0000000..c7eb0eb --- /dev/null +++ b/src/noether/lib/io.nim @@ -0,0 +1,7 @@ +import std/streams + +proc streamFile*(filename: string): Stream {.inline.} = + result = newFileStream(filename, fmRead) + +proc streamString*(str: string): Stream {.inline.} = + result = newStringStream(str) diff --git a/src/noether/parser/err.nim b/src/noether/parser/err.nim new file mode 100644 index 0000000..9cc5a73 --- /dev/null +++ b/src/noether/parser/err.nim @@ -0,0 +1,8 @@ +#[ Error codes and messaging directly associated with + | nlParser and its procedures is written here. + | General error functionality is in src/noether/lib/err.nim + ]# + +import parser + + diff --git a/src/noether/parser/nodes.nim b/src/noether/parser/nodes.nim index 23cf742..bd737c6 100644 --- a/src/noether/parser/nodes.nim +++ b/src/noether/parser/nodes.nim @@ -1,18 +1,44 @@ +import std/options from ../lexer/tok import nlTok -# from ../lexer/tokstream import type - # NOTE: by the end of parsing NO nodes should - # NOTE: have nlNodeType.NONE - nlNodeType* = enum - NONE, # Placeholder Value - TERM, # Indicates the tree has terminated - STRL, # String Literal - CHRL, # Character Literal + # NOTE: by the end of parsing NO nodes should have nkNone + nlNodeKind* = enum + nkNone, # Placeholder Value + + nkStrLit, # String Literal + nkChrLit, # Character Literal + + # NOTE: always check parent != nil when traversing the tree nlNode* {.acyclic.} = ref object of RootObj - nType*: nlNodeType - toks*: seq[nlTok] # nodes store the tokens that build them - # left, right: nlNode + nKind*: nlNodeKind + toks*: seq[nlTok] # nodes (may) store the tokens that build them + parent*: nlNode + + # Purely abstract type that all nlNode objects + # with children are expected to inherit from. + nlBranchNode* {.acyclic.} = ref object of nlNode + child: UncheckedArray[nlNode] + + nlBiNode* {.acyclic.} = ref object of nlBranchNode + +proc childCount*(node: nlNode): int {.inline.} = 0 +proc childCount*(node: nlBiNode): int {.inline.} = 2 + +proc getChild*(node: nlNode, i: int): Option[nlNode] {.inline.} = + result = none(nlNode) +proc getChild*(node: nlBranchNode, i: int): Option[nlNode] {.inline.} = + result = some(node.child[i]) + +proc newNode*(nKind: nlNodeKind): nlNode = + result = nlNode( + nKind: nKind, + ) + +proc newBiNode*(nKind: nlNodeKind): nlNode = + result = nlBiNode( + nKind: nKind, + ) # Short-hand way of appending a token to a node's token sequence proc addTok*(node: nlNode, tok: nlTok) = diff --git a/src/noether/parser/parse.nim b/src/noether/parser/parse.nim new file mode 100644 index 0000000..0ecd14b --- /dev/null +++ b/src/noether/parser/parse.nim @@ -0,0 +1,58 @@ +import strutils +include parser + +# NOTE: Matching between two tokens will fill `node` with everything +# NOTE: between those two tokens EXCLUDING the two tokens themselves. +proc parseMatch(parser: var nlParser, matchType: nlTokKind): nlParseStat = + result = greed( + parser, + satisfyMatch(matchType), + ) +proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat = + result = greedLine( + parser, + satisfyMatch(matchType), + ) + +proc parseStrLit(parser: var nlParser): nlParseStat = + result = parser.parseMatchLine(tkDQUO) + +proc parseChrLit(parser: var nlParser): nlParseStat = + result = parser.parseMatchLine(tkSQUO) + +proc parseStmt(parser: var nlParser): nlParseStat = + while parser.progressStream(): + echo "----- Current Token: ", parser.currTok + case parser.currTok.tKind + of tkDQUO: + # Attempt to parse string literal + if parser.parseStrLit() != nlParseStat.OK: + echo "Unmatched Double Quotation! Malformed String Literal" + echo parser.line + echo repeat(" ", parser.currTok.startPos), '^', '\n' + else: + echo "Parsed String Literal" + echo parser.bnode[], '\n' + of tkSQUO: + # Attempt to parse string literal + if parser.parseChrLit() != nlParseStat.OK: + echo "Unmatched Single Quotation! Malformed Character Literal" + echo parser.line + echo repeat(" ", parser.currTok.startPos), '^', '\n' + else: + echo "Parsed Character Literal" + echo parser.bnode[], '\n' + of tkEOL: + # TODO: handle this case, don't just discard + discard + else: + echo "blah blah unhandled case\n" + result = nlParseStat.OK + +# Attempt to parse nlAST from nlTokStream +proc parse*(tokStream: var nlTokStream): nlAST = + var parser = newParser(tokStream) + echo ' ' + discard parser.parseStmt() + + result = parser.ast diff --git a/src/noether/parser/parser.nim b/src/noether/parser/parser.nim index 0598075..7047e6d 100644 --- a/src/noether/parser/parser.nim +++ b/src/noether/parser/parser.nim @@ -1,63 +1,90 @@ -import strutils -include parseutil +import nodes +import ../lexer/lex -# NOTE: Matching between two tokens will fill `node` with everything -# NOTE: between those two tokens EXCLUDING the two tokens themselves. -proc parseMatch(tokStream: var nlTokStream, - node: var nlNode, - matchType: nlTokType): nlParseStat = - result = greed( - tokStream, - node.toks, - satisfyMatch(matchType), - ) -proc parseMatchLine(tokStream: var nlTokStream, - node: var nlNode, - matchType: nlTokType): nlParseStat = - result = greed( - tokStream, - node.toks, - satisfyMatchEOL(matchType), +type + # NOTE1: Values above MARKER_FAIL indicate a failed state + # NOTE2: nlParseStat is marked pure out of habit that's all + nlParseStat* {.pure.} = enum + OK, + MARKER_FAIL, + UNMATCHED, + TOOBIG, + + nlAST* = object + root: nlNode + + nlParser* = object + stream: nlTokStream + ast: nlAST + # the "build node" is a reference to the AST node + # the parser is currently modifying/building from + # NOTE: bnode changes frequently, it is NOT the root + bnode: nlNode + # flag indicating whether the parser is at + # the start of a new line (aka checking indentation) + inIndent: bool + + +proc `*`(stat: nlParseStat, b: bool): nlParseStat = + result = if b: stat else: nlParseStat.OK + +proc isFail*(stat: nlParseStat): bool = + result = (stat >= nlParseStat.MARKER_FAIL) + +proc newParser*(tokStream: var nlTokStream): nlParser = + let rootNode = newNode(nkNone) + result = nlParser( + stream: tokStream, + ast: nlAST( + root: rootNode + ), + bnode: rootNode, ) -proc parseStrL(tokStream: var nlTokStream, node: var nlNode): nlParseStat = - node = nlNode( - nType: nlNodeType.STRL - ) - node.addTok(tokStream.currTok) - result = nlParseStat.UNCLOSED * not greedEOL(tokStream, node.toks, nlTokType.DQUO) +# Exposes a subset of the nlTokStream interface +proc currTok(parser: var nlParser): nlTok = parser.stream.currTok +proc line(parser: var nlParser): string = parser.stream.line -proc parseChrL(tokStream: var nlTokStream, node: var nlNode): bool = - node = nlNode( - nType: nlNodeType.CHRL - ) - node.addTok(tokStream.currTok) - # TWO ERRORS ARE POSSIBLE, 1: content too big, 2: never closed - result = greedEOL(tokStream, node.toks, nlTokType.SQUO) +# Extends upon the functionality of nlTokStream.progress() +proc progressStream*(parser: var nlParser): bool = + result = parser.stream.progress() + if result and parser.currTok.tKind == tkEOL: + parser.inIndent = true + if -# Attempt to form an nlAST from a nlTokStream -proc parse*(tokStream: var nlTokStream): nlNode = - var tok: nlTok - var node: nlNode - while tokStream.nextTok(tok): - case tok.tType: - of nlTokType.DQUO: - # Attempt to parse string literal - if not parseStrL(tokStream, node): - echo "Unmatched Double Quotation! Malformed String Literal" - echo tokStream.currLine() - echo repeat(" ", tok.startPos), '^' - else: - echo "Parsed String Literal" - echo node[] - of nlTokType.SQUO: - # Attempt to parse string literal - if not parseChrL(tokStream, node): - echo "Unmatched Single Quotation! Malformed Character Literal" - echo tokStream.currLine() - echo repeat(" ", tok.startPos), '^' - else: - echo "Parsed String Literal" - echo node[] - else: - echo "blah blah unhandled case" +proc setNewLine() + +#[ "Greed" refers to something I mentioned in my discussion on + | Noether's grammar (in an EBNF-like language). Greed just + | means "everything until a condition is satisified". + | That condition should be supplied by a Nim procedural type. + ]# + +# Greed will consume anything until a condition is satisfied +# Returns false if the greed was never satisfied (OMG!!) +proc greed(parser: var nlParser, + satisfy: proc(tok: nlTok): bool): nlParseStat = + while parser.progressStream(): + if satisfy(parser.currTok): + return nlParseStat.OK + # NOTE: the matched token is currently excluded + parser.bnode.addTok(parser.currTok) + result = nlParseStat.UNMATCHED + +proc greedLine(parser: var nlParser, + satisfy: proc(tok: nlTok): bool): nlParseStat = + while parser.progressStream(): + if satisfy(parser.currTok): + return nlParseStat.OK + # NOTE: the matched token is currently excluded + parser.bnode.addTok(parser.currTok) + if parser.currTok.tKind == tkEOL: + return nlParseStat.UNMATCHED + result = nlParseStat.UNMATCHED + +#[ Templates for generating greed satisfying conditions. + ]# + +# Satisfied if it finds nlTok of type matchType +template satisfyMatch(matchType: nlTokKind): untyped = + (proc(tok {.inject.}: nlTok): bool = (tok.tKind == matchType)) diff --git a/src/noether/parser/parseutil.nim b/src/noether/parser/parseutil.nim deleted file mode 100644 index 6fa1243..0000000 --- a/src/noether/parser/parseutil.nim +++ /dev/null @@ -1,58 +0,0 @@ -import nodes -import ../lexer/tokstream - -type - # NOTE: Values above __FAIL__ indicate a failed state - nlParseStat* = enum - OK, - __FAIL__, - MIDAS, # Greedy search was never satisfied - UNMATCHED, - TOOBIG, - -proc `*`(stat: nlParseStat, b: bool): nlParseStat = - result = if b: stat else: nlParseStat.OK - -proc isFail*(stat: nlParseStat): bool = - result = (stat >= nlParseStat.__FAIL__) - - -#[ "Greed" refers to something I mentioned in my discussion on - | Noether's grammar (in an EBNF-like language). Greed just - | means "everything until a condition is satisified". - | That condition should be supplied by a Nim procedural type. - ]# - -# Greed will consume anything until a condition is satisfied -# Returns false if the greed was never satisfied (OMG!!) -proc greed(tokStream: var nlTokStream, - toks: var seq[nlTok], - satisfy: proc(tok: nlTok): bool, - ): nlParseStat = - var tok: nlTok - while tokStream.nextTok(tok): - toks.add(tok) - if satisfy(tok): - return nlParseStat.OK - result = nlParseStat.UNMATCHED - -proc greedLine(tokStream: var nlTokStream, - toks: var seq[nlTok], - satisfy: proc(tok: nlTok): bool): nlParseStat = - var tok: nlTok - while tokStream.nextTok(tok): - toks.add(tok) - if satisfy(tok): - return true - result = - -#[ Templates for generating greed satisfying conditions. - ]# - -# Satisfied if it finds nlTok of type matchType -template satisfyMatch(matchType: nlTokType) = - proc(tok: nlTok): bool {.inline.} = (tok.tType == matchType) - -# Satisfied if it finds nlTok of type matchType or EOL reached -template satisfyMatchEOL(matchType: nlTokType) = - proc(tok: nlTok): bool {.inline.} = (tok.tType == matchType or tok.tType == nlTokType.EOL)