diff --git a/.gitignore b/.gitignore index 814ced8..0a37b21 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,2 @@ __pycache__/ bin/ - -# TEMP: used while debugging -# (and cause I'm super duper lazy) -src/nlx diff --git a/lang/demo/single_toks.no b/lang/demo/single_toks.no deleted file mode 100644 index 683090a..0000000 --- a/lang/demo/single_toks.no +++ /dev/null @@ -1,2 +0,0 @@ -[a]b(#) -(c)d[e] diff --git a/py/m.py b/py/m.py index 9576f4c..e2b60c5 100644 --- a/py/m.py +++ b/py/m.py @@ -2,7 +2,7 @@ import sys import readline -from noether.lib.math import * +from noether.math import * from noether.cli import * diff --git a/src/ddemo b/src/ddemo deleted file mode 100755 index af30039..0000000 --- a/src/ddemo +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash -set -e - -if [ -z "$1" ]; then - echo "Usage: ddemo DEMOFILE" - echo "Demo files are located in lang/demo" - exit 1 -fi - -nim c nlx.nim -./nlx ../lang/demo/$1 diff --git a/src/nlx.nim b/src/nlx.nim index e145943..4b0c678 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -1,29 +1,22 @@ import os -import noether/lib/io -import noether/lexer/lex -import noether/parser/parse +import noether/lexer/tok +import noether/lexer/tokstream +import noether/parser/parser -{.hint: "Don't forget to drink more water (^_^)".} when isMainModule: echo "Noether Lang Extras v0.1.0 - nlx" - # really lazy argparse implementation (temporary) - let - paramC = paramCount() - cmd = if paramC > 2: paramStr 1 - else: "tok" + if paramCount() > 0: + let filename = paramStr(1) + var tokStream = newTokStream(filename, isFile=true) + + # # DumpTok + # var tok: nlTok + # while tokStream.nextTok(tok): + # echo tok - var stream = if paramC > 0: streamFile(paramStr paramC) - else: streamString(readAll stdin) - - var lexer = newLexer(stream) - if cmd == "tok": - # DumpTok - while lexer.progress(): - echo lexer.tok - elif cmd == "tree": - discard # DumpTree - # discard parse(tokStream) + discard parse(tokStream) + else: - echo "Usage: nlx [tok|tree] \n demo files are accessible at lang/demo" + echo "usage: nlx filename" diff --git a/src/noether.nim b/src/noether.nim index 83c2b1b..509b123 100644 --- a/src/noether.nim +++ b/src/noether.nim @@ -2,4 +2,4 @@ # uses this file as the main entry point of the application. when isMainModule: - echo "Noether Lang v0.1.0" + echo "Noether Lang" diff --git a/src/noether/lexer/lex.nim b/src/noether/lexer/lex.nim deleted file mode 100644 index 8f81b86..0000000 --- a/src/noether/lexer/lex.nim +++ /dev/null @@ -1,178 +0,0 @@ -import - streams, - options - -import tok -export tok - -type - # Abstracts the "building process" (lexing) - # of nlTok objects from a given Stream of characters. - nlLexer* = object - stream: Stream - done*: bool - # store current token and upcoming (build) token - tok*: nlTok # current token - btok: nlTok # the build token - # save char and pos and its token type - char: char - cTKind: nlTokKind - # track line number, line content, etc - line: string - lineNum: int - pos: int - -proc atEOL(lexer: nlLexer): bool {.inline.} = - result = (lexer.char == '\n') -proc atEOF(lexer: nlLexer): bool {.inline.} = - result = (lexer.char == '\0') - -# Initialise a new lexer -proc newLexer*(stream: var Stream): nlLexer = - result = nlLexer( - stream: stream, - done: false, - tok: emptyTok(0), - btok: emptyTok(0), - line: "", - lineNum: 1, - pos: -1, # after initial readChar this -> 0 - char: '\0', # use \0 as initial invalid char - cTKind: tkNONE, - ) - -# Classifies the current character to its nlTokKind -proc classifyTok*(lexer: nlLexer): nlTokKind {.inline.} = - case lexer.char: - of '\0': - result = tkEOF - of '\r', '\n': - result = tkEOL - of ' ', '\t': - result = tkWTSP - of '(': - result = tkLPAR - of ')': - result = tkRPAR - of '{': - result = tkLBRA - of '}': - result = tkRBRA - of '[': - result = tkLSQB - of ']': - result = tkRSQB - of '\'': - result = tkSQUO - of '\"': - result = tkDQUO - of '`': - result = tkGRVA - of '#': - result = tkHASH - else: - result = tkWORD - - -#[ ====================================================== ] - | nlLexer Internal Interface for Token Construction ] - ]# - -# Reset the build token to be "empty" -proc resetBuild(lexer: var nlLexer) = - lexer.btok = emptyTok(lexer.pos) - -# "Finishes" the build token by setting various properties -proc finishBuild(lexer: var nlLexer) = - lexer.btok.lineNum = lexer.lineNum - lexer.btok.endPos = lexer.pos - lexer.btok.lit = lexer.line[lexer.btok.startPos ..< lexer.line.high] - -# Finish, return, and reset the build token -proc flushBuild(lexer: var nlLexer): nlTok = - finishBuild(lexer) - result = lexer.btok - resetBuild(lexer) - -# Is the build token "compatible" with the current char? (if not then flushbuild) -# NOTE: This implicitly handles Windows CRLF, Unix LF, & Mac OS CR compatability -# NOTE: since atEOL => '\n', but '\r' and '\n' are both tkEOL so they both flush. -proc isIncompatibleBuild(lexer: nlLexer): bool = - result = (lexer.cTKind != lexer.btok.kind or lexer.atEOL()) - -# Inherit the build token's type from current char -proc inherit(lexer: var nlLexer) = - lexer.btok.kind = lexer.cTKind - -# Add a character to the nlLexer's build token. -# Flushes and returns the build token if finished. -proc appendBuild(lexer: var nlLexer): Option[nlTok] = - # untyped build tokens inherit type immediately - if lexer.btok.isUntyped(): - lexer.inherit() - - # check character and build token compatability - if isIncompatibleBuild(lexer): - # flush old build token, the new one inherits type - result = some(flushBuild(lexer)) - lexer.inherit() - else: - result = none(nlTok) - -#[ ========================================= ] - | nlLexer Internal Char Streaming Interface ] - ]# - -# Read the next char in the stream -# NOTE: readChar raises IOError on error, returns \0 on EOF -proc readChar(lexer: var nlLexer): bool = - if lexer.atEOL(): - inc lexer.lineNum - # sets lexer.char to '\0' if EOF - lexer.char = lexer.stream.readChar() - lexer.cTKind = lexer.classifyTok() - lexer.line.add(lexer.char) - inc lexer.pos - result = lexer.atEOF() - -#[ ======================== - | nlLexer Public Interface - ]# - -# Read until EOL and return the current line -# NOTE: Does NOT update the lexer's state (unsafe) -# NOTE: ONLY call if a lex/parse error needs displaying -proc unsafeGetLine*(lexer: var nlLexer): string = - while not lexer.atEOL() and lexer.readChar(): - discard - result = lexer.line - -# Lexes and returns the next token in the "token stream" -# via repeatedly calling readChar() and appendBuild(). -# Returns a boolean indicating whether EOF has been reached. -# NOTE: access the new token via `stream.tok` -proc progress*(lexer: var nlLexer): bool = - # Return prematurely if already closed - if lexer.done: - return false - while true: - let - atEOF = lexer.readChar() - flushedTok = lexer.appendBuild() - newTokBuilt = flushedTok.isSome - - if newTokBuilt: - lexer.tok = flushedTok.get() - # if canProgress and atEOF: - # if atEOF: - # if newTokBuilt: - # stream.isClosed = true - # return newTokBuilt - # elif newTokBuilt: - # return true - if newTokBuilt: - if atEOF: - lexer.done = true - return true - elif atEOF: - return false diff --git a/src/noether/lexer/lstream.nim b/src/noether/lexer/lstream.nim new file mode 100644 index 0000000..034f48b --- /dev/null +++ b/src/noether/lexer/lstream.nim @@ -0,0 +1,66 @@ +import std/streams +import std/options + +import tok +export tok + +type + # Character streaming for the nlTokStream + nlLStream = object + stream: Stream + # row/column positions + line*: string + lineNum*: Natural + pos*: Natural + +proc streamFile*(filename: string): FileStream = + result = newFileStream(filename, fmRead) + +proc streamString*(str: string): StringStream = + result = newStringStream(str) + +proc newLStream*(content: string, isFile: bool = false): nlLStream = + result = nlLStream( + stream: if isFile: streamFile(content) else: streamString(content), + line: "", + lineNum: Natural 0, + pos: Natural 0, + ) + +# Checks whether we've reached EOL +# NOTE: also checks if we've surpassed it (ie invalid lstream.pos) +proc atEOL*(lstream: nlLStream): bool = + result = (lstream.pos >= lstream.line.len - 1) + +# Checks whether we are EXACTLY at EOL, but not surpassed +proc exactlyEOL*(lstream: nlLStream): bool = + result = (lstream.pos == lstream.line.len - 1) + +# Checks whether we have surpassed EOL +proc outOfBounds*(lstream: nlLStream): bool = + result = (lstream.pos > lstream.line.len - 1) + +# Progress the lex stream to the next line (if available) +proc progLine*(lstream: var nlLStream): bool = + if lstream.stream.readLine(lstream.line): + inc lstream.lineNum + lstream.pos = Natural 0 + return true + return false + +# Progress the lex stream to the next character in the line +# forcefully (aka does NOT check if we reached EOL) +proc forceProgChar*(lstream: var nlLStream) = + inc lstream.pos + +# Progress the lex stream to the next character (if available) +proc progress*(lstream: var nlLStream): bool = + if not lstream.atEOL(): + lstream.forceProgChar() + result = true + else: + # attempt to progress next line past EOL + result = lstream.progLine() + +proc currChar*(lstream: nlLStream): char = + result = lstream.line[lstream.pos] diff --git a/src/noether/lexer/tok.nim b/src/noether/lexer/tok.nim index b19c341..fb3067c 100644 --- a/src/noether/lexer/tok.nim +++ b/src/noether/lexer/tok.nim @@ -1,53 +1,40 @@ -type - # nlTokKind allows primitive nlToks to be typed, - # the nlTokKind enum should never be directly - # accessed. Use the interface in this file instead. - nlTokKind* = enum - tkNONE, # Placeholder Value - - tkEOF, # End of File - tkEOL, # End of Line (\0 --> EOL) - - tkWORD, # Alphanumeric token - tkSYMB, # Symbolic token - - tkLNFD, # \r \n Line-Feed - tkWTSP, # ' ' \t Whitespace - - # RESERVED SYMBOLS - tkLPAR, # ( Left Parenthesis - tkRPAR, # ) Right Parenthesis - tkLBRA, # { Left Brace - tkRBRA, # } Right Brace - tkLSQB, # [ Left Square Bracket - tkRSQB, # ] Right Square Bracket - # tkLANB, # < Left Angle Bracket - # tkRANB, # > Right Angle Bracket - tkSQUO, # ' Single Quotation Marking - tkDQUO, # " Double Quotation Marking - tkGRVA, # ` Grave Accent - tkHASH, # # Number Sign (Hashtag) +include toktype type - nlTok* = tuple - # NOTE: nlTokBuilder will mutate nlTok.kind - kind: nlTokKind - lit: string - lineNum: int - startPos: int - endPos: int + nlTok* = object + tType*: nlTokType + lit*: string + lineNum*: Natural + startPos*: Natural + endPos*: Natural # Generates an "empty" nlTok with only a startPos, # all other fields are expected to be filled out later. -proc emptyTok*(startPos: int): nlTok {.inline.} = - result = ( - kind: tkNONE, +proc emptyTok*(startPos: int): nlTok = + result = nlTok( + tType: nlTokType.NONE, lit: "", - lineNum: 0, - startPos: startPos, - endPos: startPos, + startPos: Natural startPos, ) -# Checks if an nlTok has tkNONE -proc isUntyped*(tok: nlTok): bool {.inline.} = - result = (tok.kind == tkNONE) +# Checks if an nlTok has nlTokType.NONE +proc isTokUntyped*(tType: nlTokType): bool = + result = (tType == nlTokType.NONE) + +# Checks if an nlTok has nlTokType.EOL +proc isTokEOL*(tok: nlTok): bool = + result = (tok.tType == nlTokType.EOL) + + + +# This method is only used to convert null +# terminator nlToks into line-feed ones. +# Returns a copy of an nlTok, changing its type +proc tokTermToLineFeed*(tok: nlTok): nlTok = + result = nlTok( + tType: nlTokType.LNFD, + lit: tok.lit, + lineNum: tok.lineNum, + startPos: tok.startPos, + endPos: tok.endPos, + ) diff --git a/src/noether/lexer/tokbuilding.nim b/src/noether/lexer/tokbuilding.nim new file mode 100644 index 0000000..99022ee --- /dev/null +++ b/src/noether/lexer/tokbuilding.nim @@ -0,0 +1,86 @@ +include lstream + +type + # Provides a stream-like interface for lexing nlToks + # Internally reliant on the functionality of nlLStream + nlTokStream* = object + lstream: nlLStream + build: nlTok # the build token + currTok*: nlTok # the current token + closed: bool # EOF + all tokens built + +# Generates an EOL token for the nlTokStream's state +proc EOLTok(tokStream: nlTokStream): nlTok = + result = nlTok( + tType: nlTokType.EOL, + lit: "\0", + lineNum: Natural tokStream.lstream.lineNum, + startPos: Natural tokStream.lstream.pos, + endPos: Natural tokStream.lstream.pos, + ) + +# Resets the build token to an "empty" nlTok +proc resetBuild(tokStream: var nlTokStream) = + tokStream.build = emptyTok(tokStream.lstream.pos) + +# Completes a token generated by emptyTok() +# based on the nlTokStream's nlLStream's +# current line and character positions +proc finishBuild(ts: var nlTokStream) = + ts.build.lineNum = Natural ts.lstream.lineNum + ts.build.endPos = Natural ts.lstream.pos + ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos] + +# Returns the nlTokStream's build token and +# empties the build token's contents. +proc flushBuild(tokStream: var nlTokStream): nlTok = + finishBuild(tokStream) + result = tokStream.build + resetBuild(tokStream) + +# Returns whether the build token has a set type yet. +# This indicates that the build token should inherit +# the nlTokType of the nlLStream's next character. +proc isUntypedBuild(tokStream: nlTokStream): bool = + result = isTokUntyped(tokStream.build.tType) + +# Check whether an nlTokType is "compatible" with the build token. +# NOTE: flushBuild() should be called when an incompatible token is discovered. +proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool = + result = (tType == tokStream.build.tType) + +# Add a character to the nlTokStream's build token. +# Flushes and returns the build token if "fully built", +# and a boolean indicating whether the nlTokStream can progress. +proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool = + # the "pos > EOL" invalid state is used intentionally + # to indicate all tokens have been built, and return EOL Token + if tokStream.lstream.outOfBounds(): + buildTok = some(EOLTok(tokStream)) + return true # can progress once more + + let tType = getTokType(tokStream.lstream.currChar()) + # untyped build tokens must inherited immediately + if isUntypedBuild(tokStream): + tokStream.build.tType = tType + + # check if EOL reached + if tokStream.lstream.atEOL(): + # flush old build token, the new one can be left untyped + let compatible = isCompatibleBuild(tokStream, tType) + result = false # DO NOT PROGRESS + if compatible: + # force the lstream into an invalid state by progressing beyond EOL + # we can then detect this state on the next progBuild and return + # an EOL character (very unsafe implementation but it works well) + tokStream.lstream.forceProgChar() + buildTok = some(flushBuild(tokStream)) + # check character and build token compatability + elif not isCompatibleBuild(tokStream, tType): + # flush old build token, the new one inherits type + buildTok = some(flushBuild(tokStream)) + tokStream.build.tType = tType + result = true # can progress + else: + buildTok = none(nlTok) + result = true # can progress diff --git a/src/noether/lexer/tokkind.nim b/src/noether/lexer/tokkind.nim deleted file mode 100644 index 8b13789..0000000 --- a/src/noether/lexer/tokkind.nim +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/noether/lexer/tokstream.nim b/src/noether/lexer/tokstream.nim new file mode 100644 index 0000000..c3fb3f2 --- /dev/null +++ b/src/noether/lexer/tokstream.nim @@ -0,0 +1,53 @@ +include tokbuilding + +# Initialises a new nlTokStream on a string or file +proc newTokStream*(content: string, isFile: bool = false): nlTokStream = + result = nlTokStream( + lstream: newLStream(content, isFile=isFile), + closed: false, + ) + # 1. initialise an empty build token + # 2. progress to the first line + result.resetBuild() + discard result.lstream.progLine() + +# Defines a short-hand notation for getting the current line +proc currLine*(tokStream: nlTokStream): string = + result = tokStream.lstream.line + +# Reimplements nlLStream.progress() for nlTokStream +# to account for additional structure (ie the build token) +proc progChar(tokStream: var nlTokStream): bool = + if not tokStream.lstream.atEOL(): + tokStream.lstream.forceProgChar() + result = true + else: + # attempt to progress to next line past EOL + result = tokStream.lstream.progLine() + tokStream.resetBuild() + +# Generates and sets (by reference) the next token in the stream, +# via repeatedly calling progBuild() and progChar(). +# Returns a boolean indicating whether EOF has been reached. +# NOTE: progBuild adds lstream's current char to the build token +# NOTE: progChar progresses to lstream's next char +proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool = + # Return prematurely if already closed + if tokStream.closed: + return false + while true: + var flushedTok: Option[nlTok] + let + canProgress = tokStream.progBuild(flushedTok) + buildComplete = flushedTok.isSome + # canProgress & EOF reached => no more tokens to build :) + # NOTE: reachedEOF and not canProgress => more tokens unwrapping + if buildComplete: + # return the finished build token, and save it as the current token + tok = flushedTok.get() + tokStream.currTok = tok + if canProgress and not tokStream.progChar(): + tokStream.closed = true + return buildComplete + elif buildComplete: + return true diff --git a/src/noether/lexer/toktype.nim b/src/noether/lexer/toktype.nim new file mode 100644 index 0000000..0f40023 --- /dev/null +++ b/src/noether/lexer/toktype.nim @@ -0,0 +1,54 @@ +type + # nlTokType allows primitive nlToks to be typed, + # the nlTokType enum should never be directly + # accessed. Use the interface in this file instead. + nlTokType* = enum + NONE, # Placeholder Value + EOF, # End of File + EOL, # End of Line (\0 --> EOL) + WORD, # Alphanumeric token + SYMB, # Symbolic token + LNFD, # \r \n Line-Feed + WTSP, # ' ' \t Whitespace + LPAR, # ( Left Parenthesis + RPAR, # ) Right Parenthesis + LBRA, # { Left Brace + RBRA, # } Right Brace + LSQB, # [ Left Square Bracket + RSQB, # ] Right Square Bracket + # LANB, # < Left Angle Bracket + # RANB, # > Right Angle Bracket + SQUO, # ' Single Quotation Marking + DQUO, # " Double Quotation Marking + GRVA, # ` Grave Accent + HASH, # # Number Sign (Hashtag) + +# Classifies a character to its nlTokType +proc getTokType*(c: char): nlTokType = + case c: + of '\0', '\r', '\n': + result = nlTokType.EOL + of ' ', '\t': + result = nlTokType.WTSP + of '(': + result = nlTokType.LPAR + of ')': + result = nlTokType.RPAR + of '{': + result = nlTokType.LBRA + of '}': + result = nlTokType.RBRA + of '[': + result = nlTokType.LSQB + of ']': + result = nlTokType.RSQB + of '\'': + result = nlTokType.SQUO + of '\"': + result = nlTokType.DQUO + of '`': + result = nlTokType.GRVA + of '#': + result = nlTokType.HASH + else: + result = nlTokType.WORD diff --git a/src/noether/lib/err.nim b/src/noether/lib/err.nim deleted file mode 100644 index ec4c848..0000000 --- a/src/noether/lib/err.nim +++ /dev/null @@ -1 +0,0 @@ -proc echoErrorHeader(): = diff --git a/src/noether/lib/io.nim b/src/noether/lib/io.nim deleted file mode 100644 index c7eb0eb..0000000 --- a/src/noether/lib/io.nim +++ /dev/null @@ -1,7 +0,0 @@ -import std/streams - -proc streamFile*(filename: string): Stream {.inline.} = - result = newFileStream(filename, fmRead) - -proc streamString*(str: string): Stream {.inline.} = - result = newStringStream(str) diff --git a/src/noether/parser/err.nim b/src/noether/parser/err.nim deleted file mode 100644 index 9cc5a73..0000000 --- a/src/noether/parser/err.nim +++ /dev/null @@ -1,8 +0,0 @@ -#[ Error codes and messaging directly associated with - | nlParser and its procedures is written here. - | General error functionality is in src/noether/lib/err.nim - ]# - -import parser - - diff --git a/src/noether/parser/nodes.nim b/src/noether/parser/nodes.nim index bd737c6..23cf742 100644 --- a/src/noether/parser/nodes.nim +++ b/src/noether/parser/nodes.nim @@ -1,44 +1,18 @@ -import std/options from ../lexer/tok import nlTok +# from ../lexer/tokstream import type - # NOTE: by the end of parsing NO nodes should have nkNone - nlNodeKind* = enum - nkNone, # Placeholder Value - - nkStrLit, # String Literal - nkChrLit, # Character Literal - - # NOTE: always check parent != nil when traversing the tree + # NOTE: by the end of parsing NO nodes should + # NOTE: have nlNodeType.NONE + nlNodeType* = enum + NONE, # Placeholder Value + TERM, # Indicates the tree has terminated + STRL, # String Literal + CHRL, # Character Literal nlNode* {.acyclic.} = ref object of RootObj - nKind*: nlNodeKind - toks*: seq[nlTok] # nodes (may) store the tokens that build them - parent*: nlNode - - # Purely abstract type that all nlNode objects - # with children are expected to inherit from. - nlBranchNode* {.acyclic.} = ref object of nlNode - child: UncheckedArray[nlNode] - - nlBiNode* {.acyclic.} = ref object of nlBranchNode - -proc childCount*(node: nlNode): int {.inline.} = 0 -proc childCount*(node: nlBiNode): int {.inline.} = 2 - -proc getChild*(node: nlNode, i: int): Option[nlNode] {.inline.} = - result = none(nlNode) -proc getChild*(node: nlBranchNode, i: int): Option[nlNode] {.inline.} = - result = some(node.child[i]) - -proc newNode*(nKind: nlNodeKind): nlNode = - result = nlNode( - nKind: nKind, - ) - -proc newBiNode*(nKind: nlNodeKind): nlNode = - result = nlBiNode( - nKind: nKind, - ) + nType*: nlNodeType + toks*: seq[nlTok] # nodes store the tokens that build them + # left, right: nlNode # Short-hand way of appending a token to a node's token sequence proc addTok*(node: nlNode, tok: nlTok) = diff --git a/src/noether/parser/parse.nim b/src/noether/parser/parse.nim deleted file mode 100644 index 0ecd14b..0000000 --- a/src/noether/parser/parse.nim +++ /dev/null @@ -1,58 +0,0 @@ -import strutils -include parser - -# NOTE: Matching between two tokens will fill `node` with everything -# NOTE: between those two tokens EXCLUDING the two tokens themselves. -proc parseMatch(parser: var nlParser, matchType: nlTokKind): nlParseStat = - result = greed( - parser, - satisfyMatch(matchType), - ) -proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat = - result = greedLine( - parser, - satisfyMatch(matchType), - ) - -proc parseStrLit(parser: var nlParser): nlParseStat = - result = parser.parseMatchLine(tkDQUO) - -proc parseChrLit(parser: var nlParser): nlParseStat = - result = parser.parseMatchLine(tkSQUO) - -proc parseStmt(parser: var nlParser): nlParseStat = - while parser.progressStream(): - echo "----- Current Token: ", parser.currTok - case parser.currTok.tKind - of tkDQUO: - # Attempt to parse string literal - if parser.parseStrLit() != nlParseStat.OK: - echo "Unmatched Double Quotation! Malformed String Literal" - echo parser.line - echo repeat(" ", parser.currTok.startPos), '^', '\n' - else: - echo "Parsed String Literal" - echo parser.bnode[], '\n' - of tkSQUO: - # Attempt to parse string literal - if parser.parseChrLit() != nlParseStat.OK: - echo "Unmatched Single Quotation! Malformed Character Literal" - echo parser.line - echo repeat(" ", parser.currTok.startPos), '^', '\n' - else: - echo "Parsed Character Literal" - echo parser.bnode[], '\n' - of tkEOL: - # TODO: handle this case, don't just discard - discard - else: - echo "blah blah unhandled case\n" - result = nlParseStat.OK - -# Attempt to parse nlAST from nlTokStream -proc parse*(tokStream: var nlTokStream): nlAST = - var parser = newParser(tokStream) - echo ' ' - discard parser.parseStmt() - - result = parser.ast diff --git a/src/noether/parser/parser.nim b/src/noether/parser/parser.nim index 7047e6d..0598075 100644 --- a/src/noether/parser/parser.nim +++ b/src/noether/parser/parser.nim @@ -1,90 +1,63 @@ -import nodes -import ../lexer/lex +import strutils +include parseutil -type - # NOTE1: Values above MARKER_FAIL indicate a failed state - # NOTE2: nlParseStat is marked pure out of habit that's all - nlParseStat* {.pure.} = enum - OK, - MARKER_FAIL, - UNMATCHED, - TOOBIG, - - nlAST* = object - root: nlNode - - nlParser* = object - stream: nlTokStream - ast: nlAST - # the "build node" is a reference to the AST node - # the parser is currently modifying/building from - # NOTE: bnode changes frequently, it is NOT the root - bnode: nlNode - # flag indicating whether the parser is at - # the start of a new line (aka checking indentation) - inIndent: bool - - -proc `*`(stat: nlParseStat, b: bool): nlParseStat = - result = if b: stat else: nlParseStat.OK - -proc isFail*(stat: nlParseStat): bool = - result = (stat >= nlParseStat.MARKER_FAIL) - -proc newParser*(tokStream: var nlTokStream): nlParser = - let rootNode = newNode(nkNone) - result = nlParser( - stream: tokStream, - ast: nlAST( - root: rootNode - ), - bnode: rootNode, +# NOTE: Matching between two tokens will fill `node` with everything +# NOTE: between those two tokens EXCLUDING the two tokens themselves. +proc parseMatch(tokStream: var nlTokStream, + node: var nlNode, + matchType: nlTokType): nlParseStat = + result = greed( + tokStream, + node.toks, + satisfyMatch(matchType), + ) +proc parseMatchLine(tokStream: var nlTokStream, + node: var nlNode, + matchType: nlTokType): nlParseStat = + result = greed( + tokStream, + node.toks, + satisfyMatchEOL(matchType), ) -# Exposes a subset of the nlTokStream interface -proc currTok(parser: var nlParser): nlTok = parser.stream.currTok -proc line(parser: var nlParser): string = parser.stream.line +proc parseStrL(tokStream: var nlTokStream, node: var nlNode): nlParseStat = + node = nlNode( + nType: nlNodeType.STRL + ) + node.addTok(tokStream.currTok) + result = nlParseStat.UNCLOSED * not greedEOL(tokStream, node.toks, nlTokType.DQUO) -# Extends upon the functionality of nlTokStream.progress() -proc progressStream*(parser: var nlParser): bool = - result = parser.stream.progress() - if result and parser.currTok.tKind == tkEOL: - parser.inIndent = true - if +proc parseChrL(tokStream: var nlTokStream, node: var nlNode): bool = + node = nlNode( + nType: nlNodeType.CHRL + ) + node.addTok(tokStream.currTok) + # TWO ERRORS ARE POSSIBLE, 1: content too big, 2: never closed + result = greedEOL(tokStream, node.toks, nlTokType.SQUO) -proc setNewLine() - -#[ "Greed" refers to something I mentioned in my discussion on - | Noether's grammar (in an EBNF-like language). Greed just - | means "everything until a condition is satisified". - | That condition should be supplied by a Nim procedural type. - ]# - -# Greed will consume anything until a condition is satisfied -# Returns false if the greed was never satisfied (OMG!!) -proc greed(parser: var nlParser, - satisfy: proc(tok: nlTok): bool): nlParseStat = - while parser.progressStream(): - if satisfy(parser.currTok): - return nlParseStat.OK - # NOTE: the matched token is currently excluded - parser.bnode.addTok(parser.currTok) - result = nlParseStat.UNMATCHED - -proc greedLine(parser: var nlParser, - satisfy: proc(tok: nlTok): bool): nlParseStat = - while parser.progressStream(): - if satisfy(parser.currTok): - return nlParseStat.OK - # NOTE: the matched token is currently excluded - parser.bnode.addTok(parser.currTok) - if parser.currTok.tKind == tkEOL: - return nlParseStat.UNMATCHED - result = nlParseStat.UNMATCHED - -#[ Templates for generating greed satisfying conditions. - ]# - -# Satisfied if it finds nlTok of type matchType -template satisfyMatch(matchType: nlTokKind): untyped = - (proc(tok {.inject.}: nlTok): bool = (tok.tKind == matchType)) +# Attempt to form an nlAST from a nlTokStream +proc parse*(tokStream: var nlTokStream): nlNode = + var tok: nlTok + var node: nlNode + while tokStream.nextTok(tok): + case tok.tType: + of nlTokType.DQUO: + # Attempt to parse string literal + if not parseStrL(tokStream, node): + echo "Unmatched Double Quotation! Malformed String Literal" + echo tokStream.currLine() + echo repeat(" ", tok.startPos), '^' + else: + echo "Parsed String Literal" + echo node[] + of nlTokType.SQUO: + # Attempt to parse string literal + if not parseChrL(tokStream, node): + echo "Unmatched Single Quotation! Malformed Character Literal" + echo tokStream.currLine() + echo repeat(" ", tok.startPos), '^' + else: + echo "Parsed String Literal" + echo node[] + else: + echo "blah blah unhandled case" diff --git a/src/noether/parser/parseutil.nim b/src/noether/parser/parseutil.nim new file mode 100644 index 0000000..6fa1243 --- /dev/null +++ b/src/noether/parser/parseutil.nim @@ -0,0 +1,58 @@ +import nodes +import ../lexer/tokstream + +type + # NOTE: Values above __FAIL__ indicate a failed state + nlParseStat* = enum + OK, + __FAIL__, + MIDAS, # Greedy search was never satisfied + UNMATCHED, + TOOBIG, + +proc `*`(stat: nlParseStat, b: bool): nlParseStat = + result = if b: stat else: nlParseStat.OK + +proc isFail*(stat: nlParseStat): bool = + result = (stat >= nlParseStat.__FAIL__) + + +#[ "Greed" refers to something I mentioned in my discussion on + | Noether's grammar (in an EBNF-like language). Greed just + | means "everything until a condition is satisified". + | That condition should be supplied by a Nim procedural type. + ]# + +# Greed will consume anything until a condition is satisfied +# Returns false if the greed was never satisfied (OMG!!) +proc greed(tokStream: var nlTokStream, + toks: var seq[nlTok], + satisfy: proc(tok: nlTok): bool, + ): nlParseStat = + var tok: nlTok + while tokStream.nextTok(tok): + toks.add(tok) + if satisfy(tok): + return nlParseStat.OK + result = nlParseStat.UNMATCHED + +proc greedLine(tokStream: var nlTokStream, + toks: var seq[nlTok], + satisfy: proc(tok: nlTok): bool): nlParseStat = + var tok: nlTok + while tokStream.nextTok(tok): + toks.add(tok) + if satisfy(tok): + return true + result = + +#[ Templates for generating greed satisfying conditions. + ]# + +# Satisfied if it finds nlTok of type matchType +template satisfyMatch(matchType: nlTokType) = + proc(tok: nlTok): bool {.inline.} = (tok.tType == matchType) + +# Satisfied if it finds nlTok of type matchType or EOL reached +template satisfyMatchEOL(matchType: nlTokType) = + proc(tok: nlTok): bool {.inline.} = (tok.tType == matchType or tok.tType == nlTokType.EOL)