From 1181ea97434788914cf37951e26f2f930d4a04ca Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 12:51:03 +1000 Subject: [PATCH] Restructure attempt #087 :( --- src/nlx.nim | 26 ++++-- src/noether/lexer/lex.nim | 44 ++++++++-- src/noether/lexer/tok.nim | 30 ++++++- src/noether/lexer/tokkind.nim | 60 -------------- src/noether/parser/parse.nim | 58 +++++++++++++ src/noether/parser/parser.nim | 134 +++++++++++++++++++------------ src/noether/parser/parseutil.nim | 90 --------------------- 7 files changed, 227 insertions(+), 215 deletions(-) create mode 100644 src/noether/parser/parse.nim delete mode 100644 src/noether/parser/parseutil.nim diff --git a/src/nlx.nim b/src/nlx.nim index c7ef1d9..e145943 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -1,19 +1,29 @@ import os import noether/lib/io import noether/lexer/lex -# import noether/parser/parser +import noether/parser/parse {.hint: "Don't forget to drink more water (^_^)".} when isMainModule: echo "Noether Lang Extras v0.1.0 - nlx" - var stream = if paramCount() > 0: streamFile(paramStr 1) + # really lazy argparse implementation (temporary) + let + paramC = paramCount() + cmd = if paramC > 2: paramStr 1 + else: "tok" + + var stream = if paramC > 0: streamFile(paramStr paramC) else: streamString(readAll stdin) var lexer = newLexer(stream) - # # DumpTok - while lexer.progress(): - echo lexer.tok - - # DumpTree - # discard parse(tokStream) + if cmd == "tok": + # DumpTok + while lexer.progress(): + echo lexer.tok + elif cmd == "tree": + discard + # DumpTree + # discard parse(tokStream) + else: + echo "Usage: nlx [tok|tree] \n demo files are accessible at lang/demo" diff --git a/src/noether/lexer/lex.nim b/src/noether/lexer/lex.nim index 46e3b00..8f81b86 100644 --- a/src/noether/lexer/lex.nim +++ b/src/noether/lexer/lex.nim @@ -11,15 +11,16 @@ type nlLexer* = object stream: Stream done*: bool - tok*: nlTok # new finished token + # store current token and upcoming (build) token + tok*: nlTok # current token btok: nlTok # the build token + # save char and pos and its token type + char: char + cTKind: nlTokKind # track line number, line content, etc line: string lineNum: int pos: int - # save char and pos and its token type - char: char - cTKind: nlTokKind proc atEOL(lexer: nlLexer): bool {.inline.} = result = (lexer.char == '\n') @@ -37,8 +38,41 @@ proc newLexer*(stream: var Stream): nlLexer = lineNum: 1, pos: -1, # after initial readChar this -> 0 char: '\0', # use \0 as initial invalid char + cTKind: tkNONE, ) +# Classifies the current character to its nlTokKind +proc classifyTok*(lexer: nlLexer): nlTokKind {.inline.} = + case lexer.char: + of '\0': + result = tkEOF + of '\r', '\n': + result = tkEOL + of ' ', '\t': + result = tkWTSP + of '(': + result = tkLPAR + of ')': + result = tkRPAR + of '{': + result = tkLBRA + of '}': + result = tkRBRA + of '[': + result = tkLSQB + of ']': + result = tkRSQB + of '\'': + result = tkSQUO + of '\"': + result = tkDQUO + of '`': + result = tkGRVA + of '#': + result = tkHASH + else: + result = tkWORD + #[ ====================================================== ] | nlLexer Internal Interface for Token Construction ] @@ -96,7 +130,7 @@ proc readChar(lexer: var nlLexer): bool = inc lexer.lineNum # sets lexer.char to '\0' if EOF lexer.char = lexer.stream.readChar() - lexer.cTKind = getTokKind(lexer.char) + lexer.cTKind = lexer.classifyTok() lexer.line.add(lexer.char) inc lexer.pos result = lexer.atEOF() diff --git a/src/noether/lexer/tok.nim b/src/noether/lexer/tok.nim index 08aba66..b19c341 100644 --- a/src/noether/lexer/tok.nim +++ b/src/noether/lexer/tok.nim @@ -1,4 +1,32 @@ -include tokkind +type + # nlTokKind allows primitive nlToks to be typed, + # the nlTokKind enum should never be directly + # accessed. Use the interface in this file instead. + nlTokKind* = enum + tkNONE, # Placeholder Value + + tkEOF, # End of File + tkEOL, # End of Line (\0 --> EOL) + + tkWORD, # Alphanumeric token + tkSYMB, # Symbolic token + + tkLNFD, # \r \n Line-Feed + tkWTSP, # ' ' \t Whitespace + + # RESERVED SYMBOLS + tkLPAR, # ( Left Parenthesis + tkRPAR, # ) Right Parenthesis + tkLBRA, # { Left Brace + tkRBRA, # } Right Brace + tkLSQB, # [ Left Square Bracket + tkRSQB, # ] Right Square Bracket + # tkLANB, # < Left Angle Bracket + # tkRANB, # > Right Angle Bracket + tkSQUO, # ' Single Quotation Marking + tkDQUO, # " Double Quotation Marking + tkGRVA, # ` Grave Accent + tkHASH, # # Number Sign (Hashtag) type nlTok* = tuple diff --git a/src/noether/lexer/tokkind.nim b/src/noether/lexer/tokkind.nim index 3d1d7b6..8b13789 100644 --- a/src/noether/lexer/tokkind.nim +++ b/src/noether/lexer/tokkind.nim @@ -1,61 +1 @@ -type - # nlTokKind allows primitive nlToks to be typed, - # the nlTokKind enum should never be directly - # accessed. Use the interface in this file instead. - nlTokKind* = enum - tkNONE, # Placeholder Value - tkEOF, # End of File - tkEOL, # End of Line (\0 --> EOL) - - tkWORD, # Alphanumeric token - tkSYMB, # Symbolic token - - tkLNFD, # \r \n Line-Feed - tkWTSP, # ' ' \t Whitespace - - # RESERVED SYMBOLS - tkLPAR, # ( Left Parenthesis - tkRPAR, # ) Right Parenthesis - tkLBRA, # { Left Brace - tkRBRA, # } Right Brace - tkLSQB, # [ Left Square Bracket - tkRSQB, # ] Right Square Bracket - # tkLANB, # < Left Angle Bracket - # tkRANB, # > Right Angle Bracket - tkSQUO, # ' Single Quotation Marking - tkDQUO, # " Double Quotation Marking - tkGRVA, # ` Grave Accent - tkHASH, # # Number Sign (Hashtag) - -# Classifies a character to its nlTokKind -proc getTokKind*(c: char): nlTokKind = - case c: - of '\0': - result = tkEOF - of '\r', '\n': - result = tkEOL - of ' ', '\t': - result = tkWTSP - of '(': - result = tkLPAR - of ')': - result = tkRPAR - of '{': - result = tkLBRA - of '}': - result = tkRBRA - of '[': - result = tkLSQB - of ']': - result = tkRSQB - of '\'': - result = tkSQUO - of '\"': - result = tkDQUO - of '`': - result = tkGRVA - of '#': - result = tkHASH - else: - result = tkWORD diff --git a/src/noether/parser/parse.nim b/src/noether/parser/parse.nim new file mode 100644 index 0000000..0ecd14b --- /dev/null +++ b/src/noether/parser/parse.nim @@ -0,0 +1,58 @@ +import strutils +include parser + +# NOTE: Matching between two tokens will fill `node` with everything +# NOTE: between those two tokens EXCLUDING the two tokens themselves. +proc parseMatch(parser: var nlParser, matchType: nlTokKind): nlParseStat = + result = greed( + parser, + satisfyMatch(matchType), + ) +proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat = + result = greedLine( + parser, + satisfyMatch(matchType), + ) + +proc parseStrLit(parser: var nlParser): nlParseStat = + result = parser.parseMatchLine(tkDQUO) + +proc parseChrLit(parser: var nlParser): nlParseStat = + result = parser.parseMatchLine(tkSQUO) + +proc parseStmt(parser: var nlParser): nlParseStat = + while parser.progressStream(): + echo "----- Current Token: ", parser.currTok + case parser.currTok.tKind + of tkDQUO: + # Attempt to parse string literal + if parser.parseStrLit() != nlParseStat.OK: + echo "Unmatched Double Quotation! Malformed String Literal" + echo parser.line + echo repeat(" ", parser.currTok.startPos), '^', '\n' + else: + echo "Parsed String Literal" + echo parser.bnode[], '\n' + of tkSQUO: + # Attempt to parse string literal + if parser.parseChrLit() != nlParseStat.OK: + echo "Unmatched Single Quotation! Malformed Character Literal" + echo parser.line + echo repeat(" ", parser.currTok.startPos), '^', '\n' + else: + echo "Parsed Character Literal" + echo parser.bnode[], '\n' + of tkEOL: + # TODO: handle this case, don't just discard + discard + else: + echo "blah blah unhandled case\n" + result = nlParseStat.OK + +# Attempt to parse nlAST from nlTokStream +proc parse*(tokStream: var nlTokStream): nlAST = + var parser = newParser(tokStream) + echo ' ' + discard parser.parseStmt() + + result = parser.ast diff --git a/src/noether/parser/parser.nim b/src/noether/parser/parser.nim index 7daf91b..7047e6d 100644 --- a/src/noether/parser/parser.nim +++ b/src/noether/parser/parser.nim @@ -1,58 +1,90 @@ -import strutils -include parseutil +import nodes +import ../lexer/lex -# NOTE: Matching between two tokens will fill `node` with everything -# NOTE: between those two tokens EXCLUDING the two tokens themselves. -proc parseMatch(parser: var nlParser, matchType: nlTokKind): nlParseStat = - result = greed( - parser, - satisfyMatch(matchType), - ) -proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat = - result = greedLine( - parser, - satisfyMatch(matchType), +type + # NOTE1: Values above MARKER_FAIL indicate a failed state + # NOTE2: nlParseStat is marked pure out of habit that's all + nlParseStat* {.pure.} = enum + OK, + MARKER_FAIL, + UNMATCHED, + TOOBIG, + + nlAST* = object + root: nlNode + + nlParser* = object + stream: nlTokStream + ast: nlAST + # the "build node" is a reference to the AST node + # the parser is currently modifying/building from + # NOTE: bnode changes frequently, it is NOT the root + bnode: nlNode + # flag indicating whether the parser is at + # the start of a new line (aka checking indentation) + inIndent: bool + + +proc `*`(stat: nlParseStat, b: bool): nlParseStat = + result = if b: stat else: nlParseStat.OK + +proc isFail*(stat: nlParseStat): bool = + result = (stat >= nlParseStat.MARKER_FAIL) + +proc newParser*(tokStream: var nlTokStream): nlParser = + let rootNode = newNode(nkNone) + result = nlParser( + stream: tokStream, + ast: nlAST( + root: rootNode + ), + bnode: rootNode, ) -proc parseStrLit(parser: var nlParser): nlParseStat = - result = parser.parseMatchLine(tkDQUO) +# Exposes a subset of the nlTokStream interface +proc currTok(parser: var nlParser): nlTok = parser.stream.currTok +proc line(parser: var nlParser): string = parser.stream.line -proc parseChrLit(parser: var nlParser): nlParseStat = - result = parser.parseMatchLine(tkSQUO) +# Extends upon the functionality of nlTokStream.progress() +proc progressStream*(parser: var nlParser): bool = + result = parser.stream.progress() + if result and parser.currTok.tKind == tkEOL: + parser.inIndent = true + if + +proc setNewLine() -proc parseStmt(parser: var nlParser): nlParseStat = +#[ "Greed" refers to something I mentioned in my discussion on + | Noether's grammar (in an EBNF-like language). Greed just + | means "everything until a condition is satisified". + | That condition should be supplied by a Nim procedural type. + ]# + +# Greed will consume anything until a condition is satisfied +# Returns false if the greed was never satisfied (OMG!!) +proc greed(parser: var nlParser, + satisfy: proc(tok: nlTok): bool): nlParseStat = while parser.progressStream(): - echo "----- Current Token: ", parser.currTok - case parser.currTok.tKind - of tkDQUO: - # Attempt to parse string literal - if parser.parseStrLit() != nlParseStat.OK: - echo "Unmatched Double Quotation! Malformed String Literal" - echo parser.line - echo repeat(" ", parser.currTok.startPos), '^', '\n' - else: - echo "Parsed String Literal" - echo parser.bnode[], '\n' - of tkSQUO: - # Attempt to parse string literal - if parser.parseChrLit() != nlParseStat.OK: - echo "Unmatched Single Quotation! Malformed Character Literal" - echo parser.line - echo repeat(" ", parser.currTok.startPos), '^', '\n' - else: - echo "Parsed Character Literal" - echo parser.bnode[], '\n' - of tkEOL: - # TODO: handle this case, don't just discard - discard - else: - echo "blah blah unhandled case\n" - result = nlParseStat.OK - -# Attempt to parse nlAST from nlTokStream -proc parse*(tokStream: var nlTokStream): nlAST = - var parser = newParser(tokStream) - echo ' ' - discard parser.parseStmt() + if satisfy(parser.currTok): + return nlParseStat.OK + # NOTE: the matched token is currently excluded + parser.bnode.addTok(parser.currTok) + result = nlParseStat.UNMATCHED - result = parser.ast +proc greedLine(parser: var nlParser, + satisfy: proc(tok: nlTok): bool): nlParseStat = + while parser.progressStream(): + if satisfy(parser.currTok): + return nlParseStat.OK + # NOTE: the matched token is currently excluded + parser.bnode.addTok(parser.currTok) + if parser.currTok.tKind == tkEOL: + return nlParseStat.UNMATCHED + result = nlParseStat.UNMATCHED + +#[ Templates for generating greed satisfying conditions. + ]# + +# Satisfied if it finds nlTok of type matchType +template satisfyMatch(matchType: nlTokKind): untyped = + (proc(tok {.inject.}: nlTok): bool = (tok.tKind == matchType)) diff --git a/src/noether/parser/parseutil.nim b/src/noether/parser/parseutil.nim deleted file mode 100644 index d531490..0000000 --- a/src/noether/parser/parseutil.nim +++ /dev/null @@ -1,90 +0,0 @@ -import nodes -import ../lexer/tokstream - -type - # NOTE1: Values above MARKER_FAIL indicate a failed state - # NOTE2: nlParseStat is marked pure out of habit that's all - nlParseStat* {.pure.} = enum - OK, - MARKER_FAIL, - UNMATCHED, - TOOBIG, - - nlAST* = object - root: nlNode - - nlParser* = object - stream: nlTokStream - ast: nlAST - # the "build node" is a reference to the AST node - # the parser is currently modifying/building from - # NOTE: bnode changes frequently, it is NOT the root - bnode: nlNode - # flag indicating whether the parser is at - # the start of a new line (aka checking indentation) - inIndent: bool - - -proc `*`(stat: nlParseStat, b: bool): nlParseStat = - result = if b: stat else: nlParseStat.OK - -proc isFail*(stat: nlParseStat): bool = - result = (stat >= nlParseStat.MARKER_FAIL) - -proc newParser*(tokStream: var nlTokStream): nlParser = - let rootNode = newNode(nkNone) - result = nlParser( - stream: tokStream, - ast: nlAST( - root: rootNode - ), - bnode: rootNode, - ) - -# Exposes a subset of the nlTokStream interface -proc currTok(parser: var nlParser): nlTok = parser.stream.currTok -proc line(parser: var nlParser): string = parser.stream.line - -# Extends upon the functionality of nlTokStream.progress() -proc progressStream*(parser: var nlParser): bool = - result = parser.stream.progress() - if result and parser.currTok.tKind == tkEOL: - parser.inIndent = true - if - -proc setNewLine() - -#[ "Greed" refers to something I mentioned in my discussion on - | Noether's grammar (in an EBNF-like language). Greed just - | means "everything until a condition is satisified". - | That condition should be supplied by a Nim procedural type. - ]# - -# Greed will consume anything until a condition is satisfied -# Returns false if the greed was never satisfied (OMG!!) -proc greed(parser: var nlParser, - satisfy: proc(tok: nlTok): bool): nlParseStat = - while parser.progressStream(): - if satisfy(parser.currTok): - return nlParseStat.OK - # NOTE: the matched token is currently excluded - parser.bnode.addTok(parser.currTok) - result = nlParseStat.UNMATCHED - -proc greedLine(parser: var nlParser, - satisfy: proc(tok: nlTok): bool): nlParseStat = - while parser.progressStream(): - if satisfy(parser.currTok): - return nlParseStat.OK - # NOTE: the matched token is currently excluded - parser.bnode.addTok(parser.currTok) - if parser.currTok.tKind == tkEOL: - return nlParseStat.UNMATCHED - result = nlParseStat.UNMATCHED - -#[ Templates for generating greed satisfying conditions. - ]# - -# Satisfied if it finds nlTok of type matchType -template satisfyMatch(matchType: nlTokKind): untyped = - (proc(tok {.inject.}: nlTok): bool = (tok.tKind == matchType))