From f8697bd66297dcfd3cbff6f492332d0cb1890094 Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 02:09:43 +1000 Subject: [PATCH] 1 gazillion changes (mostly documenting my insanity optimizing + naming) --- src/nlx.nim | 6 +-- src/noether/lexer/lstream.nim | 20 ++++---- src/noether/lexer/tok.nim | 28 ++--------- src/noether/lexer/tokbuilding.nim | 28 +++++------ src/noether/lexer/tokstream.nim | 29 ++++++----- src/noether/lexer/toktype.nim | 79 ++++++++++++++++-------------- src/noether/parser/nodes.nim | 51 +++++++++++++++----- src/noether/parser/parser.nim | 80 ++++++++++++++----------------- src/noether/parser/parseutil.nim | 68 +++++++++++++++----------- 9 files changed, 206 insertions(+), 183 deletions(-) diff --git a/src/nlx.nim b/src/nlx.nim index 4b0c678..75f59d4 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -3,6 +3,7 @@ import noether/lexer/tok import noether/lexer/tokstream import noether/parser/parser +{.hint: "Don't forget to drink more water (^_^)".} when isMainModule: echo "Noether Lang Extras v0.1.0 - nlx" @@ -11,9 +12,8 @@ when isMainModule: var tokStream = newTokStream(filename, isFile=true) # # DumpTok - # var tok: nlTok - # while tokStream.nextTok(tok): - # echo tok + # while tokStream.progress(): + # echo tokStream.currTok # DumpTree discard parse(tokStream) diff --git a/src/noether/lexer/lstream.nim b/src/noether/lexer/lstream.nim index 034f48b..44138e0 100644 --- a/src/noether/lexer/lstream.nim +++ b/src/noether/lexer/lstream.nim @@ -41,7 +41,7 @@ proc outOfBounds*(lstream: nlLStream): bool = result = (lstream.pos > lstream.line.len - 1) # Progress the lex stream to the next line (if available) -proc progLine*(lstream: var nlLStream): bool = +proc progressLine*(lstream: var nlLStream): bool = if lstream.stream.readLine(lstream.line): inc lstream.lineNum lstream.pos = Natural 0 @@ -50,17 +50,17 @@ proc progLine*(lstream: var nlLStream): bool = # Progress the lex stream to the next character in the line # forcefully (aka does NOT check if we reached EOL) -proc forceProgChar*(lstream: var nlLStream) = +proc forceProgressChar*(lstream: var nlLStream) = inc lstream.pos -# Progress the lex stream to the next character (if available) -proc progress*(lstream: var nlLStream): bool = - if not lstream.atEOL(): - lstream.forceProgChar() - result = true - else: - # attempt to progress next line past EOL - result = lstream.progLine() +# # Progress the lex stream to the next character (if available) +# proc progressChar*(lstream: var nlLStream): bool = +# if not lstream.atEOL(): +# lstream.forceProgressChar() +# result = true +# else: +# # attempt to progress next line past EOL +# result = lstream.progressLine() proc currChar*(lstream: nlLStream): char = result = lstream.line[lstream.pos] diff --git a/src/noether/lexer/tok.nim b/src/noether/lexer/tok.nim index fb3067c..7715b8f 100644 --- a/src/noether/lexer/tok.nim +++ b/src/noether/lexer/tok.nim @@ -2,7 +2,7 @@ include toktype type nlTok* = object - tType*: nlTokType + tKind*: nlTokKind lit*: string lineNum*: Natural startPos*: Natural @@ -12,29 +12,11 @@ type # all other fields are expected to be filled out later. proc emptyTok*(startPos: int): nlTok = result = nlTok( - tType: nlTokType.NONE, + tKind: tkNONE, lit: "", startPos: Natural startPos, ) -# Checks if an nlTok has nlTokType.NONE -proc isTokUntyped*(tType: nlTokType): bool = - result = (tType == nlTokType.NONE) - -# Checks if an nlTok has nlTokType.EOL -proc isTokEOL*(tok: nlTok): bool = - result = (tok.tType == nlTokType.EOL) - - - -# This method is only used to convert null -# terminator nlToks into line-feed ones. -# Returns a copy of an nlTok, changing its type -proc tokTermToLineFeed*(tok: nlTok): nlTok = - result = nlTok( - tType: nlTokType.LNFD, - lit: tok.lit, - lineNum: tok.lineNum, - startPos: tok.startPos, - endPos: tok.endPos, - ) +# Checks if an nlTok has tkNONE +proc isUntyped*(tKind: nlTokKind): bool = + result = (tKind == tkNONE) diff --git a/src/noether/lexer/tokbuilding.nim b/src/noether/lexer/tokbuilding.nim index 99022ee..46a2222 100644 --- a/src/noether/lexer/tokbuilding.nim +++ b/src/noether/lexer/tokbuilding.nim @@ -12,7 +12,7 @@ type # Generates an EOL token for the nlTokStream's state proc EOLTok(tokStream: nlTokStream): nlTok = result = nlTok( - tType: nlTokType.EOL, + tKind: tkEOL, lit: "\0", lineNum: Natural tokStream.lstream.lineNum, startPos: Natural tokStream.lstream.pos, @@ -40,46 +40,46 @@ proc flushBuild(tokStream: var nlTokStream): nlTok = # Returns whether the build token has a set type yet. # This indicates that the build token should inherit -# the nlTokType of the nlLStream's next character. +# the nlTokKind of the nlLStream's next character. proc isUntypedBuild(tokStream: nlTokStream): bool = - result = isTokUntyped(tokStream.build.tType) + result = tokStream.build.tKind.isUntyped() -# Check whether an nlTokType is "compatible" with the build token. +# Check whether an nlTokKind is "compatible" with the build token. # NOTE: flushBuild() should be called when an incompatible token is discovered. -proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool = - result = (tType == tokStream.build.tType) +proc isCompatibleBuild(tokStream: nlTokStream, tKind: nlTokKind): bool = + result = (tKind == tokStream.build.tKind) # Add a character to the nlTokStream's build token. # Flushes and returns the build token if "fully built", # and a boolean indicating whether the nlTokStream can progress. -proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool = +proc progressBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool = # the "pos > EOL" invalid state is used intentionally # to indicate all tokens have been built, and return EOL Token if tokStream.lstream.outOfBounds(): buildTok = some(EOLTok(tokStream)) return true # can progress once more - let tType = getTokType(tokStream.lstream.currChar()) + let tKind = getTokType(tokStream.lstream.currChar()) # untyped build tokens must inherited immediately if isUntypedBuild(tokStream): - tokStream.build.tType = tType + tokStream.build.tKind = tKind # check if EOL reached if tokStream.lstream.atEOL(): # flush old build token, the new one can be left untyped - let compatible = isCompatibleBuild(tokStream, tType) + let compatible = isCompatibleBuild(tokStream, tKind) result = false # DO NOT PROGRESS if compatible: # force the lstream into an invalid state by progressing beyond EOL - # we can then detect this state on the next progBuild and return + # we can then detect this state on the next progressBuild and return # an EOL character (very unsafe implementation but it works well) - tokStream.lstream.forceProgChar() + tokStream.lstream.forceProgressChar() buildTok = some(flushBuild(tokStream)) # check character and build token compatability - elif not isCompatibleBuild(tokStream, tType): + elif not isCompatibleBuild(tokStream, tKind): # flush old build token, the new one inherits type buildTok = some(flushBuild(tokStream)) - tokStream.build.tType = tType + tokStream.build.tKind = tKind result = true # can progress else: buildTok = none(nlTok) diff --git a/src/noether/lexer/tokstream.nim b/src/noether/lexer/tokstream.nim index c3fb3f2..02a045e 100644 --- a/src/noether/lexer/tokstream.nim +++ b/src/noether/lexer/tokstream.nim @@ -9,44 +9,43 @@ proc newTokStream*(content: string, isFile: bool = false): nlTokStream = # 1. initialise an empty build token # 2. progress to the first line result.resetBuild() - discard result.lstream.progLine() + discard result.lstream.progressLine() # Defines a short-hand notation for getting the current line -proc currLine*(tokStream: nlTokStream): string = +proc line*(tokStream: nlTokStream): string = result = tokStream.lstream.line -# Reimplements nlLStream.progress() for nlTokStream +# Reimplements nlLStream.progressChar for nlTokStream # to account for additional structure (ie the build token) -proc progChar(tokStream: var nlTokStream): bool = +# NOTE: progressChar progresses to lstream's next char +proc progressChar(tokStream: var nlTokStream): bool = if not tokStream.lstream.atEOL(): - tokStream.lstream.forceProgChar() + tokStream.lstream.forceProgressChar() result = true else: # attempt to progress to next line past EOL - result = tokStream.lstream.progLine() + result = tokStream.lstream.progressLine() tokStream.resetBuild() -# Generates and sets (by reference) the next token in the stream, -# via repeatedly calling progBuild() and progChar(). +# Generates and progress the next token in the nlTokStream. +# via repeatedly calling progressBuild() and progressChar(). # Returns a boolean indicating whether EOF has been reached. -# NOTE: progBuild adds lstream's current char to the build token -# NOTE: progChar progresses to lstream's next char -proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool = +# NOTE: access the new token via `tokStream.tok` +proc progress*(tokStream: var nlTokStream): bool = # Return prematurely if already closed if tokStream.closed: return false while true: var flushedTok: Option[nlTok] let - canProgress = tokStream.progBuild(flushedTok) + canProgress = tokStream.progressBuild(flushedTok) buildComplete = flushedTok.isSome # canProgress & EOF reached => no more tokens to build :) # NOTE: reachedEOF and not canProgress => more tokens unwrapping if buildComplete: # return the finished build token, and save it as the current token - tok = flushedTok.get() - tokStream.currTok = tok - if canProgress and not tokStream.progChar(): + tokStream.currTok = flushedTok.get() + if canProgress and not tokStream.progressChar(): tokStream.closed = true return buildComplete elif buildComplete: diff --git a/src/noether/lexer/toktype.nim b/src/noether/lexer/toktype.nim index 0f40023..49add5b 100644 --- a/src/noether/lexer/toktype.nim +++ b/src/noether/lexer/toktype.nim @@ -1,54 +1,59 @@ type - # nlTokType allows primitive nlToks to be typed, - # the nlTokType enum should never be directly + # nlTokKind allows primitive nlToks to be typed, + # the nlTokKind enum should never be directly # accessed. Use the interface in this file instead. - nlTokType* = enum - NONE, # Placeholder Value - EOF, # End of File - EOL, # End of Line (\0 --> EOL) - WORD, # Alphanumeric token - SYMB, # Symbolic token - LNFD, # \r \n Line-Feed - WTSP, # ' ' \t Whitespace - LPAR, # ( Left Parenthesis - RPAR, # ) Right Parenthesis - LBRA, # { Left Brace - RBRA, # } Right Brace - LSQB, # [ Left Square Bracket - RSQB, # ] Right Square Bracket - # LANB, # < Left Angle Bracket - # RANB, # > Right Angle Bracket - SQUO, # ' Single Quotation Marking - DQUO, # " Double Quotation Marking - GRVA, # ` Grave Accent - HASH, # # Number Sign (Hashtag) + nlTokKind* = enum + tkNONE, # Placeholder Value + + tkEOF, # End of File + tkEOL, # End of Line (\0 --> EOL) + + tkWORD, # Alphanumeric token + tkSYMB, # Symbolic token + + tkLNFD, # \r \n Line-Feed + tkWTSP, # ' ' \t Whitespace + + # RESERVED SYMBOLS + tkLPAR, # ( Left Parenthesis + tkRPAR, # ) Right Parenthesis + tkLBRA, # { Left Brace + tkRBRA, # } Right Brace + tkLSQB, # [ Left Square Bracket + tkRSQB, # ] Right Square Bracket + # tkLANB, # < Left Angle Bracket + # tkRANB, # > Right Angle Bracket + tkSQUO, # ' Single Quotation Marking + tkDQUO, # " Double Quotation Marking + tkGRVA, # ` Grave Accent + tkHASH, # # Number Sign (Hashtag) -# Classifies a character to its nlTokType -proc getTokType*(c: char): nlTokType = +# Classifies a character to its nlTokKind +proc getTokType*(c: char): nlTokKind = case c: of '\0', '\r', '\n': - result = nlTokType.EOL + result = tkEOL of ' ', '\t': - result = nlTokType.WTSP + result = tkWTSP of '(': - result = nlTokType.LPAR + result = tkLPAR of ')': - result = nlTokType.RPAR + result = tkRPAR of '{': - result = nlTokType.LBRA + result = tkLBRA of '}': - result = nlTokType.RBRA + result = tkRBRA of '[': - result = nlTokType.LSQB + result = tkLSQB of ']': - result = nlTokType.RSQB + result = tkRSQB of '\'': - result = nlTokType.SQUO + result = tkSQUO of '\"': - result = nlTokType.DQUO + result = tkDQUO of '`': - result = nlTokType.GRVA + result = tkGRVA of '#': - result = nlTokType.HASH + result = tkHASH else: - result = nlTokType.WORD + result = tkWORD diff --git a/src/noether/parser/nodes.nim b/src/noether/parser/nodes.nim index 23cf742..c31285e 100644 --- a/src/noether/parser/nodes.nim +++ b/src/noether/parser/nodes.nim @@ -1,19 +1,48 @@ +import std/options from ../lexer/tok import nlTok -# from ../lexer/tokstream import type - # NOTE: by the end of parsing NO nodes should - # NOTE: have nlNodeType.NONE - nlNodeType* = enum - NONE, # Placeholder Value - TERM, # Indicates the tree has terminated - STRL, # String Literal - CHRL, # Character Literal + # NOTE: by the end of parsing NO nodes should have nkNone + nlNodeKind* = enum + nkNone, # Placeholder Value + + nkStrLit, # String Literal + nkChrLit, # Character Literal + + # NOTE: always check parent != nil when traversing the tree nlNode* {.acyclic.} = ref object of RootObj - nType*: nlNodeType - toks*: seq[nlTok] # nodes store the tokens that build them - # left, right: nlNode + nKind*: nlNodeKind + toks*: seq[nlTok] # nodes (may) store the tokens that build them + parent*: nlNode + + # Purely abstract type that all nlNode objects + # with children are expected to inherit from. + nlBranchNode* {.acyclic.} = ref object of nlNode + child: UncheckedArray[nlNode] + + nlBiNode* {.acyclic.} = ref object of nlBranchNode + +proc childCount*(node: nlNode): int {.inline.} = 0 +proc childCount*(node: nlBiNode): int {.inline.} = 2 + +proc getChild*(node: nlNode, i: int): Option[nlNode] {.inline.} = + result = none(nlNode) +proc getChild*(node: nlBranchNode, i: int): Option[nlNode] {.inline.} = + result = some(node.child[i]) + +proc newNode*(nKind: nlNodeKind): nlNode = + result = nlNode( + nKind: nKind, + ) + +proc newBiNode*(nKind: nlNodeKind): nlNode = + result = nlBiNode( + nKind: nKind, + ) # Short-hand way of appending a token to a node's token sequence proc addTok*(node: nlNode, tok: nlTok) = + echo "AM I HERE?" + echo node[] + echo node.toks node.toks.add(tok) diff --git a/src/noether/parser/parser.nim b/src/noether/parser/parser.nim index 0598075..4654fb3 100644 --- a/src/noether/parser/parser.nim +++ b/src/noether/parser/parser.nim @@ -3,61 +3,55 @@ include parseutil # NOTE: Matching between two tokens will fill `node` with everything # NOTE: between those two tokens EXCLUDING the two tokens themselves. -proc parseMatch(tokStream: var nlTokStream, - node: var nlNode, - matchType: nlTokType): nlParseStat = +proc parseMatch(parser: var nlParser, matchType: nlTokKind): nlParseStat = result = greed( - tokStream, - node.toks, + parser, satisfyMatch(matchType), ) -proc parseMatchLine(tokStream: var nlTokStream, - node: var nlNode, - matchType: nlTokType): nlParseStat = - result = greed( - tokStream, - node.toks, - satisfyMatchEOL(matchType), +proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat = + result = greedLine( + parser, + satisfyMatch(matchType), ) -proc parseStrL(tokStream: var nlTokStream, node: var nlNode): nlParseStat = - node = nlNode( - nType: nlNodeType.STRL - ) - node.addTok(tokStream.currTok) - result = nlParseStat.UNCLOSED * not greedEOL(tokStream, node.toks, nlTokType.DQUO) +proc parseStrLit(parser: var nlParser): nlParseStat = + result = parser.parseMatch(tkDQUO) -proc parseChrL(tokStream: var nlTokStream, node: var nlNode): bool = - node = nlNode( - nType: nlNodeType.CHRL - ) - node.addTok(tokStream.currTok) - # TWO ERRORS ARE POSSIBLE, 1: content too big, 2: never closed - result = greedEOL(tokStream, node.toks, nlTokType.SQUO) - -# Attempt to form an nlAST from a nlTokStream -proc parse*(tokStream: var nlTokStream): nlNode = - var tok: nlTok - var node: nlNode - while tokStream.nextTok(tok): - case tok.tType: - of nlTokType.DQUO: +proc parseChrLit(parser: var nlParser): nlParseStat = + result = parser.parseMatch(tkSQUO) + +proc parseStmt(parser: var nlParser): nlParseStat = + # initialise build node as none just for the hell of it + + while parser.stream.progress(): + echo parser.stream.currTok + case parser.stream.currTok.tKind + of tkDQUO: # Attempt to parse string literal - if not parseStrL(tokStream, node): + if parser.parseStrLit() != nlParseStat.OK: echo "Unmatched Double Quotation! Malformed String Literal" - echo tokStream.currLine() - echo repeat(" ", tok.startPos), '^' + echo parser.stream.line + echo repeat(" ", parser.stream.currTok.startPos), '^' else: echo "Parsed String Literal" - echo node[] - of nlTokType.SQUO: + echo parser.bnode[] + of tkSQUO: # Attempt to parse string literal - if not parseChrL(tokStream, node): + if parser.parseChrLit() != nlParseStat.OK: echo "Unmatched Single Quotation! Malformed Character Literal" - echo tokStream.currLine() - echo repeat(" ", tok.startPos), '^' + echo parser.stream.line + echo repeat(" ", parser.stream.currTok.startPos), '^' else: - echo "Parsed String Literal" - echo node[] + echo "Parsed Character Literal" + echo parser.bnode[] else: echo "blah blah unhandled case" + result = nlParseStat.OK + +# Attempt to parse nlAST from nlTokStream +proc parse*(tokStream: var nlTokStream): nlAST = + var parser = newParser(tokStream) + echo ' ' + discard parser.parseStmt() + + result = parser.ast diff --git a/src/noether/parser/parseutil.nim b/src/noether/parser/parseutil.nim index 6fa1243..4d9deb6 100644 --- a/src/noether/parser/parseutil.nim +++ b/src/noether/parser/parseutil.nim @@ -2,21 +2,41 @@ import nodes import ../lexer/tokstream type - # NOTE: Values above __FAIL__ indicate a failed state - nlParseStat* = enum + # NOTE1: Values above MARKER_FAIL indicate a failed state + # NOTE2: nlParseStat is marked pure out of habit that's all + nlParseStat* {.pure.} = enum OK, - __FAIL__, - MIDAS, # Greedy search was never satisfied + MARKER_FAIL, UNMATCHED, TOOBIG, + nlAST* = object + root: nlNode + + nlParser* = object + stream: nlTokStream + ast: nlAST + # the "build node" is a reference to the AST node + # the parser is currently modifying/building from + # NOTE: bnode changes frequently, it is NOT the root + bnode: nlNode + + proc `*`(stat: nlParseStat, b: bool): nlParseStat = result = if b: stat else: nlParseStat.OK proc isFail*(stat: nlParseStat): bool = - result = (stat >= nlParseStat.__FAIL__) + result = (stat >= nlParseStat.MARKER_FAIL) +proc newParser*(tokStream: var nlTokStream): nlParser = + let rootNode = newNode(nkNone) + result = nlParser( + stream: tokStream, + ast: rootNode, + bnode: rootNode, + ) + #[ "Greed" refers to something I mentioned in my discussion on | Noether's grammar (in an EBNF-like language). Greed just | means "everything until a condition is satisified". @@ -25,34 +45,28 @@ proc isFail*(stat: nlParseStat): bool = # Greed will consume anything until a condition is satisfied # Returns false if the greed was never satisfied (OMG!!) -proc greed(tokStream: var nlTokStream, - toks: var seq[nlTok], - satisfy: proc(tok: nlTok): bool, - ): nlParseStat = - var tok: nlTok - while tokStream.nextTok(tok): - toks.add(tok) - if satisfy(tok): +proc greed(parser: var nlParser, + satisfy: proc(tok: nlTok): bool): nlParseStat = + while parser.stream.progress(): + echo "im definitely here!" + parser.bnode.addTok(parser.stream.currTok) + if satisfy(parser.stream.currTok): return nlParseStat.OK result = nlParseStat.UNMATCHED -proc greedLine(tokStream: var nlTokStream, - toks: var seq[nlTok], +proc greedLine(parser: var nlParser, satisfy: proc(tok: nlTok): bool): nlParseStat = - var tok: nlTok - while tokStream.nextTok(tok): - toks.add(tok) - if satisfy(tok): - return true - result = + while parser.stream.progress(): + parser.bnode.addTok(parser.stream.currTok) + if satisfy(parser.stream.currTok): + return nlParseStat.OK + elif parser.stream.currTok.tKind == tkEOL: + return nlParseStat.UNMATCHED + result = nlParseStat.UNMATCHED #[ Templates for generating greed satisfying conditions. ]# # Satisfied if it finds nlTok of type matchType -template satisfyMatch(matchType: nlTokType) = - proc(tok: nlTok): bool {.inline.} = (tok.tType == matchType) - -# Satisfied if it finds nlTok of type matchType or EOL reached -template satisfyMatchEOL(matchType: nlTokType) = - proc(tok: nlTok): bool {.inline.} = (tok.tType == matchType or tok.tType == nlTokType.EOL) +template satisfyMatch(matchType: nlTokKind): untyped = + (proc(tok {.inject.}: nlTok): bool = (tok.tKind == matchType))