From f8697bd66297dcfd3cbff6f492332d0cb1890094 Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 02:09:43 +1000 Subject: [PATCH 01/12] 1 gazillion changes (mostly documenting my insanity optimizing + naming) --- src/nlx.nim | 6 +-- src/noether/lexer/lstream.nim | 20 ++++---- src/noether/lexer/tok.nim | 28 ++--------- src/noether/lexer/tokbuilding.nim | 28 +++++------ src/noether/lexer/tokstream.nim | 29 ++++++----- src/noether/lexer/toktype.nim | 79 ++++++++++++++++-------------- src/noether/parser/nodes.nim | 51 +++++++++++++++----- src/noether/parser/parser.nim | 80 ++++++++++++++----------------- src/noether/parser/parseutil.nim | 68 +++++++++++++++----------- 9 files changed, 206 insertions(+), 183 deletions(-) diff --git a/src/nlx.nim b/src/nlx.nim index 4b0c678..75f59d4 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -3,6 +3,7 @@ import noether/lexer/tok import noether/lexer/tokstream import noether/parser/parser +{.hint: "Don't forget to drink more water (^_^)".} when isMainModule: echo "Noether Lang Extras v0.1.0 - nlx" @@ -11,9 +12,8 @@ when isMainModule: var tokStream = newTokStream(filename, isFile=true) # # DumpTok - # var tok: nlTok - # while tokStream.nextTok(tok): - # echo tok + # while tokStream.progress(): + # echo tokStream.currTok # DumpTree discard parse(tokStream) diff --git a/src/noether/lexer/lstream.nim b/src/noether/lexer/lstream.nim index 034f48b..44138e0 100644 --- a/src/noether/lexer/lstream.nim +++ b/src/noether/lexer/lstream.nim @@ -41,7 +41,7 @@ proc outOfBounds*(lstream: nlLStream): bool = result = (lstream.pos > lstream.line.len - 1) # Progress the lex stream to the next line (if available) -proc progLine*(lstream: var nlLStream): bool = +proc progressLine*(lstream: var nlLStream): bool = if lstream.stream.readLine(lstream.line): inc lstream.lineNum lstream.pos = Natural 0 @@ -50,17 +50,17 @@ proc progLine*(lstream: var nlLStream): bool = # Progress the lex stream to the next character in the line # forcefully (aka does NOT check if we reached EOL) -proc forceProgChar*(lstream: var nlLStream) = +proc forceProgressChar*(lstream: var nlLStream) = inc lstream.pos -# Progress the lex stream to the next character (if available) -proc progress*(lstream: var nlLStream): bool = - if not lstream.atEOL(): - lstream.forceProgChar() - result = true - else: - # attempt to progress next line past EOL - result = lstream.progLine() +# # Progress the lex stream to the next character (if available) +# proc progressChar*(lstream: var nlLStream): bool = +# if not lstream.atEOL(): +# lstream.forceProgressChar() +# result = true +# else: +# # attempt to progress next line past EOL +# result = lstream.progressLine() proc currChar*(lstream: nlLStream): char = result = lstream.line[lstream.pos] diff --git a/src/noether/lexer/tok.nim b/src/noether/lexer/tok.nim index fb3067c..7715b8f 100644 --- a/src/noether/lexer/tok.nim +++ b/src/noether/lexer/tok.nim @@ -2,7 +2,7 @@ include toktype type nlTok* = object - tType*: nlTokType + tKind*: nlTokKind lit*: string lineNum*: Natural startPos*: Natural @@ -12,29 +12,11 @@ type # all other fields are expected to be filled out later. proc emptyTok*(startPos: int): nlTok = result = nlTok( - tType: nlTokType.NONE, + tKind: tkNONE, lit: "", startPos: Natural startPos, ) -# Checks if an nlTok has nlTokType.NONE -proc isTokUntyped*(tType: nlTokType): bool = - result = (tType == nlTokType.NONE) - -# Checks if an nlTok has nlTokType.EOL -proc isTokEOL*(tok: nlTok): bool = - result = (tok.tType == nlTokType.EOL) - - - -# This method is only used to convert null -# terminator nlToks into line-feed ones. -# Returns a copy of an nlTok, changing its type -proc tokTermToLineFeed*(tok: nlTok): nlTok = - result = nlTok( - tType: nlTokType.LNFD, - lit: tok.lit, - lineNum: tok.lineNum, - startPos: tok.startPos, - endPos: tok.endPos, - ) +# Checks if an nlTok has tkNONE +proc isUntyped*(tKind: nlTokKind): bool = + result = (tKind == tkNONE) diff --git a/src/noether/lexer/tokbuilding.nim b/src/noether/lexer/tokbuilding.nim index 99022ee..46a2222 100644 --- a/src/noether/lexer/tokbuilding.nim +++ b/src/noether/lexer/tokbuilding.nim @@ -12,7 +12,7 @@ type # Generates an EOL token for the nlTokStream's state proc EOLTok(tokStream: nlTokStream): nlTok = result = nlTok( - tType: nlTokType.EOL, + tKind: tkEOL, lit: "\0", lineNum: Natural tokStream.lstream.lineNum, startPos: Natural tokStream.lstream.pos, @@ -40,46 +40,46 @@ proc flushBuild(tokStream: var nlTokStream): nlTok = # Returns whether the build token has a set type yet. # This indicates that the build token should inherit -# the nlTokType of the nlLStream's next character. +# the nlTokKind of the nlLStream's next character. proc isUntypedBuild(tokStream: nlTokStream): bool = - result = isTokUntyped(tokStream.build.tType) + result = tokStream.build.tKind.isUntyped() -# Check whether an nlTokType is "compatible" with the build token. +# Check whether an nlTokKind is "compatible" with the build token. # NOTE: flushBuild() should be called when an incompatible token is discovered. -proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool = - result = (tType == tokStream.build.tType) +proc isCompatibleBuild(tokStream: nlTokStream, tKind: nlTokKind): bool = + result = (tKind == tokStream.build.tKind) # Add a character to the nlTokStream's build token. # Flushes and returns the build token if "fully built", # and a boolean indicating whether the nlTokStream can progress. -proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool = +proc progressBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool = # the "pos > EOL" invalid state is used intentionally # to indicate all tokens have been built, and return EOL Token if tokStream.lstream.outOfBounds(): buildTok = some(EOLTok(tokStream)) return true # can progress once more - let tType = getTokType(tokStream.lstream.currChar()) + let tKind = getTokType(tokStream.lstream.currChar()) # untyped build tokens must inherited immediately if isUntypedBuild(tokStream): - tokStream.build.tType = tType + tokStream.build.tKind = tKind # check if EOL reached if tokStream.lstream.atEOL(): # flush old build token, the new one can be left untyped - let compatible = isCompatibleBuild(tokStream, tType) + let compatible = isCompatibleBuild(tokStream, tKind) result = false # DO NOT PROGRESS if compatible: # force the lstream into an invalid state by progressing beyond EOL - # we can then detect this state on the next progBuild and return + # we can then detect this state on the next progressBuild and return # an EOL character (very unsafe implementation but it works well) - tokStream.lstream.forceProgChar() + tokStream.lstream.forceProgressChar() buildTok = some(flushBuild(tokStream)) # check character and build token compatability - elif not isCompatibleBuild(tokStream, tType): + elif not isCompatibleBuild(tokStream, tKind): # flush old build token, the new one inherits type buildTok = some(flushBuild(tokStream)) - tokStream.build.tType = tType + tokStream.build.tKind = tKind result = true # can progress else: buildTok = none(nlTok) diff --git a/src/noether/lexer/tokstream.nim b/src/noether/lexer/tokstream.nim index c3fb3f2..02a045e 100644 --- a/src/noether/lexer/tokstream.nim +++ b/src/noether/lexer/tokstream.nim @@ -9,44 +9,43 @@ proc newTokStream*(content: string, isFile: bool = false): nlTokStream = # 1. initialise an empty build token # 2. progress to the first line result.resetBuild() - discard result.lstream.progLine() + discard result.lstream.progressLine() # Defines a short-hand notation for getting the current line -proc currLine*(tokStream: nlTokStream): string = +proc line*(tokStream: nlTokStream): string = result = tokStream.lstream.line -# Reimplements nlLStream.progress() for nlTokStream +# Reimplements nlLStream.progressChar for nlTokStream # to account for additional structure (ie the build token) -proc progChar(tokStream: var nlTokStream): bool = +# NOTE: progressChar progresses to lstream's next char +proc progressChar(tokStream: var nlTokStream): bool = if not tokStream.lstream.atEOL(): - tokStream.lstream.forceProgChar() + tokStream.lstream.forceProgressChar() result = true else: # attempt to progress to next line past EOL - result = tokStream.lstream.progLine() + result = tokStream.lstream.progressLine() tokStream.resetBuild() -# Generates and sets (by reference) the next token in the stream, -# via repeatedly calling progBuild() and progChar(). +# Generates and progress the next token in the nlTokStream. +# via repeatedly calling progressBuild() and progressChar(). # Returns a boolean indicating whether EOF has been reached. -# NOTE: progBuild adds lstream's current char to the build token -# NOTE: progChar progresses to lstream's next char -proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool = +# NOTE: access the new token via `tokStream.tok` +proc progress*(tokStream: var nlTokStream): bool = # Return prematurely if already closed if tokStream.closed: return false while true: var flushedTok: Option[nlTok] let - canProgress = tokStream.progBuild(flushedTok) + canProgress = tokStream.progressBuild(flushedTok) buildComplete = flushedTok.isSome # canProgress & EOF reached => no more tokens to build :) # NOTE: reachedEOF and not canProgress => more tokens unwrapping if buildComplete: # return the finished build token, and save it as the current token - tok = flushedTok.get() - tokStream.currTok = tok - if canProgress and not tokStream.progChar(): + tokStream.currTok = flushedTok.get() + if canProgress and not tokStream.progressChar(): tokStream.closed = true return buildComplete elif buildComplete: diff --git a/src/noether/lexer/toktype.nim b/src/noether/lexer/toktype.nim index 0f40023..49add5b 100644 --- a/src/noether/lexer/toktype.nim +++ b/src/noether/lexer/toktype.nim @@ -1,54 +1,59 @@ type - # nlTokType allows primitive nlToks to be typed, - # the nlTokType enum should never be directly + # nlTokKind allows primitive nlToks to be typed, + # the nlTokKind enum should never be directly # accessed. Use the interface in this file instead. - nlTokType* = enum - NONE, # Placeholder Value - EOF, # End of File - EOL, # End of Line (\0 --> EOL) - WORD, # Alphanumeric token - SYMB, # Symbolic token - LNFD, # \r \n Line-Feed - WTSP, # ' ' \t Whitespace - LPAR, # ( Left Parenthesis - RPAR, # ) Right Parenthesis - LBRA, # { Left Brace - RBRA, # } Right Brace - LSQB, # [ Left Square Bracket - RSQB, # ] Right Square Bracket - # LANB, # < Left Angle Bracket - # RANB, # > Right Angle Bracket - SQUO, # ' Single Quotation Marking - DQUO, # " Double Quotation Marking - GRVA, # ` Grave Accent - HASH, # # Number Sign (Hashtag) + nlTokKind* = enum + tkNONE, # Placeholder Value + + tkEOF, # End of File + tkEOL, # End of Line (\0 --> EOL) + + tkWORD, # Alphanumeric token + tkSYMB, # Symbolic token + + tkLNFD, # \r \n Line-Feed + tkWTSP, # ' ' \t Whitespace + + # RESERVED SYMBOLS + tkLPAR, # ( Left Parenthesis + tkRPAR, # ) Right Parenthesis + tkLBRA, # { Left Brace + tkRBRA, # } Right Brace + tkLSQB, # [ Left Square Bracket + tkRSQB, # ] Right Square Bracket + # tkLANB, # < Left Angle Bracket + # tkRANB, # > Right Angle Bracket + tkSQUO, # ' Single Quotation Marking + tkDQUO, # " Double Quotation Marking + tkGRVA, # ` Grave Accent + tkHASH, # # Number Sign (Hashtag) -# Classifies a character to its nlTokType -proc getTokType*(c: char): nlTokType = +# Classifies a character to its nlTokKind +proc getTokType*(c: char): nlTokKind = case c: of '\0', '\r', '\n': - result = nlTokType.EOL + result = tkEOL of ' ', '\t': - result = nlTokType.WTSP + result = tkWTSP of '(': - result = nlTokType.LPAR + result = tkLPAR of ')': - result = nlTokType.RPAR + result = tkRPAR of '{': - result = nlTokType.LBRA + result = tkLBRA of '}': - result = nlTokType.RBRA + result = tkRBRA of '[': - result = nlTokType.LSQB + result = tkLSQB of ']': - result = nlTokType.RSQB + result = tkRSQB of '\'': - result = nlTokType.SQUO + result = tkSQUO of '\"': - result = nlTokType.DQUO + result = tkDQUO of '`': - result = nlTokType.GRVA + result = tkGRVA of '#': - result = nlTokType.HASH + result = tkHASH else: - result = nlTokType.WORD + result = tkWORD diff --git a/src/noether/parser/nodes.nim b/src/noether/parser/nodes.nim index 23cf742..c31285e 100644 --- a/src/noether/parser/nodes.nim +++ b/src/noether/parser/nodes.nim @@ -1,19 +1,48 @@ +import std/options from ../lexer/tok import nlTok -# from ../lexer/tokstream import type - # NOTE: by the end of parsing NO nodes should - # NOTE: have nlNodeType.NONE - nlNodeType* = enum - NONE, # Placeholder Value - TERM, # Indicates the tree has terminated - STRL, # String Literal - CHRL, # Character Literal + # NOTE: by the end of parsing NO nodes should have nkNone + nlNodeKind* = enum + nkNone, # Placeholder Value + + nkStrLit, # String Literal + nkChrLit, # Character Literal + + # NOTE: always check parent != nil when traversing the tree nlNode* {.acyclic.} = ref object of RootObj - nType*: nlNodeType - toks*: seq[nlTok] # nodes store the tokens that build them - # left, right: nlNode + nKind*: nlNodeKind + toks*: seq[nlTok] # nodes (may) store the tokens that build them + parent*: nlNode + + # Purely abstract type that all nlNode objects + # with children are expected to inherit from. + nlBranchNode* {.acyclic.} = ref object of nlNode + child: UncheckedArray[nlNode] + + nlBiNode* {.acyclic.} = ref object of nlBranchNode + +proc childCount*(node: nlNode): int {.inline.} = 0 +proc childCount*(node: nlBiNode): int {.inline.} = 2 + +proc getChild*(node: nlNode, i: int): Option[nlNode] {.inline.} = + result = none(nlNode) +proc getChild*(node: nlBranchNode, i: int): Option[nlNode] {.inline.} = + result = some(node.child[i]) + +proc newNode*(nKind: nlNodeKind): nlNode = + result = nlNode( + nKind: nKind, + ) + +proc newBiNode*(nKind: nlNodeKind): nlNode = + result = nlBiNode( + nKind: nKind, + ) # Short-hand way of appending a token to a node's token sequence proc addTok*(node: nlNode, tok: nlTok) = + echo "AM I HERE?" + echo node[] + echo node.toks node.toks.add(tok) diff --git a/src/noether/parser/parser.nim b/src/noether/parser/parser.nim index 0598075..4654fb3 100644 --- a/src/noether/parser/parser.nim +++ b/src/noether/parser/parser.nim @@ -3,61 +3,55 @@ include parseutil # NOTE: Matching between two tokens will fill `node` with everything # NOTE: between those two tokens EXCLUDING the two tokens themselves. -proc parseMatch(tokStream: var nlTokStream, - node: var nlNode, - matchType: nlTokType): nlParseStat = +proc parseMatch(parser: var nlParser, matchType: nlTokKind): nlParseStat = result = greed( - tokStream, - node.toks, + parser, satisfyMatch(matchType), ) -proc parseMatchLine(tokStream: var nlTokStream, - node: var nlNode, - matchType: nlTokType): nlParseStat = - result = greed( - tokStream, - node.toks, - satisfyMatchEOL(matchType), +proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat = + result = greedLine( + parser, + satisfyMatch(matchType), ) -proc parseStrL(tokStream: var nlTokStream, node: var nlNode): nlParseStat = - node = nlNode( - nType: nlNodeType.STRL - ) - node.addTok(tokStream.currTok) - result = nlParseStat.UNCLOSED * not greedEOL(tokStream, node.toks, nlTokType.DQUO) +proc parseStrLit(parser: var nlParser): nlParseStat = + result = parser.parseMatch(tkDQUO) -proc parseChrL(tokStream: var nlTokStream, node: var nlNode): bool = - node = nlNode( - nType: nlNodeType.CHRL - ) - node.addTok(tokStream.currTok) - # TWO ERRORS ARE POSSIBLE, 1: content too big, 2: never closed - result = greedEOL(tokStream, node.toks, nlTokType.SQUO) - -# Attempt to form an nlAST from a nlTokStream -proc parse*(tokStream: var nlTokStream): nlNode = - var tok: nlTok - var node: nlNode - while tokStream.nextTok(tok): - case tok.tType: - of nlTokType.DQUO: +proc parseChrLit(parser: var nlParser): nlParseStat = + result = parser.parseMatch(tkSQUO) + +proc parseStmt(parser: var nlParser): nlParseStat = + # initialise build node as none just for the hell of it + + while parser.stream.progress(): + echo parser.stream.currTok + case parser.stream.currTok.tKind + of tkDQUO: # Attempt to parse string literal - if not parseStrL(tokStream, node): + if parser.parseStrLit() != nlParseStat.OK: echo "Unmatched Double Quotation! Malformed String Literal" - echo tokStream.currLine() - echo repeat(" ", tok.startPos), '^' + echo parser.stream.line + echo repeat(" ", parser.stream.currTok.startPos), '^' else: echo "Parsed String Literal" - echo node[] - of nlTokType.SQUO: + echo parser.bnode[] + of tkSQUO: # Attempt to parse string literal - if not parseChrL(tokStream, node): + if parser.parseChrLit() != nlParseStat.OK: echo "Unmatched Single Quotation! Malformed Character Literal" - echo tokStream.currLine() - echo repeat(" ", tok.startPos), '^' + echo parser.stream.line + echo repeat(" ", parser.stream.currTok.startPos), '^' else: - echo "Parsed String Literal" - echo node[] + echo "Parsed Character Literal" + echo parser.bnode[] else: echo "blah blah unhandled case" + result = nlParseStat.OK + +# Attempt to parse nlAST from nlTokStream +proc parse*(tokStream: var nlTokStream): nlAST = + var parser = newParser(tokStream) + echo ' ' + discard parser.parseStmt() + + result = parser.ast diff --git a/src/noether/parser/parseutil.nim b/src/noether/parser/parseutil.nim index 6fa1243..4d9deb6 100644 --- a/src/noether/parser/parseutil.nim +++ b/src/noether/parser/parseutil.nim @@ -2,21 +2,41 @@ import nodes import ../lexer/tokstream type - # NOTE: Values above __FAIL__ indicate a failed state - nlParseStat* = enum + # NOTE1: Values above MARKER_FAIL indicate a failed state + # NOTE2: nlParseStat is marked pure out of habit that's all + nlParseStat* {.pure.} = enum OK, - __FAIL__, - MIDAS, # Greedy search was never satisfied + MARKER_FAIL, UNMATCHED, TOOBIG, + nlAST* = object + root: nlNode + + nlParser* = object + stream: nlTokStream + ast: nlAST + # the "build node" is a reference to the AST node + # the parser is currently modifying/building from + # NOTE: bnode changes frequently, it is NOT the root + bnode: nlNode + + proc `*`(stat: nlParseStat, b: bool): nlParseStat = result = if b: stat else: nlParseStat.OK proc isFail*(stat: nlParseStat): bool = - result = (stat >= nlParseStat.__FAIL__) + result = (stat >= nlParseStat.MARKER_FAIL) +proc newParser*(tokStream: var nlTokStream): nlParser = + let rootNode = newNode(nkNone) + result = nlParser( + stream: tokStream, + ast: rootNode, + bnode: rootNode, + ) + #[ "Greed" refers to something I mentioned in my discussion on | Noether's grammar (in an EBNF-like language). Greed just | means "everything until a condition is satisified". @@ -25,34 +45,28 @@ proc isFail*(stat: nlParseStat): bool = # Greed will consume anything until a condition is satisfied # Returns false if the greed was never satisfied (OMG!!) -proc greed(tokStream: var nlTokStream, - toks: var seq[nlTok], - satisfy: proc(tok: nlTok): bool, - ): nlParseStat = - var tok: nlTok - while tokStream.nextTok(tok): - toks.add(tok) - if satisfy(tok): +proc greed(parser: var nlParser, + satisfy: proc(tok: nlTok): bool): nlParseStat = + while parser.stream.progress(): + echo "im definitely here!" + parser.bnode.addTok(parser.stream.currTok) + if satisfy(parser.stream.currTok): return nlParseStat.OK result = nlParseStat.UNMATCHED -proc greedLine(tokStream: var nlTokStream, - toks: var seq[nlTok], +proc greedLine(parser: var nlParser, satisfy: proc(tok: nlTok): bool): nlParseStat = - var tok: nlTok - while tokStream.nextTok(tok): - toks.add(tok) - if satisfy(tok): - return true - result = + while parser.stream.progress(): + parser.bnode.addTok(parser.stream.currTok) + if satisfy(parser.stream.currTok): + return nlParseStat.OK + elif parser.stream.currTok.tKind == tkEOL: + return nlParseStat.UNMATCHED + result = nlParseStat.UNMATCHED #[ Templates for generating greed satisfying conditions. ]# # Satisfied if it finds nlTok of type matchType -template satisfyMatch(matchType: nlTokType) = - proc(tok: nlTok): bool {.inline.} = (tok.tType == matchType) - -# Satisfied if it finds nlTok of type matchType or EOL reached -template satisfyMatchEOL(matchType: nlTokType) = - proc(tok: nlTok): bool {.inline.} = (tok.tType == matchType or tok.tType == nlTokType.EOL) +template satisfyMatch(matchType: nlTokKind): untyped = + (proc(tok {.inject.}: nlTok): bool = (tok.tKind == matchType)) From 2af3000c2ec80230adbcf74d82ec353cd994037f Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 02:11:52 +1000 Subject: [PATCH 02/12] eeeeekkk typo :( --- src/noether/parser/nodes.nim | 1 - src/noether/parser/parseutil.nim | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/noether/parser/nodes.nim b/src/noether/parser/nodes.nim index c31285e..a50eee0 100644 --- a/src/noether/parser/nodes.nim +++ b/src/noether/parser/nodes.nim @@ -42,7 +42,6 @@ proc newBiNode*(nKind: nlNodeKind): nlNode = # Short-hand way of appending a token to a node's token sequence proc addTok*(node: nlNode, tok: nlTok) = - echo "AM I HERE?" echo node[] echo node.toks node.toks.add(tok) diff --git a/src/noether/parser/parseutil.nim b/src/noether/parser/parseutil.nim index 4d9deb6..386b03a 100644 --- a/src/noether/parser/parseutil.nim +++ b/src/noether/parser/parseutil.nim @@ -32,7 +32,9 @@ proc newParser*(tokStream: var nlTokStream): nlParser = let rootNode = newNode(nkNone) result = nlParser( stream: tokStream, - ast: rootNode, + ast: nlAST( + root: rootNode + ), bnode: rootNode, ) @@ -48,7 +50,6 @@ proc newParser*(tokStream: var nlTokStream): nlParser = proc greed(parser: var nlParser, satisfy: proc(tok: nlTok): bool): nlParseStat = while parser.stream.progress(): - echo "im definitely here!" parser.bnode.addTok(parser.stream.currTok) if satisfy(parser.stream.currTok): return nlParseStat.OK From 4a8f44d23f17a628a96efc438fcc3eaa8cb72b93 Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 02:23:54 +1000 Subject: [PATCH 03/12] Fixed parseStmt called on uninitialized nlParser.ast Also parseStmt now discards nlTokKind.tkEOL (this shouldn't be left in long term, just a temporary solution) --- src/noether.nim | 2 +- src/noether/parser/nodes.nim | 2 -- src/noether/parser/parser.nim | 15 +++++++++------ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/noether.nim b/src/noether.nim index 509b123..83c2b1b 100644 --- a/src/noether.nim +++ b/src/noether.nim @@ -2,4 +2,4 @@ # uses this file as the main entry point of the application. when isMainModule: - echo "Noether Lang" + echo "Noether Lang v0.1.0" diff --git a/src/noether/parser/nodes.nim b/src/noether/parser/nodes.nim index a50eee0..bd737c6 100644 --- a/src/noether/parser/nodes.nim +++ b/src/noether/parser/nodes.nim @@ -42,6 +42,4 @@ proc newBiNode*(nKind: nlNodeKind): nlNode = # Short-hand way of appending a token to a node's token sequence proc addTok*(node: nlNode, tok: nlTok) = - echo node[] - echo node.toks node.toks.add(tok) diff --git a/src/noether/parser/parser.nim b/src/noether/parser/parser.nim index 4654fb3..dcdcc06 100644 --- a/src/noether/parser/parser.nim +++ b/src/noether/parser/parser.nim @@ -24,28 +24,31 @@ proc parseStmt(parser: var nlParser): nlParseStat = # initialise build node as none just for the hell of it while parser.stream.progress(): - echo parser.stream.currTok + echo "Current Token: ", parser.stream.currTok case parser.stream.currTok.tKind of tkDQUO: # Attempt to parse string literal if parser.parseStrLit() != nlParseStat.OK: echo "Unmatched Double Quotation! Malformed String Literal" echo parser.stream.line - echo repeat(" ", parser.stream.currTok.startPos), '^' + echo repeat(" ", parser.stream.currTok.startPos), '^', '\n' else: echo "Parsed String Literal" - echo parser.bnode[] + echo parser.bnode[], '\n' of tkSQUO: # Attempt to parse string literal if parser.parseChrLit() != nlParseStat.OK: echo "Unmatched Single Quotation! Malformed Character Literal" echo parser.stream.line - echo repeat(" ", parser.stream.currTok.startPos), '^' + echo repeat(" ", parser.stream.currTok.startPos), '^', '\n' else: echo "Parsed Character Literal" - echo parser.bnode[] + echo parser.bnode[], '\n' + of tkEOL: + # TODO: handle this case, don't just discard + discard else: - echo "blah blah unhandled case" + echo "blah blah unhandled case\n" result = nlParseStat.OK # Attempt to parse nlAST from nlTokStream From f8f90fe92daa9330f213ac7148c2a18f0fec9b1f Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 02:24:26 +1000 Subject: [PATCH 04/12] Added ultra simple build script for debug --- .gitignore | 4 ++++ src/ddemo | 3 +++ 2 files changed, 7 insertions(+) create mode 100755 src/ddemo diff --git a/.gitignore b/.gitignore index 0a37b21..814ced8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ __pycache__/ bin/ + +# TEMP: used while debugging +# (and cause I'm super duper lazy) +src/nlx diff --git a/src/ddemo b/src/ddemo new file mode 100755 index 0000000..35c7af2 --- /dev/null +++ b/src/ddemo @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +nim c nlx.nim +./nlx ../lang/demo/$1 From 8e6c0bbbfc6cd312530e5bea5e2d9ef209e8b28d Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 02:33:13 +1000 Subject: [PATCH 05/12] Fixed StrLit + ChrLit matching beyond EOL, also greed excludes satisfier --- src/noether/parser/parser.nim | 8 +++----- src/noether/parser/parseutil.nim | 8 +++++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/noether/parser/parser.nim b/src/noether/parser/parser.nim index dcdcc06..7da349f 100644 --- a/src/noether/parser/parser.nim +++ b/src/noether/parser/parser.nim @@ -15,16 +15,14 @@ proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat = ) proc parseStrLit(parser: var nlParser): nlParseStat = - result = parser.parseMatch(tkDQUO) + result = parser.parseMatchLine(tkDQUO) proc parseChrLit(parser: var nlParser): nlParseStat = - result = parser.parseMatch(tkSQUO) + result = parser.parseMatchLine(tkSQUO) proc parseStmt(parser: var nlParser): nlParseStat = - # initialise build node as none just for the hell of it - while parser.stream.progress(): - echo "Current Token: ", parser.stream.currTok + echo "----- Current Token: ", parser.stream.currTok case parser.stream.currTok.tKind of tkDQUO: # Attempt to parse string literal diff --git a/src/noether/parser/parseutil.nim b/src/noether/parser/parseutil.nim index 386b03a..8b9ef20 100644 --- a/src/noether/parser/parseutil.nim +++ b/src/noether/parser/parseutil.nim @@ -50,18 +50,20 @@ proc newParser*(tokStream: var nlTokStream): nlParser = proc greed(parser: var nlParser, satisfy: proc(tok: nlTok): bool): nlParseStat = while parser.stream.progress(): - parser.bnode.addTok(parser.stream.currTok) if satisfy(parser.stream.currTok): return nlParseStat.OK + # NOTE: the matched token is currently excluded + parser.bnode.addTok(parser.stream.currTok) result = nlParseStat.UNMATCHED proc greedLine(parser: var nlParser, satisfy: proc(tok: nlTok): bool): nlParseStat = while parser.stream.progress(): - parser.bnode.addTok(parser.stream.currTok) if satisfy(parser.stream.currTok): return nlParseStat.OK - elif parser.stream.currTok.tKind == tkEOL: + # NOTE: the matched token is currently excluded + parser.bnode.addTok(parser.stream.currTok) + if parser.stream.currTok.tKind == tkEOL: return nlParseStat.UNMATCHED result = nlParseStat.UNMATCHED From 72a6075123cbb2749a58908766e718ccd2dc0991 Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 03:26:22 +1000 Subject: [PATCH 06/12] nlParser now exposes a subset of the nlTokStream interface --- src/noether/parser/parser.nim | 14 +++++++------- src/noether/parser/parseutil.nim | 29 ++++++++++++++++++++++------- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/src/noether/parser/parser.nim b/src/noether/parser/parser.nim index 7da349f..7daf91b 100644 --- a/src/noether/parser/parser.nim +++ b/src/noether/parser/parser.nim @@ -21,15 +21,15 @@ proc parseChrLit(parser: var nlParser): nlParseStat = result = parser.parseMatchLine(tkSQUO) proc parseStmt(parser: var nlParser): nlParseStat = - while parser.stream.progress(): - echo "----- Current Token: ", parser.stream.currTok - case parser.stream.currTok.tKind + while parser.progressStream(): + echo "----- Current Token: ", parser.currTok + case parser.currTok.tKind of tkDQUO: # Attempt to parse string literal if parser.parseStrLit() != nlParseStat.OK: echo "Unmatched Double Quotation! Malformed String Literal" - echo parser.stream.line - echo repeat(" ", parser.stream.currTok.startPos), '^', '\n' + echo parser.line + echo repeat(" ", parser.currTok.startPos), '^', '\n' else: echo "Parsed String Literal" echo parser.bnode[], '\n' @@ -37,8 +37,8 @@ proc parseStmt(parser: var nlParser): nlParseStat = # Attempt to parse string literal if parser.parseChrLit() != nlParseStat.OK: echo "Unmatched Single Quotation! Malformed Character Literal" - echo parser.stream.line - echo repeat(" ", parser.stream.currTok.startPos), '^', '\n' + echo parser.line + echo repeat(" ", parser.currTok.startPos), '^', '\n' else: echo "Parsed Character Literal" echo parser.bnode[], '\n' diff --git a/src/noether/parser/parseutil.nim b/src/noether/parser/parseutil.nim index 8b9ef20..d531490 100644 --- a/src/noether/parser/parseutil.nim +++ b/src/noether/parser/parseutil.nim @@ -20,6 +20,9 @@ type # the parser is currently modifying/building from # NOTE: bnode changes frequently, it is NOT the root bnode: nlNode + # flag indicating whether the parser is at + # the start of a new line (aka checking indentation) + inIndent: bool proc `*`(stat: nlParseStat, b: bool): nlParseStat = @@ -38,6 +41,18 @@ proc newParser*(tokStream: var nlTokStream): nlParser = bnode: rootNode, ) +# Exposes a subset of the nlTokStream interface +proc currTok(parser: var nlParser): nlTok = parser.stream.currTok +proc line(parser: var nlParser): string = parser.stream.line + +# Extends upon the functionality of nlTokStream.progress() +proc progressStream*(parser: var nlParser): bool = + result = parser.stream.progress() + if result and parser.currTok.tKind == tkEOL: + parser.inIndent = true + if + +proc setNewLine() #[ "Greed" refers to something I mentioned in my discussion on | Noether's grammar (in an EBNF-like language). Greed just @@ -49,21 +64,21 @@ proc newParser*(tokStream: var nlTokStream): nlParser = # Returns false if the greed was never satisfied (OMG!!) proc greed(parser: var nlParser, satisfy: proc(tok: nlTok): bool): nlParseStat = - while parser.stream.progress(): - if satisfy(parser.stream.currTok): + while parser.progressStream(): + if satisfy(parser.currTok): return nlParseStat.OK # NOTE: the matched token is currently excluded - parser.bnode.addTok(parser.stream.currTok) + parser.bnode.addTok(parser.currTok) result = nlParseStat.UNMATCHED proc greedLine(parser: var nlParser, satisfy: proc(tok: nlTok): bool): nlParseStat = - while parser.stream.progress(): - if satisfy(parser.stream.currTok): + while parser.progressStream(): + if satisfy(parser.currTok): return nlParseStat.OK # NOTE: the matched token is currently excluded - parser.bnode.addTok(parser.stream.currTok) - if parser.stream.currTok.tKind == tkEOL: + parser.bnode.addTok(parser.currTok) + if parser.currTok.tKind == tkEOL: return nlParseStat.UNMATCHED result = nlParseStat.UNMATCHED From 99db57dcfdf43e118f2812be560399e544acd0fb Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 08:48:31 +1000 Subject: [PATCH 07/12] YALR (Yet Another Lexer Refactor) --- lang/demo/single_toks.no | 2 + src/ddemo | 8 ++ src/nlx.nim | 27 ++-- src/noether/lexer/lstream.nim | 66 ---------- src/noether/lexer/tok.nim | 29 +++-- src/noether/lexer/tokbuilder.nim | 123 ++++++++++++++++++ src/noether/lexer/tokbuilding.nim | 86 ------------ .../lexer/{toktype.nim => tokkind.nim} | 6 +- src/noether/lexer/tokstream.nim | 71 +++++----- src/noether/lib/io.nim | 7 + 10 files changed, 208 insertions(+), 217 deletions(-) create mode 100644 lang/demo/single_toks.no delete mode 100644 src/noether/lexer/lstream.nim create mode 100644 src/noether/lexer/tokbuilder.nim delete mode 100644 src/noether/lexer/tokbuilding.nim rename src/noether/lexer/{toktype.nim => tokkind.nim} (93%) create mode 100644 src/noether/lib/io.nim diff --git a/lang/demo/single_toks.no b/lang/demo/single_toks.no new file mode 100644 index 0000000..683090a --- /dev/null +++ b/lang/demo/single_toks.no @@ -0,0 +1,2 @@ +[a]b(#) +(c)d[e] diff --git a/src/ddemo b/src/ddemo index 35c7af2..af30039 100755 --- a/src/ddemo +++ b/src/ddemo @@ -1,3 +1,11 @@ #!/usr/bin/env bash +set -e + +if [ -z "$1" ]; then + echo "Usage: ddemo DEMOFILE" + echo "Demo files are located in lang/demo" + exit 1 +fi + nim c nlx.nim ./nlx ../lang/demo/$1 diff --git a/src/nlx.nim b/src/nlx.nim index 75f59d4..adf95f0 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -1,22 +1,19 @@ import os -import noether/lexer/tok -import noether/lexer/tokstream -import noether/parser/parser +import noether/lib/io +import noether/lexer/[tok, tokstream] +# import noether/parser/parser {.hint: "Don't forget to drink more water (^_^)".} when isMainModule: echo "Noether Lang Extras v0.1.0 - nlx" - if paramCount() > 0: - let filename = paramStr(1) - var tokStream = newTokStream(filename, isFile=true) - - # # DumpTok - # while tokStream.progress(): - # echo tokStream.currTok + var inStream = if paramCount() > 0: streamFile(paramStr 1) + else: streamString(readAll stdin) - # DumpTree - discard parse(tokStream) - - else: - echo "usage: nlx filename" + var stream = newTokStream(inStream) + # # DumpTok + while stream.progress(): + echo stream.tok + + # DumpTree + # discard parse(tokStream) diff --git a/src/noether/lexer/lstream.nim b/src/noether/lexer/lstream.nim deleted file mode 100644 index 44138e0..0000000 --- a/src/noether/lexer/lstream.nim +++ /dev/null @@ -1,66 +0,0 @@ -import std/streams -import std/options - -import tok -export tok - -type - # Character streaming for the nlTokStream - nlLStream = object - stream: Stream - # row/column positions - line*: string - lineNum*: Natural - pos*: Natural - -proc streamFile*(filename: string): FileStream = - result = newFileStream(filename, fmRead) - -proc streamString*(str: string): StringStream = - result = newStringStream(str) - -proc newLStream*(content: string, isFile: bool = false): nlLStream = - result = nlLStream( - stream: if isFile: streamFile(content) else: streamString(content), - line: "", - lineNum: Natural 0, - pos: Natural 0, - ) - -# Checks whether we've reached EOL -# NOTE: also checks if we've surpassed it (ie invalid lstream.pos) -proc atEOL*(lstream: nlLStream): bool = - result = (lstream.pos >= lstream.line.len - 1) - -# Checks whether we are EXACTLY at EOL, but not surpassed -proc exactlyEOL*(lstream: nlLStream): bool = - result = (lstream.pos == lstream.line.len - 1) - -# Checks whether we have surpassed EOL -proc outOfBounds*(lstream: nlLStream): bool = - result = (lstream.pos > lstream.line.len - 1) - -# Progress the lex stream to the next line (if available) -proc progressLine*(lstream: var nlLStream): bool = - if lstream.stream.readLine(lstream.line): - inc lstream.lineNum - lstream.pos = Natural 0 - return true - return false - -# Progress the lex stream to the next character in the line -# forcefully (aka does NOT check if we reached EOL) -proc forceProgressChar*(lstream: var nlLStream) = - inc lstream.pos - -# # Progress the lex stream to the next character (if available) -# proc progressChar*(lstream: var nlLStream): bool = -# if not lstream.atEOL(): -# lstream.forceProgressChar() -# result = true -# else: -# # attempt to progress next line past EOL -# result = lstream.progressLine() - -proc currChar*(lstream: nlLStream): char = - result = lstream.line[lstream.pos] diff --git a/src/noether/lexer/tok.nim b/src/noether/lexer/tok.nim index 7715b8f..08aba66 100644 --- a/src/noether/lexer/tok.nim +++ b/src/noether/lexer/tok.nim @@ -1,22 +1,25 @@ -include toktype +include tokkind type - nlTok* = object - tKind*: nlTokKind - lit*: string - lineNum*: Natural - startPos*: Natural - endPos*: Natural + nlTok* = tuple + # NOTE: nlTokBuilder will mutate nlTok.kind + kind: nlTokKind + lit: string + lineNum: int + startPos: int + endPos: int # Generates an "empty" nlTok with only a startPos, # all other fields are expected to be filled out later. -proc emptyTok*(startPos: int): nlTok = - result = nlTok( - tKind: tkNONE, +proc emptyTok*(startPos: int): nlTok {.inline.} = + result = ( + kind: tkNONE, lit: "", - startPos: Natural startPos, + lineNum: 0, + startPos: startPos, + endPos: startPos, ) # Checks if an nlTok has tkNONE -proc isUntyped*(tKind: nlTokKind): bool = - result = (tKind == tkNONE) +proc isUntyped*(tok: nlTok): bool {.inline.} = + result = (tok.kind == tkNONE) diff --git a/src/noether/lexer/tokbuilder.nim b/src/noether/lexer/tokbuilder.nim new file mode 100644 index 0000000..357841a --- /dev/null +++ b/src/noether/lexer/tokbuilder.nim @@ -0,0 +1,123 @@ +import + streams, + options + +import tok +export tok + +type + # Abstracts the "building process" (lexing) + # of nlTok objects from a given Stream of characters. + nlTokBuilder* = object + stream: Stream + tok: nlTok # the build token + # track line number, line content, etc + line: string + lineNum: int + pos: int + # save char and pos and its token type + char: char + cTKind: nlTokKind + +proc atEOL(builder: nlTokBuilder): bool {.inline.} = + result = (builder.char == '\n') +proc atEOF(builder: nlTokBuilder): bool {.inline.} = + result = (builder.char == '\0') + +# Initialise a new token builder +proc newBuilder(stream: var Stream): nlTokBuilder = + # NOTE: initial builder.char value is arbitrary, + # NOTE: but CANNOT be initialised to the default '\0' + result = nlTokBuilder( + stream: stream, + tok: emptyTok(0), + line: "", + lineNum: 1, + pos: -1, # after initial readChar this -> 0 + char: '\0', # use \0 as initial invalid char + ) + + +#[ ====================================================== ] + | nlTokBuilder Internal Interface for Token Construction ] + ]# + +# Reset the build token to be "empty" +proc resetBuild(builder: var nlTokBuilder) = + builder.tok = emptyTok(builder.pos) + +# "Finishes" the build token by setting various properties +proc finishBuild(builder: var nlTokBuilder) = + builder.tok.lineNum = builder.lineNum + builder.tok.endPos = builder.pos + builder.tok.lit = builder.line[builder.tok.startPos ..< builder.line.high] + +# Finish, return, and reset the build token +proc flushBuild(builder: var nlTokBuilder): nlTok = + echo "Flush @", builder.pos + finishBuild(builder) + result = builder.tok + resetBuild(builder) + +# Is the build token "compatible" with the current char? +# NOTE: flushBuild() is called if incompatible +proc isCompatibleBuild(builder: nlTokBuilder): bool = + result = (builder.cTKind == builder.tok.kind) + +# Inherit the build token's type from current char +proc inherit(builder: var nlTokBuilder) = + builder.tok.kind = builder.cTKind + +# Add a character to the nlTokBuilder's build token. +# Flushes and returns the build token if "fully built", +# and a boolean indicating whether the nlTokBuilder can progress. +proc appendBuild(builder: var nlTokBuilder, flushed: var Option[nlTok]): bool = + # untyped build tokens inherit type immediately + if builder.tok.isUntyped(): + builder.inherit() + + # check if EOF reached + # if builder.atEOL(): + # echo "EOL DETECT 1" + # result = false # DO NOT PROGRESS + # flushed = some(flushBuild(builder)) + # check character and build token compatability + if not isCompatibleBuild(builder): + # flush old build token, the new one inherits type + flushed = some(flushBuild(builder)) + builder.inherit() + result = true # can progress + else: + flushed = none(nlTok) + result = true # can progress + +#[ ========================================== ] + | nlTokBuilder Char Stream Reading Interface ] + ]# + +# Read the next char in the stream without +# checking whether it is safe to do so +proc forceReadChar(builder: var nlTokBuilder) {.inline.} = + echo "read" + inc builder.pos + builder.char = builder.stream.readChar() + builder.cTKind = getTokKind(builder.char) + builder.line.add(builder.char) + +# Read the next char in the stream +# NOTE: readChar raises IOError on error, returns \0 on EOF +proc readChar(builder: var nlTokBuilder): bool = + if builder.atEOL(): + echo "EOL DETECT 2" + inc builder.lineNum + # sets builder.char to '\0' if EOF + builder.forceReadChar() + result = builder.atEOF() + +# Read until EOL and return the current line +# NOTE: Does NOT update the builder's state (unsafe) +# NOTE: ONLY call if a lex/parse error needs displaying +proc unsafeGetLine(builder: var nlTokBuilder): string = + while not builder.atEOL() and builder.readChar(): + discard + result = builder.line diff --git a/src/noether/lexer/tokbuilding.nim b/src/noether/lexer/tokbuilding.nim deleted file mode 100644 index 46a2222..0000000 --- a/src/noether/lexer/tokbuilding.nim +++ /dev/null @@ -1,86 +0,0 @@ -include lstream - -type - # Provides a stream-like interface for lexing nlToks - # Internally reliant on the functionality of nlLStream - nlTokStream* = object - lstream: nlLStream - build: nlTok # the build token - currTok*: nlTok # the current token - closed: bool # EOF + all tokens built - -# Generates an EOL token for the nlTokStream's state -proc EOLTok(tokStream: nlTokStream): nlTok = - result = nlTok( - tKind: tkEOL, - lit: "\0", - lineNum: Natural tokStream.lstream.lineNum, - startPos: Natural tokStream.lstream.pos, - endPos: Natural tokStream.lstream.pos, - ) - -# Resets the build token to an "empty" nlTok -proc resetBuild(tokStream: var nlTokStream) = - tokStream.build = emptyTok(tokStream.lstream.pos) - -# Completes a token generated by emptyTok() -# based on the nlTokStream's nlLStream's -# current line and character positions -proc finishBuild(ts: var nlTokStream) = - ts.build.lineNum = Natural ts.lstream.lineNum - ts.build.endPos = Natural ts.lstream.pos - ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos] - -# Returns the nlTokStream's build token and -# empties the build token's contents. -proc flushBuild(tokStream: var nlTokStream): nlTok = - finishBuild(tokStream) - result = tokStream.build - resetBuild(tokStream) - -# Returns whether the build token has a set type yet. -# This indicates that the build token should inherit -# the nlTokKind of the nlLStream's next character. -proc isUntypedBuild(tokStream: nlTokStream): bool = - result = tokStream.build.tKind.isUntyped() - -# Check whether an nlTokKind is "compatible" with the build token. -# NOTE: flushBuild() should be called when an incompatible token is discovered. -proc isCompatibleBuild(tokStream: nlTokStream, tKind: nlTokKind): bool = - result = (tKind == tokStream.build.tKind) - -# Add a character to the nlTokStream's build token. -# Flushes and returns the build token if "fully built", -# and a boolean indicating whether the nlTokStream can progress. -proc progressBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool = - # the "pos > EOL" invalid state is used intentionally - # to indicate all tokens have been built, and return EOL Token - if tokStream.lstream.outOfBounds(): - buildTok = some(EOLTok(tokStream)) - return true # can progress once more - - let tKind = getTokType(tokStream.lstream.currChar()) - # untyped build tokens must inherited immediately - if isUntypedBuild(tokStream): - tokStream.build.tKind = tKind - - # check if EOL reached - if tokStream.lstream.atEOL(): - # flush old build token, the new one can be left untyped - let compatible = isCompatibleBuild(tokStream, tKind) - result = false # DO NOT PROGRESS - if compatible: - # force the lstream into an invalid state by progressing beyond EOL - # we can then detect this state on the next progressBuild and return - # an EOL character (very unsafe implementation but it works well) - tokStream.lstream.forceProgressChar() - buildTok = some(flushBuild(tokStream)) - # check character and build token compatability - elif not isCompatibleBuild(tokStream, tKind): - # flush old build token, the new one inherits type - buildTok = some(flushBuild(tokStream)) - tokStream.build.tKind = tKind - result = true # can progress - else: - buildTok = none(nlTok) - result = true # can progress diff --git a/src/noether/lexer/toktype.nim b/src/noether/lexer/tokkind.nim similarity index 93% rename from src/noether/lexer/toktype.nim rename to src/noether/lexer/tokkind.nim index 49add5b..3d1d7b6 100644 --- a/src/noether/lexer/toktype.nim +++ b/src/noether/lexer/tokkind.nim @@ -29,9 +29,11 @@ type tkHASH, # # Number Sign (Hashtag) # Classifies a character to its nlTokKind -proc getTokType*(c: char): nlTokKind = +proc getTokKind*(c: char): nlTokKind = case c: - of '\0', '\r', '\n': + of '\0': + result = tkEOF + of '\r', '\n': result = tkEOL of ' ', '\t': result = tkWTSP diff --git a/src/noether/lexer/tokstream.nim b/src/noether/lexer/tokstream.nim index 02a045e..309e9bb 100644 --- a/src/noether/lexer/tokstream.nim +++ b/src/noether/lexer/tokstream.nim @@ -1,52 +1,53 @@ -include tokbuilding +include tokbuilder + +type + # Provides a stream-like interface for lexing. + # Implemented as a wrapper for nlTokBuilder. + nlTokStream* = object + builder: nlTokBuilder + tok*: nlTok # the current token + isClosed: bool # EOF + all tokens built # Initialises a new nlTokStream on a string or file -proc newTokStream*(content: string, isFile: bool = false): nlTokStream = +proc newTokStream*(stream: var Stream): nlTokStream = result = nlTokStream( - lstream: newLStream(content, isFile=isFile), - closed: false, + builder: newBuilder(stream), + tok: emptyTok(0), + isClosed: false, ) - # 1. initialise an empty build token - # 2. progress to the first line - result.resetBuild() - discard result.lstream.progressLine() - -# Defines a short-hand notation for getting the current line -proc line*(tokStream: nlTokStream): string = - result = tokStream.lstream.line - -# Reimplements nlLStream.progressChar for nlTokStream -# to account for additional structure (ie the build token) -# NOTE: progressChar progresses to lstream's next char -proc progressChar(tokStream: var nlTokStream): bool = - if not tokStream.lstream.atEOL(): - tokStream.lstream.forceProgressChar() - result = true - else: - # attempt to progress to next line past EOL - result = tokStream.lstream.progressLine() - tokStream.resetBuild() +# Expose a subset of the nlTokBuilder interface +proc line*(stream: nlTokStream): string = + result = stream.builder.line +proc atEOL*(stream: nlTokStream): bool = + result = stream.builder.atEOL() + # Generates and progress the next token in the nlTokStream. # via repeatedly calling progressBuild() and progressChar(). # Returns a boolean indicating whether EOF has been reached. -# NOTE: access the new token via `tokStream.tok` -proc progress*(tokStream: var nlTokStream): bool = +# NOTE: access the new token via `stream.tok` +proc progress*(stream: var nlTokStream): bool = # Return prematurely if already closed - if tokStream.closed: + if stream.isClosed: return false while true: + # echo "\nProgressing..." var flushedTok: Option[nlTok] let - canProgress = tokStream.progressBuild(flushedTok) - buildComplete = flushedTok.isSome + atEOF = stream.builder.readChar() + newTokBuilt = flushedTok.isSome + discard stream.builder.appendBuild(flushedTok) + echo flushedTok + echo "atEOF: ", atEOF, "\nnewTokBuilt: ", newTokBuilt # canProgress & EOF reached => no more tokens to build :) # NOTE: reachedEOF and not canProgress => more tokens unwrapping - if buildComplete: + if newTokBuilt: # return the finished build token, and save it as the current token - tokStream.currTok = flushedTok.get() - if canProgress and not tokStream.progressChar(): - tokStream.closed = true - return buildComplete - elif buildComplete: + stream.tok = flushedTok.get() + # if canProgress and atEOF: + if atEOF: + if newTokBuilt: + stream.isClosed = true + return newTokBuilt + elif newTokBuilt: return true diff --git a/src/noether/lib/io.nim b/src/noether/lib/io.nim new file mode 100644 index 0000000..c7eb0eb --- /dev/null +++ b/src/noether/lib/io.nim @@ -0,0 +1,7 @@ +import std/streams + +proc streamFile*(filename: string): Stream {.inline.} = + result = newFileStream(filename, fmRead) + +proc streamString*(str: string): Stream {.inline.} = + result = newStringStream(str) From 07a9bda9ba017f1d60eed2e719b55264bdc90dcd Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 09:11:49 +1000 Subject: [PATCH 08/12] Once again fixed EOL handling... --- src/noether/lexer/tokbuilder.nim | 51 +++++++++++--------------------- src/noether/lexer/tokstream.nim | 4 +-- 2 files changed, 19 insertions(+), 36 deletions(-) diff --git a/src/noether/lexer/tokbuilder.nim b/src/noether/lexer/tokbuilder.nim index 357841a..0d2f212 100644 --- a/src/noether/lexer/tokbuilder.nim +++ b/src/noether/lexer/tokbuilder.nim @@ -54,70 +54,55 @@ proc finishBuild(builder: var nlTokBuilder) = # Finish, return, and reset the build token proc flushBuild(builder: var nlTokBuilder): nlTok = - echo "Flush @", builder.pos finishBuild(builder) result = builder.tok resetBuild(builder) -# Is the build token "compatible" with the current char? -# NOTE: flushBuild() is called if incompatible -proc isCompatibleBuild(builder: nlTokBuilder): bool = - result = (builder.cTKind == builder.tok.kind) +# Is the build token "compatible" with the current char? (if not then flushbuild) +# NOTE: This implicitly handles Windows CRLF, Unix LF, & Mac OS CR compatability +# NOTE: since atEOL => '\n', but '\r' and '\n' are both tkEOL so they both flush. +proc isIncompatibleBuild(builder: nlTokBuilder): bool = + result = (builder.cTKind != builder.tok.kind or builder.atEOL()) # Inherit the build token's type from current char proc inherit(builder: var nlTokBuilder) = builder.tok.kind = builder.cTKind # Add a character to the nlTokBuilder's build token. -# Flushes and returns the build token if "fully built", -# and a boolean indicating whether the nlTokBuilder can progress. -proc appendBuild(builder: var nlTokBuilder, flushed: var Option[nlTok]): bool = +# Flushes and returns the build token if finished. +proc appendBuild(builder: var nlTokBuilder): Option[nlTok] = # untyped build tokens inherit type immediately if builder.tok.isUntyped(): builder.inherit() - # check if EOF reached - # if builder.atEOL(): - # echo "EOL DETECT 1" - # result = false # DO NOT PROGRESS - # flushed = some(flushBuild(builder)) # check character and build token compatability - if not isCompatibleBuild(builder): + if isIncompatibleBuild(builder): # flush old build token, the new one inherits type - flushed = some(flushBuild(builder)) + result = some(flushBuild(builder)) builder.inherit() - result = true # can progress else: - flushed = none(nlTok) - result = true # can progress + result = none(nlTok) #[ ========================================== ] | nlTokBuilder Char Stream Reading Interface ] ]# -# Read the next char in the stream without -# checking whether it is safe to do so -proc forceReadChar(builder: var nlTokBuilder) {.inline.} = - echo "read" - inc builder.pos +# Read the next char in the stream +# NOTE: readChar raises IOError on error, returns \0 on EOF +proc readChar*(builder: var nlTokBuilder): bool = + if builder.atEOL(): + inc builder.lineNum + # sets builder.char to '\0' if EOF builder.char = builder.stream.readChar() builder.cTKind = getTokKind(builder.char) builder.line.add(builder.char) - -# Read the next char in the stream -# NOTE: readChar raises IOError on error, returns \0 on EOF -proc readChar(builder: var nlTokBuilder): bool = - if builder.atEOL(): - echo "EOL DETECT 2" - inc builder.lineNum - # sets builder.char to '\0' if EOF - builder.forceReadChar() + inc builder.pos result = builder.atEOF() # Read until EOL and return the current line # NOTE: Does NOT update the builder's state (unsafe) # NOTE: ONLY call if a lex/parse error needs displaying -proc unsafeGetLine(builder: var nlTokBuilder): string = +proc unsafeGetLine*(builder: var nlTokBuilder): string = while not builder.atEOL() and builder.readChar(): discard result = builder.line diff --git a/src/noether/lexer/tokstream.nim b/src/noether/lexer/tokstream.nim index 309e9bb..e64f777 100644 --- a/src/noether/lexer/tokstream.nim +++ b/src/noether/lexer/tokstream.nim @@ -31,12 +31,10 @@ proc progress*(stream: var nlTokStream): bool = if stream.isClosed: return false while true: - # echo "\nProgressing..." - var flushedTok: Option[nlTok] let atEOF = stream.builder.readChar() + flushedTok = stream.builder.appendBuild() newTokBuilt = flushedTok.isSome - discard stream.builder.appendBuild(flushedTok) echo flushedTok echo "atEOF: ", atEOF, "\nnewTokBuilt: ", newTokBuilt # canProgress & EOF reached => no more tokens to build :) From d7fb1f0c899a189c7a48acbf5cb2c29d3c968a6d Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 09:38:08 +1000 Subject: [PATCH 09/12] Migrate nlTokBuilder + nlTokStream -> nlLexer --- src/nlx.nim | 12 +-- src/noether/lexer/tokbuilder.nim | 132 ++++++++++++++++++++----------- 2 files changed, 90 insertions(+), 54 deletions(-) diff --git a/src/nlx.nim b/src/nlx.nim index adf95f0..565aaae 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -1,19 +1,19 @@ import os import noether/lib/io -import noether/lexer/[tok, tokstream] +import noether/lexer/tokbuilder # import noether/parser/parser {.hint: "Don't forget to drink more water (^_^)".} when isMainModule: echo "Noether Lang Extras v0.1.0 - nlx" - var inStream = if paramCount() > 0: streamFile(paramStr 1) - else: streamString(readAll stdin) + var stream = if paramCount() > 0: streamFile(paramStr 1) + else: streamString(readAll stdin) - var stream = newTokStream(inStream) + var lexer = newLexer(stream) # # DumpTok - while stream.progress(): - echo stream.tok + while lexer.progress(): + echo lexer.tok # DumpTree # discard parse(tokStream) diff --git a/src/noether/lexer/tokbuilder.nim b/src/noether/lexer/tokbuilder.nim index 0d2f212..46e3b00 100644 --- a/src/noether/lexer/tokbuilder.nim +++ b/src/noether/lexer/tokbuilder.nim @@ -8,9 +8,11 @@ export tok type # Abstracts the "building process" (lexing) # of nlTok objects from a given Stream of characters. - nlTokBuilder* = object + nlLexer* = object stream: Stream - tok: nlTok # the build token + done*: bool + tok*: nlTok # new finished token + btok: nlTok # the build token # track line number, line content, etc line: string lineNum: int @@ -19,18 +21,18 @@ type char: char cTKind: nlTokKind -proc atEOL(builder: nlTokBuilder): bool {.inline.} = - result = (builder.char == '\n') -proc atEOF(builder: nlTokBuilder): bool {.inline.} = - result = (builder.char == '\0') +proc atEOL(lexer: nlLexer): bool {.inline.} = + result = (lexer.char == '\n') +proc atEOF(lexer: nlLexer): bool {.inline.} = + result = (lexer.char == '\0') -# Initialise a new token builder -proc newBuilder(stream: var Stream): nlTokBuilder = - # NOTE: initial builder.char value is arbitrary, - # NOTE: but CANNOT be initialised to the default '\0' - result = nlTokBuilder( +# Initialise a new lexer +proc newLexer*(stream: var Stream): nlLexer = + result = nlLexer( stream: stream, + done: false, tok: emptyTok(0), + btok: emptyTok(0), line: "", lineNum: 1, pos: -1, # after initial readChar this -> 0 @@ -39,70 +41,104 @@ proc newBuilder(stream: var Stream): nlTokBuilder = #[ ====================================================== ] - | nlTokBuilder Internal Interface for Token Construction ] + | nlLexer Internal Interface for Token Construction ] ]# # Reset the build token to be "empty" -proc resetBuild(builder: var nlTokBuilder) = - builder.tok = emptyTok(builder.pos) +proc resetBuild(lexer: var nlLexer) = + lexer.btok = emptyTok(lexer.pos) # "Finishes" the build token by setting various properties -proc finishBuild(builder: var nlTokBuilder) = - builder.tok.lineNum = builder.lineNum - builder.tok.endPos = builder.pos - builder.tok.lit = builder.line[builder.tok.startPos ..< builder.line.high] +proc finishBuild(lexer: var nlLexer) = + lexer.btok.lineNum = lexer.lineNum + lexer.btok.endPos = lexer.pos + lexer.btok.lit = lexer.line[lexer.btok.startPos ..< lexer.line.high] # Finish, return, and reset the build token -proc flushBuild(builder: var nlTokBuilder): nlTok = - finishBuild(builder) - result = builder.tok - resetBuild(builder) +proc flushBuild(lexer: var nlLexer): nlTok = + finishBuild(lexer) + result = lexer.btok + resetBuild(lexer) # Is the build token "compatible" with the current char? (if not then flushbuild) # NOTE: This implicitly handles Windows CRLF, Unix LF, & Mac OS CR compatability # NOTE: since atEOL => '\n', but '\r' and '\n' are both tkEOL so they both flush. -proc isIncompatibleBuild(builder: nlTokBuilder): bool = - result = (builder.cTKind != builder.tok.kind or builder.atEOL()) +proc isIncompatibleBuild(lexer: nlLexer): bool = + result = (lexer.cTKind != lexer.btok.kind or lexer.atEOL()) # Inherit the build token's type from current char -proc inherit(builder: var nlTokBuilder) = - builder.tok.kind = builder.cTKind +proc inherit(lexer: var nlLexer) = + lexer.btok.kind = lexer.cTKind -# Add a character to the nlTokBuilder's build token. +# Add a character to the nlLexer's build token. # Flushes and returns the build token if finished. -proc appendBuild(builder: var nlTokBuilder): Option[nlTok] = +proc appendBuild(lexer: var nlLexer): Option[nlTok] = # untyped build tokens inherit type immediately - if builder.tok.isUntyped(): - builder.inherit() + if lexer.btok.isUntyped(): + lexer.inherit() # check character and build token compatability - if isIncompatibleBuild(builder): + if isIncompatibleBuild(lexer): # flush old build token, the new one inherits type - result = some(flushBuild(builder)) - builder.inherit() + result = some(flushBuild(lexer)) + lexer.inherit() else: result = none(nlTok) -#[ ========================================== ] - | nlTokBuilder Char Stream Reading Interface ] +#[ ========================================= ] + | nlLexer Internal Char Streaming Interface ] ]# # Read the next char in the stream # NOTE: readChar raises IOError on error, returns \0 on EOF -proc readChar*(builder: var nlTokBuilder): bool = - if builder.atEOL(): - inc builder.lineNum - # sets builder.char to '\0' if EOF - builder.char = builder.stream.readChar() - builder.cTKind = getTokKind(builder.char) - builder.line.add(builder.char) - inc builder.pos - result = builder.atEOF() +proc readChar(lexer: var nlLexer): bool = + if lexer.atEOL(): + inc lexer.lineNum + # sets lexer.char to '\0' if EOF + lexer.char = lexer.stream.readChar() + lexer.cTKind = getTokKind(lexer.char) + lexer.line.add(lexer.char) + inc lexer.pos + result = lexer.atEOF() +#[ ======================== + | nlLexer Public Interface + ]# + # Read until EOL and return the current line -# NOTE: Does NOT update the builder's state (unsafe) +# NOTE: Does NOT update the lexer's state (unsafe) # NOTE: ONLY call if a lex/parse error needs displaying -proc unsafeGetLine*(builder: var nlTokBuilder): string = - while not builder.atEOL() and builder.readChar(): +proc unsafeGetLine*(lexer: var nlLexer): string = + while not lexer.atEOL() and lexer.readChar(): discard - result = builder.line + result = lexer.line + +# Lexes and returns the next token in the "token stream" +# via repeatedly calling readChar() and appendBuild(). +# Returns a boolean indicating whether EOF has been reached. +# NOTE: access the new token via `stream.tok` +proc progress*(lexer: var nlLexer): bool = + # Return prematurely if already closed + if lexer.done: + return false + while true: + let + atEOF = lexer.readChar() + flushedTok = lexer.appendBuild() + newTokBuilt = flushedTok.isSome + + if newTokBuilt: + lexer.tok = flushedTok.get() + # if canProgress and atEOF: + # if atEOF: + # if newTokBuilt: + # stream.isClosed = true + # return newTokBuilt + # elif newTokBuilt: + # return true + if newTokBuilt: + if atEOF: + lexer.done = true + return true + elif atEOF: + return false From f25e66e9ef7270bfaf1e2c6df5fa53467016e379 Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 09:41:15 +1000 Subject: [PATCH 10/12] Garbage collection *so to speak* --- src/nlx.nim | 2 +- src/noether/lexer/{tokbuilder.nim => lex.nim} | 0 src/noether/lexer/tokstream.nim | 51 ------------------- 3 files changed, 1 insertion(+), 52 deletions(-) rename src/noether/lexer/{tokbuilder.nim => lex.nim} (100%) delete mode 100644 src/noether/lexer/tokstream.nim diff --git a/src/nlx.nim b/src/nlx.nim index 565aaae..c7ef1d9 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -1,6 +1,6 @@ import os import noether/lib/io -import noether/lexer/tokbuilder +import noether/lexer/lex # import noether/parser/parser {.hint: "Don't forget to drink more water (^_^)".} diff --git a/src/noether/lexer/tokbuilder.nim b/src/noether/lexer/lex.nim similarity index 100% rename from src/noether/lexer/tokbuilder.nim rename to src/noether/lexer/lex.nim diff --git a/src/noether/lexer/tokstream.nim b/src/noether/lexer/tokstream.nim deleted file mode 100644 index e64f777..0000000 --- a/src/noether/lexer/tokstream.nim +++ /dev/null @@ -1,51 +0,0 @@ -include tokbuilder - -type - # Provides a stream-like interface for lexing. - # Implemented as a wrapper for nlTokBuilder. - nlTokStream* = object - builder: nlTokBuilder - tok*: nlTok # the current token - isClosed: bool # EOF + all tokens built - -# Initialises a new nlTokStream on a string or file -proc newTokStream*(stream: var Stream): nlTokStream = - result = nlTokStream( - builder: newBuilder(stream), - tok: emptyTok(0), - isClosed: false, - ) - -# Expose a subset of the nlTokBuilder interface -proc line*(stream: nlTokStream): string = - result = stream.builder.line -proc atEOL*(stream: nlTokStream): bool = - result = stream.builder.atEOL() - -# Generates and progress the next token in the nlTokStream. -# via repeatedly calling progressBuild() and progressChar(). -# Returns a boolean indicating whether EOF has been reached. -# NOTE: access the new token via `stream.tok` -proc progress*(stream: var nlTokStream): bool = - # Return prematurely if already closed - if stream.isClosed: - return false - while true: - let - atEOF = stream.builder.readChar() - flushedTok = stream.builder.appendBuild() - newTokBuilt = flushedTok.isSome - echo flushedTok - echo "atEOF: ", atEOF, "\nnewTokBuilt: ", newTokBuilt - # canProgress & EOF reached => no more tokens to build :) - # NOTE: reachedEOF and not canProgress => more tokens unwrapping - if newTokBuilt: - # return the finished build token, and save it as the current token - stream.tok = flushedTok.get() - # if canProgress and atEOF: - if atEOF: - if newTokBuilt: - stream.isClosed = true - return newTokBuilt - elif newTokBuilt: - return true From 1181ea97434788914cf37951e26f2f930d4a04ca Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 12:51:03 +1000 Subject: [PATCH 11/12] Restructure attempt #087 :( --- src/nlx.nim | 26 ++++-- src/noether/lexer/lex.nim | 44 ++++++++-- src/noether/lexer/tok.nim | 30 ++++++- src/noether/lexer/tokkind.nim | 60 -------------- src/noether/parser/parse.nim | 58 +++++++++++++ src/noether/parser/parser.nim | 134 +++++++++++++++++++------------ src/noether/parser/parseutil.nim | 90 --------------------- 7 files changed, 227 insertions(+), 215 deletions(-) create mode 100644 src/noether/parser/parse.nim delete mode 100644 src/noether/parser/parseutil.nim diff --git a/src/nlx.nim b/src/nlx.nim index c7ef1d9..e145943 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -1,19 +1,29 @@ import os import noether/lib/io import noether/lexer/lex -# import noether/parser/parser +import noether/parser/parse {.hint: "Don't forget to drink more water (^_^)".} when isMainModule: echo "Noether Lang Extras v0.1.0 - nlx" - var stream = if paramCount() > 0: streamFile(paramStr 1) + # really lazy argparse implementation (temporary) + let + paramC = paramCount() + cmd = if paramC > 2: paramStr 1 + else: "tok" + + var stream = if paramC > 0: streamFile(paramStr paramC) else: streamString(readAll stdin) var lexer = newLexer(stream) - # # DumpTok - while lexer.progress(): - echo lexer.tok - - # DumpTree - # discard parse(tokStream) + if cmd == "tok": + # DumpTok + while lexer.progress(): + echo lexer.tok + elif cmd == "tree": + discard + # DumpTree + # discard parse(tokStream) + else: + echo "Usage: nlx [tok|tree] \n demo files are accessible at lang/demo" diff --git a/src/noether/lexer/lex.nim b/src/noether/lexer/lex.nim index 46e3b00..8f81b86 100644 --- a/src/noether/lexer/lex.nim +++ b/src/noether/lexer/lex.nim @@ -11,15 +11,16 @@ type nlLexer* = object stream: Stream done*: bool - tok*: nlTok # new finished token + # store current token and upcoming (build) token + tok*: nlTok # current token btok: nlTok # the build token + # save char and pos and its token type + char: char + cTKind: nlTokKind # track line number, line content, etc line: string lineNum: int pos: int - # save char and pos and its token type - char: char - cTKind: nlTokKind proc atEOL(lexer: nlLexer): bool {.inline.} = result = (lexer.char == '\n') @@ -37,8 +38,41 @@ proc newLexer*(stream: var Stream): nlLexer = lineNum: 1, pos: -1, # after initial readChar this -> 0 char: '\0', # use \0 as initial invalid char + cTKind: tkNONE, ) +# Classifies the current character to its nlTokKind +proc classifyTok*(lexer: nlLexer): nlTokKind {.inline.} = + case lexer.char: + of '\0': + result = tkEOF + of '\r', '\n': + result = tkEOL + of ' ', '\t': + result = tkWTSP + of '(': + result = tkLPAR + of ')': + result = tkRPAR + of '{': + result = tkLBRA + of '}': + result = tkRBRA + of '[': + result = tkLSQB + of ']': + result = tkRSQB + of '\'': + result = tkSQUO + of '\"': + result = tkDQUO + of '`': + result = tkGRVA + of '#': + result = tkHASH + else: + result = tkWORD + #[ ====================================================== ] | nlLexer Internal Interface for Token Construction ] @@ -96,7 +130,7 @@ proc readChar(lexer: var nlLexer): bool = inc lexer.lineNum # sets lexer.char to '\0' if EOF lexer.char = lexer.stream.readChar() - lexer.cTKind = getTokKind(lexer.char) + lexer.cTKind = lexer.classifyTok() lexer.line.add(lexer.char) inc lexer.pos result = lexer.atEOF() diff --git a/src/noether/lexer/tok.nim b/src/noether/lexer/tok.nim index 08aba66..b19c341 100644 --- a/src/noether/lexer/tok.nim +++ b/src/noether/lexer/tok.nim @@ -1,4 +1,32 @@ -include tokkind +type + # nlTokKind allows primitive nlToks to be typed, + # the nlTokKind enum should never be directly + # accessed. Use the interface in this file instead. + nlTokKind* = enum + tkNONE, # Placeholder Value + + tkEOF, # End of File + tkEOL, # End of Line (\0 --> EOL) + + tkWORD, # Alphanumeric token + tkSYMB, # Symbolic token + + tkLNFD, # \r \n Line-Feed + tkWTSP, # ' ' \t Whitespace + + # RESERVED SYMBOLS + tkLPAR, # ( Left Parenthesis + tkRPAR, # ) Right Parenthesis + tkLBRA, # { Left Brace + tkRBRA, # } Right Brace + tkLSQB, # [ Left Square Bracket + tkRSQB, # ] Right Square Bracket + # tkLANB, # < Left Angle Bracket + # tkRANB, # > Right Angle Bracket + tkSQUO, # ' Single Quotation Marking + tkDQUO, # " Double Quotation Marking + tkGRVA, # ` Grave Accent + tkHASH, # # Number Sign (Hashtag) type nlTok* = tuple diff --git a/src/noether/lexer/tokkind.nim b/src/noether/lexer/tokkind.nim index 3d1d7b6..8b13789 100644 --- a/src/noether/lexer/tokkind.nim +++ b/src/noether/lexer/tokkind.nim @@ -1,61 +1 @@ -type - # nlTokKind allows primitive nlToks to be typed, - # the nlTokKind enum should never be directly - # accessed. Use the interface in this file instead. - nlTokKind* = enum - tkNONE, # Placeholder Value - tkEOF, # End of File - tkEOL, # End of Line (\0 --> EOL) - - tkWORD, # Alphanumeric token - tkSYMB, # Symbolic token - - tkLNFD, # \r \n Line-Feed - tkWTSP, # ' ' \t Whitespace - - # RESERVED SYMBOLS - tkLPAR, # ( Left Parenthesis - tkRPAR, # ) Right Parenthesis - tkLBRA, # { Left Brace - tkRBRA, # } Right Brace - tkLSQB, # [ Left Square Bracket - tkRSQB, # ] Right Square Bracket - # tkLANB, # < Left Angle Bracket - # tkRANB, # > Right Angle Bracket - tkSQUO, # ' Single Quotation Marking - tkDQUO, # " Double Quotation Marking - tkGRVA, # ` Grave Accent - tkHASH, # # Number Sign (Hashtag) - -# Classifies a character to its nlTokKind -proc getTokKind*(c: char): nlTokKind = - case c: - of '\0': - result = tkEOF - of '\r', '\n': - result = tkEOL - of ' ', '\t': - result = tkWTSP - of '(': - result = tkLPAR - of ')': - result = tkRPAR - of '{': - result = tkLBRA - of '}': - result = tkRBRA - of '[': - result = tkLSQB - of ']': - result = tkRSQB - of '\'': - result = tkSQUO - of '\"': - result = tkDQUO - of '`': - result = tkGRVA - of '#': - result = tkHASH - else: - result = tkWORD diff --git a/src/noether/parser/parse.nim b/src/noether/parser/parse.nim new file mode 100644 index 0000000..0ecd14b --- /dev/null +++ b/src/noether/parser/parse.nim @@ -0,0 +1,58 @@ +import strutils +include parser + +# NOTE: Matching between two tokens will fill `node` with everything +# NOTE: between those two tokens EXCLUDING the two tokens themselves. +proc parseMatch(parser: var nlParser, matchType: nlTokKind): nlParseStat = + result = greed( + parser, + satisfyMatch(matchType), + ) +proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat = + result = greedLine( + parser, + satisfyMatch(matchType), + ) + +proc parseStrLit(parser: var nlParser): nlParseStat = + result = parser.parseMatchLine(tkDQUO) + +proc parseChrLit(parser: var nlParser): nlParseStat = + result = parser.parseMatchLine(tkSQUO) + +proc parseStmt(parser: var nlParser): nlParseStat = + while parser.progressStream(): + echo "----- Current Token: ", parser.currTok + case parser.currTok.tKind + of tkDQUO: + # Attempt to parse string literal + if parser.parseStrLit() != nlParseStat.OK: + echo "Unmatched Double Quotation! Malformed String Literal" + echo parser.line + echo repeat(" ", parser.currTok.startPos), '^', '\n' + else: + echo "Parsed String Literal" + echo parser.bnode[], '\n' + of tkSQUO: + # Attempt to parse string literal + if parser.parseChrLit() != nlParseStat.OK: + echo "Unmatched Single Quotation! Malformed Character Literal" + echo parser.line + echo repeat(" ", parser.currTok.startPos), '^', '\n' + else: + echo "Parsed Character Literal" + echo parser.bnode[], '\n' + of tkEOL: + # TODO: handle this case, don't just discard + discard + else: + echo "blah blah unhandled case\n" + result = nlParseStat.OK + +# Attempt to parse nlAST from nlTokStream +proc parse*(tokStream: var nlTokStream): nlAST = + var parser = newParser(tokStream) + echo ' ' + discard parser.parseStmt() + + result = parser.ast diff --git a/src/noether/parser/parser.nim b/src/noether/parser/parser.nim index 7daf91b..7047e6d 100644 --- a/src/noether/parser/parser.nim +++ b/src/noether/parser/parser.nim @@ -1,58 +1,90 @@ -import strutils -include parseutil +import nodes +import ../lexer/lex -# NOTE: Matching between two tokens will fill `node` with everything -# NOTE: between those two tokens EXCLUDING the two tokens themselves. -proc parseMatch(parser: var nlParser, matchType: nlTokKind): nlParseStat = - result = greed( - parser, - satisfyMatch(matchType), - ) -proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat = - result = greedLine( - parser, - satisfyMatch(matchType), +type + # NOTE1: Values above MARKER_FAIL indicate a failed state + # NOTE2: nlParseStat is marked pure out of habit that's all + nlParseStat* {.pure.} = enum + OK, + MARKER_FAIL, + UNMATCHED, + TOOBIG, + + nlAST* = object + root: nlNode + + nlParser* = object + stream: nlTokStream + ast: nlAST + # the "build node" is a reference to the AST node + # the parser is currently modifying/building from + # NOTE: bnode changes frequently, it is NOT the root + bnode: nlNode + # flag indicating whether the parser is at + # the start of a new line (aka checking indentation) + inIndent: bool + + +proc `*`(stat: nlParseStat, b: bool): nlParseStat = + result = if b: stat else: nlParseStat.OK + +proc isFail*(stat: nlParseStat): bool = + result = (stat >= nlParseStat.MARKER_FAIL) + +proc newParser*(tokStream: var nlTokStream): nlParser = + let rootNode = newNode(nkNone) + result = nlParser( + stream: tokStream, + ast: nlAST( + root: rootNode + ), + bnode: rootNode, ) -proc parseStrLit(parser: var nlParser): nlParseStat = - result = parser.parseMatchLine(tkDQUO) +# Exposes a subset of the nlTokStream interface +proc currTok(parser: var nlParser): nlTok = parser.stream.currTok +proc line(parser: var nlParser): string = parser.stream.line -proc parseChrLit(parser: var nlParser): nlParseStat = - result = parser.parseMatchLine(tkSQUO) +# Extends upon the functionality of nlTokStream.progress() +proc progressStream*(parser: var nlParser): bool = + result = parser.stream.progress() + if result and parser.currTok.tKind == tkEOL: + parser.inIndent = true + if + +proc setNewLine() -proc parseStmt(parser: var nlParser): nlParseStat = +#[ "Greed" refers to something I mentioned in my discussion on + | Noether's grammar (in an EBNF-like language). Greed just + | means "everything until a condition is satisified". + | That condition should be supplied by a Nim procedural type. + ]# + +# Greed will consume anything until a condition is satisfied +# Returns false if the greed was never satisfied (OMG!!) +proc greed(parser: var nlParser, + satisfy: proc(tok: nlTok): bool): nlParseStat = while parser.progressStream(): - echo "----- Current Token: ", parser.currTok - case parser.currTok.tKind - of tkDQUO: - # Attempt to parse string literal - if parser.parseStrLit() != nlParseStat.OK: - echo "Unmatched Double Quotation! Malformed String Literal" - echo parser.line - echo repeat(" ", parser.currTok.startPos), '^', '\n' - else: - echo "Parsed String Literal" - echo parser.bnode[], '\n' - of tkSQUO: - # Attempt to parse string literal - if parser.parseChrLit() != nlParseStat.OK: - echo "Unmatched Single Quotation! Malformed Character Literal" - echo parser.line - echo repeat(" ", parser.currTok.startPos), '^', '\n' - else: - echo "Parsed Character Literal" - echo parser.bnode[], '\n' - of tkEOL: - # TODO: handle this case, don't just discard - discard - else: - echo "blah blah unhandled case\n" - result = nlParseStat.OK - -# Attempt to parse nlAST from nlTokStream -proc parse*(tokStream: var nlTokStream): nlAST = - var parser = newParser(tokStream) - echo ' ' - discard parser.parseStmt() + if satisfy(parser.currTok): + return nlParseStat.OK + # NOTE: the matched token is currently excluded + parser.bnode.addTok(parser.currTok) + result = nlParseStat.UNMATCHED - result = parser.ast +proc greedLine(parser: var nlParser, + satisfy: proc(tok: nlTok): bool): nlParseStat = + while parser.progressStream(): + if satisfy(parser.currTok): + return nlParseStat.OK + # NOTE: the matched token is currently excluded + parser.bnode.addTok(parser.currTok) + if parser.currTok.tKind == tkEOL: + return nlParseStat.UNMATCHED + result = nlParseStat.UNMATCHED + +#[ Templates for generating greed satisfying conditions. + ]# + +# Satisfied if it finds nlTok of type matchType +template satisfyMatch(matchType: nlTokKind): untyped = + (proc(tok {.inject.}: nlTok): bool = (tok.tKind == matchType)) diff --git a/src/noether/parser/parseutil.nim b/src/noether/parser/parseutil.nim deleted file mode 100644 index d531490..0000000 --- a/src/noether/parser/parseutil.nim +++ /dev/null @@ -1,90 +0,0 @@ -import nodes -import ../lexer/tokstream - -type - # NOTE1: Values above MARKER_FAIL indicate a failed state - # NOTE2: nlParseStat is marked pure out of habit that's all - nlParseStat* {.pure.} = enum - OK, - MARKER_FAIL, - UNMATCHED, - TOOBIG, - - nlAST* = object - root: nlNode - - nlParser* = object - stream: nlTokStream - ast: nlAST - # the "build node" is a reference to the AST node - # the parser is currently modifying/building from - # NOTE: bnode changes frequently, it is NOT the root - bnode: nlNode - # flag indicating whether the parser is at - # the start of a new line (aka checking indentation) - inIndent: bool - - -proc `*`(stat: nlParseStat, b: bool): nlParseStat = - result = if b: stat else: nlParseStat.OK - -proc isFail*(stat: nlParseStat): bool = - result = (stat >= nlParseStat.MARKER_FAIL) - -proc newParser*(tokStream: var nlTokStream): nlParser = - let rootNode = newNode(nkNone) - result = nlParser( - stream: tokStream, - ast: nlAST( - root: rootNode - ), - bnode: rootNode, - ) - -# Exposes a subset of the nlTokStream interface -proc currTok(parser: var nlParser): nlTok = parser.stream.currTok -proc line(parser: var nlParser): string = parser.stream.line - -# Extends upon the functionality of nlTokStream.progress() -proc progressStream*(parser: var nlParser): bool = - result = parser.stream.progress() - if result and parser.currTok.tKind == tkEOL: - parser.inIndent = true - if - -proc setNewLine() - -#[ "Greed" refers to something I mentioned in my discussion on - | Noether's grammar (in an EBNF-like language). Greed just - | means "everything until a condition is satisified". - | That condition should be supplied by a Nim procedural type. - ]# - -# Greed will consume anything until a condition is satisfied -# Returns false if the greed was never satisfied (OMG!!) -proc greed(parser: var nlParser, - satisfy: proc(tok: nlTok): bool): nlParseStat = - while parser.progressStream(): - if satisfy(parser.currTok): - return nlParseStat.OK - # NOTE: the matched token is currently excluded - parser.bnode.addTok(parser.currTok) - result = nlParseStat.UNMATCHED - -proc greedLine(parser: var nlParser, - satisfy: proc(tok: nlTok): bool): nlParseStat = - while parser.progressStream(): - if satisfy(parser.currTok): - return nlParseStat.OK - # NOTE: the matched token is currently excluded - parser.bnode.addTok(parser.currTok) - if parser.currTok.tKind == tkEOL: - return nlParseStat.UNMATCHED - result = nlParseStat.UNMATCHED - -#[ Templates for generating greed satisfying conditions. - ]# - -# Satisfied if it finds nlTok of type matchType -template satisfyMatch(matchType: nlTokKind): untyped = - (proc(tok {.inject.}: nlTok): bool = (tok.tKind == matchType)) From bab593a86bead14ec44ef119c744a12a5ae02fa1 Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Sun, 6 Jul 2025 21:42:09 +1000 Subject: [PATCH 12/12] Typo fix + start of error handling --- py/m.py | 2 +- src/noether/lib/err.nim | 1 + src/noether/parser/err.nim | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 src/noether/lib/err.nim create mode 100644 src/noether/parser/err.nim diff --git a/py/m.py b/py/m.py index e2b60c5..9576f4c 100644 --- a/py/m.py +++ b/py/m.py @@ -2,7 +2,7 @@ import sys import readline -from noether.math import * +from noether.lib.math import * from noether.cli import * diff --git a/src/noether/lib/err.nim b/src/noether/lib/err.nim new file mode 100644 index 0000000..ec4c848 --- /dev/null +++ b/src/noether/lib/err.nim @@ -0,0 +1 @@ +proc echoErrorHeader(): = diff --git a/src/noether/parser/err.nim b/src/noether/parser/err.nim new file mode 100644 index 0000000..9cc5a73 --- /dev/null +++ b/src/noether/parser/err.nim @@ -0,0 +1,8 @@ +#[ Error codes and messaging directly associated with + | nlParser and its procedures is written here. + | General error functionality is in src/noether/lib/err.nim + ]# + +import parser + +