From d7fb1f0c899a189c7a48acbf5cb2c29d3c968a6d Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 09:38:08 +1000 Subject: [PATCH] Migrate nlTokBuilder + nlTokStream -> nlLexer --- src/nlx.nim | 12 +-- src/noether/lexer/tokbuilder.nim | 132 ++++++++++++++++++++----------- 2 files changed, 90 insertions(+), 54 deletions(-) diff --git a/src/nlx.nim b/src/nlx.nim index adf95f0..565aaae 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -1,19 +1,19 @@ import os import noether/lib/io -import noether/lexer/[tok, tokstream] +import noether/lexer/tokbuilder # import noether/parser/parser {.hint: "Don't forget to drink more water (^_^)".} when isMainModule: echo "Noether Lang Extras v0.1.0 - nlx" - var inStream = if paramCount() > 0: streamFile(paramStr 1) - else: streamString(readAll stdin) + var stream = if paramCount() > 0: streamFile(paramStr 1) + else: streamString(readAll stdin) - var stream = newTokStream(inStream) + var lexer = newLexer(stream) # # DumpTok - while stream.progress(): - echo stream.tok + while lexer.progress(): + echo lexer.tok # DumpTree # discard parse(tokStream) diff --git a/src/noether/lexer/tokbuilder.nim b/src/noether/lexer/tokbuilder.nim index 0d2f212..46e3b00 100644 --- a/src/noether/lexer/tokbuilder.nim +++ b/src/noether/lexer/tokbuilder.nim @@ -8,9 +8,11 @@ export tok type # Abstracts the "building process" (lexing) # of nlTok objects from a given Stream of characters. - nlTokBuilder* = object + nlLexer* = object stream: Stream - tok: nlTok # the build token + done*: bool + tok*: nlTok # new finished token + btok: nlTok # the build token # track line number, line content, etc line: string lineNum: int @@ -19,18 +21,18 @@ type char: char cTKind: nlTokKind -proc atEOL(builder: nlTokBuilder): bool {.inline.} = - result = (builder.char == '\n') -proc atEOF(builder: nlTokBuilder): bool {.inline.} = - result = (builder.char == '\0') +proc atEOL(lexer: nlLexer): bool {.inline.} = + result = (lexer.char == '\n') +proc atEOF(lexer: nlLexer): bool {.inline.} = + result = (lexer.char == '\0') -# Initialise a new token builder -proc newBuilder(stream: var Stream): nlTokBuilder = - # NOTE: initial builder.char value is arbitrary, - # NOTE: but CANNOT be initialised to the default '\0' - result = nlTokBuilder( +# Initialise a new lexer +proc newLexer*(stream: var Stream): nlLexer = + result = nlLexer( stream: stream, + done: false, tok: emptyTok(0), + btok: emptyTok(0), line: "", lineNum: 1, pos: -1, # after initial readChar this -> 0 @@ -39,70 +41,104 @@ proc newBuilder(stream: var Stream): nlTokBuilder = #[ ====================================================== ] - | nlTokBuilder Internal Interface for Token Construction ] + | nlLexer Internal Interface for Token Construction ] ]# # Reset the build token to be "empty" -proc resetBuild(builder: var nlTokBuilder) = - builder.tok = emptyTok(builder.pos) +proc resetBuild(lexer: var nlLexer) = + lexer.btok = emptyTok(lexer.pos) # "Finishes" the build token by setting various properties -proc finishBuild(builder: var nlTokBuilder) = - builder.tok.lineNum = builder.lineNum - builder.tok.endPos = builder.pos - builder.tok.lit = builder.line[builder.tok.startPos ..< builder.line.high] +proc finishBuild(lexer: var nlLexer) = + lexer.btok.lineNum = lexer.lineNum + lexer.btok.endPos = lexer.pos + lexer.btok.lit = lexer.line[lexer.btok.startPos ..< lexer.line.high] # Finish, return, and reset the build token -proc flushBuild(builder: var nlTokBuilder): nlTok = - finishBuild(builder) - result = builder.tok - resetBuild(builder) +proc flushBuild(lexer: var nlLexer): nlTok = + finishBuild(lexer) + result = lexer.btok + resetBuild(lexer) # Is the build token "compatible" with the current char? (if not then flushbuild) # NOTE: This implicitly handles Windows CRLF, Unix LF, & Mac OS CR compatability # NOTE: since atEOL => '\n', but '\r' and '\n' are both tkEOL so they both flush. -proc isIncompatibleBuild(builder: nlTokBuilder): bool = - result = (builder.cTKind != builder.tok.kind or builder.atEOL()) +proc isIncompatibleBuild(lexer: nlLexer): bool = + result = (lexer.cTKind != lexer.btok.kind or lexer.atEOL()) # Inherit the build token's type from current char -proc inherit(builder: var nlTokBuilder) = - builder.tok.kind = builder.cTKind +proc inherit(lexer: var nlLexer) = + lexer.btok.kind = lexer.cTKind -# Add a character to the nlTokBuilder's build token. +# Add a character to the nlLexer's build token. # Flushes and returns the build token if finished. -proc appendBuild(builder: var nlTokBuilder): Option[nlTok] = +proc appendBuild(lexer: var nlLexer): Option[nlTok] = # untyped build tokens inherit type immediately - if builder.tok.isUntyped(): - builder.inherit() + if lexer.btok.isUntyped(): + lexer.inherit() # check character and build token compatability - if isIncompatibleBuild(builder): + if isIncompatibleBuild(lexer): # flush old build token, the new one inherits type - result = some(flushBuild(builder)) - builder.inherit() + result = some(flushBuild(lexer)) + lexer.inherit() else: result = none(nlTok) -#[ ========================================== ] - | nlTokBuilder Char Stream Reading Interface ] +#[ ========================================= ] + | nlLexer Internal Char Streaming Interface ] ]# # Read the next char in the stream # NOTE: readChar raises IOError on error, returns \0 on EOF -proc readChar*(builder: var nlTokBuilder): bool = - if builder.atEOL(): - inc builder.lineNum - # sets builder.char to '\0' if EOF - builder.char = builder.stream.readChar() - builder.cTKind = getTokKind(builder.char) - builder.line.add(builder.char) - inc builder.pos - result = builder.atEOF() +proc readChar(lexer: var nlLexer): bool = + if lexer.atEOL(): + inc lexer.lineNum + # sets lexer.char to '\0' if EOF + lexer.char = lexer.stream.readChar() + lexer.cTKind = getTokKind(lexer.char) + lexer.line.add(lexer.char) + inc lexer.pos + result = lexer.atEOF() +#[ ======================== + | nlLexer Public Interface + ]# + # Read until EOL and return the current line -# NOTE: Does NOT update the builder's state (unsafe) +# NOTE: Does NOT update the lexer's state (unsafe) # NOTE: ONLY call if a lex/parse error needs displaying -proc unsafeGetLine*(builder: var nlTokBuilder): string = - while not builder.atEOL() and builder.readChar(): +proc unsafeGetLine*(lexer: var nlLexer): string = + while not lexer.atEOL() and lexer.readChar(): discard - result = builder.line + result = lexer.line + +# Lexes and returns the next token in the "token stream" +# via repeatedly calling readChar() and appendBuild(). +# Returns a boolean indicating whether EOF has been reached. +# NOTE: access the new token via `stream.tok` +proc progress*(lexer: var nlLexer): bool = + # Return prematurely if already closed + if lexer.done: + return false + while true: + let + atEOF = lexer.readChar() + flushedTok = lexer.appendBuild() + newTokBuilt = flushedTok.isSome + + if newTokBuilt: + lexer.tok = flushedTok.get() + # if canProgress and atEOF: + # if atEOF: + # if newTokBuilt: + # stream.isClosed = true + # return newTokBuilt + # elif newTokBuilt: + # return true + if newTokBuilt: + if atEOF: + lexer.done = true + return true + elif atEOF: + return false