Migrate nlTokBuilder + nlTokStream -> nlLexer

This commit is contained in:
Emile Clark-Boman 2025-06-19 09:38:08 +10:00
parent 07a9bda9ba
commit d7fb1f0c89
2 changed files with 90 additions and 54 deletions

View file

@ -1,19 +1,19 @@
import os import os
import noether/lib/io import noether/lib/io
import noether/lexer/[tok, tokstream] import noether/lexer/tokbuilder
# import noether/parser/parser # import noether/parser/parser
{.hint: "Don't forget to drink more water (^_^)".} {.hint: "Don't forget to drink more water (^_^)".}
when isMainModule: when isMainModule:
echo "Noether Lang Extras v0.1.0 - nlx" echo "Noether Lang Extras v0.1.0 - nlx"
var inStream = if paramCount() > 0: streamFile(paramStr 1) var stream = if paramCount() > 0: streamFile(paramStr 1)
else: streamString(readAll stdin) else: streamString(readAll stdin)
var stream = newTokStream(inStream) var lexer = newLexer(stream)
# # DumpTok # # DumpTok
while stream.progress(): while lexer.progress():
echo stream.tok echo lexer.tok
# DumpTree # DumpTree
# discard parse(tokStream) # discard parse(tokStream)

View file

@ -8,9 +8,11 @@ export tok
type type
# Abstracts the "building process" (lexing) # Abstracts the "building process" (lexing)
# of nlTok objects from a given Stream of characters. # of nlTok objects from a given Stream of characters.
nlTokBuilder* = object nlLexer* = object
stream: Stream stream: Stream
tok: nlTok # the build token done*: bool
tok*: nlTok # new finished token
btok: nlTok # the build token
# track line number, line content, etc # track line number, line content, etc
line: string line: string
lineNum: int lineNum: int
@ -19,18 +21,18 @@ type
char: char char: char
cTKind: nlTokKind cTKind: nlTokKind
proc atEOL(builder: nlTokBuilder): bool {.inline.} = proc atEOL(lexer: nlLexer): bool {.inline.} =
result = (builder.char == '\n') result = (lexer.char == '\n')
proc atEOF(builder: nlTokBuilder): bool {.inline.} = proc atEOF(lexer: nlLexer): bool {.inline.} =
result = (builder.char == '\0') result = (lexer.char == '\0')
# Initialise a new token builder # Initialise a new lexer
proc newBuilder(stream: var Stream): nlTokBuilder = proc newLexer*(stream: var Stream): nlLexer =
# NOTE: initial builder.char value is arbitrary, result = nlLexer(
# NOTE: but CANNOT be initialised to the default '\0'
result = nlTokBuilder(
stream: stream, stream: stream,
done: false,
tok: emptyTok(0), tok: emptyTok(0),
btok: emptyTok(0),
line: "", line: "",
lineNum: 1, lineNum: 1,
pos: -1, # after initial readChar this -> 0 pos: -1, # after initial readChar this -> 0
@ -39,70 +41,104 @@ proc newBuilder(stream: var Stream): nlTokBuilder =
#[ ====================================================== ] #[ ====================================================== ]
| nlTokBuilder Internal Interface for Token Construction ] | nlLexer Internal Interface for Token Construction ]
]# ]#
# Reset the build token to be "empty" # Reset the build token to be "empty"
proc resetBuild(builder: var nlTokBuilder) = proc resetBuild(lexer: var nlLexer) =
builder.tok = emptyTok(builder.pos) lexer.btok = emptyTok(lexer.pos)
# "Finishes" the build token by setting various properties # "Finishes" the build token by setting various properties
proc finishBuild(builder: var nlTokBuilder) = proc finishBuild(lexer: var nlLexer) =
builder.tok.lineNum = builder.lineNum lexer.btok.lineNum = lexer.lineNum
builder.tok.endPos = builder.pos lexer.btok.endPos = lexer.pos
builder.tok.lit = builder.line[builder.tok.startPos ..< builder.line.high] lexer.btok.lit = lexer.line[lexer.btok.startPos ..< lexer.line.high]
# Finish, return, and reset the build token # Finish, return, and reset the build token
proc flushBuild(builder: var nlTokBuilder): nlTok = proc flushBuild(lexer: var nlLexer): nlTok =
finishBuild(builder) finishBuild(lexer)
result = builder.tok result = lexer.btok
resetBuild(builder) resetBuild(lexer)
# Is the build token "compatible" with the current char? (if not then flushbuild) # Is the build token "compatible" with the current char? (if not then flushbuild)
# NOTE: This implicitly handles Windows CRLF, Unix LF, & Mac OS CR compatability # NOTE: This implicitly handles Windows CRLF, Unix LF, & Mac OS CR compatability
# NOTE: since atEOL => '\n', but '\r' and '\n' are both tkEOL so they both flush. # NOTE: since atEOL => '\n', but '\r' and '\n' are both tkEOL so they both flush.
proc isIncompatibleBuild(builder: nlTokBuilder): bool = proc isIncompatibleBuild(lexer: nlLexer): bool =
result = (builder.cTKind != builder.tok.kind or builder.atEOL()) result = (lexer.cTKind != lexer.btok.kind or lexer.atEOL())
# Inherit the build token's type from current char # Inherit the build token's type from current char
proc inherit(builder: var nlTokBuilder) = proc inherit(lexer: var nlLexer) =
builder.tok.kind = builder.cTKind lexer.btok.kind = lexer.cTKind
# Add a character to the nlTokBuilder's build token. # Add a character to the nlLexer's build token.
# Flushes and returns the build token if finished. # Flushes and returns the build token if finished.
proc appendBuild(builder: var nlTokBuilder): Option[nlTok] = proc appendBuild(lexer: var nlLexer): Option[nlTok] =
# untyped build tokens inherit type immediately # untyped build tokens inherit type immediately
if builder.tok.isUntyped(): if lexer.btok.isUntyped():
builder.inherit() lexer.inherit()
# check character and build token compatability # check character and build token compatability
if isIncompatibleBuild(builder): if isIncompatibleBuild(lexer):
# flush old build token, the new one inherits type # flush old build token, the new one inherits type
result = some(flushBuild(builder)) result = some(flushBuild(lexer))
builder.inherit() lexer.inherit()
else: else:
result = none(nlTok) result = none(nlTok)
#[ ========================================== ] #[ ========================================= ]
| nlTokBuilder Char Stream Reading Interface ] | nlLexer Internal Char Streaming Interface ]
]# ]#
# Read the next char in the stream # Read the next char in the stream
# NOTE: readChar raises IOError on error, returns \0 on EOF # NOTE: readChar raises IOError on error, returns \0 on EOF
proc readChar*(builder: var nlTokBuilder): bool = proc readChar(lexer: var nlLexer): bool =
if builder.atEOL(): if lexer.atEOL():
inc builder.lineNum inc lexer.lineNum
# sets builder.char to '\0' if EOF # sets lexer.char to '\0' if EOF
builder.char = builder.stream.readChar() lexer.char = lexer.stream.readChar()
builder.cTKind = getTokKind(builder.char) lexer.cTKind = getTokKind(lexer.char)
builder.line.add(builder.char) lexer.line.add(lexer.char)
inc builder.pos inc lexer.pos
result = builder.atEOF() result = lexer.atEOF()
#[ ========================
| nlLexer Public Interface
]#
# Read until EOL and return the current line # Read until EOL and return the current line
# NOTE: Does NOT update the builder's state (unsafe) # NOTE: Does NOT update the lexer's state (unsafe)
# NOTE: ONLY call if a lex/parse error needs displaying # NOTE: ONLY call if a lex/parse error needs displaying
proc unsafeGetLine*(builder: var nlTokBuilder): string = proc unsafeGetLine*(lexer: var nlLexer): string =
while not builder.atEOL() and builder.readChar(): while not lexer.atEOL() and lexer.readChar():
discard discard
result = builder.line result = lexer.line
# Lexes and returns the next token in the "token stream"
# via repeatedly calling readChar() and appendBuild().
# Returns a boolean indicating whether EOF has been reached.
# NOTE: access the new token via `stream.tok`
proc progress*(lexer: var nlLexer): bool =
# Return prematurely if already closed
if lexer.done:
return false
while true:
let
atEOF = lexer.readChar()
flushedTok = lexer.appendBuild()
newTokBuilt = flushedTok.isSome
if newTokBuilt:
lexer.tok = flushedTok.get()
# if canProgress and atEOF:
# if atEOF:
# if newTokBuilt:
# stream.isClosed = true
# return newTokBuilt
# elif newTokBuilt:
# return true
if newTokBuilt:
if atEOF:
lexer.done = true
return true
elif atEOF:
return false