Migrate nlTokBuilder + nlTokStream -> nlLexer

This commit is contained in:
Emile Clark-Boman 2025-06-19 09:38:08 +10:00
parent 07a9bda9ba
commit d7fb1f0c89
2 changed files with 90 additions and 54 deletions

View file

@ -1,19 +1,19 @@
import os
import noether/lib/io
import noether/lexer/[tok, tokstream]
import noether/lexer/tokbuilder
# import noether/parser/parser
{.hint: "Don't forget to drink more water (^_^)".}
when isMainModule:
echo "Noether Lang Extras v0.1.0 - nlx"
var inStream = if paramCount() > 0: streamFile(paramStr 1)
else: streamString(readAll stdin)
var stream = if paramCount() > 0: streamFile(paramStr 1)
else: streamString(readAll stdin)
var stream = newTokStream(inStream)
var lexer = newLexer(stream)
# # DumpTok
while stream.progress():
echo stream.tok
while lexer.progress():
echo lexer.tok
# DumpTree
# discard parse(tokStream)

View file

@ -8,9 +8,11 @@ export tok
type
# Abstracts the "building process" (lexing)
# of nlTok objects from a given Stream of characters.
nlTokBuilder* = object
nlLexer* = object
stream: Stream
tok: nlTok # the build token
done*: bool
tok*: nlTok # new finished token
btok: nlTok # the build token
# track line number, line content, etc
line: string
lineNum: int
@ -19,18 +21,18 @@ type
char: char
cTKind: nlTokKind
proc atEOL(builder: nlTokBuilder): bool {.inline.} =
result = (builder.char == '\n')
proc atEOF(builder: nlTokBuilder): bool {.inline.} =
result = (builder.char == '\0')
proc atEOL(lexer: nlLexer): bool {.inline.} =
result = (lexer.char == '\n')
proc atEOF(lexer: nlLexer): bool {.inline.} =
result = (lexer.char == '\0')
# Initialise a new token builder
proc newBuilder(stream: var Stream): nlTokBuilder =
# NOTE: initial builder.char value is arbitrary,
# NOTE: but CANNOT be initialised to the default '\0'
result = nlTokBuilder(
# Initialise a new lexer
proc newLexer*(stream: var Stream): nlLexer =
result = nlLexer(
stream: stream,
done: false,
tok: emptyTok(0),
btok: emptyTok(0),
line: "",
lineNum: 1,
pos: -1, # after initial readChar this -> 0
@ -39,70 +41,104 @@ proc newBuilder(stream: var Stream): nlTokBuilder =
#[ ====================================================== ]
| nlTokBuilder Internal Interface for Token Construction ]
| nlLexer Internal Interface for Token Construction ]
]#
# Reset the build token to be "empty"
proc resetBuild(builder: var nlTokBuilder) =
builder.tok = emptyTok(builder.pos)
proc resetBuild(lexer: var nlLexer) =
lexer.btok = emptyTok(lexer.pos)
# "Finishes" the build token by setting various properties
proc finishBuild(builder: var nlTokBuilder) =
builder.tok.lineNum = builder.lineNum
builder.tok.endPos = builder.pos
builder.tok.lit = builder.line[builder.tok.startPos ..< builder.line.high]
proc finishBuild(lexer: var nlLexer) =
lexer.btok.lineNum = lexer.lineNum
lexer.btok.endPos = lexer.pos
lexer.btok.lit = lexer.line[lexer.btok.startPos ..< lexer.line.high]
# Finish, return, and reset the build token
proc flushBuild(builder: var nlTokBuilder): nlTok =
finishBuild(builder)
result = builder.tok
resetBuild(builder)
proc flushBuild(lexer: var nlLexer): nlTok =
finishBuild(lexer)
result = lexer.btok
resetBuild(lexer)
# Is the build token "compatible" with the current char? (if not then flushbuild)
# NOTE: This implicitly handles Windows CRLF, Unix LF, & Mac OS CR compatability
# NOTE: since atEOL => '\n', but '\r' and '\n' are both tkEOL so they both flush.
proc isIncompatibleBuild(builder: nlTokBuilder): bool =
result = (builder.cTKind != builder.tok.kind or builder.atEOL())
proc isIncompatibleBuild(lexer: nlLexer): bool =
result = (lexer.cTKind != lexer.btok.kind or lexer.atEOL())
# Inherit the build token's type from current char
proc inherit(builder: var nlTokBuilder) =
builder.tok.kind = builder.cTKind
proc inherit(lexer: var nlLexer) =
lexer.btok.kind = lexer.cTKind
# Add a character to the nlTokBuilder's build token.
# Add a character to the nlLexer's build token.
# Flushes and returns the build token if finished.
proc appendBuild(builder: var nlTokBuilder): Option[nlTok] =
proc appendBuild(lexer: var nlLexer): Option[nlTok] =
# untyped build tokens inherit type immediately
if builder.tok.isUntyped():
builder.inherit()
if lexer.btok.isUntyped():
lexer.inherit()
# check character and build token compatability
if isIncompatibleBuild(builder):
if isIncompatibleBuild(lexer):
# flush old build token, the new one inherits type
result = some(flushBuild(builder))
builder.inherit()
result = some(flushBuild(lexer))
lexer.inherit()
else:
result = none(nlTok)
#[ ========================================== ]
| nlTokBuilder Char Stream Reading Interface ]
#[ ========================================= ]
| nlLexer Internal Char Streaming Interface ]
]#
# Read the next char in the stream
# NOTE: readChar raises IOError on error, returns \0 on EOF
proc readChar*(builder: var nlTokBuilder): bool =
if builder.atEOL():
inc builder.lineNum
# sets builder.char to '\0' if EOF
builder.char = builder.stream.readChar()
builder.cTKind = getTokKind(builder.char)
builder.line.add(builder.char)
inc builder.pos
result = builder.atEOF()
proc readChar(lexer: var nlLexer): bool =
if lexer.atEOL():
inc lexer.lineNum
# sets lexer.char to '\0' if EOF
lexer.char = lexer.stream.readChar()
lexer.cTKind = getTokKind(lexer.char)
lexer.line.add(lexer.char)
inc lexer.pos
result = lexer.atEOF()
#[ ========================
| nlLexer Public Interface
]#
# Read until EOL and return the current line
# NOTE: Does NOT update the builder's state (unsafe)
# NOTE: Does NOT update the lexer's state (unsafe)
# NOTE: ONLY call if a lex/parse error needs displaying
proc unsafeGetLine*(builder: var nlTokBuilder): string =
while not builder.atEOL() and builder.readChar():
proc unsafeGetLine*(lexer: var nlLexer): string =
while not lexer.atEOL() and lexer.readChar():
discard
result = builder.line
result = lexer.line
# Lexes and returns the next token in the "token stream"
# via repeatedly calling readChar() and appendBuild().
# Returns a boolean indicating whether EOF has been reached.
# NOTE: access the new token via `stream.tok`
proc progress*(lexer: var nlLexer): bool =
# Return prematurely if already closed
if lexer.done:
return false
while true:
let
atEOF = lexer.readChar()
flushedTok = lexer.appendBuild()
newTokBuilt = flushedTok.isSome
if newTokBuilt:
lexer.tok = flushedTok.get()
# if canProgress and atEOF:
# if atEOF:
# if newTokBuilt:
# stream.isClosed = true
# return newTokBuilt
# elif newTokBuilt:
# return true
if newTokBuilt:
if atEOF:
lexer.done = true
return true
elif atEOF:
return false