From 99db57dcfdf43e118f2812be560399e544acd0fb Mon Sep 17 00:00:00 2001 From: Emile Clark-Boman Date: Thu, 19 Jun 2025 08:48:31 +1000 Subject: [PATCH] YALR (Yet Another Lexer Refactor) --- lang/demo/single_toks.no | 2 + src/ddemo | 8 ++ src/nlx.nim | 27 ++-- src/noether/lexer/lstream.nim | 66 ---------- src/noether/lexer/tok.nim | 29 +++-- src/noether/lexer/tokbuilder.nim | 123 ++++++++++++++++++ src/noether/lexer/tokbuilding.nim | 86 ------------ .../lexer/{toktype.nim => tokkind.nim} | 6 +- src/noether/lexer/tokstream.nim | 71 +++++----- src/noether/lib/io.nim | 7 + 10 files changed, 208 insertions(+), 217 deletions(-) create mode 100644 lang/demo/single_toks.no delete mode 100644 src/noether/lexer/lstream.nim create mode 100644 src/noether/lexer/tokbuilder.nim delete mode 100644 src/noether/lexer/tokbuilding.nim rename src/noether/lexer/{toktype.nim => tokkind.nim} (93%) create mode 100644 src/noether/lib/io.nim diff --git a/lang/demo/single_toks.no b/lang/demo/single_toks.no new file mode 100644 index 0000000..683090a --- /dev/null +++ b/lang/demo/single_toks.no @@ -0,0 +1,2 @@ +[a]b(#) +(c)d[e] diff --git a/src/ddemo b/src/ddemo index 35c7af2..af30039 100755 --- a/src/ddemo +++ b/src/ddemo @@ -1,3 +1,11 @@ #!/usr/bin/env bash +set -e + +if [ -z "$1" ]; then + echo "Usage: ddemo DEMOFILE" + echo "Demo files are located in lang/demo" + exit 1 +fi + nim c nlx.nim ./nlx ../lang/demo/$1 diff --git a/src/nlx.nim b/src/nlx.nim index 75f59d4..adf95f0 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -1,22 +1,19 @@ import os -import noether/lexer/tok -import noether/lexer/tokstream -import noether/parser/parser +import noether/lib/io +import noether/lexer/[tok, tokstream] +# import noether/parser/parser {.hint: "Don't forget to drink more water (^_^)".} when isMainModule: echo "Noether Lang Extras v0.1.0 - nlx" - if paramCount() > 0: - let filename = paramStr(1) - var tokStream = newTokStream(filename, isFile=true) - - # # DumpTok - # while tokStream.progress(): - # echo tokStream.currTok + var inStream = if paramCount() > 0: streamFile(paramStr 1) + else: streamString(readAll stdin) - # DumpTree - discard parse(tokStream) - - else: - echo "usage: nlx filename" + var stream = newTokStream(inStream) + # # DumpTok + while stream.progress(): + echo stream.tok + + # DumpTree + # discard parse(tokStream) diff --git a/src/noether/lexer/lstream.nim b/src/noether/lexer/lstream.nim deleted file mode 100644 index 44138e0..0000000 --- a/src/noether/lexer/lstream.nim +++ /dev/null @@ -1,66 +0,0 @@ -import std/streams -import std/options - -import tok -export tok - -type - # Character streaming for the nlTokStream - nlLStream = object - stream: Stream - # row/column positions - line*: string - lineNum*: Natural - pos*: Natural - -proc streamFile*(filename: string): FileStream = - result = newFileStream(filename, fmRead) - -proc streamString*(str: string): StringStream = - result = newStringStream(str) - -proc newLStream*(content: string, isFile: bool = false): nlLStream = - result = nlLStream( - stream: if isFile: streamFile(content) else: streamString(content), - line: "", - lineNum: Natural 0, - pos: Natural 0, - ) - -# Checks whether we've reached EOL -# NOTE: also checks if we've surpassed it (ie invalid lstream.pos) -proc atEOL*(lstream: nlLStream): bool = - result = (lstream.pos >= lstream.line.len - 1) - -# Checks whether we are EXACTLY at EOL, but not surpassed -proc exactlyEOL*(lstream: nlLStream): bool = - result = (lstream.pos == lstream.line.len - 1) - -# Checks whether we have surpassed EOL -proc outOfBounds*(lstream: nlLStream): bool = - result = (lstream.pos > lstream.line.len - 1) - -# Progress the lex stream to the next line (if available) -proc progressLine*(lstream: var nlLStream): bool = - if lstream.stream.readLine(lstream.line): - inc lstream.lineNum - lstream.pos = Natural 0 - return true - return false - -# Progress the lex stream to the next character in the line -# forcefully (aka does NOT check if we reached EOL) -proc forceProgressChar*(lstream: var nlLStream) = - inc lstream.pos - -# # Progress the lex stream to the next character (if available) -# proc progressChar*(lstream: var nlLStream): bool = -# if not lstream.atEOL(): -# lstream.forceProgressChar() -# result = true -# else: -# # attempt to progress next line past EOL -# result = lstream.progressLine() - -proc currChar*(lstream: nlLStream): char = - result = lstream.line[lstream.pos] diff --git a/src/noether/lexer/tok.nim b/src/noether/lexer/tok.nim index 7715b8f..08aba66 100644 --- a/src/noether/lexer/tok.nim +++ b/src/noether/lexer/tok.nim @@ -1,22 +1,25 @@ -include toktype +include tokkind type - nlTok* = object - tKind*: nlTokKind - lit*: string - lineNum*: Natural - startPos*: Natural - endPos*: Natural + nlTok* = tuple + # NOTE: nlTokBuilder will mutate nlTok.kind + kind: nlTokKind + lit: string + lineNum: int + startPos: int + endPos: int # Generates an "empty" nlTok with only a startPos, # all other fields are expected to be filled out later. -proc emptyTok*(startPos: int): nlTok = - result = nlTok( - tKind: tkNONE, +proc emptyTok*(startPos: int): nlTok {.inline.} = + result = ( + kind: tkNONE, lit: "", - startPos: Natural startPos, + lineNum: 0, + startPos: startPos, + endPos: startPos, ) # Checks if an nlTok has tkNONE -proc isUntyped*(tKind: nlTokKind): bool = - result = (tKind == tkNONE) +proc isUntyped*(tok: nlTok): bool {.inline.} = + result = (tok.kind == tkNONE) diff --git a/src/noether/lexer/tokbuilder.nim b/src/noether/lexer/tokbuilder.nim new file mode 100644 index 0000000..357841a --- /dev/null +++ b/src/noether/lexer/tokbuilder.nim @@ -0,0 +1,123 @@ +import + streams, + options + +import tok +export tok + +type + # Abstracts the "building process" (lexing) + # of nlTok objects from a given Stream of characters. + nlTokBuilder* = object + stream: Stream + tok: nlTok # the build token + # track line number, line content, etc + line: string + lineNum: int + pos: int + # save char and pos and its token type + char: char + cTKind: nlTokKind + +proc atEOL(builder: nlTokBuilder): bool {.inline.} = + result = (builder.char == '\n') +proc atEOF(builder: nlTokBuilder): bool {.inline.} = + result = (builder.char == '\0') + +# Initialise a new token builder +proc newBuilder(stream: var Stream): nlTokBuilder = + # NOTE: initial builder.char value is arbitrary, + # NOTE: but CANNOT be initialised to the default '\0' + result = nlTokBuilder( + stream: stream, + tok: emptyTok(0), + line: "", + lineNum: 1, + pos: -1, # after initial readChar this -> 0 + char: '\0', # use \0 as initial invalid char + ) + + +#[ ====================================================== ] + | nlTokBuilder Internal Interface for Token Construction ] + ]# + +# Reset the build token to be "empty" +proc resetBuild(builder: var nlTokBuilder) = + builder.tok = emptyTok(builder.pos) + +# "Finishes" the build token by setting various properties +proc finishBuild(builder: var nlTokBuilder) = + builder.tok.lineNum = builder.lineNum + builder.tok.endPos = builder.pos + builder.tok.lit = builder.line[builder.tok.startPos ..< builder.line.high] + +# Finish, return, and reset the build token +proc flushBuild(builder: var nlTokBuilder): nlTok = + echo "Flush @", builder.pos + finishBuild(builder) + result = builder.tok + resetBuild(builder) + +# Is the build token "compatible" with the current char? +# NOTE: flushBuild() is called if incompatible +proc isCompatibleBuild(builder: nlTokBuilder): bool = + result = (builder.cTKind == builder.tok.kind) + +# Inherit the build token's type from current char +proc inherit(builder: var nlTokBuilder) = + builder.tok.kind = builder.cTKind + +# Add a character to the nlTokBuilder's build token. +# Flushes and returns the build token if "fully built", +# and a boolean indicating whether the nlTokBuilder can progress. +proc appendBuild(builder: var nlTokBuilder, flushed: var Option[nlTok]): bool = + # untyped build tokens inherit type immediately + if builder.tok.isUntyped(): + builder.inherit() + + # check if EOF reached + # if builder.atEOL(): + # echo "EOL DETECT 1" + # result = false # DO NOT PROGRESS + # flushed = some(flushBuild(builder)) + # check character and build token compatability + if not isCompatibleBuild(builder): + # flush old build token, the new one inherits type + flushed = some(flushBuild(builder)) + builder.inherit() + result = true # can progress + else: + flushed = none(nlTok) + result = true # can progress + +#[ ========================================== ] + | nlTokBuilder Char Stream Reading Interface ] + ]# + +# Read the next char in the stream without +# checking whether it is safe to do so +proc forceReadChar(builder: var nlTokBuilder) {.inline.} = + echo "read" + inc builder.pos + builder.char = builder.stream.readChar() + builder.cTKind = getTokKind(builder.char) + builder.line.add(builder.char) + +# Read the next char in the stream +# NOTE: readChar raises IOError on error, returns \0 on EOF +proc readChar(builder: var nlTokBuilder): bool = + if builder.atEOL(): + echo "EOL DETECT 2" + inc builder.lineNum + # sets builder.char to '\0' if EOF + builder.forceReadChar() + result = builder.atEOF() + +# Read until EOL and return the current line +# NOTE: Does NOT update the builder's state (unsafe) +# NOTE: ONLY call if a lex/parse error needs displaying +proc unsafeGetLine(builder: var nlTokBuilder): string = + while not builder.atEOL() and builder.readChar(): + discard + result = builder.line diff --git a/src/noether/lexer/tokbuilding.nim b/src/noether/lexer/tokbuilding.nim deleted file mode 100644 index 46a2222..0000000 --- a/src/noether/lexer/tokbuilding.nim +++ /dev/null @@ -1,86 +0,0 @@ -include lstream - -type - # Provides a stream-like interface for lexing nlToks - # Internally reliant on the functionality of nlLStream - nlTokStream* = object - lstream: nlLStream - build: nlTok # the build token - currTok*: nlTok # the current token - closed: bool # EOF + all tokens built - -# Generates an EOL token for the nlTokStream's state -proc EOLTok(tokStream: nlTokStream): nlTok = - result = nlTok( - tKind: tkEOL, - lit: "\0", - lineNum: Natural tokStream.lstream.lineNum, - startPos: Natural tokStream.lstream.pos, - endPos: Natural tokStream.lstream.pos, - ) - -# Resets the build token to an "empty" nlTok -proc resetBuild(tokStream: var nlTokStream) = - tokStream.build = emptyTok(tokStream.lstream.pos) - -# Completes a token generated by emptyTok() -# based on the nlTokStream's nlLStream's -# current line and character positions -proc finishBuild(ts: var nlTokStream) = - ts.build.lineNum = Natural ts.lstream.lineNum - ts.build.endPos = Natural ts.lstream.pos - ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos] - -# Returns the nlTokStream's build token and -# empties the build token's contents. -proc flushBuild(tokStream: var nlTokStream): nlTok = - finishBuild(tokStream) - result = tokStream.build - resetBuild(tokStream) - -# Returns whether the build token has a set type yet. -# This indicates that the build token should inherit -# the nlTokKind of the nlLStream's next character. -proc isUntypedBuild(tokStream: nlTokStream): bool = - result = tokStream.build.tKind.isUntyped() - -# Check whether an nlTokKind is "compatible" with the build token. -# NOTE: flushBuild() should be called when an incompatible token is discovered. -proc isCompatibleBuild(tokStream: nlTokStream, tKind: nlTokKind): bool = - result = (tKind == tokStream.build.tKind) - -# Add a character to the nlTokStream's build token. -# Flushes and returns the build token if "fully built", -# and a boolean indicating whether the nlTokStream can progress. -proc progressBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool = - # the "pos > EOL" invalid state is used intentionally - # to indicate all tokens have been built, and return EOL Token - if tokStream.lstream.outOfBounds(): - buildTok = some(EOLTok(tokStream)) - return true # can progress once more - - let tKind = getTokType(tokStream.lstream.currChar()) - # untyped build tokens must inherited immediately - if isUntypedBuild(tokStream): - tokStream.build.tKind = tKind - - # check if EOL reached - if tokStream.lstream.atEOL(): - # flush old build token, the new one can be left untyped - let compatible = isCompatibleBuild(tokStream, tKind) - result = false # DO NOT PROGRESS - if compatible: - # force the lstream into an invalid state by progressing beyond EOL - # we can then detect this state on the next progressBuild and return - # an EOL character (very unsafe implementation but it works well) - tokStream.lstream.forceProgressChar() - buildTok = some(flushBuild(tokStream)) - # check character and build token compatability - elif not isCompatibleBuild(tokStream, tKind): - # flush old build token, the new one inherits type - buildTok = some(flushBuild(tokStream)) - tokStream.build.tKind = tKind - result = true # can progress - else: - buildTok = none(nlTok) - result = true # can progress diff --git a/src/noether/lexer/toktype.nim b/src/noether/lexer/tokkind.nim similarity index 93% rename from src/noether/lexer/toktype.nim rename to src/noether/lexer/tokkind.nim index 49add5b..3d1d7b6 100644 --- a/src/noether/lexer/toktype.nim +++ b/src/noether/lexer/tokkind.nim @@ -29,9 +29,11 @@ type tkHASH, # # Number Sign (Hashtag) # Classifies a character to its nlTokKind -proc getTokType*(c: char): nlTokKind = +proc getTokKind*(c: char): nlTokKind = case c: - of '\0', '\r', '\n': + of '\0': + result = tkEOF + of '\r', '\n': result = tkEOL of ' ', '\t': result = tkWTSP diff --git a/src/noether/lexer/tokstream.nim b/src/noether/lexer/tokstream.nim index 02a045e..309e9bb 100644 --- a/src/noether/lexer/tokstream.nim +++ b/src/noether/lexer/tokstream.nim @@ -1,52 +1,53 @@ -include tokbuilding +include tokbuilder + +type + # Provides a stream-like interface for lexing. + # Implemented as a wrapper for nlTokBuilder. + nlTokStream* = object + builder: nlTokBuilder + tok*: nlTok # the current token + isClosed: bool # EOF + all tokens built # Initialises a new nlTokStream on a string or file -proc newTokStream*(content: string, isFile: bool = false): nlTokStream = +proc newTokStream*(stream: var Stream): nlTokStream = result = nlTokStream( - lstream: newLStream(content, isFile=isFile), - closed: false, + builder: newBuilder(stream), + tok: emptyTok(0), + isClosed: false, ) - # 1. initialise an empty build token - # 2. progress to the first line - result.resetBuild() - discard result.lstream.progressLine() - -# Defines a short-hand notation for getting the current line -proc line*(tokStream: nlTokStream): string = - result = tokStream.lstream.line - -# Reimplements nlLStream.progressChar for nlTokStream -# to account for additional structure (ie the build token) -# NOTE: progressChar progresses to lstream's next char -proc progressChar(tokStream: var nlTokStream): bool = - if not tokStream.lstream.atEOL(): - tokStream.lstream.forceProgressChar() - result = true - else: - # attempt to progress to next line past EOL - result = tokStream.lstream.progressLine() - tokStream.resetBuild() +# Expose a subset of the nlTokBuilder interface +proc line*(stream: nlTokStream): string = + result = stream.builder.line +proc atEOL*(stream: nlTokStream): bool = + result = stream.builder.atEOL() + # Generates and progress the next token in the nlTokStream. # via repeatedly calling progressBuild() and progressChar(). # Returns a boolean indicating whether EOF has been reached. -# NOTE: access the new token via `tokStream.tok` -proc progress*(tokStream: var nlTokStream): bool = +# NOTE: access the new token via `stream.tok` +proc progress*(stream: var nlTokStream): bool = # Return prematurely if already closed - if tokStream.closed: + if stream.isClosed: return false while true: + # echo "\nProgressing..." var flushedTok: Option[nlTok] let - canProgress = tokStream.progressBuild(flushedTok) - buildComplete = flushedTok.isSome + atEOF = stream.builder.readChar() + newTokBuilt = flushedTok.isSome + discard stream.builder.appendBuild(flushedTok) + echo flushedTok + echo "atEOF: ", atEOF, "\nnewTokBuilt: ", newTokBuilt # canProgress & EOF reached => no more tokens to build :) # NOTE: reachedEOF and not canProgress => more tokens unwrapping - if buildComplete: + if newTokBuilt: # return the finished build token, and save it as the current token - tokStream.currTok = flushedTok.get() - if canProgress and not tokStream.progressChar(): - tokStream.closed = true - return buildComplete - elif buildComplete: + stream.tok = flushedTok.get() + # if canProgress and atEOF: + if atEOF: + if newTokBuilt: + stream.isClosed = true + return newTokBuilt + elif newTokBuilt: return true diff --git a/src/noether/lib/io.nim b/src/noether/lib/io.nim new file mode 100644 index 0000000..c7eb0eb --- /dev/null +++ b/src/noether/lib/io.nim @@ -0,0 +1,7 @@ +import std/streams + +proc streamFile*(filename: string): Stream {.inline.} = + result = newFileStream(filename, fmRead) + +proc streamString*(str: string): Stream {.inline.} = + result = newStringStream(str)