YALR (Yet Another Lexer Refactor)

This commit is contained in:
Emile Clark-Boman 2025-06-19 08:48:31 +10:00
parent 72a6075123
commit 99db57dcfd
10 changed files with 208 additions and 217 deletions

2
lang/demo/single_toks.no Normal file
View file

@ -0,0 +1,2 @@
[a]b(#)
(c)d[e]

View file

@ -1,3 +1,11 @@
#!/usr/bin/env bash
set -e
if [ -z "$1" ]; then
echo "Usage: ddemo DEMOFILE"
echo "Demo files are located in lang/demo"
exit 1
fi
nim c nlx.nim
./nlx ../lang/demo/$1

View file

@ -1,22 +1,19 @@
import os
import noether/lexer/tok
import noether/lexer/tokstream
import noether/parser/parser
import noether/lib/io
import noether/lexer/[tok, tokstream]
# import noether/parser/parser
{.hint: "Don't forget to drink more water (^_^)".}
when isMainModule:
echo "Noether Lang Extras v0.1.0 - nlx"
if paramCount() > 0:
let filename = paramStr(1)
var tokStream = newTokStream(filename, isFile=true)
var inStream = if paramCount() > 0: streamFile(paramStr 1)
else: streamString(readAll stdin)
# # DumpTok
# while tokStream.progress():
# echo tokStream.currTok
var stream = newTokStream(inStream)
# # DumpTok
while stream.progress():
echo stream.tok
# DumpTree
discard parse(tokStream)
else:
echo "usage: nlx filename"
# DumpTree
# discard parse(tokStream)

View file

@ -1,66 +0,0 @@
import std/streams
import std/options
import tok
export tok
type
# Character streaming for the nlTokStream
nlLStream = object
stream: Stream
# row/column positions
line*: string
lineNum*: Natural
pos*: Natural
proc streamFile*(filename: string): FileStream =
result = newFileStream(filename, fmRead)
proc streamString*(str: string): StringStream =
result = newStringStream(str)
proc newLStream*(content: string, isFile: bool = false): nlLStream =
result = nlLStream(
stream: if isFile: streamFile(content) else: streamString(content),
line: "",
lineNum: Natural 0,
pos: Natural 0,
)
# Checks whether we've reached EOL
# NOTE: also checks if we've surpassed it (ie invalid lstream.pos)
proc atEOL*(lstream: nlLStream): bool =
result = (lstream.pos >= lstream.line.len - 1)
# Checks whether we are EXACTLY at EOL, but not surpassed
proc exactlyEOL*(lstream: nlLStream): bool =
result = (lstream.pos == lstream.line.len - 1)
# Checks whether we have surpassed EOL
proc outOfBounds*(lstream: nlLStream): bool =
result = (lstream.pos > lstream.line.len - 1)
# Progress the lex stream to the next line (if available)
proc progressLine*(lstream: var nlLStream): bool =
if lstream.stream.readLine(lstream.line):
inc lstream.lineNum
lstream.pos = Natural 0
return true
return false
# Progress the lex stream to the next character in the line
# forcefully (aka does NOT check if we reached EOL)
proc forceProgressChar*(lstream: var nlLStream) =
inc lstream.pos
# # Progress the lex stream to the next character (if available)
# proc progressChar*(lstream: var nlLStream): bool =
# if not lstream.atEOL():
# lstream.forceProgressChar()
# result = true
# else:
# # attempt to progress next line past EOL
# result = lstream.progressLine()
proc currChar*(lstream: nlLStream): char =
result = lstream.line[lstream.pos]

View file

@ -1,22 +1,25 @@
include toktype
include tokkind
type
nlTok* = object
tKind*: nlTokKind
lit*: string
lineNum*: Natural
startPos*: Natural
endPos*: Natural
nlTok* = tuple
# NOTE: nlTokBuilder will mutate nlTok.kind
kind: nlTokKind
lit: string
lineNum: int
startPos: int
endPos: int
# Generates an "empty" nlTok with only a startPos,
# all other fields are expected to be filled out later.
proc emptyTok*(startPos: int): nlTok =
result = nlTok(
tKind: tkNONE,
proc emptyTok*(startPos: int): nlTok {.inline.} =
result = (
kind: tkNONE,
lit: "",
startPos: Natural startPos,
lineNum: 0,
startPos: startPos,
endPos: startPos,
)
# Checks if an nlTok has tkNONE
proc isUntyped*(tKind: nlTokKind): bool =
result = (tKind == tkNONE)
proc isUntyped*(tok: nlTok): bool {.inline.} =
result = (tok.kind == tkNONE)

View file

@ -0,0 +1,123 @@
import
streams,
options
import tok
export tok
type
# Abstracts the "building process" (lexing)
# of nlTok objects from a given Stream of characters.
nlTokBuilder* = object
stream: Stream
tok: nlTok # the build token
# track line number, line content, etc
line: string
lineNum: int
pos: int
# save char and pos and its token type
char: char
cTKind: nlTokKind
proc atEOL(builder: nlTokBuilder): bool {.inline.} =
result = (builder.char == '\n')
proc atEOF(builder: nlTokBuilder): bool {.inline.} =
result = (builder.char == '\0')
# Initialise a new token builder
proc newBuilder(stream: var Stream): nlTokBuilder =
# NOTE: initial builder.char value is arbitrary,
# NOTE: but CANNOT be initialised to the default '\0'
result = nlTokBuilder(
stream: stream,
tok: emptyTok(0),
line: "",
lineNum: 1,
pos: -1, # after initial readChar this -> 0
char: '\0', # use \0 as initial invalid char
)
#[ ====================================================== ]
| nlTokBuilder Internal Interface for Token Construction ]
]#
# Reset the build token to be "empty"
proc resetBuild(builder: var nlTokBuilder) =
builder.tok = emptyTok(builder.pos)
# "Finishes" the build token by setting various properties
proc finishBuild(builder: var nlTokBuilder) =
builder.tok.lineNum = builder.lineNum
builder.tok.endPos = builder.pos
builder.tok.lit = builder.line[builder.tok.startPos ..< builder.line.high]
# Finish, return, and reset the build token
proc flushBuild(builder: var nlTokBuilder): nlTok =
echo "Flush @", builder.pos
finishBuild(builder)
result = builder.tok
resetBuild(builder)
# Is the build token "compatible" with the current char?
# NOTE: flushBuild() is called if incompatible
proc isCompatibleBuild(builder: nlTokBuilder): bool =
result = (builder.cTKind == builder.tok.kind)
# Inherit the build token's type from current char
proc inherit(builder: var nlTokBuilder) =
builder.tok.kind = builder.cTKind
# Add a character to the nlTokBuilder's build token.
# Flushes and returns the build token if "fully built",
# and a boolean indicating whether the nlTokBuilder can progress.
proc appendBuild(builder: var nlTokBuilder, flushed: var Option[nlTok]): bool =
# untyped build tokens inherit type immediately
if builder.tok.isUntyped():
builder.inherit()
# check if EOF reached
# if builder.atEOL():
# echo "EOL DETECT 1"
# result = false # DO NOT PROGRESS
# flushed = some(flushBuild(builder))
# check character and build token compatability
if not isCompatibleBuild(builder):
# flush old build token, the new one inherits type
flushed = some(flushBuild(builder))
builder.inherit()
result = true # can progress
else:
flushed = none(nlTok)
result = true # can progress
#[ ========================================== ]
| nlTokBuilder Char Stream Reading Interface ]
]#
# Read the next char in the stream without
# checking whether it is safe to do so
proc forceReadChar(builder: var nlTokBuilder) {.inline.} =
echo "read"
inc builder.pos
builder.char = builder.stream.readChar()
builder.cTKind = getTokKind(builder.char)
builder.line.add(builder.char)
# Read the next char in the stream
# NOTE: readChar raises IOError on error, returns \0 on EOF
proc readChar(builder: var nlTokBuilder): bool =
if builder.atEOL():
echo "EOL DETECT 2"
inc builder.lineNum
# sets builder.char to '\0' if EOF
builder.forceReadChar()
result = builder.atEOF()
# Read until EOL and return the current line
# NOTE: Does NOT update the builder's state (unsafe)
# NOTE: ONLY call if a lex/parse error needs displaying
proc unsafeGetLine(builder: var nlTokBuilder): string =
while not builder.atEOL() and builder.readChar():
discard
result = builder.line

View file

@ -1,86 +0,0 @@
include lstream
type
# Provides a stream-like interface for lexing nlToks
# Internally reliant on the functionality of nlLStream
nlTokStream* = object
lstream: nlLStream
build: nlTok # the build token
currTok*: nlTok # the current token
closed: bool # EOF + all tokens built
# Generates an EOL token for the nlTokStream's state
proc EOLTok(tokStream: nlTokStream): nlTok =
result = nlTok(
tKind: tkEOL,
lit: "\0",
lineNum: Natural tokStream.lstream.lineNum,
startPos: Natural tokStream.lstream.pos,
endPos: Natural tokStream.lstream.pos,
)
# Resets the build token to an "empty" nlTok
proc resetBuild(tokStream: var nlTokStream) =
tokStream.build = emptyTok(tokStream.lstream.pos)
# Completes a token generated by emptyTok()
# based on the nlTokStream's nlLStream's
# current line and character positions
proc finishBuild(ts: var nlTokStream) =
ts.build.lineNum = Natural ts.lstream.lineNum
ts.build.endPos = Natural ts.lstream.pos
ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos]
# Returns the nlTokStream's build token and
# empties the build token's contents.
proc flushBuild(tokStream: var nlTokStream): nlTok =
finishBuild(tokStream)
result = tokStream.build
resetBuild(tokStream)
# Returns whether the build token has a set type yet.
# This indicates that the build token should inherit
# the nlTokKind of the nlLStream's next character.
proc isUntypedBuild(tokStream: nlTokStream): bool =
result = tokStream.build.tKind.isUntyped()
# Check whether an nlTokKind is "compatible" with the build token.
# NOTE: flushBuild() should be called when an incompatible token is discovered.
proc isCompatibleBuild(tokStream: nlTokStream, tKind: nlTokKind): bool =
result = (tKind == tokStream.build.tKind)
# Add a character to the nlTokStream's build token.
# Flushes and returns the build token if "fully built",
# and a boolean indicating whether the nlTokStream can progress.
proc progressBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool =
# the "pos > EOL" invalid state is used intentionally
# to indicate all tokens have been built, and return EOL Token
if tokStream.lstream.outOfBounds():
buildTok = some(EOLTok(tokStream))
return true # can progress once more
let tKind = getTokType(tokStream.lstream.currChar())
# untyped build tokens must inherited immediately
if isUntypedBuild(tokStream):
tokStream.build.tKind = tKind
# check if EOL reached
if tokStream.lstream.atEOL():
# flush old build token, the new one can be left untyped
let compatible = isCompatibleBuild(tokStream, tKind)
result = false # DO NOT PROGRESS
if compatible:
# force the lstream into an invalid state by progressing beyond EOL
# we can then detect this state on the next progressBuild and return
# an EOL character (very unsafe implementation but it works well)
tokStream.lstream.forceProgressChar()
buildTok = some(flushBuild(tokStream))
# check character and build token compatability
elif not isCompatibleBuild(tokStream, tKind):
# flush old build token, the new one inherits type
buildTok = some(flushBuild(tokStream))
tokStream.build.tKind = tKind
result = true # can progress
else:
buildTok = none(nlTok)
result = true # can progress

View file

@ -29,9 +29,11 @@ type
tkHASH, # # Number Sign (Hashtag)
# Classifies a character to its nlTokKind
proc getTokType*(c: char): nlTokKind =
proc getTokKind*(c: char): nlTokKind =
case c:
of '\0', '\r', '\n':
of '\0':
result = tkEOF
of '\r', '\n':
result = tkEOL
of ' ', '\t':
result = tkWTSP

View file

@ -1,52 +1,53 @@
include tokbuilding
include tokbuilder
type
# Provides a stream-like interface for lexing.
# Implemented as a wrapper for nlTokBuilder.
nlTokStream* = object
builder: nlTokBuilder
tok*: nlTok # the current token
isClosed: bool # EOF + all tokens built
# Initialises a new nlTokStream on a string or file
proc newTokStream*(content: string, isFile: bool = false): nlTokStream =
proc newTokStream*(stream: var Stream): nlTokStream =
result = nlTokStream(
lstream: newLStream(content, isFile=isFile),
closed: false,
builder: newBuilder(stream),
tok: emptyTok(0),
isClosed: false,
)
# 1. initialise an empty build token
# 2. progress to the first line
result.resetBuild()
discard result.lstream.progressLine()
# Defines a short-hand notation for getting the current line
proc line*(tokStream: nlTokStream): string =
result = tokStream.lstream.line
# Reimplements nlLStream.progressChar for nlTokStream
# to account for additional structure (ie the build token)
# NOTE: progressChar progresses to lstream's next char
proc progressChar(tokStream: var nlTokStream): bool =
if not tokStream.lstream.atEOL():
tokStream.lstream.forceProgressChar()
result = true
else:
# attempt to progress to next line past EOL
result = tokStream.lstream.progressLine()
tokStream.resetBuild()
# Expose a subset of the nlTokBuilder interface
proc line*(stream: nlTokStream): string =
result = stream.builder.line
proc atEOL*(stream: nlTokStream): bool =
result = stream.builder.atEOL()
# Generates and progress the next token in the nlTokStream.
# via repeatedly calling progressBuild() and progressChar().
# Returns a boolean indicating whether EOF has been reached.
# NOTE: access the new token via `tokStream.tok`
proc progress*(tokStream: var nlTokStream): bool =
# NOTE: access the new token via `stream.tok`
proc progress*(stream: var nlTokStream): bool =
# Return prematurely if already closed
if tokStream.closed:
if stream.isClosed:
return false
while true:
# echo "\nProgressing..."
var flushedTok: Option[nlTok]
let
canProgress = tokStream.progressBuild(flushedTok)
buildComplete = flushedTok.isSome
atEOF = stream.builder.readChar()
newTokBuilt = flushedTok.isSome
discard stream.builder.appendBuild(flushedTok)
echo flushedTok
echo "atEOF: ", atEOF, "\nnewTokBuilt: ", newTokBuilt
# canProgress & EOF reached => no more tokens to build :)
# NOTE: reachedEOF and not canProgress => more tokens unwrapping
if buildComplete:
if newTokBuilt:
# return the finished build token, and save it as the current token
tokStream.currTok = flushedTok.get()
if canProgress and not tokStream.progressChar():
tokStream.closed = true
return buildComplete
elif buildComplete:
stream.tok = flushedTok.get()
# if canProgress and atEOF:
if atEOF:
if newTokBuilt:
stream.isClosed = true
return newTokBuilt
elif newTokBuilt:
return true

7
src/noether/lib/io.nim Normal file
View file

@ -0,0 +1,7 @@
import std/streams
proc streamFile*(filename: string): Stream {.inline.} =
result = newFileStream(filename, fmRead)
proc streamString*(str: string): Stream {.inline.} =
result = newStringStream(str)