Simple (shit) working lexer via nlTokStream

Currently only tested on strings but its highly extendable if you modify the getTokType mapping of chars to their nlTokType
This commit is contained in:
Emile Clark-Boman 2025-06-18 01:25:20 +10:00
parent edf164df90
commit 3ce9390be4
8 changed files with 224 additions and 185 deletions

2
lang/demo/math.no Normal file
View file

@ -0,0 +1,2 @@
"abc+def"
xy+z

View file

@ -1,4 +1,5 @@
import os import os
import noether/lexer/tok
import noether/lexer/tokstream import noether/lexer/tokstream
when isMainModule: when isMainModule:
@ -7,7 +8,10 @@ when isMainModule:
if paramCount() > 0: if paramCount() > 0:
let filename = paramStr(1) let filename = paramStr(1)
var tokStream = newTokStream(filename, isFile=true) var tokStream = newTokStream(filename, isFile=true)
for tok in toks(tokStream):
var tok: nlTok
while tokStream.nextTok(tok):
echo tok echo tok
else: else:
echo "usage: nlx filename" echo "usage: nlx filename"

View file

@ -1,7 +1,7 @@
import std/streams import std/streams
import std/options import std/options
include tok import tok
type type
# Character streaming for the nlTokStream # Character streaming for the nlTokStream
@ -10,15 +10,15 @@ type
# row/column positions # row/column positions
line*: string line*: string
lineNum*: Natural lineNum*: Natural
pos: Natural pos*: Natural
proc streamFile(filename: string): FileStream = proc streamFile*(filename: string): FileStream =
result = newFileStream(filename, fmRead) result = newFileStream(filename, fmRead)
proc streamString(str: string): StringStream = proc streamString*(str: string): StringStream =
result = newStringStream(str) result = newStringStream(str)
proc newLStream(content: string, isFile: bool = false): nlLStream = proc newLStream*(content: string, isFile: bool = false): nlLStream =
result = nlLStream( result = nlLStream(
stream: if isFile: streamFile(content) else: streamString(content), stream: if isFile: streamFile(content) else: streamString(content),
line: "", line: "",
@ -26,20 +26,40 @@ proc newLStream(content: string, isFile: bool = false): nlLStream =
pos: Natural 0, pos: Natural 0,
) )
# Checks whether we've reached EOL
# NOTE: also checks if we've surpassed it (ie invalid lstream.pos)
proc atEOL*(lstream: nlLStream): bool =
result = (lstream.pos >= lstream.line.len - 1)
# Checks whether we are EXACTLY at EOL, but not surpassed
proc exactlyEOL*(lstream: nlLStream): bool =
result = (lstream.pos == lstream.line.len - 1)
# Checks whether we have surpassed EOL
proc outOfBounds*(lstream: nlLStream): bool =
result = (lstream.pos > lstream.line.len - 1)
# Progress the lex stream to the next line (if available) # Progress the lex stream to the next line (if available)
proc progLine(lstream: var nlLStream): bool = proc progLine*(lstream: var nlLStream): bool =
if lstream.stream.readLine(lstream.line): if lstream.stream.readLine(lstream.line):
inc lstream.lineNum inc lstream.lineNum
lstream.pos = Natural 0 lstream.pos = Natural 0
return true return true
return false return false
proc currChar(lstream: nlLStream): char = # Progress the lex stream to the next character in the line
result = lstream.line[lstream.pos] # forcefully (aka does NOT check if we reached EOL)
proc forceProgChar*(lstream: var nlLStream) =
# NOTE: assumes lstream.line does NOT mutate while iterating
iterator iterChars(lstream: var nlLStream): Option[char] =
while lstream.pos < lstream.line.len:
inc lstream.pos inc lstream.pos
yield some(lstream.line[lstream.pos - 1])
yield none(char) # Progress the lex stream to the next character (if available)
proc progress*(lstream: var nlLStream): bool =
if not lstream.atEOL():
lstream.forceProgChar()
result = true
else:
# attempt to progress next line past EOL
result = lstream.progLine()
proc currChar*(lstream: nlLStream): char =
result = lstream.line[lstream.pos]

View file

@ -1,43 +1,16 @@
type include toktype
# nlTokType allows primitive nlToks to be typed,
# the nlTokType enum should never be directly
# accessed. Use the interface in this file instead.
# NOTE: NONE is used as a default value
# NOTE: it is very different to NTERM!
nlTokType = enum
NONE, # Placeholder Value
EOF, # EOF
TERM, # String \0 terminator
WORD, # Alphanumeric token
SYMB, # Symbolic token
LNFD, # \r \n Line-Feed
WTSP, # ' ' \t Whitespace
LPAR, # ( Left Parenthesis
RPAR, # ) Right Parenthesis
LBRA, # { Left Brace
RBRA, # } Right Brace
LSQB, # [ Left Square Bracket
RSQB, # ] Right Square Bracket
# LANB, # < Left Angle Bracket
# RANB, # > Right Angle Bracket
SQUO, # ' Single Quotation Marking
DQUO, # " Double Quotation Marking
GRVA, # ` Grave Accent
HASH, # # Number Sign (Hashtag)
nlTok = object type
nlTok* = object
tType*: nlTokType tType*: nlTokType
lit*: string lit*: string
line*: Natural lineNum*: Natural
startPos*: Natural startPos*: Natural
endPos*: Natural endPos*: Natural
# Generates an "empty" nlTok with only a startPos, # Generates an "empty" nlTok with only a startPos,
# all other fields are expected to be filled out later. # all other fields are expected to be filled out later.
# NOTE: tType initialised to nlTokType.NUL proc emptyTok*(startPos: int): nlTok =
# NOTE: lit initialised to empty string
# NOTE: all other fields are uninitialised
proc emptyTok(startPos: int): nlTok =
result = nlTok( result = nlTok(
tType: nlTokType.NONE, tType: nlTokType.NONE,
lit: "", lit: "",
@ -45,55 +18,23 @@ proc emptyTok(startPos: int): nlTok =
) )
# Checks if an nlTok has nlTokType.NONE # Checks if an nlTok has nlTokType.NONE
proc isTokUntyped(tType: nlTokType): bool = proc isTokUntyped*(tType: nlTokType): bool =
result = (tType == nlTokType.NONE) result = (tType == nlTokType.NONE)
# Checks if an nlTok has nlTokType.TERM # Checks if an nlTok has nlTokType.EOL
proc isTokTerm(tType: nlTokType): bool = proc isTokEOL*(tok: nlTok): bool =
result = (tType == nlTokType.TERM) result = (tok.tType == nlTokType.EOL)
# This method is only used to convert null # This method is only used to convert null
# terminator nlToks into line-feed ones. # terminator nlToks into line-feed ones.
# Returns a copy of an nlTok, changing its type # Returns a copy of an nlTok, changing its type
# NOTE: this is necessary because Nim handles proc tokTermToLineFeed*(tok: nlTok): nlTok =
# NOTE: strings in a useful but annoying way
proc tokTermToLineFeed(tok: nlTok): nlTok =
result = nlTok( result = nlTok(
tType: nlTokType.LNFD, tType: nlTokType.LNFD,
lit: tok.lit, lit: tok.lit,
line: tok.line, lineNum: tok.lineNum,
startPos: tok.startPos, startPos: tok.startPos,
endPos: tok.endPos, endPos: tok.endPos,
) )
# Classifies a character to its nlTokType
proc getTokType(c: char): nlTokType =
case c:
of '\0':
result = nlTokType.TERM
of '\r', '\n':
result = nlTokType.LNFD
of ' ', '\t':
result = nlTokType.WTSP
of '(':
result = nlTokType.LPAR
of ')':
result = nlTokType.RPAR
of '{':
result = nlTokType.LBRA
of '}':
result = nlTokType.RBRA
of '[':
result = nlTokType.LSQB
of ']':
result = nlTokType.RSQB
of '\'':
result = nlTokType.SQUO
of '\"':
result = nlTokType.DQUO
of '`':
result = nlTokType.GRVA
of '#':
result = nlTokType.HASH
else:
result = nlTokType.WORD

View file

@ -0,0 +1,84 @@
include lstream
type
# Provides a stream-like interface for lexing nlToks
# Internally reliant on the functionality of nlLStream
nlTokStream = object
lstream: nlLStream
build: nlTok # the build token
# Generates an EOL token for the nlTokStream's state
proc EOLTok*(tokStream: nlTokStream): nlTok =
result = nlTok(
tType: nlTokType.EOL,
lit: "\0",
lineNum: Natural tokStream.lstream.lineNum,
startPos: Natural tokStream.lstream.pos,
endPos: Natural tokStream.lstream.pos,
)
# Resets the build token to an "empty" nlTok
proc resetBuild(tokStream: var nlTokStream) =
tokStream.build = emptyTok(tokStream.lstream.pos)
# Completes a token generated by emptyTok()
# based on the nlTokStream's nlLStream's
# current line and character positions
proc finishBuild(ts: var nlTokStream) =
ts.build.lineNum = Natural ts.lstream.lineNum
ts.build.endPos = Natural ts.lstream.pos
ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos]
# Returns the nlTokStream's build token and
# empties the build token's contents.
proc flushBuild(tokStream: var nlTokStream): nlTok =
finishBuild(tokStream)
result = tokStream.build
resetBuild(tokStream)
# Returns whether the build token has a set type yet.
# This indicates that the build token should inherit
# the nlTokType of the nlLStream's next character.
proc isUntypedBuild(tokStream: nlTokStream): bool =
result = isTokUntyped(tokStream.build.tType)
# Check whether an nlTokType is "compatible" with the build token.
# NOTE: flushBuild() should be called when an incompatible token is discovered.
proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
result = (tType == tokStream.build.tType)
# Add a character to the nlTokStream's build token.
# Flushes and returns the build token if "fully built",
# and a boolean indicating whether the nlTokStream can progress.
proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool =
# the "pos > EOL" invalid state is used intentionally
# to indicate all tokens have been built, and return EOL Token
if tokStream.lstream.outOfBounds():
buildTok = some(EOLTok(tokStream))
return true # can progress once more
let tType = getTokType(tokStream.lstream.currChar())
# untyped build tokens must inherited immediately
if isUntypedBuild(tokStream):
tokStream.build.tType = tType
# check if EOL reached
if tokStream.lstream.atEOL():
# flush old build token, the new one can be left untyped
let compatible = isCompatibleBuild(tokStream, tType)
result = false # DO NOT PROGRESS
if compatible:
# force the lstream into an invalid state by progressing beyond EOL
# we can then detect this state on the next progBuild and return
# an EOL character (very unsafe implementation but it works well)
tokStream.lstream.forceProgChar()
buildTok = some(flushBuild(tokStream))
# check character and build token compatability
elif not isCompatibleBuild(tokStream, tType):
# flush old build token, the new one inherits type
buildTok = some(flushBuild(tokStream))
tokStream.build.tType = tType
result = true # can progress
else:
buildTok = none(nlTok)
result = true # can progress

View file

@ -1,104 +1,38 @@
include lstream include tokbuilding
type
# Provides a stream-like interface for lexing nlToks
# Internally reliant on the functionality of nlLStream
nlTokStream = object
lstream: nlLStream
build: nlTok # the current token we're building
# Resets the build token to an "empty" nlTok where
# only tType, lit, and startPos are initialised.
proc resetBuild(tokStream: var nlTokStream) =
tokStream.build = emptyTok(tokStream.lstream.pos)
# Completes a token generated by emptyTok()
# based on the nlTokStream's nlLStream's
# current line and character positions
proc finishBuild(tokStream: var nlTokStream) =
# if we've reached \0 terminator then forge the start
# and end positions to point OUTSIDE the line
let endPos = if isTokTerm(tokStream.build.tType):
inc tokStream.build.startPos;
tokStream.build.startPos
else: Natural tokStream.lstream.pos
tokStream.build.line = Natural tokStream.lstream.lineNum
tokStream.build.endPos = endPos
# Returns the nlTokStream's build token and
# empties the build token's contents.
proc flushBuild(tokStream: var nlTokStream): nlTok =
finishBuild(tokStream)
result = tokStream.build
resetBuild(tokStream)
# Returns whether the build token has a set type yet.
# This indicates that the build token should inherit
# the nlTokType of the nlLStream's next character.
proc isUntypedBuild(tokStream: nlTokStream): bool =
result = isTokUntyped(tokStream.build.tType)
# Check whether an nlTokType is "compatible" with
# the build token. flushBuild() should be called
# when an incompatible token is discovered.
proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
result = (tType == tokStream.build.tType)
# Add a character to the nlTokStream's build token.
# Returns a bool indicating if a new nlTok has been built
# or not. flushBuild should then be called.
proc appendBuild(tokStream: var nlTokStream, c: char): Option[nlTok] =
let tType = getTokType(c)
# check whether build token should inherit type
if isUntypedBuild(tokStream):
tokStream.build.tType = tType
# check character and build token compatability
elif not isCompatibleBuild(tokStream, tType):
# return flushed build token, and reset
result = some(flushBuild(tokStream))
# new build token is untyped so inherit type
tokStream.build.tType = tType
# check if \0 terminator reached
elif isTokTerm(tokStream.build.tType):
# return immediately to avoid concatinating '\0'
return some(flushBuild(tokStream))
# else return none to indicate no build was completed
else:
result = none(nlTok)
# ensure character is appended to the build token
tokStream.build.lit.add(c)
# Generates and returns the next token in the stream,
# result.tType == nlTokType.NTERM implies line ended
proc nextTok(tokStream: var nlTokStream): nlTok =
# try progress to next char, receives none option on failure
for optchar in iterChars(tokStream.lstream):
# unpack the Option[char], none => '\0'
let c = if optchar.isSome: optchar.get
else: '\0'
let opttok = appendBuild(tokStream, c)
if opttok.isSome:
return opttok.get
# NOTE: REACHING HERE SHOULD NEVER OCCUR
# Initialises a new nlTokStream on a string or file # Initialises a new nlTokStream on a string or file
proc newTokStream*(content: string, isFile: bool = false): nlTokStream = proc newTokStream*(content: string, isFile: bool = false): nlTokStream =
result = nlTokStream( result = nlTokStream(
lstream: newLStream(content, isFile=isFile), lstream: newLStream(content, isFile=isFile),
) )
resetBuild(result) # 1. initialise an empty build token
# 2. progress to the first line
result.resetBuild()
discard result.lstream.progLine()
# Allow the nlTokStream to be iterated # Reimplements nlLStream.progress() for nlTokStream
iterator toks*(tokStream: var nlTokStream): nlTok = # to account for additional structure (ie the build token)
var tok: nlTok proc progChar(tokStream: var nlTokStream): bool =
while progLine(tokStream.lstream): if not tokStream.lstream.atEOL():
tokStream.lstream.forceProgChar()
result = true
else:
# attempt to progress to next line past EOL
result = tokStream.lstream.progLine()
tokStream.resetBuild()
# Generates and sets (by reference) the next token in the stream,
# via repeatedly calling progBuild() and progChar().
# Returns a boolean indicating whether EOF has been reached.
# NOTE: progBuild adds lstream's current char to the build token
# NOTE: progChar progresses to lstream's next char
proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool =
while true: while true:
tok = nextTok(tokStream) var buildTok: Option[nlTok]
# \0 terminator means the line ended OR the file let canProgress = tokStream.progBuild(buildTok)
# has ended, so always yield a line-feed just in case # canProgress & progression failed => EOF reached
if isTokTerm(tok.tType): if canProgress and not tokStream.progChar():
yield tokTermToLineFeed(tok) return false
break elif buildTok.isSome:
yield tok tok = buildTok.get()
# we ONLY reach here on EOF return true
yield tok

View file

@ -0,0 +1,54 @@
type
# nlTokType allows primitive nlToks to be typed,
# the nlTokType enum should never be directly
# accessed. Use the interface in this file instead.
nlTokType* = enum
NONE, # Placeholder Value
EOF, # End of File
EOL, # End of Line (\0 --> EOL)
WORD, # Alphanumeric token
SYMB, # Symbolic token
LNFD, # \r \n Line-Feed
WTSP, # ' ' \t Whitespace
LPAR, # ( Left Parenthesis
RPAR, # ) Right Parenthesis
LBRA, # { Left Brace
RBRA, # } Right Brace
LSQB, # [ Left Square Bracket
RSQB, # ] Right Square Bracket
# LANB, # < Left Angle Bracket
# RANB, # > Right Angle Bracket
SQUO, # ' Single Quotation Marking
DQUO, # " Double Quotation Marking
GRVA, # ` Grave Accent
HASH, # # Number Sign (Hashtag)
# Classifies a character to its nlTokType
proc getTokType*(c: char): nlTokType =
case c:
of '\0', '\r', '\n':
result = nlTokType.EOL
of ' ', '\t':
result = nlTokType.WTSP
of '(':
result = nlTokType.LPAR
of ')':
result = nlTokType.RPAR
of '{':
result = nlTokType.LBRA
of '}':
result = nlTokType.RBRA
of '[':
result = nlTokType.LSQB
of ']':
result = nlTokType.RSQB
of '\'':
result = nlTokType.SQUO
of '\"':
result = nlTokType.DQUO
of '`':
result = nlTokType.GRVA
of '#':
result = nlTokType.HASH
else:
result = nlTokType.WORD