Simple (shit) working lexer via nlTokStream

Currently only tested on strings but its highly extendable if you modify the getTokType mapping of chars to their nlTokType
This commit is contained in:
Emile Clark-Boman 2025-06-18 01:25:20 +10:00
parent edf164df90
commit 3ce9390be4
8 changed files with 224 additions and 185 deletions

2
lang/demo/math.no Normal file
View file

@ -0,0 +1,2 @@
"abc+def"
xy+z

View file

@ -1,4 +1,5 @@
import os
import noether/lexer/tok
import noether/lexer/tokstream
when isMainModule:
@ -7,7 +8,10 @@ when isMainModule:
if paramCount() > 0:
let filename = paramStr(1)
var tokStream = newTokStream(filename, isFile=true)
for tok in toks(tokStream):
var tok: nlTok
while tokStream.nextTok(tok):
echo tok
else:
echo "usage: nlx filename"

View file

@ -1,7 +1,7 @@
import std/streams
import std/options
include tok
import tok
type
# Character streaming for the nlTokStream
@ -10,15 +10,15 @@ type
# row/column positions
line*: string
lineNum*: Natural
pos: Natural
pos*: Natural
proc streamFile(filename: string): FileStream =
proc streamFile*(filename: string): FileStream =
result = newFileStream(filename, fmRead)
proc streamString(str: string): StringStream =
proc streamString*(str: string): StringStream =
result = newStringStream(str)
proc newLStream(content: string, isFile: bool = false): nlLStream =
proc newLStream*(content: string, isFile: bool = false): nlLStream =
result = nlLStream(
stream: if isFile: streamFile(content) else: streamString(content),
line: "",
@ -26,20 +26,40 @@ proc newLStream(content: string, isFile: bool = false): nlLStream =
pos: Natural 0,
)
# Checks whether we've reached EOL
# NOTE: also checks if we've surpassed it (ie invalid lstream.pos)
proc atEOL*(lstream: nlLStream): bool =
result = (lstream.pos >= lstream.line.len - 1)
# Checks whether we are EXACTLY at EOL, but not surpassed
proc exactlyEOL*(lstream: nlLStream): bool =
result = (lstream.pos == lstream.line.len - 1)
# Checks whether we have surpassed EOL
proc outOfBounds*(lstream: nlLStream): bool =
result = (lstream.pos > lstream.line.len - 1)
# Progress the lex stream to the next line (if available)
proc progLine(lstream: var nlLStream): bool =
proc progLine*(lstream: var nlLStream): bool =
if lstream.stream.readLine(lstream.line):
inc lstream.lineNum
lstream.pos = Natural 0
return true
return false
proc currChar(lstream: nlLStream): char =
result = lstream.line[lstream.pos]
# Progress the lex stream to the next character in the line
# forcefully (aka does NOT check if we reached EOL)
proc forceProgChar*(lstream: var nlLStream) =
inc lstream.pos
# NOTE: assumes lstream.line does NOT mutate while iterating
iterator iterChars(lstream: var nlLStream): Option[char] =
while lstream.pos < lstream.line.len:
inc lstream.pos
yield some(lstream.line[lstream.pos - 1])
yield none(char)
# Progress the lex stream to the next character (if available)
proc progress*(lstream: var nlLStream): bool =
if not lstream.atEOL():
lstream.forceProgChar()
result = true
else:
# attempt to progress next line past EOL
result = lstream.progLine()
proc currChar*(lstream: nlLStream): char =
result = lstream.line[lstream.pos]

View file

@ -1,43 +1,16 @@
type
# nlTokType allows primitive nlToks to be typed,
# the nlTokType enum should never be directly
# accessed. Use the interface in this file instead.
# NOTE: NONE is used as a default value
# NOTE: it is very different to NTERM!
nlTokType = enum
NONE, # Placeholder Value
EOF, # EOF
TERM, # String \0 terminator
WORD, # Alphanumeric token
SYMB, # Symbolic token
LNFD, # \r \n Line-Feed
WTSP, # ' ' \t Whitespace
LPAR, # ( Left Parenthesis
RPAR, # ) Right Parenthesis
LBRA, # { Left Brace
RBRA, # } Right Brace
LSQB, # [ Left Square Bracket
RSQB, # ] Right Square Bracket
# LANB, # < Left Angle Bracket
# RANB, # > Right Angle Bracket
SQUO, # ' Single Quotation Marking
DQUO, # " Double Quotation Marking
GRVA, # ` Grave Accent
HASH, # # Number Sign (Hashtag)
include toktype
nlTok = object
type
nlTok* = object
tType*: nlTokType
lit*: string
line*: Natural
lineNum*: Natural
startPos*: Natural
endPos*: Natural
# Generates an "empty" nlTok with only a startPos,
# all other fields are expected to be filled out later.
# NOTE: tType initialised to nlTokType.NUL
# NOTE: lit initialised to empty string
# NOTE: all other fields are uninitialised
proc emptyTok(startPos: int): nlTok =
proc emptyTok*(startPos: int): nlTok =
result = nlTok(
tType: nlTokType.NONE,
lit: "",
@ -45,55 +18,23 @@ proc emptyTok(startPos: int): nlTok =
)
# Checks if an nlTok has nlTokType.NONE
proc isTokUntyped(tType: nlTokType): bool =
proc isTokUntyped*(tType: nlTokType): bool =
result = (tType == nlTokType.NONE)
# Checks if an nlTok has nlTokType.TERM
proc isTokTerm(tType: nlTokType): bool =
result = (tType == nlTokType.TERM)
# Checks if an nlTok has nlTokType.EOL
proc isTokEOL*(tok: nlTok): bool =
result = (tok.tType == nlTokType.EOL)
# This method is only used to convert null
# terminator nlToks into line-feed ones.
# Returns a copy of an nlTok, changing its type
# NOTE: this is necessary because Nim handles
# NOTE: strings in a useful but annoying way
proc tokTermToLineFeed(tok: nlTok): nlTok =
proc tokTermToLineFeed*(tok: nlTok): nlTok =
result = nlTok(
tType: nlTokType.LNFD,
lit: tok.lit,
line: tok.line,
lineNum: tok.lineNum,
startPos: tok.startPos,
endPos: tok.endPos,
)
# Classifies a character to its nlTokType
proc getTokType(c: char): nlTokType =
case c:
of '\0':
result = nlTokType.TERM
of '\r', '\n':
result = nlTokType.LNFD
of ' ', '\t':
result = nlTokType.WTSP
of '(':
result = nlTokType.LPAR
of ')':
result = nlTokType.RPAR
of '{':
result = nlTokType.LBRA
of '}':
result = nlTokType.RBRA
of '[':
result = nlTokType.LSQB
of ']':
result = nlTokType.RSQB
of '\'':
result = nlTokType.SQUO
of '\"':
result = nlTokType.DQUO
of '`':
result = nlTokType.GRVA
of '#':
result = nlTokType.HASH
else:
result = nlTokType.WORD

View file

@ -0,0 +1,84 @@
include lstream
type
# Provides a stream-like interface for lexing nlToks
# Internally reliant on the functionality of nlLStream
nlTokStream = object
lstream: nlLStream
build: nlTok # the build token
# Generates an EOL token for the nlTokStream's state
proc EOLTok*(tokStream: nlTokStream): nlTok =
result = nlTok(
tType: nlTokType.EOL,
lit: "\0",
lineNum: Natural tokStream.lstream.lineNum,
startPos: Natural tokStream.lstream.pos,
endPos: Natural tokStream.lstream.pos,
)
# Resets the build token to an "empty" nlTok
proc resetBuild(tokStream: var nlTokStream) =
tokStream.build = emptyTok(tokStream.lstream.pos)
# Completes a token generated by emptyTok()
# based on the nlTokStream's nlLStream's
# current line and character positions
proc finishBuild(ts: var nlTokStream) =
ts.build.lineNum = Natural ts.lstream.lineNum
ts.build.endPos = Natural ts.lstream.pos
ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos]
# Returns the nlTokStream's build token and
# empties the build token's contents.
proc flushBuild(tokStream: var nlTokStream): nlTok =
finishBuild(tokStream)
result = tokStream.build
resetBuild(tokStream)
# Returns whether the build token has a set type yet.
# This indicates that the build token should inherit
# the nlTokType of the nlLStream's next character.
proc isUntypedBuild(tokStream: nlTokStream): bool =
result = isTokUntyped(tokStream.build.tType)
# Check whether an nlTokType is "compatible" with the build token.
# NOTE: flushBuild() should be called when an incompatible token is discovered.
proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
result = (tType == tokStream.build.tType)
# Add a character to the nlTokStream's build token.
# Flushes and returns the build token if "fully built",
# and a boolean indicating whether the nlTokStream can progress.
proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool =
# the "pos > EOL" invalid state is used intentionally
# to indicate all tokens have been built, and return EOL Token
if tokStream.lstream.outOfBounds():
buildTok = some(EOLTok(tokStream))
return true # can progress once more
let tType = getTokType(tokStream.lstream.currChar())
# untyped build tokens must inherited immediately
if isUntypedBuild(tokStream):
tokStream.build.tType = tType
# check if EOL reached
if tokStream.lstream.atEOL():
# flush old build token, the new one can be left untyped
let compatible = isCompatibleBuild(tokStream, tType)
result = false # DO NOT PROGRESS
if compatible:
# force the lstream into an invalid state by progressing beyond EOL
# we can then detect this state on the next progBuild and return
# an EOL character (very unsafe implementation but it works well)
tokStream.lstream.forceProgChar()
buildTok = some(flushBuild(tokStream))
# check character and build token compatability
elif not isCompatibleBuild(tokStream, tType):
# flush old build token, the new one inherits type
buildTok = some(flushBuild(tokStream))
tokStream.build.tType = tType
result = true # can progress
else:
buildTok = none(nlTok)
result = true # can progress

View file

@ -1,104 +1,38 @@
include lstream
type
# Provides a stream-like interface for lexing nlToks
# Internally reliant on the functionality of nlLStream
nlTokStream = object
lstream: nlLStream
build: nlTok # the current token we're building
# Resets the build token to an "empty" nlTok where
# only tType, lit, and startPos are initialised.
proc resetBuild(tokStream: var nlTokStream) =
tokStream.build = emptyTok(tokStream.lstream.pos)
# Completes a token generated by emptyTok()
# based on the nlTokStream's nlLStream's
# current line and character positions
proc finishBuild(tokStream: var nlTokStream) =
# if we've reached \0 terminator then forge the start
# and end positions to point OUTSIDE the line
let endPos = if isTokTerm(tokStream.build.tType):
inc tokStream.build.startPos;
tokStream.build.startPos
else: Natural tokStream.lstream.pos
tokStream.build.line = Natural tokStream.lstream.lineNum
tokStream.build.endPos = endPos
# Returns the nlTokStream's build token and
# empties the build token's contents.
proc flushBuild(tokStream: var nlTokStream): nlTok =
finishBuild(tokStream)
result = tokStream.build
resetBuild(tokStream)
# Returns whether the build token has a set type yet.
# This indicates that the build token should inherit
# the nlTokType of the nlLStream's next character.
proc isUntypedBuild(tokStream: nlTokStream): bool =
result = isTokUntyped(tokStream.build.tType)
# Check whether an nlTokType is "compatible" with
# the build token. flushBuild() should be called
# when an incompatible token is discovered.
proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
result = (tType == tokStream.build.tType)
# Add a character to the nlTokStream's build token.
# Returns a bool indicating if a new nlTok has been built
# or not. flushBuild should then be called.
proc appendBuild(tokStream: var nlTokStream, c: char): Option[nlTok] =
let tType = getTokType(c)
# check whether build token should inherit type
if isUntypedBuild(tokStream):
tokStream.build.tType = tType
# check character and build token compatability
elif not isCompatibleBuild(tokStream, tType):
# return flushed build token, and reset
result = some(flushBuild(tokStream))
# new build token is untyped so inherit type
tokStream.build.tType = tType
# check if \0 terminator reached
elif isTokTerm(tokStream.build.tType):
# return immediately to avoid concatinating '\0'
return some(flushBuild(tokStream))
# else return none to indicate no build was completed
else:
result = none(nlTok)
# ensure character is appended to the build token
tokStream.build.lit.add(c)
# Generates and returns the next token in the stream,
# result.tType == nlTokType.NTERM implies line ended
proc nextTok(tokStream: var nlTokStream): nlTok =
# try progress to next char, receives none option on failure
for optchar in iterChars(tokStream.lstream):
# unpack the Option[char], none => '\0'
let c = if optchar.isSome: optchar.get
else: '\0'
let opttok = appendBuild(tokStream, c)
if opttok.isSome:
return opttok.get
# NOTE: REACHING HERE SHOULD NEVER OCCUR
include tokbuilding
# Initialises a new nlTokStream on a string or file
proc newTokStream*(content: string, isFile: bool = false): nlTokStream =
result = nlTokStream(
lstream: newLStream(content, isFile=isFile),
)
resetBuild(result)
# 1. initialise an empty build token
# 2. progress to the first line
result.resetBuild()
discard result.lstream.progLine()
# Allow the nlTokStream to be iterated
iterator toks*(tokStream: var nlTokStream): nlTok =
var tok: nlTok
while progLine(tokStream.lstream):
while true:
tok = nextTok(tokStream)
# \0 terminator means the line ended OR the file
# has ended, so always yield a line-feed just in case
if isTokTerm(tok.tType):
yield tokTermToLineFeed(tok)
break
yield tok
# we ONLY reach here on EOF
yield tok
# Reimplements nlLStream.progress() for nlTokStream
# to account for additional structure (ie the build token)
proc progChar(tokStream: var nlTokStream): bool =
if not tokStream.lstream.atEOL():
tokStream.lstream.forceProgChar()
result = true
else:
# attempt to progress to next line past EOL
result = tokStream.lstream.progLine()
tokStream.resetBuild()
# Generates and sets (by reference) the next token in the stream,
# via repeatedly calling progBuild() and progChar().
# Returns a boolean indicating whether EOF has been reached.
# NOTE: progBuild adds lstream's current char to the build token
# NOTE: progChar progresses to lstream's next char
proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool =
while true:
var buildTok: Option[nlTok]
let canProgress = tokStream.progBuild(buildTok)
# canProgress & progression failed => EOF reached
if canProgress and not tokStream.progChar():
return false
elif buildTok.isSome:
tok = buildTok.get()
return true

View file

@ -0,0 +1,54 @@
type
# nlTokType allows primitive nlToks to be typed,
# the nlTokType enum should never be directly
# accessed. Use the interface in this file instead.
nlTokType* = enum
NONE, # Placeholder Value
EOF, # End of File
EOL, # End of Line (\0 --> EOL)
WORD, # Alphanumeric token
SYMB, # Symbolic token
LNFD, # \r \n Line-Feed
WTSP, # ' ' \t Whitespace
LPAR, # ( Left Parenthesis
RPAR, # ) Right Parenthesis
LBRA, # { Left Brace
RBRA, # } Right Brace
LSQB, # [ Left Square Bracket
RSQB, # ] Right Square Bracket
# LANB, # < Left Angle Bracket
# RANB, # > Right Angle Bracket
SQUO, # ' Single Quotation Marking
DQUO, # " Double Quotation Marking
GRVA, # ` Grave Accent
HASH, # # Number Sign (Hashtag)
# Classifies a character to its nlTokType
proc getTokType*(c: char): nlTokType =
case c:
of '\0', '\r', '\n':
result = nlTokType.EOL
of ' ', '\t':
result = nlTokType.WTSP
of '(':
result = nlTokType.LPAR
of ')':
result = nlTokType.RPAR
of '{':
result = nlTokType.LBRA
of '}':
result = nlTokType.RBRA
of '[':
result = nlTokType.LSQB
of ']':
result = nlTokType.RSQB
of '\'':
result = nlTokType.SQUO
of '\"':
result = nlTokType.DQUO
of '`':
result = nlTokType.GRVA
of '#':
result = nlTokType.HASH
else:
result = nlTokType.WORD