Compare commits

...

2 commits

Author SHA1 Message Date
90ca138904 Fixed build tokens not unwrapping when both EOL and EOF occur 2025-06-18 02:35:51 +10:00
3ce9390be4 Simple (shit) working lexer via nlTokStream
Currently only tested on strings but its highly extendable if you modify the getTokType mapping of chars to their nlTokType
2025-06-18 01:25:20 +10:00
9 changed files with 255 additions and 192 deletions

2
lang/demo/math.no Normal file
View file

@ -0,0 +1,2 @@
"abc+def"
xy+z

View file

@ -1,4 +1,5 @@
import os
import noether/lexer/tok
import noether/lexer/tokstream
when isMainModule:
@ -7,7 +8,11 @@ when isMainModule:
if paramCount() > 0:
let filename = paramStr(1)
var tokStream = newTokStream(filename, isFile=true)
for tok in toks(tokStream):
# DumpTok
var tok: nlTok
while tokStream.nextTok(tok):
echo tok
else:
echo "usage: nlx filename"

View file

@ -1,7 +1,7 @@
import std/streams
import std/options
include tok
import tok
type
# Character streaming for the nlTokStream
@ -10,15 +10,15 @@ type
# row/column positions
line*: string
lineNum*: Natural
pos: Natural
pos*: Natural
proc streamFile(filename: string): FileStream =
proc streamFile*(filename: string): FileStream =
result = newFileStream(filename, fmRead)
proc streamString(str: string): StringStream =
proc streamString*(str: string): StringStream =
result = newStringStream(str)
proc newLStream(content: string, isFile: bool = false): nlLStream =
proc newLStream*(content: string, isFile: bool = false): nlLStream =
result = nlLStream(
stream: if isFile: streamFile(content) else: streamString(content),
line: "",
@ -26,20 +26,40 @@ proc newLStream(content: string, isFile: bool = false): nlLStream =
pos: Natural 0,
)
# Checks whether we've reached EOL
# NOTE: also checks if we've surpassed it (ie invalid lstream.pos)
proc atEOL*(lstream: nlLStream): bool =
result = (lstream.pos >= lstream.line.len - 1)
# Checks whether we are EXACTLY at EOL, but not surpassed
proc exactlyEOL*(lstream: nlLStream): bool =
result = (lstream.pos == lstream.line.len - 1)
# Checks whether we have surpassed EOL
proc outOfBounds*(lstream: nlLStream): bool =
result = (lstream.pos > lstream.line.len - 1)
# Progress the lex stream to the next line (if available)
proc progLine(lstream: var nlLStream): bool =
proc progLine*(lstream: var nlLStream): bool =
if lstream.stream.readLine(lstream.line):
inc lstream.lineNum
lstream.pos = Natural 0
return true
return false
proc currChar(lstream: nlLStream): char =
result = lstream.line[lstream.pos]
# Progress the lex stream to the next character in the line
# forcefully (aka does NOT check if we reached EOL)
proc forceProgChar*(lstream: var nlLStream) =
inc lstream.pos
# NOTE: assumes lstream.line does NOT mutate while iterating
iterator iterChars(lstream: var nlLStream): Option[char] =
while lstream.pos < lstream.line.len:
inc lstream.pos
yield some(lstream.line[lstream.pos - 1])
yield none(char)
# Progress the lex stream to the next character (if available)
proc progress*(lstream: var nlLStream): bool =
if not lstream.atEOL():
lstream.forceProgChar()
result = true
else:
# attempt to progress next line past EOL
result = lstream.progLine()
proc currChar*(lstream: nlLStream): char =
result = lstream.line[lstream.pos]

View file

@ -1,43 +1,16 @@
type
# nlTokType allows primitive nlToks to be typed,
# the nlTokType enum should never be directly
# accessed. Use the interface in this file instead.
# NOTE: NONE is used as a default value
# NOTE: it is very different to NTERM!
nlTokType = enum
NONE, # Placeholder Value
EOF, # EOF
TERM, # String \0 terminator
WORD, # Alphanumeric token
SYMB, # Symbolic token
LNFD, # \r \n Line-Feed
WTSP, # ' ' \t Whitespace
LPAR, # ( Left Parenthesis
RPAR, # ) Right Parenthesis
LBRA, # { Left Brace
RBRA, # } Right Brace
LSQB, # [ Left Square Bracket
RSQB, # ] Right Square Bracket
# LANB, # < Left Angle Bracket
# RANB, # > Right Angle Bracket
SQUO, # ' Single Quotation Marking
DQUO, # " Double Quotation Marking
GRVA, # ` Grave Accent
HASH, # # Number Sign (Hashtag)
nlTok = object
include toktype
type
nlTok* = object
tType*: nlTokType
lit*: string
line*: Natural
lineNum*: Natural
startPos*: Natural
endPos*: Natural
# Generates an "empty" nlTok with only a startPos,
# all other fields are expected to be filled out later.
# NOTE: tType initialised to nlTokType.NUL
# NOTE: lit initialised to empty string
# NOTE: all other fields are uninitialised
proc emptyTok(startPos: int): nlTok =
proc emptyTok*(startPos: int): nlTok =
result = nlTok(
tType: nlTokType.NONE,
lit: "",
@ -45,55 +18,23 @@ proc emptyTok(startPos: int): nlTok =
)
# Checks if an nlTok has nlTokType.NONE
proc isTokUntyped(tType: nlTokType): bool =
proc isTokUntyped*(tType: nlTokType): bool =
result = (tType == nlTokType.NONE)
# Checks if an nlTok has nlTokType.EOL
proc isTokEOL*(tok: nlTok): bool =
result = (tok.tType == nlTokType.EOL)
# Checks if an nlTok has nlTokType.TERM
proc isTokTerm(tType: nlTokType): bool =
result = (tType == nlTokType.TERM)
# This method is only used to convert null
# terminator nlToks into line-feed ones.
# Returns a copy of an nlTok, changing its type
# NOTE: this is necessary because Nim handles
# NOTE: strings in a useful but annoying way
proc tokTermToLineFeed(tok: nlTok): nlTok =
proc tokTermToLineFeed*(tok: nlTok): nlTok =
result = nlTok(
tType: nlTokType.LNFD,
lit: tok.lit,
line: tok.line,
lineNum: tok.lineNum,
startPos: tok.startPos,
endPos: tok.endPos,
)
# Classifies a character to its nlTokType
proc getTokType(c: char): nlTokType =
case c:
of '\0':
result = nlTokType.TERM
of '\r', '\n':
result = nlTokType.LNFD
of ' ', '\t':
result = nlTokType.WTSP
of '(':
result = nlTokType.LPAR
of ')':
result = nlTokType.RPAR
of '{':
result = nlTokType.LBRA
of '}':
result = nlTokType.RBRA
of '[':
result = nlTokType.LSQB
of ']':
result = nlTokType.RSQB
of '\'':
result = nlTokType.SQUO
of '\"':
result = nlTokType.DQUO
of '`':
result = nlTokType.GRVA
of '#':
result = nlTokType.HASH
else:
result = nlTokType.WORD

View file

@ -0,0 +1,85 @@
include lstream
type
# Provides a stream-like interface for lexing nlToks
# Internally reliant on the functionality of nlLStream
nlTokStream = object
lstream: nlLStream
build: nlTok # the build token
closed: bool # EOF + all tokens built
# Generates an EOL token for the nlTokStream's state
proc EOLTok*(tokStream: nlTokStream): nlTok =
result = nlTok(
tType: nlTokType.EOL,
lit: "\0",
lineNum: Natural tokStream.lstream.lineNum,
startPos: Natural tokStream.lstream.pos,
endPos: Natural tokStream.lstream.pos,
)
# Resets the build token to an "empty" nlTok
proc resetBuild(tokStream: var nlTokStream) =
tokStream.build = emptyTok(tokStream.lstream.pos)
# Completes a token generated by emptyTok()
# based on the nlTokStream's nlLStream's
# current line and character positions
proc finishBuild(ts: var nlTokStream) =
ts.build.lineNum = Natural ts.lstream.lineNum
ts.build.endPos = Natural ts.lstream.pos
ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos]
# Returns the nlTokStream's build token and
# empties the build token's contents.
proc flushBuild(tokStream: var nlTokStream): nlTok =
finishBuild(tokStream)
result = tokStream.build
resetBuild(tokStream)
# Returns whether the build token has a set type yet.
# This indicates that the build token should inherit
# the nlTokType of the nlLStream's next character.
proc isUntypedBuild(tokStream: nlTokStream): bool =
result = isTokUntyped(tokStream.build.tType)
# Check whether an nlTokType is "compatible" with the build token.
# NOTE: flushBuild() should be called when an incompatible token is discovered.
proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
result = (tType == tokStream.build.tType)
# Add a character to the nlTokStream's build token.
# Flushes and returns the build token if "fully built",
# and a boolean indicating whether the nlTokStream can progress.
proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool =
# the "pos > EOL" invalid state is used intentionally
# to indicate all tokens have been built, and return EOL Token
if tokStream.lstream.outOfBounds():
buildTok = some(EOLTok(tokStream))
return true # can progress once more
let tType = getTokType(tokStream.lstream.currChar())
# untyped build tokens must inherited immediately
if isUntypedBuild(tokStream):
tokStream.build.tType = tType
# check if EOL reached
if tokStream.lstream.atEOL():
# flush old build token, the new one can be left untyped
let compatible = isCompatibleBuild(tokStream, tType)
result = false # DO NOT PROGRESS
if compatible:
# force the lstream into an invalid state by progressing beyond EOL
# we can then detect this state on the next progBuild and return
# an EOL character (very unsafe implementation but it works well)
tokStream.lstream.forceProgChar()
buildTok = some(flushBuild(tokStream))
# check character and build token compatability
elif not isCompatibleBuild(tokStream, tType):
# flush old build token, the new one inherits type
buildTok = some(flushBuild(tokStream))
tokStream.build.tType = tType
result = true # can progress
else:
buildTok = none(nlTok)
result = true # can progress

View file

@ -1,104 +1,47 @@
include lstream
type
# Provides a stream-like interface for lexing nlToks
# Internally reliant on the functionality of nlLStream
nlTokStream = object
lstream: nlLStream
build: nlTok # the current token we're building
# Resets the build token to an "empty" nlTok where
# only tType, lit, and startPos are initialised.
proc resetBuild(tokStream: var nlTokStream) =
tokStream.build = emptyTok(tokStream.lstream.pos)
# Completes a token generated by emptyTok()
# based on the nlTokStream's nlLStream's
# current line and character positions
proc finishBuild(tokStream: var nlTokStream) =
# if we've reached \0 terminator then forge the start
# and end positions to point OUTSIDE the line
let endPos = if isTokTerm(tokStream.build.tType):
inc tokStream.build.startPos;
tokStream.build.startPos
else: Natural tokStream.lstream.pos
tokStream.build.line = Natural tokStream.lstream.lineNum
tokStream.build.endPos = endPos
# Returns the nlTokStream's build token and
# empties the build token's contents.
proc flushBuild(tokStream: var nlTokStream): nlTok =
finishBuild(tokStream)
result = tokStream.build
resetBuild(tokStream)
# Returns whether the build token has a set type yet.
# This indicates that the build token should inherit
# the nlTokType of the nlLStream's next character.
proc isUntypedBuild(tokStream: nlTokStream): bool =
result = isTokUntyped(tokStream.build.tType)
# Check whether an nlTokType is "compatible" with
# the build token. flushBuild() should be called
# when an incompatible token is discovered.
proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
result = (tType == tokStream.build.tType)
# Add a character to the nlTokStream's build token.
# Returns a bool indicating if a new nlTok has been built
# or not. flushBuild should then be called.
proc appendBuild(tokStream: var nlTokStream, c: char): Option[nlTok] =
let tType = getTokType(c)
# check whether build token should inherit type
if isUntypedBuild(tokStream):
tokStream.build.tType = tType
# check character and build token compatability
elif not isCompatibleBuild(tokStream, tType):
# return flushed build token, and reset
result = some(flushBuild(tokStream))
# new build token is untyped so inherit type
tokStream.build.tType = tType
# check if \0 terminator reached
elif isTokTerm(tokStream.build.tType):
# return immediately to avoid concatinating '\0'
return some(flushBuild(tokStream))
# else return none to indicate no build was completed
else:
result = none(nlTok)
# ensure character is appended to the build token
tokStream.build.lit.add(c)
# Generates and returns the next token in the stream,
# result.tType == nlTokType.NTERM implies line ended
proc nextTok(tokStream: var nlTokStream): nlTok =
# try progress to next char, receives none option on failure
for optchar in iterChars(tokStream.lstream):
# unpack the Option[char], none => '\0'
let c = if optchar.isSome: optchar.get
else: '\0'
let opttok = appendBuild(tokStream, c)
if opttok.isSome:
return opttok.get
# NOTE: REACHING HERE SHOULD NEVER OCCUR
include tokbuilding
# Initialises a new nlTokStream on a string or file
proc newTokStream*(content: string, isFile: bool = false): nlTokStream =
result = nlTokStream(
lstream: newLStream(content, isFile=isFile),
closed: false,
)
resetBuild(result)
# 1. initialise an empty build token
# 2. progress to the first line
result.resetBuild()
discard result.lstream.progLine()
# Allow the nlTokStream to be iterated
iterator toks*(tokStream: var nlTokStream): nlTok =
var tok: nlTok
while progLine(tokStream.lstream):
while true:
tok = nextTok(tokStream)
# \0 terminator means the line ended OR the file
# has ended, so always yield a line-feed just in case
if isTokTerm(tok.tType):
yield tokTermToLineFeed(tok)
break
yield tok
# we ONLY reach here on EOF
yield tok
# Reimplements nlLStream.progress() for nlTokStream
# to account for additional structure (ie the build token)
proc progChar(tokStream: var nlTokStream): bool =
if not tokStream.lstream.atEOL():
tokStream.lstream.forceProgChar()
result = true
else:
# attempt to progress to next line past EOL
result = tokStream.lstream.progLine()
tokStream.resetBuild()
# Generates and sets (by reference) the next token in the stream,
# via repeatedly calling progBuild() and progChar().
# Returns a boolean indicating whether EOF has been reached.
# NOTE: progBuild adds lstream's current char to the build token
# NOTE: progChar progresses to lstream's next char
proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool =
# Return prematurely if already closed
if tokStream.closed:
return false
while true:
var buildTok: Option[nlTok]
let
canProgress = tokStream.progBuild(buildTok)
tokBuilt = buildTok.isSome
# canProgress & EOF reached => no more tokens to build :)
# NOTE: reachedEOF and not canProgress => more tokens unwrapping
if tokBuilt:
tok = buildTok.get()
if canProgress and not tokStream.progChar():
tokStream.closed = true
return tokBuilt
elif tokBuilt:
return true

View file

@ -0,0 +1,54 @@
type
# nlTokType allows primitive nlToks to be typed,
# the nlTokType enum should never be directly
# accessed. Use the interface in this file instead.
nlTokType* = enum
NONE, # Placeholder Value
EOF, # End of File
EOL, # End of Line (\0 --> EOL)
WORD, # Alphanumeric token
SYMB, # Symbolic token
LNFD, # \r \n Line-Feed
WTSP, # ' ' \t Whitespace
LPAR, # ( Left Parenthesis
RPAR, # ) Right Parenthesis
LBRA, # { Left Brace
RBRA, # } Right Brace
LSQB, # [ Left Square Bracket
RSQB, # ] Right Square Bracket
# LANB, # < Left Angle Bracket
# RANB, # > Right Angle Bracket
SQUO, # ' Single Quotation Marking
DQUO, # " Double Quotation Marking
GRVA, # ` Grave Accent
HASH, # # Number Sign (Hashtag)
# Classifies a character to its nlTokType
proc getTokType*(c: char): nlTokType =
case c:
of '\0', '\r', '\n':
result = nlTokType.EOL
of ' ', '\t':
result = nlTokType.WTSP
of '(':
result = nlTokType.LPAR
of ')':
result = nlTokType.RPAR
of '{':
result = nlTokType.LBRA
of '}':
result = nlTokType.RBRA
of '[':
result = nlTokType.LSQB
of ']':
result = nlTokType.RSQB
of '\'':
result = nlTokType.SQUO
of '\"':
result = nlTokType.DQUO
of '`':
result = nlTokType.GRVA
of '#':
result = nlTokType.HASH
else:
result = nlTokType.WORD

View file

@ -1,7 +0,0 @@
# Attempt to form an nlAST from a nlTokStream
proc arborise(tokStream: nlTokStream): nlNode =
for tok in toks(tokStream):
case tok.tokType:
of nlTokType.DQUO:
# Attempt to parse string literal
parse_strl()

View file

@ -0,0 +1,20 @@
import ../lexer/tokstream
# Greed will consume anything except a punishment
proc greed(tokStream: nlTokStream, toks: var seq[nlTok], punish: str) =
proc parse_strl(tokStream: nlTokStream): nlNode =
# Attempt to form an nlAST from a nlTokStream
proc parse(tokStream: nlTokStream): nlNode =
var tok: nlTok
while true:
case tok.tokType:
of nlTokType.DQUO:
# Attempt to parse string literal
parse_strl()
if not tokStream.nextTok(tok):
break