1 gazillion changes (mostly documenting my insanity optimizing + naming)

This commit is contained in:
Emile Clark-Boman 2025-06-19 02:09:43 +10:00
parent ebef458186
commit f8697bd662
9 changed files with 206 additions and 183 deletions

View file

@ -3,6 +3,7 @@ import noether/lexer/tok
import noether/lexer/tokstream import noether/lexer/tokstream
import noether/parser/parser import noether/parser/parser
{.hint: "Don't forget to drink more water (^_^)".}
when isMainModule: when isMainModule:
echo "Noether Lang Extras v0.1.0 - nlx" echo "Noether Lang Extras v0.1.0 - nlx"
@ -11,9 +12,8 @@ when isMainModule:
var tokStream = newTokStream(filename, isFile=true) var tokStream = newTokStream(filename, isFile=true)
# # DumpTok # # DumpTok
# var tok: nlTok # while tokStream.progress():
# while tokStream.nextTok(tok): # echo tokStream.currTok
# echo tok
# DumpTree # DumpTree
discard parse(tokStream) discard parse(tokStream)

View file

@ -41,7 +41,7 @@ proc outOfBounds*(lstream: nlLStream): bool =
result = (lstream.pos > lstream.line.len - 1) result = (lstream.pos > lstream.line.len - 1)
# Progress the lex stream to the next line (if available) # Progress the lex stream to the next line (if available)
proc progLine*(lstream: var nlLStream): bool = proc progressLine*(lstream: var nlLStream): bool =
if lstream.stream.readLine(lstream.line): if lstream.stream.readLine(lstream.line):
inc lstream.lineNum inc lstream.lineNum
lstream.pos = Natural 0 lstream.pos = Natural 0
@ -50,17 +50,17 @@ proc progLine*(lstream: var nlLStream): bool =
# Progress the lex stream to the next character in the line # Progress the lex stream to the next character in the line
# forcefully (aka does NOT check if we reached EOL) # forcefully (aka does NOT check if we reached EOL)
proc forceProgChar*(lstream: var nlLStream) = proc forceProgressChar*(lstream: var nlLStream) =
inc lstream.pos inc lstream.pos
# Progress the lex stream to the next character (if available) # # Progress the lex stream to the next character (if available)
proc progress*(lstream: var nlLStream): bool = # proc progressChar*(lstream: var nlLStream): bool =
if not lstream.atEOL(): # if not lstream.atEOL():
lstream.forceProgChar() # lstream.forceProgressChar()
result = true # result = true
else: # else:
# attempt to progress next line past EOL # # attempt to progress next line past EOL
result = lstream.progLine() # result = lstream.progressLine()
proc currChar*(lstream: nlLStream): char = proc currChar*(lstream: nlLStream): char =
result = lstream.line[lstream.pos] result = lstream.line[lstream.pos]

View file

@ -2,7 +2,7 @@ include toktype
type type
nlTok* = object nlTok* = object
tType*: nlTokType tKind*: nlTokKind
lit*: string lit*: string
lineNum*: Natural lineNum*: Natural
startPos*: Natural startPos*: Natural
@ -12,29 +12,11 @@ type
# all other fields are expected to be filled out later. # all other fields are expected to be filled out later.
proc emptyTok*(startPos: int): nlTok = proc emptyTok*(startPos: int): nlTok =
result = nlTok( result = nlTok(
tType: nlTokType.NONE, tKind: tkNONE,
lit: "", lit: "",
startPos: Natural startPos, startPos: Natural startPos,
) )
# Checks if an nlTok has nlTokType.NONE # Checks if an nlTok has tkNONE
proc isTokUntyped*(tType: nlTokType): bool = proc isUntyped*(tKind: nlTokKind): bool =
result = (tType == nlTokType.NONE) result = (tKind == tkNONE)
# Checks if an nlTok has nlTokType.EOL
proc isTokEOL*(tok: nlTok): bool =
result = (tok.tType == nlTokType.EOL)
# This method is only used to convert null
# terminator nlToks into line-feed ones.
# Returns a copy of an nlTok, changing its type
proc tokTermToLineFeed*(tok: nlTok): nlTok =
result = nlTok(
tType: nlTokType.LNFD,
lit: tok.lit,
lineNum: tok.lineNum,
startPos: tok.startPos,
endPos: tok.endPos,
)

View file

@ -12,7 +12,7 @@ type
# Generates an EOL token for the nlTokStream's state # Generates an EOL token for the nlTokStream's state
proc EOLTok(tokStream: nlTokStream): nlTok = proc EOLTok(tokStream: nlTokStream): nlTok =
result = nlTok( result = nlTok(
tType: nlTokType.EOL, tKind: tkEOL,
lit: "\0", lit: "\0",
lineNum: Natural tokStream.lstream.lineNum, lineNum: Natural tokStream.lstream.lineNum,
startPos: Natural tokStream.lstream.pos, startPos: Natural tokStream.lstream.pos,
@ -40,46 +40,46 @@ proc flushBuild(tokStream: var nlTokStream): nlTok =
# Returns whether the build token has a set type yet. # Returns whether the build token has a set type yet.
# This indicates that the build token should inherit # This indicates that the build token should inherit
# the nlTokType of the nlLStream's next character. # the nlTokKind of the nlLStream's next character.
proc isUntypedBuild(tokStream: nlTokStream): bool = proc isUntypedBuild(tokStream: nlTokStream): bool =
result = isTokUntyped(tokStream.build.tType) result = tokStream.build.tKind.isUntyped()
# Check whether an nlTokType is "compatible" with the build token. # Check whether an nlTokKind is "compatible" with the build token.
# NOTE: flushBuild() should be called when an incompatible token is discovered. # NOTE: flushBuild() should be called when an incompatible token is discovered.
proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool = proc isCompatibleBuild(tokStream: nlTokStream, tKind: nlTokKind): bool =
result = (tType == tokStream.build.tType) result = (tKind == tokStream.build.tKind)
# Add a character to the nlTokStream's build token. # Add a character to the nlTokStream's build token.
# Flushes and returns the build token if "fully built", # Flushes and returns the build token if "fully built",
# and a boolean indicating whether the nlTokStream can progress. # and a boolean indicating whether the nlTokStream can progress.
proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool = proc progressBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool =
# the "pos > EOL" invalid state is used intentionally # the "pos > EOL" invalid state is used intentionally
# to indicate all tokens have been built, and return EOL Token # to indicate all tokens have been built, and return EOL Token
if tokStream.lstream.outOfBounds(): if tokStream.lstream.outOfBounds():
buildTok = some(EOLTok(tokStream)) buildTok = some(EOLTok(tokStream))
return true # can progress once more return true # can progress once more
let tType = getTokType(tokStream.lstream.currChar()) let tKind = getTokType(tokStream.lstream.currChar())
# untyped build tokens must inherited immediately # untyped build tokens must inherited immediately
if isUntypedBuild(tokStream): if isUntypedBuild(tokStream):
tokStream.build.tType = tType tokStream.build.tKind = tKind
# check if EOL reached # check if EOL reached
if tokStream.lstream.atEOL(): if tokStream.lstream.atEOL():
# flush old build token, the new one can be left untyped # flush old build token, the new one can be left untyped
let compatible = isCompatibleBuild(tokStream, tType) let compatible = isCompatibleBuild(tokStream, tKind)
result = false # DO NOT PROGRESS result = false # DO NOT PROGRESS
if compatible: if compatible:
# force the lstream into an invalid state by progressing beyond EOL # force the lstream into an invalid state by progressing beyond EOL
# we can then detect this state on the next progBuild and return # we can then detect this state on the next progressBuild and return
# an EOL character (very unsafe implementation but it works well) # an EOL character (very unsafe implementation but it works well)
tokStream.lstream.forceProgChar() tokStream.lstream.forceProgressChar()
buildTok = some(flushBuild(tokStream)) buildTok = some(flushBuild(tokStream))
# check character and build token compatability # check character and build token compatability
elif not isCompatibleBuild(tokStream, tType): elif not isCompatibleBuild(tokStream, tKind):
# flush old build token, the new one inherits type # flush old build token, the new one inherits type
buildTok = some(flushBuild(tokStream)) buildTok = some(flushBuild(tokStream))
tokStream.build.tType = tType tokStream.build.tKind = tKind
result = true # can progress result = true # can progress
else: else:
buildTok = none(nlTok) buildTok = none(nlTok)

View file

@ -9,44 +9,43 @@ proc newTokStream*(content: string, isFile: bool = false): nlTokStream =
# 1. initialise an empty build token # 1. initialise an empty build token
# 2. progress to the first line # 2. progress to the first line
result.resetBuild() result.resetBuild()
discard result.lstream.progLine() discard result.lstream.progressLine()
# Defines a short-hand notation for getting the current line # Defines a short-hand notation for getting the current line
proc currLine*(tokStream: nlTokStream): string = proc line*(tokStream: nlTokStream): string =
result = tokStream.lstream.line result = tokStream.lstream.line
# Reimplements nlLStream.progress() for nlTokStream # Reimplements nlLStream.progressChar for nlTokStream
# to account for additional structure (ie the build token) # to account for additional structure (ie the build token)
proc progChar(tokStream: var nlTokStream): bool = # NOTE: progressChar progresses to lstream's next char
proc progressChar(tokStream: var nlTokStream): bool =
if not tokStream.lstream.atEOL(): if not tokStream.lstream.atEOL():
tokStream.lstream.forceProgChar() tokStream.lstream.forceProgressChar()
result = true result = true
else: else:
# attempt to progress to next line past EOL # attempt to progress to next line past EOL
result = tokStream.lstream.progLine() result = tokStream.lstream.progressLine()
tokStream.resetBuild() tokStream.resetBuild()
# Generates and sets (by reference) the next token in the stream, # Generates and progress the next token in the nlTokStream.
# via repeatedly calling progBuild() and progChar(). # via repeatedly calling progressBuild() and progressChar().
# Returns a boolean indicating whether EOF has been reached. # Returns a boolean indicating whether EOF has been reached.
# NOTE: progBuild adds lstream's current char to the build token # NOTE: access the new token via `tokStream.tok`
# NOTE: progChar progresses to lstream's next char proc progress*(tokStream: var nlTokStream): bool =
proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool =
# Return prematurely if already closed # Return prematurely if already closed
if tokStream.closed: if tokStream.closed:
return false return false
while true: while true:
var flushedTok: Option[nlTok] var flushedTok: Option[nlTok]
let let
canProgress = tokStream.progBuild(flushedTok) canProgress = tokStream.progressBuild(flushedTok)
buildComplete = flushedTok.isSome buildComplete = flushedTok.isSome
# canProgress & EOF reached => no more tokens to build :) # canProgress & EOF reached => no more tokens to build :)
# NOTE: reachedEOF and not canProgress => more tokens unwrapping # NOTE: reachedEOF and not canProgress => more tokens unwrapping
if buildComplete: if buildComplete:
# return the finished build token, and save it as the current token # return the finished build token, and save it as the current token
tok = flushedTok.get() tokStream.currTok = flushedTok.get()
tokStream.currTok = tok if canProgress and not tokStream.progressChar():
if canProgress and not tokStream.progChar():
tokStream.closed = true tokStream.closed = true
return buildComplete return buildComplete
elif buildComplete: elif buildComplete:

View file

@ -1,54 +1,59 @@
type type
# nlTokType allows primitive nlToks to be typed, # nlTokKind allows primitive nlToks to be typed,
# the nlTokType enum should never be directly # the nlTokKind enum should never be directly
# accessed. Use the interface in this file instead. # accessed. Use the interface in this file instead.
nlTokType* = enum nlTokKind* = enum
NONE, # Placeholder Value tkNONE, # Placeholder Value
EOF, # End of File
EOL, # End of Line (\0 --> EOL)
WORD, # Alphanumeric token
SYMB, # Symbolic token
LNFD, # \r \n Line-Feed
WTSP, # ' ' \t Whitespace
LPAR, # ( Left Parenthesis
RPAR, # ) Right Parenthesis
LBRA, # { Left Brace
RBRA, # } Right Brace
LSQB, # [ Left Square Bracket
RSQB, # ] Right Square Bracket
# LANB, # < Left Angle Bracket
# RANB, # > Right Angle Bracket
SQUO, # ' Single Quotation Marking
DQUO, # " Double Quotation Marking
GRVA, # ` Grave Accent
HASH, # # Number Sign (Hashtag)
# Classifies a character to its nlTokType tkEOF, # End of File
proc getTokType*(c: char): nlTokType = tkEOL, # End of Line (\0 --> EOL)
tkWORD, # Alphanumeric token
tkSYMB, # Symbolic token
tkLNFD, # \r \n Line-Feed
tkWTSP, # ' ' \t Whitespace
# RESERVED SYMBOLS
tkLPAR, # ( Left Parenthesis
tkRPAR, # ) Right Parenthesis
tkLBRA, # { Left Brace
tkRBRA, # } Right Brace
tkLSQB, # [ Left Square Bracket
tkRSQB, # ] Right Square Bracket
# tkLANB, # < Left Angle Bracket
# tkRANB, # > Right Angle Bracket
tkSQUO, # ' Single Quotation Marking
tkDQUO, # " Double Quotation Marking
tkGRVA, # ` Grave Accent
tkHASH, # # Number Sign (Hashtag)
# Classifies a character to its nlTokKind
proc getTokType*(c: char): nlTokKind =
case c: case c:
of '\0', '\r', '\n': of '\0', '\r', '\n':
result = nlTokType.EOL result = tkEOL
of ' ', '\t': of ' ', '\t':
result = nlTokType.WTSP result = tkWTSP
of '(': of '(':
result = nlTokType.LPAR result = tkLPAR
of ')': of ')':
result = nlTokType.RPAR result = tkRPAR
of '{': of '{':
result = nlTokType.LBRA result = tkLBRA
of '}': of '}':
result = nlTokType.RBRA result = tkRBRA
of '[': of '[':
result = nlTokType.LSQB result = tkLSQB
of ']': of ']':
result = nlTokType.RSQB result = tkRSQB
of '\'': of '\'':
result = nlTokType.SQUO result = tkSQUO
of '\"': of '\"':
result = nlTokType.DQUO result = tkDQUO
of '`': of '`':
result = nlTokType.GRVA result = tkGRVA
of '#': of '#':
result = nlTokType.HASH result = tkHASH
else: else:
result = nlTokType.WORD result = tkWORD

View file

@ -1,19 +1,48 @@
import std/options
from ../lexer/tok import nlTok from ../lexer/tok import nlTok
# from ../lexer/tokstream import
type type
# NOTE: by the end of parsing NO nodes should # NOTE: by the end of parsing NO nodes should have nkNone
# NOTE: have nlNodeType.NONE nlNodeKind* = enum
nlNodeType* = enum nkNone, # Placeholder Value
NONE, # Placeholder Value
TERM, # Indicates the tree has terminated nkStrLit, # String Literal
STRL, # String Literal nkChrLit, # Character Literal
CHRL, # Character Literal
# NOTE: always check parent != nil when traversing the tree
nlNode* {.acyclic.} = ref object of RootObj nlNode* {.acyclic.} = ref object of RootObj
nType*: nlNodeType nKind*: nlNodeKind
toks*: seq[nlTok] # nodes store the tokens that build them toks*: seq[nlTok] # nodes (may) store the tokens that build them
# left, right: nlNode parent*: nlNode
# Purely abstract type that all nlNode objects
# with children are expected to inherit from.
nlBranchNode* {.acyclic.} = ref object of nlNode
child: UncheckedArray[nlNode]
nlBiNode* {.acyclic.} = ref object of nlBranchNode
proc childCount*(node: nlNode): int {.inline.} = 0
proc childCount*(node: nlBiNode): int {.inline.} = 2
proc getChild*(node: nlNode, i: int): Option[nlNode] {.inline.} =
result = none(nlNode)
proc getChild*(node: nlBranchNode, i: int): Option[nlNode] {.inline.} =
result = some(node.child[i])
proc newNode*(nKind: nlNodeKind): nlNode =
result = nlNode(
nKind: nKind,
)
proc newBiNode*(nKind: nlNodeKind): nlNode =
result = nlBiNode(
nKind: nKind,
)
# Short-hand way of appending a token to a node's token sequence # Short-hand way of appending a token to a node's token sequence
proc addTok*(node: nlNode, tok: nlTok) = proc addTok*(node: nlNode, tok: nlTok) =
echo "AM I HERE?"
echo node[]
echo node.toks
node.toks.add(tok) node.toks.add(tok)

View file

@ -3,61 +3,55 @@ include parseutil
# NOTE: Matching between two tokens will fill `node` with everything # NOTE: Matching between two tokens will fill `node` with everything
# NOTE: between those two tokens EXCLUDING the two tokens themselves. # NOTE: between those two tokens EXCLUDING the two tokens themselves.
proc parseMatch(tokStream: var nlTokStream, proc parseMatch(parser: var nlParser, matchType: nlTokKind): nlParseStat =
node: var nlNode,
matchType: nlTokType): nlParseStat =
result = greed( result = greed(
tokStream, parser,
node.toks,
satisfyMatch(matchType), satisfyMatch(matchType),
) )
proc parseMatchLine(tokStream: var nlTokStream, proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat =
node: var nlNode, result = greedLine(
matchType: nlTokType): nlParseStat = parser,
result = greed( satisfyMatch(matchType),
tokStream,
node.toks,
satisfyMatchEOL(matchType),
) )
proc parseStrL(tokStream: var nlTokStream, node: var nlNode): nlParseStat = proc parseStrLit(parser: var nlParser): nlParseStat =
node = nlNode( result = parser.parseMatch(tkDQUO)
nType: nlNodeType.STRL
)
node.addTok(tokStream.currTok)
result = nlParseStat.UNCLOSED * not greedEOL(tokStream, node.toks, nlTokType.DQUO)
proc parseChrL(tokStream: var nlTokStream, node: var nlNode): bool = proc parseChrLit(parser: var nlParser): nlParseStat =
node = nlNode( result = parser.parseMatch(tkSQUO)
nType: nlNodeType.CHRL
)
node.addTok(tokStream.currTok)
# TWO ERRORS ARE POSSIBLE, 1: content too big, 2: never closed
result = greedEOL(tokStream, node.toks, nlTokType.SQUO)
# Attempt to form an nlAST from a nlTokStream proc parseStmt(parser: var nlParser): nlParseStat =
proc parse*(tokStream: var nlTokStream): nlNode = # initialise build node as none just for the hell of it
var tok: nlTok
var node: nlNode while parser.stream.progress():
while tokStream.nextTok(tok): echo parser.stream.currTok
case tok.tType: case parser.stream.currTok.tKind
of nlTokType.DQUO: of tkDQUO:
# Attempt to parse string literal # Attempt to parse string literal
if not parseStrL(tokStream, node): if parser.parseStrLit() != nlParseStat.OK:
echo "Unmatched Double Quotation! Malformed String Literal" echo "Unmatched Double Quotation! Malformed String Literal"
echo tokStream.currLine() echo parser.stream.line
echo repeat(" ", tok.startPos), '^' echo repeat(" ", parser.stream.currTok.startPos), '^'
else: else:
echo "Parsed String Literal" echo "Parsed String Literal"
echo node[] echo parser.bnode[]
of nlTokType.SQUO: of tkSQUO:
# Attempt to parse string literal # Attempt to parse string literal
if not parseChrL(tokStream, node): if parser.parseChrLit() != nlParseStat.OK:
echo "Unmatched Single Quotation! Malformed Character Literal" echo "Unmatched Single Quotation! Malformed Character Literal"
echo tokStream.currLine() echo parser.stream.line
echo repeat(" ", tok.startPos), '^' echo repeat(" ", parser.stream.currTok.startPos), '^'
else: else:
echo "Parsed String Literal" echo "Parsed Character Literal"
echo node[] echo parser.bnode[]
else: else:
echo "blah blah unhandled case" echo "blah blah unhandled case"
result = nlParseStat.OK
# Attempt to parse nlAST from nlTokStream
proc parse*(tokStream: var nlTokStream): nlAST =
var parser = newParser(tokStream)
echo ' '
discard parser.parseStmt()
result = parser.ast

View file

@ -2,19 +2,39 @@ import nodes
import ../lexer/tokstream import ../lexer/tokstream
type type
# NOTE: Values above __FAIL__ indicate a failed state # NOTE1: Values above MARKER_FAIL indicate a failed state
nlParseStat* = enum # NOTE2: nlParseStat is marked pure out of habit that's all
nlParseStat* {.pure.} = enum
OK, OK,
__FAIL__, MARKER_FAIL,
MIDAS, # Greedy search was never satisfied
UNMATCHED, UNMATCHED,
TOOBIG, TOOBIG,
nlAST* = object
root: nlNode
nlParser* = object
stream: nlTokStream
ast: nlAST
# the "build node" is a reference to the AST node
# the parser is currently modifying/building from
# NOTE: bnode changes frequently, it is NOT the root
bnode: nlNode
proc `*`(stat: nlParseStat, b: bool): nlParseStat = proc `*`(stat: nlParseStat, b: bool): nlParseStat =
result = if b: stat else: nlParseStat.OK result = if b: stat else: nlParseStat.OK
proc isFail*(stat: nlParseStat): bool = proc isFail*(stat: nlParseStat): bool =
result = (stat >= nlParseStat.__FAIL__) result = (stat >= nlParseStat.MARKER_FAIL)
proc newParser*(tokStream: var nlTokStream): nlParser =
let rootNode = newNode(nkNone)
result = nlParser(
stream: tokStream,
ast: rootNode,
bnode: rootNode,
)
#[ "Greed" refers to something I mentioned in my discussion on #[ "Greed" refers to something I mentioned in my discussion on
@ -25,34 +45,28 @@ proc isFail*(stat: nlParseStat): bool =
# Greed will consume anything until a condition is satisfied # Greed will consume anything until a condition is satisfied
# Returns false if the greed was never satisfied (OMG!!) # Returns false if the greed was never satisfied (OMG!!)
proc greed(tokStream: var nlTokStream, proc greed(parser: var nlParser,
toks: var seq[nlTok], satisfy: proc(tok: nlTok): bool): nlParseStat =
satisfy: proc(tok: nlTok): bool, while parser.stream.progress():
): nlParseStat = echo "im definitely here!"
var tok: nlTok parser.bnode.addTok(parser.stream.currTok)
while tokStream.nextTok(tok): if satisfy(parser.stream.currTok):
toks.add(tok)
if satisfy(tok):
return nlParseStat.OK return nlParseStat.OK
result = nlParseStat.UNMATCHED result = nlParseStat.UNMATCHED
proc greedLine(tokStream: var nlTokStream, proc greedLine(parser: var nlParser,
toks: var seq[nlTok],
satisfy: proc(tok: nlTok): bool): nlParseStat = satisfy: proc(tok: nlTok): bool): nlParseStat =
var tok: nlTok while parser.stream.progress():
while tokStream.nextTok(tok): parser.bnode.addTok(parser.stream.currTok)
toks.add(tok) if satisfy(parser.stream.currTok):
if satisfy(tok): return nlParseStat.OK
return true elif parser.stream.currTok.tKind == tkEOL:
result = return nlParseStat.UNMATCHED
result = nlParseStat.UNMATCHED
#[ Templates for generating greed satisfying conditions. #[ Templates for generating greed satisfying conditions.
]# ]#
# Satisfied if it finds nlTok of type matchType # Satisfied if it finds nlTok of type matchType
template satisfyMatch(matchType: nlTokType) = template satisfyMatch(matchType: nlTokKind): untyped =
proc(tok: nlTok): bool {.inline.} = (tok.tType == matchType) (proc(tok {.inject.}: nlTok): bool = (tok.tKind == matchType))
# Satisfied if it finds nlTok of type matchType or EOL reached
template satisfyMatchEOL(matchType: nlTokType) =
proc(tok: nlTok): bool {.inline.} = (tok.tType == matchType or tok.tType == nlTokType.EOL)