Restructure attempt #087 :(

This commit is contained in:
Emile Clark-Boman 2025-06-19 12:51:03 +10:00
parent f25e66e9ef
commit 1181ea9743
7 changed files with 227 additions and 215 deletions

View file

@ -1,19 +1,29 @@
import os import os
import noether/lib/io import noether/lib/io
import noether/lexer/lex import noether/lexer/lex
# import noether/parser/parser import noether/parser/parse
{.hint: "Don't forget to drink more water (^_^)".} {.hint: "Don't forget to drink more water (^_^)".}
when isMainModule: when isMainModule:
echo "Noether Lang Extras v0.1.0 - nlx" echo "Noether Lang Extras v0.1.0 - nlx"
var stream = if paramCount() > 0: streamFile(paramStr 1) # really lazy argparse implementation (temporary)
let
paramC = paramCount()
cmd = if paramC > 2: paramStr 1
else: "tok"
var stream = if paramC > 0: streamFile(paramStr paramC)
else: streamString(readAll stdin) else: streamString(readAll stdin)
var lexer = newLexer(stream) var lexer = newLexer(stream)
# # DumpTok if cmd == "tok":
# DumpTok
while lexer.progress(): while lexer.progress():
echo lexer.tok echo lexer.tok
elif cmd == "tree":
discard
# DumpTree # DumpTree
# discard parse(tokStream) # discard parse(tokStream)
else:
echo "Usage: nlx [tok|tree] <demo>\n demo files are accessible at lang/demo"

View file

@ -11,15 +11,16 @@ type
nlLexer* = object nlLexer* = object
stream: Stream stream: Stream
done*: bool done*: bool
tok*: nlTok # new finished token # store current token and upcoming (build) token
tok*: nlTok # current token
btok: nlTok # the build token btok: nlTok # the build token
# save char and pos and its token type
char: char
cTKind: nlTokKind
# track line number, line content, etc # track line number, line content, etc
line: string line: string
lineNum: int lineNum: int
pos: int pos: int
# save char and pos and its token type
char: char
cTKind: nlTokKind
proc atEOL(lexer: nlLexer): bool {.inline.} = proc atEOL(lexer: nlLexer): bool {.inline.} =
result = (lexer.char == '\n') result = (lexer.char == '\n')
@ -37,8 +38,41 @@ proc newLexer*(stream: var Stream): nlLexer =
lineNum: 1, lineNum: 1,
pos: -1, # after initial readChar this -> 0 pos: -1, # after initial readChar this -> 0
char: '\0', # use \0 as initial invalid char char: '\0', # use \0 as initial invalid char
cTKind: tkNONE,
) )
# Classifies the current character to its nlTokKind
proc classifyTok*(lexer: nlLexer): nlTokKind {.inline.} =
case lexer.char:
of '\0':
result = tkEOF
of '\r', '\n':
result = tkEOL
of ' ', '\t':
result = tkWTSP
of '(':
result = tkLPAR
of ')':
result = tkRPAR
of '{':
result = tkLBRA
of '}':
result = tkRBRA
of '[':
result = tkLSQB
of ']':
result = tkRSQB
of '\'':
result = tkSQUO
of '\"':
result = tkDQUO
of '`':
result = tkGRVA
of '#':
result = tkHASH
else:
result = tkWORD
#[ ====================================================== ] #[ ====================================================== ]
| nlLexer Internal Interface for Token Construction ] | nlLexer Internal Interface for Token Construction ]
@ -96,7 +130,7 @@ proc readChar(lexer: var nlLexer): bool =
inc lexer.lineNum inc lexer.lineNum
# sets lexer.char to '\0' if EOF # sets lexer.char to '\0' if EOF
lexer.char = lexer.stream.readChar() lexer.char = lexer.stream.readChar()
lexer.cTKind = getTokKind(lexer.char) lexer.cTKind = lexer.classifyTok()
lexer.line.add(lexer.char) lexer.line.add(lexer.char)
inc lexer.pos inc lexer.pos
result = lexer.atEOF() result = lexer.atEOF()

View file

@ -1,4 +1,32 @@
include tokkind type
# nlTokKind allows primitive nlToks to be typed,
# the nlTokKind enum should never be directly
# accessed. Use the interface in this file instead.
nlTokKind* = enum
tkNONE, # Placeholder Value
tkEOF, # End of File
tkEOL, # End of Line (\0 --> EOL)
tkWORD, # Alphanumeric token
tkSYMB, # Symbolic token
tkLNFD, # \r \n Line-Feed
tkWTSP, # ' ' \t Whitespace
# RESERVED SYMBOLS
tkLPAR, # ( Left Parenthesis
tkRPAR, # ) Right Parenthesis
tkLBRA, # { Left Brace
tkRBRA, # } Right Brace
tkLSQB, # [ Left Square Bracket
tkRSQB, # ] Right Square Bracket
# tkLANB, # < Left Angle Bracket
# tkRANB, # > Right Angle Bracket
tkSQUO, # ' Single Quotation Marking
tkDQUO, # " Double Quotation Marking
tkGRVA, # ` Grave Accent
tkHASH, # # Number Sign (Hashtag)
type type
nlTok* = tuple nlTok* = tuple

View file

@ -1,61 +1 @@
type
# nlTokKind allows primitive nlToks to be typed,
# the nlTokKind enum should never be directly
# accessed. Use the interface in this file instead.
nlTokKind* = enum
tkNONE, # Placeholder Value
tkEOF, # End of File
tkEOL, # End of Line (\0 --> EOL)
tkWORD, # Alphanumeric token
tkSYMB, # Symbolic token
tkLNFD, # \r \n Line-Feed
tkWTSP, # ' ' \t Whitespace
# RESERVED SYMBOLS
tkLPAR, # ( Left Parenthesis
tkRPAR, # ) Right Parenthesis
tkLBRA, # { Left Brace
tkRBRA, # } Right Brace
tkLSQB, # [ Left Square Bracket
tkRSQB, # ] Right Square Bracket
# tkLANB, # < Left Angle Bracket
# tkRANB, # > Right Angle Bracket
tkSQUO, # ' Single Quotation Marking
tkDQUO, # " Double Quotation Marking
tkGRVA, # ` Grave Accent
tkHASH, # # Number Sign (Hashtag)
# Classifies a character to its nlTokKind
proc getTokKind*(c: char): nlTokKind =
case c:
of '\0':
result = tkEOF
of '\r', '\n':
result = tkEOL
of ' ', '\t':
result = tkWTSP
of '(':
result = tkLPAR
of ')':
result = tkRPAR
of '{':
result = tkLBRA
of '}':
result = tkRBRA
of '[':
result = tkLSQB
of ']':
result = tkRSQB
of '\'':
result = tkSQUO
of '\"':
result = tkDQUO
of '`':
result = tkGRVA
of '#':
result = tkHASH
else:
result = tkWORD

View file

@ -0,0 +1,58 @@
import strutils
include parser
# NOTE: Matching between two tokens will fill `node` with everything
# NOTE: between those two tokens EXCLUDING the two tokens themselves.
proc parseMatch(parser: var nlParser, matchType: nlTokKind): nlParseStat =
result = greed(
parser,
satisfyMatch(matchType),
)
proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat =
result = greedLine(
parser,
satisfyMatch(matchType),
)
proc parseStrLit(parser: var nlParser): nlParseStat =
result = parser.parseMatchLine(tkDQUO)
proc parseChrLit(parser: var nlParser): nlParseStat =
result = parser.parseMatchLine(tkSQUO)
proc parseStmt(parser: var nlParser): nlParseStat =
while parser.progressStream():
echo "----- Current Token: ", parser.currTok
case parser.currTok.tKind
of tkDQUO:
# Attempt to parse string literal
if parser.parseStrLit() != nlParseStat.OK:
echo "Unmatched Double Quotation! Malformed String Literal"
echo parser.line
echo repeat(" ", parser.currTok.startPos), '^', '\n'
else:
echo "Parsed String Literal"
echo parser.bnode[], '\n'
of tkSQUO:
# Attempt to parse string literal
if parser.parseChrLit() != nlParseStat.OK:
echo "Unmatched Single Quotation! Malformed Character Literal"
echo parser.line
echo repeat(" ", parser.currTok.startPos), '^', '\n'
else:
echo "Parsed Character Literal"
echo parser.bnode[], '\n'
of tkEOL:
# TODO: handle this case, don't just discard
discard
else:
echo "blah blah unhandled case\n"
result = nlParseStat.OK
# Attempt to parse nlAST from nlTokStream
proc parse*(tokStream: var nlTokStream): nlAST =
var parser = newParser(tokStream)
echo ' '
discard parser.parseStmt()
result = parser.ast

View file

@ -1,58 +1,90 @@
import strutils import nodes
include parseutil import ../lexer/lex
# NOTE: Matching between two tokens will fill `node` with everything type
# NOTE: between those two tokens EXCLUDING the two tokens themselves. # NOTE1: Values above MARKER_FAIL indicate a failed state
proc parseMatch(parser: var nlParser, matchType: nlTokKind): nlParseStat = # NOTE2: nlParseStat is marked pure out of habit that's all
result = greed( nlParseStat* {.pure.} = enum
parser, OK,
satisfyMatch(matchType), MARKER_FAIL,
) UNMATCHED,
proc parseMatchLine(parser: var nlParser, matchType: nlTokKind): nlParseStat = TOOBIG,
result = greedLine(
parser, nlAST* = object
satisfyMatch(matchType), root: nlNode
nlParser* = object
stream: nlTokStream
ast: nlAST
# the "build node" is a reference to the AST node
# the parser is currently modifying/building from
# NOTE: bnode changes frequently, it is NOT the root
bnode: nlNode
# flag indicating whether the parser is at
# the start of a new line (aka checking indentation)
inIndent: bool
proc `*`(stat: nlParseStat, b: bool): nlParseStat =
result = if b: stat else: nlParseStat.OK
proc isFail*(stat: nlParseStat): bool =
result = (stat >= nlParseStat.MARKER_FAIL)
proc newParser*(tokStream: var nlTokStream): nlParser =
let rootNode = newNode(nkNone)
result = nlParser(
stream: tokStream,
ast: nlAST(
root: rootNode
),
bnode: rootNode,
) )
proc parseStrLit(parser: var nlParser): nlParseStat = # Exposes a subset of the nlTokStream interface
result = parser.parseMatchLine(tkDQUO) proc currTok(parser: var nlParser): nlTok = parser.stream.currTok
proc line(parser: var nlParser): string = parser.stream.line
proc parseChrLit(parser: var nlParser): nlParseStat = # Extends upon the functionality of nlTokStream.progress()
result = parser.parseMatchLine(tkSQUO) proc progressStream*(parser: var nlParser): bool =
result = parser.stream.progress()
if result and parser.currTok.tKind == tkEOL:
parser.inIndent = true
if
proc parseStmt(parser: var nlParser): nlParseStat = proc setNewLine()
#[ "Greed" refers to something I mentioned in my discussion on
| Noether's grammar (in an EBNF-like language). Greed just
| means "everything until a condition is satisified".
| That condition should be supplied by a Nim procedural type.
]#
# Greed will consume anything until a condition is satisfied
# Returns false if the greed was never satisfied (OMG!!)
proc greed(parser: var nlParser,
satisfy: proc(tok: nlTok): bool): nlParseStat =
while parser.progressStream(): while parser.progressStream():
echo "----- Current Token: ", parser.currTok if satisfy(parser.currTok):
case parser.currTok.tKind return nlParseStat.OK
of tkDQUO: # NOTE: the matched token is currently excluded
# Attempt to parse string literal parser.bnode.addTok(parser.currTok)
if parser.parseStrLit() != nlParseStat.OK: result = nlParseStat.UNMATCHED
echo "Unmatched Double Quotation! Malformed String Literal"
echo parser.line
echo repeat(" ", parser.currTok.startPos), '^', '\n'
else:
echo "Parsed String Literal"
echo parser.bnode[], '\n'
of tkSQUO:
# Attempt to parse string literal
if parser.parseChrLit() != nlParseStat.OK:
echo "Unmatched Single Quotation! Malformed Character Literal"
echo parser.line
echo repeat(" ", parser.currTok.startPos), '^', '\n'
else:
echo "Parsed Character Literal"
echo parser.bnode[], '\n'
of tkEOL:
# TODO: handle this case, don't just discard
discard
else:
echo "blah blah unhandled case\n"
result = nlParseStat.OK
# Attempt to parse nlAST from nlTokStream proc greedLine(parser: var nlParser,
proc parse*(tokStream: var nlTokStream): nlAST = satisfy: proc(tok: nlTok): bool): nlParseStat =
var parser = newParser(tokStream) while parser.progressStream():
echo ' ' if satisfy(parser.currTok):
discard parser.parseStmt() return nlParseStat.OK
# NOTE: the matched token is currently excluded
parser.bnode.addTok(parser.currTok)
if parser.currTok.tKind == tkEOL:
return nlParseStat.UNMATCHED
result = nlParseStat.UNMATCHED
result = parser.ast #[ Templates for generating greed satisfying conditions.
]#
# Satisfied if it finds nlTok of type matchType
template satisfyMatch(matchType: nlTokKind): untyped =
(proc(tok {.inject.}: nlTok): bool = (tok.tKind == matchType))

View file

@ -1,90 +0,0 @@
import nodes
import ../lexer/tokstream
type
# NOTE1: Values above MARKER_FAIL indicate a failed state
# NOTE2: nlParseStat is marked pure out of habit that's all
nlParseStat* {.pure.} = enum
OK,
MARKER_FAIL,
UNMATCHED,
TOOBIG,
nlAST* = object
root: nlNode
nlParser* = object
stream: nlTokStream
ast: nlAST
# the "build node" is a reference to the AST node
# the parser is currently modifying/building from
# NOTE: bnode changes frequently, it is NOT the root
bnode: nlNode
# flag indicating whether the parser is at
# the start of a new line (aka checking indentation)
inIndent: bool
proc `*`(stat: nlParseStat, b: bool): nlParseStat =
result = if b: stat else: nlParseStat.OK
proc isFail*(stat: nlParseStat): bool =
result = (stat >= nlParseStat.MARKER_FAIL)
proc newParser*(tokStream: var nlTokStream): nlParser =
let rootNode = newNode(nkNone)
result = nlParser(
stream: tokStream,
ast: nlAST(
root: rootNode
),
bnode: rootNode,
)
# Exposes a subset of the nlTokStream interface
proc currTok(parser: var nlParser): nlTok = parser.stream.currTok
proc line(parser: var nlParser): string = parser.stream.line
# Extends upon the functionality of nlTokStream.progress()
proc progressStream*(parser: var nlParser): bool =
result = parser.stream.progress()
if result and parser.currTok.tKind == tkEOL:
parser.inIndent = true
if
proc setNewLine()
#[ "Greed" refers to something I mentioned in my discussion on
| Noether's grammar (in an EBNF-like language). Greed just
| means "everything until a condition is satisified".
| That condition should be supplied by a Nim procedural type.
]#
# Greed will consume anything until a condition is satisfied
# Returns false if the greed was never satisfied (OMG!!)
proc greed(parser: var nlParser,
satisfy: proc(tok: nlTok): bool): nlParseStat =
while parser.progressStream():
if satisfy(parser.currTok):
return nlParseStat.OK
# NOTE: the matched token is currently excluded
parser.bnode.addTok(parser.currTok)
result = nlParseStat.UNMATCHED
proc greedLine(parser: var nlParser,
satisfy: proc(tok: nlTok): bool): nlParseStat =
while parser.progressStream():
if satisfy(parser.currTok):
return nlParseStat.OK
# NOTE: the matched token is currently excluded
parser.bnode.addTok(parser.currTok)
if parser.currTok.tKind == tkEOL:
return nlParseStat.UNMATCHED
result = nlParseStat.UNMATCHED
#[ Templates for generating greed satisfying conditions.
]#
# Satisfied if it finds nlTok of type matchType
template satisfyMatch(matchType: nlTokKind): untyped =
(proc(tok {.inject.}: nlTok): bool = (tok.tKind == matchType))