Simple (shit) working lexer via nlTokStream
Currently only tested on strings but its highly extendable if you modify the getTokType mapping of chars to their nlTokType
This commit is contained in:
parent
edf164df90
commit
3ce9390be4
8 changed files with 224 additions and 185 deletions
2
lang/demo/math.no
Normal file
2
lang/demo/math.no
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
"abc+def"
|
||||
xy+z
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
import noether/lexer/tok
|
||||
import noether/lexer/tokstream
|
||||
|
||||
when isMainModule:
|
||||
|
|
@ -7,7 +8,10 @@ when isMainModule:
|
|||
if paramCount() > 0:
|
||||
let filename = paramStr(1)
|
||||
var tokStream = newTokStream(filename, isFile=true)
|
||||
for tok in toks(tokStream):
|
||||
|
||||
var tok: nlTok
|
||||
while tokStream.nextTok(tok):
|
||||
echo tok
|
||||
|
||||
else:
|
||||
echo "usage: nlx filename"
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import std/streams
|
||||
import std/options
|
||||
|
||||
include tok
|
||||
import tok
|
||||
|
||||
type
|
||||
# Character streaming for the nlTokStream
|
||||
|
|
@ -10,15 +10,15 @@ type
|
|||
# row/column positions
|
||||
line*: string
|
||||
lineNum*: Natural
|
||||
pos: Natural
|
||||
pos*: Natural
|
||||
|
||||
proc streamFile(filename: string): FileStream =
|
||||
proc streamFile*(filename: string): FileStream =
|
||||
result = newFileStream(filename, fmRead)
|
||||
|
||||
proc streamString(str: string): StringStream =
|
||||
proc streamString*(str: string): StringStream =
|
||||
result = newStringStream(str)
|
||||
|
||||
proc newLStream(content: string, isFile: bool = false): nlLStream =
|
||||
proc newLStream*(content: string, isFile: bool = false): nlLStream =
|
||||
result = nlLStream(
|
||||
stream: if isFile: streamFile(content) else: streamString(content),
|
||||
line: "",
|
||||
|
|
@ -26,20 +26,40 @@ proc newLStream(content: string, isFile: bool = false): nlLStream =
|
|||
pos: Natural 0,
|
||||
)
|
||||
|
||||
# Checks whether we've reached EOL
|
||||
# NOTE: also checks if we've surpassed it (ie invalid lstream.pos)
|
||||
proc atEOL*(lstream: nlLStream): bool =
|
||||
result = (lstream.pos >= lstream.line.len - 1)
|
||||
|
||||
# Checks whether we are EXACTLY at EOL, but not surpassed
|
||||
proc exactlyEOL*(lstream: nlLStream): bool =
|
||||
result = (lstream.pos == lstream.line.len - 1)
|
||||
|
||||
# Checks whether we have surpassed EOL
|
||||
proc outOfBounds*(lstream: nlLStream): bool =
|
||||
result = (lstream.pos > lstream.line.len - 1)
|
||||
|
||||
# Progress the lex stream to the next line (if available)
|
||||
proc progLine(lstream: var nlLStream): bool =
|
||||
proc progLine*(lstream: var nlLStream): bool =
|
||||
if lstream.stream.readLine(lstream.line):
|
||||
inc lstream.lineNum
|
||||
lstream.pos = Natural 0
|
||||
return true
|
||||
return false
|
||||
|
||||
proc currChar(lstream: nlLStream): char =
|
||||
result = lstream.line[lstream.pos]
|
||||
# Progress the lex stream to the next character in the line
|
||||
# forcefully (aka does NOT check if we reached EOL)
|
||||
proc forceProgChar*(lstream: var nlLStream) =
|
||||
inc lstream.pos
|
||||
|
||||
# NOTE: assumes lstream.line does NOT mutate while iterating
|
||||
iterator iterChars(lstream: var nlLStream): Option[char] =
|
||||
while lstream.pos < lstream.line.len:
|
||||
inc lstream.pos
|
||||
yield some(lstream.line[lstream.pos - 1])
|
||||
yield none(char)
|
||||
# Progress the lex stream to the next character (if available)
|
||||
proc progress*(lstream: var nlLStream): bool =
|
||||
if not lstream.atEOL():
|
||||
lstream.forceProgChar()
|
||||
result = true
|
||||
else:
|
||||
# attempt to progress next line past EOL
|
||||
result = lstream.progLine()
|
||||
|
||||
proc currChar*(lstream: nlLStream): char =
|
||||
result = lstream.line[lstream.pos]
|
||||
|
|
|
|||
|
|
@ -1,43 +1,16 @@
|
|||
type
|
||||
# nlTokType allows primitive nlToks to be typed,
|
||||
# the nlTokType enum should never be directly
|
||||
# accessed. Use the interface in this file instead.
|
||||
# NOTE: NONE is used as a default value
|
||||
# NOTE: it is very different to NTERM!
|
||||
nlTokType = enum
|
||||
NONE, # Placeholder Value
|
||||
EOF, # EOF
|
||||
TERM, # String \0 terminator
|
||||
WORD, # Alphanumeric token
|
||||
SYMB, # Symbolic token
|
||||
LNFD, # \r \n Line-Feed
|
||||
WTSP, # ' ' \t Whitespace
|
||||
LPAR, # ( Left Parenthesis
|
||||
RPAR, # ) Right Parenthesis
|
||||
LBRA, # { Left Brace
|
||||
RBRA, # } Right Brace
|
||||
LSQB, # [ Left Square Bracket
|
||||
RSQB, # ] Right Square Bracket
|
||||
# LANB, # < Left Angle Bracket
|
||||
# RANB, # > Right Angle Bracket
|
||||
SQUO, # ' Single Quotation Marking
|
||||
DQUO, # " Double Quotation Marking
|
||||
GRVA, # ` Grave Accent
|
||||
HASH, # # Number Sign (Hashtag)
|
||||
|
||||
nlTok = object
|
||||
include toktype
|
||||
|
||||
type
|
||||
nlTok* = object
|
||||
tType*: nlTokType
|
||||
lit*: string
|
||||
line*: Natural
|
||||
lineNum*: Natural
|
||||
startPos*: Natural
|
||||
endPos*: Natural
|
||||
|
||||
# Generates an "empty" nlTok with only a startPos,
|
||||
# all other fields are expected to be filled out later.
|
||||
# NOTE: tType initialised to nlTokType.NUL
|
||||
# NOTE: lit initialised to empty string
|
||||
# NOTE: all other fields are uninitialised
|
||||
proc emptyTok(startPos: int): nlTok =
|
||||
proc emptyTok*(startPos: int): nlTok =
|
||||
result = nlTok(
|
||||
tType: nlTokType.NONE,
|
||||
lit: "",
|
||||
|
|
@ -45,55 +18,23 @@ proc emptyTok(startPos: int): nlTok =
|
|||
)
|
||||
|
||||
# Checks if an nlTok has nlTokType.NONE
|
||||
proc isTokUntyped(tType: nlTokType): bool =
|
||||
proc isTokUntyped*(tType: nlTokType): bool =
|
||||
result = (tType == nlTokType.NONE)
|
||||
|
||||
# Checks if an nlTok has nlTokType.EOL
|
||||
proc isTokEOL*(tok: nlTok): bool =
|
||||
result = (tok.tType == nlTokType.EOL)
|
||||
|
||||
|
||||
# Checks if an nlTok has nlTokType.TERM
|
||||
proc isTokTerm(tType: nlTokType): bool =
|
||||
result = (tType == nlTokType.TERM)
|
||||
|
||||
# This method is only used to convert null
|
||||
# terminator nlToks into line-feed ones.
|
||||
# Returns a copy of an nlTok, changing its type
|
||||
# NOTE: this is necessary because Nim handles
|
||||
# NOTE: strings in a useful but annoying way
|
||||
proc tokTermToLineFeed(tok: nlTok): nlTok =
|
||||
proc tokTermToLineFeed*(tok: nlTok): nlTok =
|
||||
result = nlTok(
|
||||
tType: nlTokType.LNFD,
|
||||
lit: tok.lit,
|
||||
line: tok.line,
|
||||
lineNum: tok.lineNum,
|
||||
startPos: tok.startPos,
|
||||
endPos: tok.endPos,
|
||||
)
|
||||
|
||||
# Classifies a character to its nlTokType
|
||||
proc getTokType(c: char): nlTokType =
|
||||
case c:
|
||||
of '\0':
|
||||
result = nlTokType.TERM
|
||||
of '\r', '\n':
|
||||
result = nlTokType.LNFD
|
||||
of ' ', '\t':
|
||||
result = nlTokType.WTSP
|
||||
of '(':
|
||||
result = nlTokType.LPAR
|
||||
of ')':
|
||||
result = nlTokType.RPAR
|
||||
of '{':
|
||||
result = nlTokType.LBRA
|
||||
of '}':
|
||||
result = nlTokType.RBRA
|
||||
of '[':
|
||||
result = nlTokType.LSQB
|
||||
of ']':
|
||||
result = nlTokType.RSQB
|
||||
of '\'':
|
||||
result = nlTokType.SQUO
|
||||
of '\"':
|
||||
result = nlTokType.DQUO
|
||||
of '`':
|
||||
result = nlTokType.GRVA
|
||||
of '#':
|
||||
result = nlTokType.HASH
|
||||
else:
|
||||
result = nlTokType.WORD
|
||||
|
|
|
|||
84
src/noether/lexer/tokbuilding.nim
Normal file
84
src/noether/lexer/tokbuilding.nim
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
include lstream
|
||||
|
||||
type
|
||||
# Provides a stream-like interface for lexing nlToks
|
||||
# Internally reliant on the functionality of nlLStream
|
||||
nlTokStream = object
|
||||
lstream: nlLStream
|
||||
build: nlTok # the build token
|
||||
|
||||
# Generates an EOL token for the nlTokStream's state
|
||||
proc EOLTok*(tokStream: nlTokStream): nlTok =
|
||||
result = nlTok(
|
||||
tType: nlTokType.EOL,
|
||||
lit: "\0",
|
||||
lineNum: Natural tokStream.lstream.lineNum,
|
||||
startPos: Natural tokStream.lstream.pos,
|
||||
endPos: Natural tokStream.lstream.pos,
|
||||
)
|
||||
|
||||
# Resets the build token to an "empty" nlTok
|
||||
proc resetBuild(tokStream: var nlTokStream) =
|
||||
tokStream.build = emptyTok(tokStream.lstream.pos)
|
||||
|
||||
# Completes a token generated by emptyTok()
|
||||
# based on the nlTokStream's nlLStream's
|
||||
# current line and character positions
|
||||
proc finishBuild(ts: var nlTokStream) =
|
||||
ts.build.lineNum = Natural ts.lstream.lineNum
|
||||
ts.build.endPos = Natural ts.lstream.pos
|
||||
ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos]
|
||||
|
||||
# Returns the nlTokStream's build token and
|
||||
# empties the build token's contents.
|
||||
proc flushBuild(tokStream: var nlTokStream): nlTok =
|
||||
finishBuild(tokStream)
|
||||
result = tokStream.build
|
||||
resetBuild(tokStream)
|
||||
|
||||
# Returns whether the build token has a set type yet.
|
||||
# This indicates that the build token should inherit
|
||||
# the nlTokType of the nlLStream's next character.
|
||||
proc isUntypedBuild(tokStream: nlTokStream): bool =
|
||||
result = isTokUntyped(tokStream.build.tType)
|
||||
|
||||
# Check whether an nlTokType is "compatible" with the build token.
|
||||
# NOTE: flushBuild() should be called when an incompatible token is discovered.
|
||||
proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
|
||||
result = (tType == tokStream.build.tType)
|
||||
|
||||
# Add a character to the nlTokStream's build token.
|
||||
# Flushes and returns the build token if "fully built",
|
||||
# and a boolean indicating whether the nlTokStream can progress.
|
||||
proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool =
|
||||
# the "pos > EOL" invalid state is used intentionally
|
||||
# to indicate all tokens have been built, and return EOL Token
|
||||
if tokStream.lstream.outOfBounds():
|
||||
buildTok = some(EOLTok(tokStream))
|
||||
return true # can progress once more
|
||||
|
||||
let tType = getTokType(tokStream.lstream.currChar())
|
||||
# untyped build tokens must inherited immediately
|
||||
if isUntypedBuild(tokStream):
|
||||
tokStream.build.tType = tType
|
||||
|
||||
# check if EOL reached
|
||||
if tokStream.lstream.atEOL():
|
||||
# flush old build token, the new one can be left untyped
|
||||
let compatible = isCompatibleBuild(tokStream, tType)
|
||||
result = false # DO NOT PROGRESS
|
||||
if compatible:
|
||||
# force the lstream into an invalid state by progressing beyond EOL
|
||||
# we can then detect this state on the next progBuild and return
|
||||
# an EOL character (very unsafe implementation but it works well)
|
||||
tokStream.lstream.forceProgChar()
|
||||
buildTok = some(flushBuild(tokStream))
|
||||
# check character and build token compatability
|
||||
elif not isCompatibleBuild(tokStream, tType):
|
||||
# flush old build token, the new one inherits type
|
||||
buildTok = some(flushBuild(tokStream))
|
||||
tokStream.build.tType = tType
|
||||
result = true # can progress
|
||||
else:
|
||||
buildTok = none(nlTok)
|
||||
result = true # can progress
|
||||
|
|
@ -1,104 +1,38 @@
|
|||
include lstream
|
||||
|
||||
type
|
||||
# Provides a stream-like interface for lexing nlToks
|
||||
# Internally reliant on the functionality of nlLStream
|
||||
nlTokStream = object
|
||||
lstream: nlLStream
|
||||
build: nlTok # the current token we're building
|
||||
|
||||
# Resets the build token to an "empty" nlTok where
|
||||
# only tType, lit, and startPos are initialised.
|
||||
proc resetBuild(tokStream: var nlTokStream) =
|
||||
tokStream.build = emptyTok(tokStream.lstream.pos)
|
||||
|
||||
# Completes a token generated by emptyTok()
|
||||
# based on the nlTokStream's nlLStream's
|
||||
# current line and character positions
|
||||
proc finishBuild(tokStream: var nlTokStream) =
|
||||
# if we've reached \0 terminator then forge the start
|
||||
# and end positions to point OUTSIDE the line
|
||||
let endPos = if isTokTerm(tokStream.build.tType):
|
||||
inc tokStream.build.startPos;
|
||||
tokStream.build.startPos
|
||||
else: Natural tokStream.lstream.pos
|
||||
tokStream.build.line = Natural tokStream.lstream.lineNum
|
||||
tokStream.build.endPos = endPos
|
||||
|
||||
# Returns the nlTokStream's build token and
|
||||
# empties the build token's contents.
|
||||
proc flushBuild(tokStream: var nlTokStream): nlTok =
|
||||
finishBuild(tokStream)
|
||||
result = tokStream.build
|
||||
resetBuild(tokStream)
|
||||
|
||||
# Returns whether the build token has a set type yet.
|
||||
# This indicates that the build token should inherit
|
||||
# the nlTokType of the nlLStream's next character.
|
||||
proc isUntypedBuild(tokStream: nlTokStream): bool =
|
||||
result = isTokUntyped(tokStream.build.tType)
|
||||
|
||||
# Check whether an nlTokType is "compatible" with
|
||||
# the build token. flushBuild() should be called
|
||||
# when an incompatible token is discovered.
|
||||
proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
|
||||
result = (tType == tokStream.build.tType)
|
||||
|
||||
# Add a character to the nlTokStream's build token.
|
||||
# Returns a bool indicating if a new nlTok has been built
|
||||
# or not. flushBuild should then be called.
|
||||
proc appendBuild(tokStream: var nlTokStream, c: char): Option[nlTok] =
|
||||
let tType = getTokType(c)
|
||||
# check whether build token should inherit type
|
||||
if isUntypedBuild(tokStream):
|
||||
tokStream.build.tType = tType
|
||||
# check character and build token compatability
|
||||
elif not isCompatibleBuild(tokStream, tType):
|
||||
# return flushed build token, and reset
|
||||
result = some(flushBuild(tokStream))
|
||||
# new build token is untyped so inherit type
|
||||
tokStream.build.tType = tType
|
||||
# check if \0 terminator reached
|
||||
elif isTokTerm(tokStream.build.tType):
|
||||
# return immediately to avoid concatinating '\0'
|
||||
return some(flushBuild(tokStream))
|
||||
# else return none to indicate no build was completed
|
||||
else:
|
||||
result = none(nlTok)
|
||||
# ensure character is appended to the build token
|
||||
tokStream.build.lit.add(c)
|
||||
|
||||
# Generates and returns the next token in the stream,
|
||||
# result.tType == nlTokType.NTERM implies line ended
|
||||
proc nextTok(tokStream: var nlTokStream): nlTok =
|
||||
# try progress to next char, receives none option on failure
|
||||
for optchar in iterChars(tokStream.lstream):
|
||||
# unpack the Option[char], none => '\0'
|
||||
let c = if optchar.isSome: optchar.get
|
||||
else: '\0'
|
||||
let opttok = appendBuild(tokStream, c)
|
||||
if opttok.isSome:
|
||||
return opttok.get
|
||||
# NOTE: REACHING HERE SHOULD NEVER OCCUR
|
||||
include tokbuilding
|
||||
|
||||
# Initialises a new nlTokStream on a string or file
|
||||
proc newTokStream*(content: string, isFile: bool = false): nlTokStream =
|
||||
result = nlTokStream(
|
||||
lstream: newLStream(content, isFile=isFile),
|
||||
)
|
||||
resetBuild(result)
|
||||
# 1. initialise an empty build token
|
||||
# 2. progress to the first line
|
||||
result.resetBuild()
|
||||
discard result.lstream.progLine()
|
||||
|
||||
# Allow the nlTokStream to be iterated
|
||||
iterator toks*(tokStream: var nlTokStream): nlTok =
|
||||
var tok: nlTok
|
||||
while progLine(tokStream.lstream):
|
||||
while true:
|
||||
tok = nextTok(tokStream)
|
||||
# \0 terminator means the line ended OR the file
|
||||
# has ended, so always yield a line-feed just in case
|
||||
if isTokTerm(tok.tType):
|
||||
yield tokTermToLineFeed(tok)
|
||||
break
|
||||
yield tok
|
||||
# we ONLY reach here on EOF
|
||||
yield tok
|
||||
# Reimplements nlLStream.progress() for nlTokStream
|
||||
# to account for additional structure (ie the build token)
|
||||
proc progChar(tokStream: var nlTokStream): bool =
|
||||
if not tokStream.lstream.atEOL():
|
||||
tokStream.lstream.forceProgChar()
|
||||
result = true
|
||||
else:
|
||||
# attempt to progress to next line past EOL
|
||||
result = tokStream.lstream.progLine()
|
||||
tokStream.resetBuild()
|
||||
|
||||
# Generates and sets (by reference) the next token in the stream,
|
||||
# via repeatedly calling progBuild() and progChar().
|
||||
# Returns a boolean indicating whether EOF has been reached.
|
||||
# NOTE: progBuild adds lstream's current char to the build token
|
||||
# NOTE: progChar progresses to lstream's next char
|
||||
proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool =
|
||||
while true:
|
||||
var buildTok: Option[nlTok]
|
||||
let canProgress = tokStream.progBuild(buildTok)
|
||||
# canProgress & progression failed => EOF reached
|
||||
if canProgress and not tokStream.progChar():
|
||||
return false
|
||||
elif buildTok.isSome:
|
||||
tok = buildTok.get()
|
||||
return true
|
||||
|
|
|
|||
54
src/noether/lexer/toktype.nim
Normal file
54
src/noether/lexer/toktype.nim
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
type
|
||||
# nlTokType allows primitive nlToks to be typed,
|
||||
# the nlTokType enum should never be directly
|
||||
# accessed. Use the interface in this file instead.
|
||||
nlTokType* = enum
|
||||
NONE, # Placeholder Value
|
||||
EOF, # End of File
|
||||
EOL, # End of Line (\0 --> EOL)
|
||||
WORD, # Alphanumeric token
|
||||
SYMB, # Symbolic token
|
||||
LNFD, # \r \n Line-Feed
|
||||
WTSP, # ' ' \t Whitespace
|
||||
LPAR, # ( Left Parenthesis
|
||||
RPAR, # ) Right Parenthesis
|
||||
LBRA, # { Left Brace
|
||||
RBRA, # } Right Brace
|
||||
LSQB, # [ Left Square Bracket
|
||||
RSQB, # ] Right Square Bracket
|
||||
# LANB, # < Left Angle Bracket
|
||||
# RANB, # > Right Angle Bracket
|
||||
SQUO, # ' Single Quotation Marking
|
||||
DQUO, # " Double Quotation Marking
|
||||
GRVA, # ` Grave Accent
|
||||
HASH, # # Number Sign (Hashtag)
|
||||
|
||||
# Classifies a character to its nlTokType
|
||||
proc getTokType*(c: char): nlTokType =
|
||||
case c:
|
||||
of '\0', '\r', '\n':
|
||||
result = nlTokType.EOL
|
||||
of ' ', '\t':
|
||||
result = nlTokType.WTSP
|
||||
of '(':
|
||||
result = nlTokType.LPAR
|
||||
of ')':
|
||||
result = nlTokType.RPAR
|
||||
of '{':
|
||||
result = nlTokType.LBRA
|
||||
of '}':
|
||||
result = nlTokType.RBRA
|
||||
of '[':
|
||||
result = nlTokType.LSQB
|
||||
of ']':
|
||||
result = nlTokType.RSQB
|
||||
of '\'':
|
||||
result = nlTokType.SQUO
|
||||
of '\"':
|
||||
result = nlTokType.DQUO
|
||||
of '`':
|
||||
result = nlTokType.GRVA
|
||||
of '#':
|
||||
result = nlTokType.HASH
|
||||
else:
|
||||
result = nlTokType.WORD
|
||||
Loading…
Add table
Add a link
Reference in a new issue