diff --git a/lang/demo/math.no b/lang/demo/math.no deleted file mode 100644 index 8050f04..0000000 --- a/lang/demo/math.no +++ /dev/null @@ -1,2 +0,0 @@ -"abc+def" -xy+z diff --git a/src/nlx.nim b/src/nlx.nim index 2e590ac..1c6c446 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -1,5 +1,4 @@ import os -import noether/lexer/tok import noether/lexer/tokstream when isMainModule: @@ -8,11 +7,7 @@ when isMainModule: if paramCount() > 0: let filename = paramStr(1) var tokStream = newTokStream(filename, isFile=true) - - # DumpTok - var tok: nlTok - while tokStream.nextTok(tok): + for tok in toks(tokStream): echo tok - else: echo "usage: nlx filename" diff --git a/src/noether/lexer/lstream.nim b/src/noether/lexer/lstream.nim index b743c6d..862eb7b 100644 --- a/src/noether/lexer/lstream.nim +++ b/src/noether/lexer/lstream.nim @@ -1,7 +1,7 @@ import std/streams import std/options -import tok +include tok type # Character streaming for the nlTokStream @@ -10,15 +10,15 @@ type # row/column positions line*: string lineNum*: Natural - pos*: Natural + pos: Natural -proc streamFile*(filename: string): FileStream = +proc streamFile(filename: string): FileStream = result = newFileStream(filename, fmRead) -proc streamString*(str: string): StringStream = +proc streamString(str: string): StringStream = result = newStringStream(str) -proc newLStream*(content: string, isFile: bool = false): nlLStream = +proc newLStream(content: string, isFile: bool = false): nlLStream = result = nlLStream( stream: if isFile: streamFile(content) else: streamString(content), line: "", @@ -26,40 +26,20 @@ proc newLStream*(content: string, isFile: bool = false): nlLStream = pos: Natural 0, ) -# Checks whether we've reached EOL -# NOTE: also checks if we've surpassed it (ie invalid lstream.pos) -proc atEOL*(lstream: nlLStream): bool = - result = (lstream.pos >= lstream.line.len - 1) - -# Checks whether we are EXACTLY at EOL, but not surpassed -proc exactlyEOL*(lstream: nlLStream): bool = - result = (lstream.pos == lstream.line.len - 1) - -# Checks whether we have surpassed EOL -proc outOfBounds*(lstream: nlLStream): bool = - result = (lstream.pos > lstream.line.len - 1) - # Progress the lex stream to the next line (if available) -proc progLine*(lstream: var nlLStream): bool = +proc progLine(lstream: var nlLStream): bool = if lstream.stream.readLine(lstream.line): inc lstream.lineNum lstream.pos = Natural 0 return true return false -# Progress the lex stream to the next character in the line -# forcefully (aka does NOT check if we reached EOL) -proc forceProgChar*(lstream: var nlLStream) = - inc lstream.pos - -# Progress the lex stream to the next character (if available) -proc progress*(lstream: var nlLStream): bool = - if not lstream.atEOL(): - lstream.forceProgChar() - result = true - else: - # attempt to progress next line past EOL - result = lstream.progLine() - -proc currChar*(lstream: nlLStream): char = +proc currChar(lstream: nlLStream): char = result = lstream.line[lstream.pos] + +# NOTE: assumes lstream.line does NOT mutate while iterating +iterator iterChars(lstream: var nlLStream): Option[char] = + while lstream.pos < lstream.line.len: + inc lstream.pos + yield some(lstream.line[lstream.pos - 1]) + yield none(char) diff --git a/src/noether/lexer/tok.nim b/src/noether/lexer/tok.nim index fb3067c..3b2464b 100644 --- a/src/noether/lexer/tok.nim +++ b/src/noether/lexer/tok.nim @@ -1,16 +1,43 @@ -include toktype - -type - nlTok* = object +type + # nlTokType allows primitive nlToks to be typed, + # the nlTokType enum should never be directly + # accessed. Use the interface in this file instead. + # NOTE: NONE is used as a default value + # NOTE: it is very different to NTERM! + nlTokType = enum + NONE, # Placeholder Value + EOF, # EOF + TERM, # String \0 terminator + WORD, # Alphanumeric token + SYMB, # Symbolic token + LNFD, # \r \n Line-Feed + WTSP, # ' ' \t Whitespace + LPAR, # ( Left Parenthesis + RPAR, # ) Right Parenthesis + LBRA, # { Left Brace + RBRA, # } Right Brace + LSQB, # [ Left Square Bracket + RSQB, # ] Right Square Bracket + # LANB, # < Left Angle Bracket + # RANB, # > Right Angle Bracket + SQUO, # ' Single Quotation Marking + DQUO, # " Double Quotation Marking + GRVA, # ` Grave Accent + HASH, # # Number Sign (Hashtag) + + nlTok = object tType*: nlTokType lit*: string - lineNum*: Natural + line*: Natural startPos*: Natural endPos*: Natural # Generates an "empty" nlTok with only a startPos, # all other fields are expected to be filled out later. -proc emptyTok*(startPos: int): nlTok = +# NOTE: tType initialised to nlTokType.NUL +# NOTE: lit initialised to empty string +# NOTE: all other fields are uninitialised +proc emptyTok(startPos: int): nlTok = result = nlTok( tType: nlTokType.NONE, lit: "", @@ -18,23 +45,55 @@ proc emptyTok*(startPos: int): nlTok = ) # Checks if an nlTok has nlTokType.NONE -proc isTokUntyped*(tType: nlTokType): bool = +proc isTokUntyped(tType: nlTokType): bool = result = (tType == nlTokType.NONE) - -# Checks if an nlTok has nlTokType.EOL -proc isTokEOL*(tok: nlTok): bool = - result = (tok.tType == nlTokType.EOL) - +# Checks if an nlTok has nlTokType.TERM +proc isTokTerm(tType: nlTokType): bool = + result = (tType == nlTokType.TERM) # This method is only used to convert null # terminator nlToks into line-feed ones. # Returns a copy of an nlTok, changing its type -proc tokTermToLineFeed*(tok: nlTok): nlTok = +# NOTE: this is necessary because Nim handles +# NOTE: strings in a useful but annoying way +proc tokTermToLineFeed(tok: nlTok): nlTok = result = nlTok( tType: nlTokType.LNFD, lit: tok.lit, - lineNum: tok.lineNum, + line: tok.line, startPos: tok.startPos, endPos: tok.endPos, ) + +# Classifies a character to its nlTokType +proc getTokType(c: char): nlTokType = + case c: + of '\0': + result = nlTokType.TERM + of '\r', '\n': + result = nlTokType.LNFD + of ' ', '\t': + result = nlTokType.WTSP + of '(': + result = nlTokType.LPAR + of ')': + result = nlTokType.RPAR + of '{': + result = nlTokType.LBRA + of '}': + result = nlTokType.RBRA + of '[': + result = nlTokType.LSQB + of ']': + result = nlTokType.RSQB + of '\'': + result = nlTokType.SQUO + of '\"': + result = nlTokType.DQUO + of '`': + result = nlTokType.GRVA + of '#': + result = nlTokType.HASH + else: + result = nlTokType.WORD diff --git a/src/noether/lexer/tokbuilding.nim b/src/noether/lexer/tokbuilding.nim deleted file mode 100644 index 043ac71..0000000 --- a/src/noether/lexer/tokbuilding.nim +++ /dev/null @@ -1,85 +0,0 @@ -include lstream - -type - # Provides a stream-like interface for lexing nlToks - # Internally reliant on the functionality of nlLStream - nlTokStream = object - lstream: nlLStream - build: nlTok # the build token - closed: bool # EOF + all tokens built - -# Generates an EOL token for the nlTokStream's state -proc EOLTok*(tokStream: nlTokStream): nlTok = - result = nlTok( - tType: nlTokType.EOL, - lit: "\0", - lineNum: Natural tokStream.lstream.lineNum, - startPos: Natural tokStream.lstream.pos, - endPos: Natural tokStream.lstream.pos, - ) - -# Resets the build token to an "empty" nlTok -proc resetBuild(tokStream: var nlTokStream) = - tokStream.build = emptyTok(tokStream.lstream.pos) - -# Completes a token generated by emptyTok() -# based on the nlTokStream's nlLStream's -# current line and character positions -proc finishBuild(ts: var nlTokStream) = - ts.build.lineNum = Natural ts.lstream.lineNum - ts.build.endPos = Natural ts.lstream.pos - ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos] - -# Returns the nlTokStream's build token and -# empties the build token's contents. -proc flushBuild(tokStream: var nlTokStream): nlTok = - finishBuild(tokStream) - result = tokStream.build - resetBuild(tokStream) - -# Returns whether the build token has a set type yet. -# This indicates that the build token should inherit -# the nlTokType of the nlLStream's next character. -proc isUntypedBuild(tokStream: nlTokStream): bool = - result = isTokUntyped(tokStream.build.tType) - -# Check whether an nlTokType is "compatible" with the build token. -# NOTE: flushBuild() should be called when an incompatible token is discovered. -proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool = - result = (tType == tokStream.build.tType) - -# Add a character to the nlTokStream's build token. -# Flushes and returns the build token if "fully built", -# and a boolean indicating whether the nlTokStream can progress. -proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool = - # the "pos > EOL" invalid state is used intentionally - # to indicate all tokens have been built, and return EOL Token - if tokStream.lstream.outOfBounds(): - buildTok = some(EOLTok(tokStream)) - return true # can progress once more - - let tType = getTokType(tokStream.lstream.currChar()) - # untyped build tokens must inherited immediately - if isUntypedBuild(tokStream): - tokStream.build.tType = tType - - # check if EOL reached - if tokStream.lstream.atEOL(): - # flush old build token, the new one can be left untyped - let compatible = isCompatibleBuild(tokStream, tType) - result = false # DO NOT PROGRESS - if compatible: - # force the lstream into an invalid state by progressing beyond EOL - # we can then detect this state on the next progBuild and return - # an EOL character (very unsafe implementation but it works well) - tokStream.lstream.forceProgChar() - buildTok = some(flushBuild(tokStream)) - # check character and build token compatability - elif not isCompatibleBuild(tokStream, tType): - # flush old build token, the new one inherits type - buildTok = some(flushBuild(tokStream)) - tokStream.build.tType = tType - result = true # can progress - else: - buildTok = none(nlTok) - result = true # can progress diff --git a/src/noether/lexer/tokstream.nim b/src/noether/lexer/tokstream.nim index 98f92b7..5ae2f65 100644 --- a/src/noether/lexer/tokstream.nim +++ b/src/noether/lexer/tokstream.nim @@ -1,47 +1,104 @@ -include tokbuilding +include lstream + +type + # Provides a stream-like interface for lexing nlToks + # Internally reliant on the functionality of nlLStream + nlTokStream = object + lstream: nlLStream + build: nlTok # the current token we're building + +# Resets the build token to an "empty" nlTok where +# only tType, lit, and startPos are initialised. +proc resetBuild(tokStream: var nlTokStream) = + tokStream.build = emptyTok(tokStream.lstream.pos) + +# Completes a token generated by emptyTok() +# based on the nlTokStream's nlLStream's +# current line and character positions +proc finishBuild(tokStream: var nlTokStream) = + # if we've reached \0 terminator then forge the start + # and end positions to point OUTSIDE the line + let endPos = if isTokTerm(tokStream.build.tType): + inc tokStream.build.startPos; + tokStream.build.startPos + else: Natural tokStream.lstream.pos + tokStream.build.line = Natural tokStream.lstream.lineNum + tokStream.build.endPos = endPos + +# Returns the nlTokStream's build token and +# empties the build token's contents. +proc flushBuild(tokStream: var nlTokStream): nlTok = + finishBuild(tokStream) + result = tokStream.build + resetBuild(tokStream) + +# Returns whether the build token has a set type yet. +# This indicates that the build token should inherit +# the nlTokType of the nlLStream's next character. +proc isUntypedBuild(tokStream: nlTokStream): bool = + result = isTokUntyped(tokStream.build.tType) + +# Check whether an nlTokType is "compatible" with +# the build token. flushBuild() should be called +# when an incompatible token is discovered. +proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool = + result = (tType == tokStream.build.tType) + +# Add a character to the nlTokStream's build token. +# Returns a bool indicating if a new nlTok has been built +# or not. flushBuild should then be called. +proc appendBuild(tokStream: var nlTokStream, c: char): Option[nlTok] = + let tType = getTokType(c) + # check whether build token should inherit type + if isUntypedBuild(tokStream): + tokStream.build.tType = tType + # check character and build token compatability + elif not isCompatibleBuild(tokStream, tType): + # return flushed build token, and reset + result = some(flushBuild(tokStream)) + # new build token is untyped so inherit type + tokStream.build.tType = tType + # check if \0 terminator reached + elif isTokTerm(tokStream.build.tType): + # return immediately to avoid concatinating '\0' + return some(flushBuild(tokStream)) + # else return none to indicate no build was completed + else: + result = none(nlTok) + # ensure character is appended to the build token + tokStream.build.lit.add(c) + +# Generates and returns the next token in the stream, +# result.tType == nlTokType.NTERM implies line ended +proc nextTok(tokStream: var nlTokStream): nlTok = + # try progress to next char, receives none option on failure + for optchar in iterChars(tokStream.lstream): + # unpack the Option[char], none => '\0' + let c = if optchar.isSome: optchar.get + else: '\0' + let opttok = appendBuild(tokStream, c) + if opttok.isSome: + return opttok.get + # NOTE: REACHING HERE SHOULD NEVER OCCUR # Initialises a new nlTokStream on a string or file proc newTokStream*(content: string, isFile: bool = false): nlTokStream = result = nlTokStream( lstream: newLStream(content, isFile=isFile), - closed: false, ) - # 1. initialise an empty build token - # 2. progress to the first line - result.resetBuild() - discard result.lstream.progLine() + resetBuild(result) -# Reimplements nlLStream.progress() for nlTokStream -# to account for additional structure (ie the build token) -proc progChar(tokStream: var nlTokStream): bool = - if not tokStream.lstream.atEOL(): - tokStream.lstream.forceProgChar() - result = true - else: - # attempt to progress to next line past EOL - result = tokStream.lstream.progLine() - tokStream.resetBuild() - -# Generates and sets (by reference) the next token in the stream, -# via repeatedly calling progBuild() and progChar(). -# Returns a boolean indicating whether EOF has been reached. -# NOTE: progBuild adds lstream's current char to the build token -# NOTE: progChar progresses to lstream's next char -proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool = - # Return prematurely if already closed - if tokStream.closed: - return false - while true: - var buildTok: Option[nlTok] - let - canProgress = tokStream.progBuild(buildTok) - tokBuilt = buildTok.isSome - # canProgress & EOF reached => no more tokens to build :) - # NOTE: reachedEOF and not canProgress => more tokens unwrapping - if tokBuilt: - tok = buildTok.get() - if canProgress and not tokStream.progChar(): - tokStream.closed = true - return tokBuilt - elif tokBuilt: - return true +# Allow the nlTokStream to be iterated +iterator toks*(tokStream: var nlTokStream): nlTok = + var tok: nlTok + while progLine(tokStream.lstream): + while true: + tok = nextTok(tokStream) + # \0 terminator means the line ended OR the file + # has ended, so always yield a line-feed just in case + if isTokTerm(tok.tType): + yield tokTermToLineFeed(tok) + break + yield tok + # we ONLY reach here on EOF + yield tok diff --git a/src/noether/lexer/toktype.nim b/src/noether/lexer/toktype.nim deleted file mode 100644 index 0f40023..0000000 --- a/src/noether/lexer/toktype.nim +++ /dev/null @@ -1,54 +0,0 @@ -type - # nlTokType allows primitive nlToks to be typed, - # the nlTokType enum should never be directly - # accessed. Use the interface in this file instead. - nlTokType* = enum - NONE, # Placeholder Value - EOF, # End of File - EOL, # End of Line (\0 --> EOL) - WORD, # Alphanumeric token - SYMB, # Symbolic token - LNFD, # \r \n Line-Feed - WTSP, # ' ' \t Whitespace - LPAR, # ( Left Parenthesis - RPAR, # ) Right Parenthesis - LBRA, # { Left Brace - RBRA, # } Right Brace - LSQB, # [ Left Square Bracket - RSQB, # ] Right Square Bracket - # LANB, # < Left Angle Bracket - # RANB, # > Right Angle Bracket - SQUO, # ' Single Quotation Marking - DQUO, # " Double Quotation Marking - GRVA, # ` Grave Accent - HASH, # # Number Sign (Hashtag) - -# Classifies a character to its nlTokType -proc getTokType*(c: char): nlTokType = - case c: - of '\0', '\r', '\n': - result = nlTokType.EOL - of ' ', '\t': - result = nlTokType.WTSP - of '(': - result = nlTokType.LPAR - of ')': - result = nlTokType.RPAR - of '{': - result = nlTokType.LBRA - of '}': - result = nlTokType.RBRA - of '[': - result = nlTokType.LSQB - of ']': - result = nlTokType.RSQB - of '\'': - result = nlTokType.SQUO - of '\"': - result = nlTokType.DQUO - of '`': - result = nlTokType.GRVA - of '#': - result = nlTokType.HASH - else: - result = nlTokType.WORD diff --git a/src/noether/parser/arborist.nim b/src/noether/parser/arborist.nim new file mode 100644 index 0000000..42888c8 --- /dev/null +++ b/src/noether/parser/arborist.nim @@ -0,0 +1,7 @@ +# Attempt to form an nlAST from a nlTokStream +proc arborise(tokStream: nlTokStream): nlNode = + for tok in toks(tokStream): + case tok.tokType: + of nlTokType.DQUO: + # Attempt to parse string literal + parse_strl() diff --git a/src/noether/parser/parser.nim b/src/noether/parser/parser.nim deleted file mode 100644 index b1937c9..0000000 --- a/src/noether/parser/parser.nim +++ /dev/null @@ -1,20 +0,0 @@ -import ../lexer/tokstream - -# Greed will consume anything except a punishment -proc greed(tokStream: nlTokStream, toks: var seq[nlTok], punish: str) = - - -proc parse_strl(tokStream: nlTokStream): nlNode = - - -# Attempt to form an nlAST from a nlTokStream -proc parse(tokStream: nlTokStream): nlNode = - var tok: nlTok - while true: - case tok.tokType: - of nlTokType.DQUO: - # Attempt to parse string literal - parse_strl() - - if not tokStream.nextTok(tok): - break