diff --git a/lang/demo/math.no b/lang/demo/math.no new file mode 100644 index 0000000..8050f04 --- /dev/null +++ b/lang/demo/math.no @@ -0,0 +1,2 @@ +"abc+def" +xy+z diff --git a/src/nlx.nim b/src/nlx.nim index 1c6c446..da182d0 100644 --- a/src/nlx.nim +++ b/src/nlx.nim @@ -1,4 +1,5 @@ import os +import noether/lexer/tok import noether/lexer/tokstream when isMainModule: @@ -7,7 +8,10 @@ when isMainModule: if paramCount() > 0: let filename = paramStr(1) var tokStream = newTokStream(filename, isFile=true) - for tok in toks(tokStream): + + var tok: nlTok + while tokStream.nextTok(tok): echo tok + else: echo "usage: nlx filename" diff --git a/src/noether/lexer/lstream.nim b/src/noether/lexer/lstream.nim index 862eb7b..b743c6d 100644 --- a/src/noether/lexer/lstream.nim +++ b/src/noether/lexer/lstream.nim @@ -1,7 +1,7 @@ import std/streams import std/options -include tok +import tok type # Character streaming for the nlTokStream @@ -10,15 +10,15 @@ type # row/column positions line*: string lineNum*: Natural - pos: Natural + pos*: Natural -proc streamFile(filename: string): FileStream = +proc streamFile*(filename: string): FileStream = result = newFileStream(filename, fmRead) -proc streamString(str: string): StringStream = +proc streamString*(str: string): StringStream = result = newStringStream(str) -proc newLStream(content: string, isFile: bool = false): nlLStream = +proc newLStream*(content: string, isFile: bool = false): nlLStream = result = nlLStream( stream: if isFile: streamFile(content) else: streamString(content), line: "", @@ -26,20 +26,40 @@ proc newLStream(content: string, isFile: bool = false): nlLStream = pos: Natural 0, ) +# Checks whether we've reached EOL +# NOTE: also checks if we've surpassed it (ie invalid lstream.pos) +proc atEOL*(lstream: nlLStream): bool = + result = (lstream.pos >= lstream.line.len - 1) + +# Checks whether we are EXACTLY at EOL, but not surpassed +proc exactlyEOL*(lstream: nlLStream): bool = + result = (lstream.pos == lstream.line.len - 1) + +# Checks whether we have surpassed EOL +proc outOfBounds*(lstream: nlLStream): bool = + result = (lstream.pos > lstream.line.len - 1) + # Progress the lex stream to the next line (if available) -proc progLine(lstream: var nlLStream): bool = +proc progLine*(lstream: var nlLStream): bool = if lstream.stream.readLine(lstream.line): inc lstream.lineNum lstream.pos = Natural 0 return true return false -proc currChar(lstream: nlLStream): char = - result = lstream.line[lstream.pos] +# Progress the lex stream to the next character in the line +# forcefully (aka does NOT check if we reached EOL) +proc forceProgChar*(lstream: var nlLStream) = + inc lstream.pos -# NOTE: assumes lstream.line does NOT mutate while iterating -iterator iterChars(lstream: var nlLStream): Option[char] = - while lstream.pos < lstream.line.len: - inc lstream.pos - yield some(lstream.line[lstream.pos - 1]) - yield none(char) +# Progress the lex stream to the next character (if available) +proc progress*(lstream: var nlLStream): bool = + if not lstream.atEOL(): + lstream.forceProgChar() + result = true + else: + # attempt to progress next line past EOL + result = lstream.progLine() + +proc currChar*(lstream: nlLStream): char = + result = lstream.line[lstream.pos] diff --git a/src/noether/lexer/tok.nim b/src/noether/lexer/tok.nim index 3b2464b..fb3067c 100644 --- a/src/noether/lexer/tok.nim +++ b/src/noether/lexer/tok.nim @@ -1,43 +1,16 @@ -type - # nlTokType allows primitive nlToks to be typed, - # the nlTokType enum should never be directly - # accessed. Use the interface in this file instead. - # NOTE: NONE is used as a default value - # NOTE: it is very different to NTERM! - nlTokType = enum - NONE, # Placeholder Value - EOF, # EOF - TERM, # String \0 terminator - WORD, # Alphanumeric token - SYMB, # Symbolic token - LNFD, # \r \n Line-Feed - WTSP, # ' ' \t Whitespace - LPAR, # ( Left Parenthesis - RPAR, # ) Right Parenthesis - LBRA, # { Left Brace - RBRA, # } Right Brace - LSQB, # [ Left Square Bracket - RSQB, # ] Right Square Bracket - # LANB, # < Left Angle Bracket - # RANB, # > Right Angle Bracket - SQUO, # ' Single Quotation Marking - DQUO, # " Double Quotation Marking - GRVA, # ` Grave Accent - HASH, # # Number Sign (Hashtag) - - nlTok = object +include toktype + +type + nlTok* = object tType*: nlTokType lit*: string - line*: Natural + lineNum*: Natural startPos*: Natural endPos*: Natural # Generates an "empty" nlTok with only a startPos, # all other fields are expected to be filled out later. -# NOTE: tType initialised to nlTokType.NUL -# NOTE: lit initialised to empty string -# NOTE: all other fields are uninitialised -proc emptyTok(startPos: int): nlTok = +proc emptyTok*(startPos: int): nlTok = result = nlTok( tType: nlTokType.NONE, lit: "", @@ -45,55 +18,23 @@ proc emptyTok(startPos: int): nlTok = ) # Checks if an nlTok has nlTokType.NONE -proc isTokUntyped(tType: nlTokType): bool = +proc isTokUntyped*(tType: nlTokType): bool = result = (tType == nlTokType.NONE) + +# Checks if an nlTok has nlTokType.EOL +proc isTokEOL*(tok: nlTok): bool = + result = (tok.tType == nlTokType.EOL) + -# Checks if an nlTok has nlTokType.TERM -proc isTokTerm(tType: nlTokType): bool = - result = (tType == nlTokType.TERM) # This method is only used to convert null # terminator nlToks into line-feed ones. # Returns a copy of an nlTok, changing its type -# NOTE: this is necessary because Nim handles -# NOTE: strings in a useful but annoying way -proc tokTermToLineFeed(tok: nlTok): nlTok = +proc tokTermToLineFeed*(tok: nlTok): nlTok = result = nlTok( tType: nlTokType.LNFD, lit: tok.lit, - line: tok.line, + lineNum: tok.lineNum, startPos: tok.startPos, endPos: tok.endPos, ) - -# Classifies a character to its nlTokType -proc getTokType(c: char): nlTokType = - case c: - of '\0': - result = nlTokType.TERM - of '\r', '\n': - result = nlTokType.LNFD - of ' ', '\t': - result = nlTokType.WTSP - of '(': - result = nlTokType.LPAR - of ')': - result = nlTokType.RPAR - of '{': - result = nlTokType.LBRA - of '}': - result = nlTokType.RBRA - of '[': - result = nlTokType.LSQB - of ']': - result = nlTokType.RSQB - of '\'': - result = nlTokType.SQUO - of '\"': - result = nlTokType.DQUO - of '`': - result = nlTokType.GRVA - of '#': - result = nlTokType.HASH - else: - result = nlTokType.WORD diff --git a/src/noether/lexer/tokbuilding.nim b/src/noether/lexer/tokbuilding.nim new file mode 100644 index 0000000..c77ae6f --- /dev/null +++ b/src/noether/lexer/tokbuilding.nim @@ -0,0 +1,84 @@ +include lstream + +type + # Provides a stream-like interface for lexing nlToks + # Internally reliant on the functionality of nlLStream + nlTokStream = object + lstream: nlLStream + build: nlTok # the build token + +# Generates an EOL token for the nlTokStream's state +proc EOLTok*(tokStream: nlTokStream): nlTok = + result = nlTok( + tType: nlTokType.EOL, + lit: "\0", + lineNum: Natural tokStream.lstream.lineNum, + startPos: Natural tokStream.lstream.pos, + endPos: Natural tokStream.lstream.pos, + ) + +# Resets the build token to an "empty" nlTok +proc resetBuild(tokStream: var nlTokStream) = + tokStream.build = emptyTok(tokStream.lstream.pos) + +# Completes a token generated by emptyTok() +# based on the nlTokStream's nlLStream's +# current line and character positions +proc finishBuild(ts: var nlTokStream) = + ts.build.lineNum = Natural ts.lstream.lineNum + ts.build.endPos = Natural ts.lstream.pos + ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos] + +# Returns the nlTokStream's build token and +# empties the build token's contents. +proc flushBuild(tokStream: var nlTokStream): nlTok = + finishBuild(tokStream) + result = tokStream.build + resetBuild(tokStream) + +# Returns whether the build token has a set type yet. +# This indicates that the build token should inherit +# the nlTokType of the nlLStream's next character. +proc isUntypedBuild(tokStream: nlTokStream): bool = + result = isTokUntyped(tokStream.build.tType) + +# Check whether an nlTokType is "compatible" with the build token. +# NOTE: flushBuild() should be called when an incompatible token is discovered. +proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool = + result = (tType == tokStream.build.tType) + +# Add a character to the nlTokStream's build token. +# Flushes and returns the build token if "fully built", +# and a boolean indicating whether the nlTokStream can progress. +proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool = + # the "pos > EOL" invalid state is used intentionally + # to indicate all tokens have been built, and return EOL Token + if tokStream.lstream.outOfBounds(): + buildTok = some(EOLTok(tokStream)) + return true # can progress once more + + let tType = getTokType(tokStream.lstream.currChar()) + # untyped build tokens must inherited immediately + if isUntypedBuild(tokStream): + tokStream.build.tType = tType + + # check if EOL reached + if tokStream.lstream.atEOL(): + # flush old build token, the new one can be left untyped + let compatible = isCompatibleBuild(tokStream, tType) + result = false # DO NOT PROGRESS + if compatible: + # force the lstream into an invalid state by progressing beyond EOL + # we can then detect this state on the next progBuild and return + # an EOL character (very unsafe implementation but it works well) + tokStream.lstream.forceProgChar() + buildTok = some(flushBuild(tokStream)) + # check character and build token compatability + elif not isCompatibleBuild(tokStream, tType): + # flush old build token, the new one inherits type + buildTok = some(flushBuild(tokStream)) + tokStream.build.tType = tType + result = true # can progress + else: + buildTok = none(nlTok) + result = true # can progress diff --git a/src/noether/lexer/tokstream.nim b/src/noether/lexer/tokstream.nim index 5ae2f65..6358d1d 100644 --- a/src/noether/lexer/tokstream.nim +++ b/src/noether/lexer/tokstream.nim @@ -1,104 +1,38 @@ -include lstream - -type - # Provides a stream-like interface for lexing nlToks - # Internally reliant on the functionality of nlLStream - nlTokStream = object - lstream: nlLStream - build: nlTok # the current token we're building - -# Resets the build token to an "empty" nlTok where -# only tType, lit, and startPos are initialised. -proc resetBuild(tokStream: var nlTokStream) = - tokStream.build = emptyTok(tokStream.lstream.pos) - -# Completes a token generated by emptyTok() -# based on the nlTokStream's nlLStream's -# current line and character positions -proc finishBuild(tokStream: var nlTokStream) = - # if we've reached \0 terminator then forge the start - # and end positions to point OUTSIDE the line - let endPos = if isTokTerm(tokStream.build.tType): - inc tokStream.build.startPos; - tokStream.build.startPos - else: Natural tokStream.lstream.pos - tokStream.build.line = Natural tokStream.lstream.lineNum - tokStream.build.endPos = endPos - -# Returns the nlTokStream's build token and -# empties the build token's contents. -proc flushBuild(tokStream: var nlTokStream): nlTok = - finishBuild(tokStream) - result = tokStream.build - resetBuild(tokStream) - -# Returns whether the build token has a set type yet. -# This indicates that the build token should inherit -# the nlTokType of the nlLStream's next character. -proc isUntypedBuild(tokStream: nlTokStream): bool = - result = isTokUntyped(tokStream.build.tType) - -# Check whether an nlTokType is "compatible" with -# the build token. flushBuild() should be called -# when an incompatible token is discovered. -proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool = - result = (tType == tokStream.build.tType) - -# Add a character to the nlTokStream's build token. -# Returns a bool indicating if a new nlTok has been built -# or not. flushBuild should then be called. -proc appendBuild(tokStream: var nlTokStream, c: char): Option[nlTok] = - let tType = getTokType(c) - # check whether build token should inherit type - if isUntypedBuild(tokStream): - tokStream.build.tType = tType - # check character and build token compatability - elif not isCompatibleBuild(tokStream, tType): - # return flushed build token, and reset - result = some(flushBuild(tokStream)) - # new build token is untyped so inherit type - tokStream.build.tType = tType - # check if \0 terminator reached - elif isTokTerm(tokStream.build.tType): - # return immediately to avoid concatinating '\0' - return some(flushBuild(tokStream)) - # else return none to indicate no build was completed - else: - result = none(nlTok) - # ensure character is appended to the build token - tokStream.build.lit.add(c) - -# Generates and returns the next token in the stream, -# result.tType == nlTokType.NTERM implies line ended -proc nextTok(tokStream: var nlTokStream): nlTok = - # try progress to next char, receives none option on failure - for optchar in iterChars(tokStream.lstream): - # unpack the Option[char], none => '\0' - let c = if optchar.isSome: optchar.get - else: '\0' - let opttok = appendBuild(tokStream, c) - if opttok.isSome: - return opttok.get - # NOTE: REACHING HERE SHOULD NEVER OCCUR +include tokbuilding # Initialises a new nlTokStream on a string or file proc newTokStream*(content: string, isFile: bool = false): nlTokStream = result = nlTokStream( lstream: newLStream(content, isFile=isFile), ) - resetBuild(result) + # 1. initialise an empty build token + # 2. progress to the first line + result.resetBuild() + discard result.lstream.progLine() -# Allow the nlTokStream to be iterated -iterator toks*(tokStream: var nlTokStream): nlTok = - var tok: nlTok - while progLine(tokStream.lstream): - while true: - tok = nextTok(tokStream) - # \0 terminator means the line ended OR the file - # has ended, so always yield a line-feed just in case - if isTokTerm(tok.tType): - yield tokTermToLineFeed(tok) - break - yield tok - # we ONLY reach here on EOF - yield tok +# Reimplements nlLStream.progress() for nlTokStream +# to account for additional structure (ie the build token) +proc progChar(tokStream: var nlTokStream): bool = + if not tokStream.lstream.atEOL(): + tokStream.lstream.forceProgChar() + result = true + else: + # attempt to progress to next line past EOL + result = tokStream.lstream.progLine() + tokStream.resetBuild() + +# Generates and sets (by reference) the next token in the stream, +# via repeatedly calling progBuild() and progChar(). +# Returns a boolean indicating whether EOF has been reached. +# NOTE: progBuild adds lstream's current char to the build token +# NOTE: progChar progresses to lstream's next char +proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool = + while true: + var buildTok: Option[nlTok] + let canProgress = tokStream.progBuild(buildTok) + # canProgress & progression failed => EOF reached + if canProgress and not tokStream.progChar(): + return false + elif buildTok.isSome: + tok = buildTok.get() + return true diff --git a/src/noether/lexer/toktype.nim b/src/noether/lexer/toktype.nim new file mode 100644 index 0000000..0f40023 --- /dev/null +++ b/src/noether/lexer/toktype.nim @@ -0,0 +1,54 @@ +type + # nlTokType allows primitive nlToks to be typed, + # the nlTokType enum should never be directly + # accessed. Use the interface in this file instead. + nlTokType* = enum + NONE, # Placeholder Value + EOF, # End of File + EOL, # End of Line (\0 --> EOL) + WORD, # Alphanumeric token + SYMB, # Symbolic token + LNFD, # \r \n Line-Feed + WTSP, # ' ' \t Whitespace + LPAR, # ( Left Parenthesis + RPAR, # ) Right Parenthesis + LBRA, # { Left Brace + RBRA, # } Right Brace + LSQB, # [ Left Square Bracket + RSQB, # ] Right Square Bracket + # LANB, # < Left Angle Bracket + # RANB, # > Right Angle Bracket + SQUO, # ' Single Quotation Marking + DQUO, # " Double Quotation Marking + GRVA, # ` Grave Accent + HASH, # # Number Sign (Hashtag) + +# Classifies a character to its nlTokType +proc getTokType*(c: char): nlTokType = + case c: + of '\0', '\r', '\n': + result = nlTokType.EOL + of ' ', '\t': + result = nlTokType.WTSP + of '(': + result = nlTokType.LPAR + of ')': + result = nlTokType.RPAR + of '{': + result = nlTokType.LBRA + of '}': + result = nlTokType.RBRA + of '[': + result = nlTokType.LSQB + of ']': + result = nlTokType.RSQB + of '\'': + result = nlTokType.SQUO + of '\"': + result = nlTokType.DQUO + of '`': + result = nlTokType.GRVA + of '#': + result = nlTokType.HASH + else: + result = nlTokType.WORD diff --git a/src/noether/parser/arborist.nim b/src/noether/parser/parser.nim similarity index 100% rename from src/noether/parser/arborist.nim rename to src/noether/parser/parser.nim