Simple (shit) working lexer via nlTokStream

Currently only tested on strings but its highly extendable if you modify the getTokType mapping of chars to their nlTokType
2025-06-18 01:25:20 +10:00 · 2025-06-18 01:25:20 +10:00 · 3ce9390be4
commit 3ce9390be4
parent edf164df90
8 changed files with 224 additions and 185 deletions
--- a/lang/demo/math.no
+++ b/lang/demo/math.no
@ -0,0 +1,2 @@
 "abc+def"
 xy+z
--- a/src/nlx.nim
+++ b/src/nlx.nim
@ -1,4 +1,5 @@
 import os
 import noether/lexer/tok
 import noether/lexer/tokstream
 when isMainModule:
@ -7,7 +8,10 @@ when isMainModule:
  if paramCount() > 0:
    let filename = paramStr(1)
    var tokStream = newTokStream(filename, isFile=true)
-    for tok in toks(tokStream):
+
    var tok: nlTok
    while tokStream.nextTok(tok):
      echo tok
  else:
    echo "usage: nlx filename"
--- a/src/noether/lexer/lstream.nim
+++ b/src/noether/lexer/lstream.nim
@ -1,7 +1,7 @@
 import std/streams
 import std/options
-include tok
+import tok
 type
  # Character streaming for the nlTokStream
@ -10,15 +10,15 @@ type
    # row/column positions
    line*: string 
    lineNum*: Natural
-    pos: Natural
+    pos*: Natural
-proc streamFile(filename: string): FileStream =
+proc streamFile*(filename: string): FileStream =
  result = newFileStream(filename, fmRead)
-proc streamString(str: string): StringStream =
+proc streamString*(str: string): StringStream =
  result = newStringStream(str)
-proc newLStream(content: string, isFile: bool = false): nlLStream =
+proc newLStream*(content: string, isFile: bool = false): nlLStream =
  result = nlLStream(
    stream: if isFile: streamFile(content) else: streamString(content),
    line: "",
@ -26,20 +26,40 @@ proc newLStream(content: string, isFile: bool = false): nlLStream =
    pos: Natural 0,
  )
 # Checks whether we've reached EOL
 # NOTE: also checks if we've surpassed it (ie invalid lstream.pos)
 proc atEOL*(lstream: nlLStream): bool = 
  result = (lstream.pos >= lstream.line.len - 1)
 # Checks whether we are EXACTLY at EOL, but not surpassed
 proc exactlyEOL*(lstream: nlLStream): bool =
  result = (lstream.pos == lstream.line.len - 1)
 # Checks whether we have surpassed EOL
 proc outOfBounds*(lstream: nlLStream): bool = 
  result = (lstream.pos > lstream.line.len - 1)
 # Progress the lex stream to the next line (if available)
-proc progLine(lstream: var nlLStream): bool = 
+proc progLine*(lstream: var nlLStream): bool = 
  if lstream.stream.readLine(lstream.line):
    inc lstream.lineNum
    lstream.pos = Natural 0
    return true
  return false
-proc currChar(lstream: nlLStream): char = 
+# Progress the lex stream to the next character in the line
-  result = lstream.line[lstream.pos]
+# forcefully (aka does NOT check if we reached EOL)
-
+proc forceProgChar*(lstream: var nlLStream) = 
 # NOTE: assumes lstream.line does NOT mutate while iterating
 iterator iterChars(lstream: var nlLStream): Option[char] =
  while lstream.pos < lstream.line.len:
  inc lstream.pos
-    yield some(lstream.line[lstream.pos - 1])
+
-  yield none(char)
+# Progress the lex stream to the next character (if available)
 proc progress*(lstream: var nlLStream): bool =
  if not lstream.atEOL():
    lstream.forceProgChar()
    result = true
  else:
    # attempt to progress next line past EOL
    result = lstream.progLine()
 proc currChar*(lstream: nlLStream): char = 
  result = lstream.line[lstream.pos]
--- a/src/noether/lexer/tok.nim
+++ b/src/noether/lexer/tok.nim
@ -1,43 +1,16 @@
-type
+include toktype
  # nlTokType allows primitive nlToks to be typed,
  # the nlTokType enum should never be directly
  # accessed. Use the interface in this file instead.
  # NOTE: NONE is used as a default value
  # NOTE: it is very different to NTERM!
  nlTokType = enum
    NONE, # Placeholder Value
    EOF,  # EOF
    TERM, # String \0 terminator
    WORD, # Alphanumeric token
    SYMB, # Symbolic token
    LNFD, # \r \n Line-Feed
    WTSP, # ' ' \t Whitespace
    LPAR, # ( Left Parenthesis
    RPAR, # ) Right Parenthesis
    LBRA, # { Left Brace
    RBRA, # } Right Brace
    LSQB, # [ Left Square Bracket
    RSQB, # ] Right Square Bracket
    # LANB, # < Left Angle Bracket
    # RANB, # > Right Angle Bracket
    SQUO, # ' Single Quotation Marking
    DQUO, # " Double Quotation Marking
    GRVA, # ` Grave Accent
    HASH, # # Number Sign (Hashtag)
-  nlTok = object
+type 
  nlTok* = object
    tType*: nlTokType
    lit*: string
-    line*: Natural
+    lineNum*: Natural
    startPos*: Natural
    endPos*: Natural
 # Generates an "empty" nlTok with only a startPos,
 # all other fields are expected to be filled out later.
-# NOTE: tType initialised to nlTokType.NUL
+proc emptyTok*(startPos: int): nlTok =
 # NOTE: lit initialised to empty string
 # NOTE: all other fields are uninitialised
 proc emptyTok(startPos: int): nlTok =
  result = nlTok(
    tType: nlTokType.NONE,
    lit: "",
@ -45,55 +18,23 @@ proc emptyTok(startPos: int): nlTok =
  )
 # Checks if an nlTok has nlTokType.NONE
-proc isTokUntyped(tType: nlTokType): bool =
+proc isTokUntyped*(tType: nlTokType): bool =
  result = (tType == nlTokType.NONE)
-# Checks if an nlTok has nlTokType.TERM
+# Checks if an nlTok has nlTokType.EOL
-proc isTokTerm(tType: nlTokType): bool =
+proc isTokEOL*(tok: nlTok): bool =
-  result = (tType == nlTokType.TERM)
+  result = (tok.tType == nlTokType.EOL)
 # This method is only used to convert null
 # terminator nlToks into line-feed ones.
 # Returns a copy of an nlTok, changing its type
-# NOTE: this is necessary because Nim handles
+proc tokTermToLineFeed*(tok: nlTok): nlTok =
 # NOTE: strings in a useful but annoying way
 proc tokTermToLineFeed(tok: nlTok): nlTok =
  result = nlTok(
    tType: nlTokType.LNFD,
    lit: tok.lit,
-    line: tok.line,
+    lineNum: tok.lineNum,
    startPos: tok.startPos,
    endPos: tok.endPos,
  )
 # Classifies a character to its nlTokType
 proc getTokType(c: char): nlTokType =
  case c:
  of '\0':
    result = nlTokType.TERM
  of '\r', '\n':
    result = nlTokType.LNFD
  of ' ', '\t':
    result = nlTokType.WTSP
  of '(':
    result = nlTokType.LPAR
  of ')':
    result = nlTokType.RPAR
  of '{':
    result = nlTokType.LBRA
  of '}':
    result = nlTokType.RBRA
  of '[':
    result = nlTokType.LSQB
  of ']':
    result = nlTokType.RSQB
  of '\'':
    result = nlTokType.SQUO
  of '\"':
    result = nlTokType.DQUO
  of '`':
    result = nlTokType.GRVA
  of '#':
    result = nlTokType.HASH
  else:
    result = nlTokType.WORD
--- a/src/noether/lexer/tokbuilding.nim
+++ b/src/noether/lexer/tokbuilding.nim
@ -0,0 +1,84 @@
 include lstream
 type
  # Provides a stream-like interface for lexing nlToks
  # Internally reliant on the functionality of nlLStream
  nlTokStream = object
    lstream: nlLStream
    build: nlTok # the build token
 # Generates an EOL token for the nlTokStream's state
 proc EOLTok*(tokStream: nlTokStream): nlTok = 
  result = nlTok(
    tType: nlTokType.EOL,
    lit: "\0",
    lineNum: Natural tokStream.lstream.lineNum,
    startPos: Natural tokStream.lstream.pos,
    endPos: Natural tokStream.lstream.pos,
  )
 # Resets the build token to an "empty" nlTok
 proc resetBuild(tokStream: var nlTokStream) =
  tokStream.build = emptyTok(tokStream.lstream.pos)
 # Completes a token generated by emptyTok()
 # based on the nlTokStream's nlLStream's
 # current line and character positions
 proc finishBuild(ts: var nlTokStream) =
  ts.build.lineNum = Natural ts.lstream.lineNum
  ts.build.endPos = Natural ts.lstream.pos
  ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos]
 # Returns the nlTokStream's build token and
 # empties the build token's contents.
 proc flushBuild(tokStream: var nlTokStream): nlTok = 
  finishBuild(tokStream)
  result = tokStream.build
  resetBuild(tokStream)
 # Returns whether the build token has a set type yet.
 # This indicates that the build token should inherit
 # the nlTokType of the nlLStream's next character.
 proc isUntypedBuild(tokStream: nlTokStream): bool =
  result = isTokUntyped(tokStream.build.tType)
 # Check whether an nlTokType is "compatible" with the build token. 
 # NOTE: flushBuild() should be called when an incompatible token is discovered.
 proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
  result = (tType == tokStream.build.tType)
 # Add a character to the nlTokStream's build token.
 # Flushes and returns the build token if "fully built",
 # and a boolean indicating whether the nlTokStream can progress.
 proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool =
  # the "pos > EOL" invalid state is used intentionally
  # to indicate all tokens have been built, and return EOL Token
  if tokStream.lstream.outOfBounds():
    buildTok = some(EOLTok(tokStream))
    return true # can progress once more
  let tType = getTokType(tokStream.lstream.currChar())
  # untyped build tokens must inherited immediately
  if isUntypedBuild(tokStream):
    tokStream.build.tType = tType
  # check if EOL reached
  if tokStream.lstream.atEOL():
      # flush old build token, the new one can be left untyped
      let compatible = isCompatibleBuild(tokStream, tType)
      result = false # DO NOT PROGRESS
      if compatible:
        # force the lstream into an invalid state by progressing beyond EOL
        # we can then detect this state on the next progBuild and return
        # an EOL character (very unsafe implementation but it works well)
        tokStream.lstream.forceProgChar()
      buildTok = some(flushBuild(tokStream))
  # check character and build token compatability
  elif not isCompatibleBuild(tokStream, tType):
      # flush old build token, the new one inherits type
      buildTok = some(flushBuild(tokStream))
      tokStream.build.tType = tType
      result = true # can progress
  else:
    buildTok = none(nlTok)
    result = true # can progress
--- a/src/noether/lexer/tokstream.nim
+++ b/src/noether/lexer/tokstream.nim
@ -1,104 +1,38 @@
-include lstream
+include tokbuilding
 type
  # Provides a stream-like interface for lexing nlToks
  # Internally reliant on the functionality of nlLStream
  nlTokStream = object
    lstream: nlLStream
    build: nlTok # the current token we're building
 # Resets the build token to an "empty" nlTok where
 # only tType, lit, and startPos are initialised.
 proc resetBuild(tokStream: var nlTokStream) =
  tokStream.build = emptyTok(tokStream.lstream.pos)
 # Completes a token generated by emptyTok()
 # based on the nlTokStream's nlLStream's
 # current line and character positions
 proc finishBuild(tokStream: var nlTokStream) =
  # if we've reached \0 terminator then forge the start
  # and end positions to point OUTSIDE the line
  let endPos = if isTokTerm(tokStream.build.tType): 
                   inc tokStream.build.startPos; 
                   tokStream.build.startPos 
               else: Natural tokStream.lstream.pos 
  tokStream.build.line = Natural tokStream.lstream.lineNum
  tokStream.build.endPos = endPos
 # Returns the nlTokStream's build token and
 # empties the build token's contents.
 proc flushBuild(tokStream: var nlTokStream): nlTok = 
  finishBuild(tokStream)
  result = tokStream.build
  resetBuild(tokStream)
 # Returns whether the build token has a set type yet.
 # This indicates that the build token should inherit
 # the nlTokType of the nlLStream's next character.
 proc isUntypedBuild(tokStream: nlTokStream): bool =
  result = isTokUntyped(tokStream.build.tType)
 # Check whether an nlTokType is "compatible" with
 # the build token. flushBuild() should be called
 # when an incompatible token is discovered.
 proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
  result = (tType == tokStream.build.tType)
 # Add a character to the nlTokStream's build token.
 # Returns a bool indicating if a new nlTok has been built
 # or not. flushBuild should then be called.
 proc appendBuild(tokStream: var nlTokStream, c: char): Option[nlTok] =
  let tType = getTokType(c)
  # check whether build token should inherit type
  if isUntypedBuild(tokStream):
    tokStream.build.tType = tType
  # check character and build token compatability
  elif not isCompatibleBuild(tokStream, tType):
      # return flushed build token, and reset 
      result = some(flushBuild(tokStream))
      # new build token is untyped so inherit type
      tokStream.build.tType = tType
  # check if \0 terminator reached
  elif isTokTerm(tokStream.build.tType):
      # return immediately to avoid concatinating '\0'
      return some(flushBuild(tokStream))
  # else return none to indicate no build was completed
  else:
    result = none(nlTok)
  # ensure character is appended to the build token
  tokStream.build.lit.add(c)
 # Generates and returns the next token in the stream,
 # result.tType == nlTokType.NTERM implies line ended
 proc nextTok(tokStream: var nlTokStream): nlTok =
  # try progress to next char, receives none option on failure
  for optchar in iterChars(tokStream.lstream):
    # unpack the Option[char], none => '\0'
    let c = if optchar.isSome: optchar.get
            else: '\0'
    let opttok = appendBuild(tokStream, c)
    if opttok.isSome:
      return opttok.get
  # NOTE: REACHING HERE SHOULD NEVER OCCUR
 # Initialises a new nlTokStream on a string or file
 proc newTokStream*(content: string, isFile: bool = false): nlTokStream =
  result = nlTokStream(
    lstream: newLStream(content, isFile=isFile),
  )
-  resetBuild(result)
+  # 1. initialise an empty build token 
  # 2. progress to the first line
  result.resetBuild()
  discard result.lstream.progLine()
-# Allow the nlTokStream to be iterated
+# Reimplements nlLStream.progress() for nlTokStream
-iterator toks*(tokStream: var nlTokStream): nlTok = 
+# to account for additional structure (ie the build token)
-  var tok: nlTok
+proc progChar(tokStream: var nlTokStream): bool =
-  while progLine(tokStream.lstream):
+  if not tokStream.lstream.atEOL():
    tokStream.lstream.forceProgChar()
    result = true
  else:
    # attempt to progress to next line past EOL
    result = tokStream.lstream.progLine()
    tokStream.resetBuild()  
 # Generates and sets (by reference) the next token in the stream,
 # via repeatedly calling progBuild() and progChar().
 # Returns a boolean indicating whether EOF has been reached.
 # NOTE: progBuild adds lstream's current char to the build token
 # NOTE: progChar progresses to lstream's next char
 proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool =
  while true:
-      tok = nextTok(tokStream)
+    var buildTok: Option[nlTok]
-      # \0 terminator means the line ended OR the file
+    let canProgress = tokStream.progBuild(buildTok)
-      # has ended, so always yield a line-feed just in case
+    # canProgress & progression failed => EOF reached
-      if isTokTerm(tok.tType):
+    if canProgress and not tokStream.progChar():
-        yield tokTermToLineFeed(tok)
+      return false
-        break
+    elif buildTok.isSome:
-      yield tok
+      tok = buildTok.get()
-  # we ONLY reach here on EOF
+      return true
  yield tok
--- a/src/noether/lexer/toktype.nim
+++ b/src/noether/lexer/toktype.nim
@ -0,0 +1,54 @@
 type
  # nlTokType allows primitive nlToks to be typed,
  # the nlTokType enum should never be directly
  # accessed. Use the interface in this file instead.
  nlTokType* = enum
    NONE, # Placeholder Value
    EOF,  # End of File
    EOL,  # End of Line (\0 --> EOL)
    WORD, # Alphanumeric token
    SYMB, # Symbolic token
    LNFD, # \r \n Line-Feed
    WTSP, # ' ' \t Whitespace
    LPAR, # ( Left Parenthesis
    RPAR, # ) Right Parenthesis
    LBRA, # { Left Brace
    RBRA, # } Right Brace
    LSQB, # [ Left Square Bracket
    RSQB, # ] Right Square Bracket
    # LANB, # < Left Angle Bracket
    # RANB, # > Right Angle Bracket
    SQUO, # ' Single Quotation Marking
    DQUO, # " Double Quotation Marking
    GRVA, # ` Grave Accent
    HASH, # # Number Sign (Hashtag)
 # Classifies a character to its nlTokType
 proc getTokType*(c: char): nlTokType =
  case c:
  of '\0', '\r', '\n':
    result = nlTokType.EOL
  of ' ', '\t':
    result = nlTokType.WTSP
  of '(':
    result = nlTokType.LPAR
  of ')':
    result = nlTokType.RPAR
  of '{':
    result = nlTokType.LBRA
  of '}':
    result = nlTokType.RBRA
  of '[':
    result = nlTokType.LSQB
  of ']':
    result = nlTokType.RSQB
  of '\'':
    result = nlTokType.SQUO
  of '\"':
    result = nlTokType.DQUO
  of '`':
    result = nlTokType.GRVA
  of '#':
    result = nlTokType.HASH
  else:
    result = nlTokType.WORD
--- a/src/noether/parser/arborist.nim
+++ b/src/noether/parser/arborist.nim