Simple (shit) working lexer via nlTokStream

Currently only tested on strings but its highly extendable if you modify the getTokType mapping of chars to their nlTokType
2025-06-18 01:25:20 +10:00 · 2025-06-18 01:25:20 +10:00 · 3ce9390be4
commit 3ce9390be4
parent edf164df90
8 changed files with 224 additions and 185 deletions
--- a/lang/demo/math.no
+++ b/lang/demo/math.no
@ -0,0 +1,2 @@
+"abc+def"
+xy+z
--- a/src/nlx.nim
+++ b/src/nlx.nim
@ -1,4 +1,5 @@
 import os
+import noether/lexer/tok
 import noether/lexer/tokstream

 when isMainModule:
@ -7,7 +8,10 @@ when isMainModule:
  if paramCount() > 0:
    let filename = paramStr(1)
    var tokStream = newTokStream(filename, isFile=true)
-    for tok in toks(tokStream):
+
+    var tok: nlTok
+    while tokStream.nextTok(tok):
      echo tok
+    
  else:
    echo "usage: nlx filename"
--- a/src/noether/lexer/lstream.nim
+++ b/src/noether/lexer/lstream.nim
@ -1,7 +1,7 @@
 import std/streams
 import std/options

-include tok
+import tok

 type
  # Character streaming for the nlTokStream
@ -10,15 +10,15 @@ type
    # row/column positions
    line*: string 
    lineNum*: Natural
-    pos: Natural
+    pos*: Natural

-proc streamFile(filename: string): FileStream =
+proc streamFile*(filename: string): FileStream =
  result = newFileStream(filename, fmRead)

-proc streamString(str: string): StringStream =
+proc streamString*(str: string): StringStream =
  result = newStringStream(str)

-proc newLStream(content: string, isFile: bool = false): nlLStream =
+proc newLStream*(content: string, isFile: bool = false): nlLStream =
  result = nlLStream(
    stream: if isFile: streamFile(content) else: streamString(content),
    line: "",
@ -26,20 +26,40 @@ proc newLStream(content: string, isFile: bool = false): nlLStream =
    pos: Natural 0,
  )

+# Checks whether we've reached EOL
+# NOTE: also checks if we've surpassed it (ie invalid lstream.pos)
+proc atEOL*(lstream: nlLStream): bool = 
+  result = (lstream.pos >= lstream.line.len - 1)
+
+# Checks whether we are EXACTLY at EOL, but not surpassed
+proc exactlyEOL*(lstream: nlLStream): bool =
+  result = (lstream.pos == lstream.line.len - 1)
+
+# Checks whether we have surpassed EOL
+proc outOfBounds*(lstream: nlLStream): bool = 
+  result = (lstream.pos > lstream.line.len - 1)
+
 # Progress the lex stream to the next line (if available)
-proc progLine(lstream: var nlLStream): bool = 
+proc progLine*(lstream: var nlLStream): bool = 
  if lstream.stream.readLine(lstream.line):
    inc lstream.lineNum
    lstream.pos = Natural 0
    return true
  return false

-proc currChar(lstream: nlLStream): char = 
-  result = lstream.line[lstream.pos]
+# Progress the lex stream to the next character in the line
+# forcefully (aka does NOT check if we reached EOL)
+proc forceProgChar*(lstream: var nlLStream) = 
+  inc lstream.pos

-# NOTE: assumes lstream.line does NOT mutate while iterating
-iterator iterChars(lstream: var nlLStream): Option[char] =
-  while lstream.pos < lstream.line.len:
-    inc lstream.pos
-    yield some(lstream.line[lstream.pos - 1])
-  yield none(char)
+# Progress the lex stream to the next character (if available)
+proc progress*(lstream: var nlLStream): bool =
+  if not lstream.atEOL():
+    lstream.forceProgChar()
+    result = true
+  else:
+    # attempt to progress next line past EOL
+    result = lstream.progLine()
+
+proc currChar*(lstream: nlLStream): char = 
+  result = lstream.line[lstream.pos]
--- a/src/noether/lexer/tok.nim
+++ b/src/noether/lexer/tok.nim
@ -1,43 +1,16 @@
-type
-  # nlTokType allows primitive nlToks to be typed,
-  # the nlTokType enum should never be directly
-  # accessed. Use the interface in this file instead.
-  # NOTE: NONE is used as a default value
-  # NOTE: it is very different to NTERM!
-  nlTokType = enum
-    NONE, # Placeholder Value
-    EOF,  # EOF
-    TERM, # String \0 terminator
-    WORD, # Alphanumeric token
-    SYMB, # Symbolic token
-    LNFD, # \r \n Line-Feed
-    WTSP, # ' ' \t Whitespace
-    LPAR, # ( Left Parenthesis
-    RPAR, # ) Right Parenthesis
-    LBRA, # { Left Brace
-    RBRA, # } Right Brace
-    LSQB, # [ Left Square Bracket
-    RSQB, # ] Right Square Bracket
-    # LANB, # < Left Angle Bracket
-    # RANB, # > Right Angle Bracket
-    SQUO, # ' Single Quotation Marking
-    DQUO, # " Double Quotation Marking
-    GRVA, # ` Grave Accent
-    HASH, # # Number Sign (Hashtag)
-        
-  nlTok = object
+include toktype
+
+type 
+  nlTok* = object
    tType*: nlTokType
    lit*: string
-    line*: Natural
+    lineNum*: Natural
    startPos*: Natural
    endPos*: Natural

 # Generates an "empty" nlTok with only a startPos,
 # all other fields are expected to be filled out later.
-# NOTE: tType initialised to nlTokType.NUL
-# NOTE: lit initialised to empty string
-# NOTE: all other fields are uninitialised
-proc emptyTok(startPos: int): nlTok =
+proc emptyTok*(startPos: int): nlTok =
  result = nlTok(
    tType: nlTokType.NONE,
    lit: "",
@ -45,55 +18,23 @@ proc emptyTok(startPos: int): nlTok =
  )

 # Checks if an nlTok has nlTokType.NONE
-proc isTokUntyped(tType: nlTokType): bool =
+proc isTokUntyped*(tType: nlTokType): bool =
  result = (tType == nlTokType.NONE)
+  
+# Checks if an nlTok has nlTokType.EOL
+proc isTokEOL*(tok: nlTok): bool =
+  result = (tok.tType == nlTokType.EOL)
+

-# Checks if an nlTok has nlTokType.TERM
-proc isTokTerm(tType: nlTokType): bool =
-  result = (tType == nlTokType.TERM)

 # This method is only used to convert null
 # terminator nlToks into line-feed ones.
 # Returns a copy of an nlTok, changing its type
-# NOTE: this is necessary because Nim handles
-# NOTE: strings in a useful but annoying way
-proc tokTermToLineFeed(tok: nlTok): nlTok =
+proc tokTermToLineFeed*(tok: nlTok): nlTok =
  result = nlTok(
    tType: nlTokType.LNFD,
    lit: tok.lit,
-    line: tok.line,
+    lineNum: tok.lineNum,
    startPos: tok.startPos,
    endPos: tok.endPos,
  )
-
-# Classifies a character to its nlTokType
-proc getTokType(c: char): nlTokType =
-  case c:
-  of '\0':
-    result = nlTokType.TERM
-  of '\r', '\n':
-    result = nlTokType.LNFD
-  of ' ', '\t':
-    result = nlTokType.WTSP
-  of '(':
-    result = nlTokType.LPAR
-  of ')':
-    result = nlTokType.RPAR
-  of '{':
-    result = nlTokType.LBRA
-  of '}':
-    result = nlTokType.RBRA
-  of '[':
-    result = nlTokType.LSQB
-  of ']':
-    result = nlTokType.RSQB
-  of '\'':
-    result = nlTokType.SQUO
-  of '\"':
-    result = nlTokType.DQUO
-  of '`':
-    result = nlTokType.GRVA
-  of '#':
-    result = nlTokType.HASH
-  else:
-    result = nlTokType.WORD
--- a/src/noether/lexer/tokbuilding.nim
+++ b/src/noether/lexer/tokbuilding.nim
@ -0,0 +1,84 @@
+include lstream
+
+type
+  # Provides a stream-like interface for lexing nlToks
+  # Internally reliant on the functionality of nlLStream
+  nlTokStream = object
+    lstream: nlLStream
+    build: nlTok # the build token
+
+# Generates an EOL token for the nlTokStream's state
+proc EOLTok*(tokStream: nlTokStream): nlTok = 
+  result = nlTok(
+    tType: nlTokType.EOL,
+    lit: "\0",
+    lineNum: Natural tokStream.lstream.lineNum,
+    startPos: Natural tokStream.lstream.pos,
+    endPos: Natural tokStream.lstream.pos,
+  )
+
+# Resets the build token to an "empty" nlTok
+proc resetBuild(tokStream: var nlTokStream) =
+  tokStream.build = emptyTok(tokStream.lstream.pos)
+
+# Completes a token generated by emptyTok()
+# based on the nlTokStream's nlLStream's
+# current line and character positions
+proc finishBuild(ts: var nlTokStream) =
+  ts.build.lineNum = Natural ts.lstream.lineNum
+  ts.build.endPos = Natural ts.lstream.pos
+  ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos]
+
+# Returns the nlTokStream's build token and
+# empties the build token's contents.
+proc flushBuild(tokStream: var nlTokStream): nlTok = 
+  finishBuild(tokStream)
+  result = tokStream.build
+  resetBuild(tokStream)
+
+# Returns whether the build token has a set type yet.
+# This indicates that the build token should inherit
+# the nlTokType of the nlLStream's next character.
+proc isUntypedBuild(tokStream: nlTokStream): bool =
+  result = isTokUntyped(tokStream.build.tType)
+
+# Check whether an nlTokType is "compatible" with the build token. 
+# NOTE: flushBuild() should be called when an incompatible token is discovered.
+proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
+  result = (tType == tokStream.build.tType)
+
+# Add a character to the nlTokStream's build token.
+# Flushes and returns the build token if "fully built",
+# and a boolean indicating whether the nlTokStream can progress.
+proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool =
+  # the "pos > EOL" invalid state is used intentionally
+  # to indicate all tokens have been built, and return EOL Token
+  if tokStream.lstream.outOfBounds():
+    buildTok = some(EOLTok(tokStream))
+    return true # can progress once more
+    
+  let tType = getTokType(tokStream.lstream.currChar())
+  # untyped build tokens must inherited immediately
+  if isUntypedBuild(tokStream):
+    tokStream.build.tType = tType
+  
+  # check if EOL reached
+  if tokStream.lstream.atEOL():
+      # flush old build token, the new one can be left untyped
+      let compatible = isCompatibleBuild(tokStream, tType)
+      result = false # DO NOT PROGRESS
+      if compatible:
+        # force the lstream into an invalid state by progressing beyond EOL
+        # we can then detect this state on the next progBuild and return
+        # an EOL character (very unsafe implementation but it works well)
+        tokStream.lstream.forceProgChar()
+      buildTok = some(flushBuild(tokStream))
+  # check character and build token compatability
+  elif not isCompatibleBuild(tokStream, tType):
+      # flush old build token, the new one inherits type
+      buildTok = some(flushBuild(tokStream))
+      tokStream.build.tType = tType
+      result = true # can progress
+  else:
+    buildTok = none(nlTok)
+    result = true # can progress
--- a/src/noether/lexer/tokstream.nim
+++ b/src/noether/lexer/tokstream.nim
@ -1,104 +1,38 @@
-include lstream
-
-type
-  # Provides a stream-like interface for lexing nlToks
-  # Internally reliant on the functionality of nlLStream
-  nlTokStream = object
-    lstream: nlLStream
-    build: nlTok # the current token we're building
-
-# Resets the build token to an "empty" nlTok where
-# only tType, lit, and startPos are initialised.
-proc resetBuild(tokStream: var nlTokStream) =
-  tokStream.build = emptyTok(tokStream.lstream.pos)
-
-# Completes a token generated by emptyTok()
-# based on the nlTokStream's nlLStream's
-# current line and character positions
-proc finishBuild(tokStream: var nlTokStream) =
-  # if we've reached \0 terminator then forge the start
-  # and end positions to point OUTSIDE the line
-  let endPos = if isTokTerm(tokStream.build.tType): 
-                   inc tokStream.build.startPos; 
-                   tokStream.build.startPos 
-               else: Natural tokStream.lstream.pos 
-  tokStream.build.line = Natural tokStream.lstream.lineNum
-  tokStream.build.endPos = endPos
-
-# Returns the nlTokStream's build token and
-# empties the build token's contents.
-proc flushBuild(tokStream: var nlTokStream): nlTok = 
-  finishBuild(tokStream)
-  result = tokStream.build
-  resetBuild(tokStream)
-
-# Returns whether the build token has a set type yet.
-# This indicates that the build token should inherit
-# the nlTokType of the nlLStream's next character.
-proc isUntypedBuild(tokStream: nlTokStream): bool =
-  result = isTokUntyped(tokStream.build.tType)
-
-# Check whether an nlTokType is "compatible" with
-# the build token. flushBuild() should be called
-# when an incompatible token is discovered.
-proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
-  result = (tType == tokStream.build.tType)
-
-# Add a character to the nlTokStream's build token.
-# Returns a bool indicating if a new nlTok has been built
-# or not. flushBuild should then be called.
-proc appendBuild(tokStream: var nlTokStream, c: char): Option[nlTok] =
-  let tType = getTokType(c)
-  # check whether build token should inherit type
-  if isUntypedBuild(tokStream):
-    tokStream.build.tType = tType
-  # check character and build token compatability
-  elif not isCompatibleBuild(tokStream, tType):
-      # return flushed build token, and reset 
-      result = some(flushBuild(tokStream))
-      # new build token is untyped so inherit type
-      tokStream.build.tType = tType
-  # check if \0 terminator reached
-  elif isTokTerm(tokStream.build.tType):
-      # return immediately to avoid concatinating '\0'
-      return some(flushBuild(tokStream))
-  # else return none to indicate no build was completed
-  else:
-    result = none(nlTok)
-  # ensure character is appended to the build token
-  tokStream.build.lit.add(c)
-
-# Generates and returns the next token in the stream,
-# result.tType == nlTokType.NTERM implies line ended
-proc nextTok(tokStream: var nlTokStream): nlTok =
-  # try progress to next char, receives none option on failure
-  for optchar in iterChars(tokStream.lstream):
-    # unpack the Option[char], none => '\0'
-    let c = if optchar.isSome: optchar.get
-            else: '\0'
-    let opttok = appendBuild(tokStream, c)
-    if opttok.isSome:
-      return opttok.get
-  # NOTE: REACHING HERE SHOULD NEVER OCCUR
+include tokbuilding

 # Initialises a new nlTokStream on a string or file
 proc newTokStream*(content: string, isFile: bool = false): nlTokStream =
  result = nlTokStream(
    lstream: newLStream(content, isFile=isFile),
  )
-  resetBuild(result)
+  # 1. initialise an empty build token 
+  # 2. progress to the first line
+  result.resetBuild()
+  discard result.lstream.progLine()

-# Allow the nlTokStream to be iterated
-iterator toks*(tokStream: var nlTokStream): nlTok = 
-  var tok: nlTok
-  while progLine(tokStream.lstream):
-    while true:
-      tok = nextTok(tokStream)
-      # \0 terminator means the line ended OR the file
-      # has ended, so always yield a line-feed just in case
-      if isTokTerm(tok.tType):
-        yield tokTermToLineFeed(tok)
-        break
-      yield tok
-  # we ONLY reach here on EOF
-  yield tok
+# Reimplements nlLStream.progress() for nlTokStream
+# to account for additional structure (ie the build token)
+proc progChar(tokStream: var nlTokStream): bool =
+  if not tokStream.lstream.atEOL():
+    tokStream.lstream.forceProgChar()
+    result = true
+  else:
+    # attempt to progress to next line past EOL
+    result = tokStream.lstream.progLine()
+    tokStream.resetBuild()  
+  
+# Generates and sets (by reference) the next token in the stream,
+# via repeatedly calling progBuild() and progChar().
+# Returns a boolean indicating whether EOF has been reached.
+# NOTE: progBuild adds lstream's current char to the build token
+# NOTE: progChar progresses to lstream's next char
+proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool =
+  while true:
+    var buildTok: Option[nlTok]
+    let canProgress = tokStream.progBuild(buildTok)
+    # canProgress & progression failed => EOF reached
+    if canProgress and not tokStream.progChar():
+      return false
+    elif buildTok.isSome:
+      tok = buildTok.get()
+      return true
--- a/src/noether/lexer/toktype.nim
+++ b/src/noether/lexer/toktype.nim
@ -0,0 +1,54 @@
+type
+  # nlTokType allows primitive nlToks to be typed,
+  # the nlTokType enum should never be directly
+  # accessed. Use the interface in this file instead.
+  nlTokType* = enum
+    NONE, # Placeholder Value
+    EOF,  # End of File
+    EOL,  # End of Line (\0 --> EOL)
+    WORD, # Alphanumeric token
+    SYMB, # Symbolic token
+    LNFD, # \r \n Line-Feed
+    WTSP, # ' ' \t Whitespace
+    LPAR, # ( Left Parenthesis
+    RPAR, # ) Right Parenthesis
+    LBRA, # { Left Brace
+    RBRA, # } Right Brace
+    LSQB, # [ Left Square Bracket
+    RSQB, # ] Right Square Bracket
+    # LANB, # < Left Angle Bracket
+    # RANB, # > Right Angle Bracket
+    SQUO, # ' Single Quotation Marking
+    DQUO, # " Double Quotation Marking
+    GRVA, # ` Grave Accent
+    HASH, # # Number Sign (Hashtag)
+        
+# Classifies a character to its nlTokType
+proc getTokType*(c: char): nlTokType =
+  case c:
+  of '\0', '\r', '\n':
+    result = nlTokType.EOL
+  of ' ', '\t':
+    result = nlTokType.WTSP
+  of '(':
+    result = nlTokType.LPAR
+  of ')':
+    result = nlTokType.RPAR
+  of '{':
+    result = nlTokType.LBRA
+  of '}':
+    result = nlTokType.RBRA
+  of '[':
+    result = nlTokType.LSQB
+  of ']':
+    result = nlTokType.RSQB
+  of '\'':
+    result = nlTokType.SQUO
+  of '\"':
+    result = nlTokType.DQUO
+  of '`':
+    result = nlTokType.GRVA
+  of '#':
+    result = nlTokType.HASH
+  else:
+    result = nlTokType.WORD
--- a/src/noether/parser/arborist.nim
+++ b/src/noether/parser/arborist.nim