Fixed build tokens not unwrapping when both EOL and EOF occur

Simple (shit) working lexer via nlTokStream
Currently only tested on strings but its highly extendable if you modify the getTokType mapping of chars to their nlTokType
2025-06-18 02:35:51 +10:00 · 2025-06-18 01:25:20 +10:00
9 changed files with 255 additions and 192 deletions
--- a/lang/demo/math.no
+++ b/lang/demo/math.no
@ -0,0 +1,2 @@
+"abc+def"
+xy+z
--- a/src/nlx.nim
+++ b/src/nlx.nim
@ -1,4 +1,5 @@
 import os
+import noether/lexer/tok
 import noether/lexer/tokstream

 when isMainModule:
@ -7,7 +8,11 @@ when isMainModule:
  if paramCount() > 0:
    let filename = paramStr(1)
    var tokStream = newTokStream(filename, isFile=true)
-    for tok in toks(tokStream):
+    
+    # DumpTok
+    var tok: nlTok
+    while tokStream.nextTok(tok):
      echo tok
+    
  else:
    echo "usage: nlx filename"
--- a/src/noether/lexer/lstream.nim
+++ b/src/noether/lexer/lstream.nim
@ -1,7 +1,7 @@
 import std/streams
 import std/options

-include tok
+import tok

 type
  # Character streaming for the nlTokStream
@ -10,15 +10,15 @@ type
    # row/column positions
    line*: string 
    lineNum*: Natural
-    pos: Natural
+    pos*: Natural

-proc streamFile(filename: string): FileStream =
+proc streamFile*(filename: string): FileStream =
  result = newFileStream(filename, fmRead)

-proc streamString(str: string): StringStream =
+proc streamString*(str: string): StringStream =
  result = newStringStream(str)

-proc newLStream(content: string, isFile: bool = false): nlLStream =
+proc newLStream*(content: string, isFile: bool = false): nlLStream =
  result = nlLStream(
    stream: if isFile: streamFile(content) else: streamString(content),
    line: "",
@ -26,20 +26,40 @@ proc newLStream(content: string, isFile: bool = false): nlLStream =
    pos: Natural 0,
  )

+# Checks whether we've reached EOL
+# NOTE: also checks if we've surpassed it (ie invalid lstream.pos)
+proc atEOL*(lstream: nlLStream): bool = 
+  result = (lstream.pos >= lstream.line.len - 1)
+
+# Checks whether we are EXACTLY at EOL, but not surpassed
+proc exactlyEOL*(lstream: nlLStream): bool =
+  result = (lstream.pos == lstream.line.len - 1)
+
+# Checks whether we have surpassed EOL
+proc outOfBounds*(lstream: nlLStream): bool = 
+  result = (lstream.pos > lstream.line.len - 1)
+
 # Progress the lex stream to the next line (if available)
-proc progLine(lstream: var nlLStream): bool = 
+proc progLine*(lstream: var nlLStream): bool = 
  if lstream.stream.readLine(lstream.line):
    inc lstream.lineNum
    lstream.pos = Natural 0
    return true
  return false

-proc currChar(lstream: nlLStream): char = 
-  result = lstream.line[lstream.pos]
+# Progress the lex stream to the next character in the line
+# forcefully (aka does NOT check if we reached EOL)
+proc forceProgChar*(lstream: var nlLStream) = 
+  inc lstream.pos

-# NOTE: assumes lstream.line does NOT mutate while iterating
-iterator iterChars(lstream: var nlLStream): Option[char] =
-  while lstream.pos < lstream.line.len:
-    inc lstream.pos
-    yield some(lstream.line[lstream.pos - 1])
-  yield none(char)
+# Progress the lex stream to the next character (if available)
+proc progress*(lstream: var nlLStream): bool =
+  if not lstream.atEOL():
+    lstream.forceProgChar()
+    result = true
+  else:
+    # attempt to progress next line past EOL
+    result = lstream.progLine()
+
+proc currChar*(lstream: nlLStream): char = 
+  result = lstream.line[lstream.pos]
--- a/src/noether/lexer/tok.nim
+++ b/src/noether/lexer/tok.nim
@ -1,43 +1,16 @@
-type
-  # nlTokType allows primitive nlToks to be typed,
-  # the nlTokType enum should never be directly
-  # accessed. Use the interface in this file instead.
-  # NOTE: NONE is used as a default value
-  # NOTE: it is very different to NTERM!
-  nlTokType = enum
-    NONE, # Placeholder Value
-    EOF,  # EOF
-    TERM, # String \0 terminator
-    WORD, # Alphanumeric token
-    SYMB, # Symbolic token
-    LNFD, # \r \n Line-Feed
-    WTSP, # ' ' \t Whitespace
-    LPAR, # ( Left Parenthesis
-    RPAR, # ) Right Parenthesis
-    LBRA, # { Left Brace
-    RBRA, # } Right Brace
-    LSQB, # [ Left Square Bracket
-    RSQB, # ] Right Square Bracket
-    # LANB, # < Left Angle Bracket
-    # RANB, # > Right Angle Bracket
-    SQUO, # ' Single Quotation Marking
-    DQUO, # " Double Quotation Marking
-    GRVA, # ` Grave Accent
-    HASH, # # Number Sign (Hashtag)
-        
-  nlTok = object
+include toktype
+
+type 
+  nlTok* = object
    tType*: nlTokType
    lit*: string
-    line*: Natural
+    lineNum*: Natural
    startPos*: Natural
    endPos*: Natural

 # Generates an "empty" nlTok with only a startPos,
 # all other fields are expected to be filled out later.
-# NOTE: tType initialised to nlTokType.NUL
-# NOTE: lit initialised to empty string
-# NOTE: all other fields are uninitialised
-proc emptyTok(startPos: int): nlTok =
+proc emptyTok*(startPos: int): nlTok =
  result = nlTok(
    tType: nlTokType.NONE,
    lit: "",
@ -45,55 +18,23 @@ proc emptyTok(startPos: int): nlTok =
  )

 # Checks if an nlTok has nlTokType.NONE
-proc isTokUntyped(tType: nlTokType): bool =
+proc isTokUntyped*(tType: nlTokType): bool =
  result = (tType == nlTokType.NONE)
+  
+# Checks if an nlTok has nlTokType.EOL
+proc isTokEOL*(tok: nlTok): bool =
+  result = (tok.tType == nlTokType.EOL)
+

-# Checks if an nlTok has nlTokType.TERM
-proc isTokTerm(tType: nlTokType): bool =
-  result = (tType == nlTokType.TERM)

 # This method is only used to convert null
 # terminator nlToks into line-feed ones.
 # Returns a copy of an nlTok, changing its type
-# NOTE: this is necessary because Nim handles
-# NOTE: strings in a useful but annoying way
-proc tokTermToLineFeed(tok: nlTok): nlTok =
+proc tokTermToLineFeed*(tok: nlTok): nlTok =
  result = nlTok(
    tType: nlTokType.LNFD,
    lit: tok.lit,
-    line: tok.line,
+    lineNum: tok.lineNum,
    startPos: tok.startPos,
    endPos: tok.endPos,
  )
-
-# Classifies a character to its nlTokType
-proc getTokType(c: char): nlTokType =
-  case c:
-  of '\0':
-    result = nlTokType.TERM
-  of '\r', '\n':
-    result = nlTokType.LNFD
-  of ' ', '\t':
-    result = nlTokType.WTSP
-  of '(':
-    result = nlTokType.LPAR
-  of ')':
-    result = nlTokType.RPAR
-  of '{':
-    result = nlTokType.LBRA
-  of '}':
-    result = nlTokType.RBRA
-  of '[':
-    result = nlTokType.LSQB
-  of ']':
-    result = nlTokType.RSQB
-  of '\'':
-    result = nlTokType.SQUO
-  of '\"':
-    result = nlTokType.DQUO
-  of '`':
-    result = nlTokType.GRVA
-  of '#':
-    result = nlTokType.HASH
-  else:
-    result = nlTokType.WORD
--- a/src/noether/lexer/tokbuilding.nim
+++ b/src/noether/lexer/tokbuilding.nim
@ -0,0 +1,85 @@
+include lstream
+
+type
+  # Provides a stream-like interface for lexing nlToks
+  # Internally reliant on the functionality of nlLStream
+  nlTokStream = object
+    lstream: nlLStream
+    build: nlTok # the build token
+    closed: bool # EOF + all tokens built
+
+# Generates an EOL token for the nlTokStream's state
+proc EOLTok*(tokStream: nlTokStream): nlTok = 
+  result = nlTok(
+    tType: nlTokType.EOL,
+    lit: "\0",
+    lineNum: Natural tokStream.lstream.lineNum,
+    startPos: Natural tokStream.lstream.pos,
+    endPos: Natural tokStream.lstream.pos,
+  )
+
+# Resets the build token to an "empty" nlTok
+proc resetBuild(tokStream: var nlTokStream) =
+  tokStream.build = emptyTok(tokStream.lstream.pos)
+
+# Completes a token generated by emptyTok()
+# based on the nlTokStream's nlLStream's
+# current line and character positions
+proc finishBuild(ts: var nlTokStream) =
+  ts.build.lineNum = Natural ts.lstream.lineNum
+  ts.build.endPos = Natural ts.lstream.pos
+  ts.build.lit = ts.lstream.line[ts.build.startPos ..< ts.build.endPos]
+
+# Returns the nlTokStream's build token and
+# empties the build token's contents.
+proc flushBuild(tokStream: var nlTokStream): nlTok = 
+  finishBuild(tokStream)
+  result = tokStream.build
+  resetBuild(tokStream)
+
+# Returns whether the build token has a set type yet.
+# This indicates that the build token should inherit
+# the nlTokType of the nlLStream's next character.
+proc isUntypedBuild(tokStream: nlTokStream): bool =
+  result = isTokUntyped(tokStream.build.tType)
+
+# Check whether an nlTokType is "compatible" with the build token. 
+# NOTE: flushBuild() should be called when an incompatible token is discovered.
+proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
+  result = (tType == tokStream.build.tType)
+
+# Add a character to the nlTokStream's build token.
+# Flushes and returns the build token if "fully built",
+# and a boolean indicating whether the nlTokStream can progress.
+proc progBuild(tokStream: var nlTokStream, buildTok: var Option[nlTok]): bool =
+  # the "pos > EOL" invalid state is used intentionally
+  # to indicate all tokens have been built, and return EOL Token
+  if tokStream.lstream.outOfBounds():
+    buildTok = some(EOLTok(tokStream))
+    return true # can progress once more
+    
+  let tType = getTokType(tokStream.lstream.currChar())
+  # untyped build tokens must inherited immediately
+  if isUntypedBuild(tokStream):
+    tokStream.build.tType = tType
+  
+  # check if EOL reached
+  if tokStream.lstream.atEOL():
+      # flush old build token, the new one can be left untyped
+      let compatible = isCompatibleBuild(tokStream, tType)
+      result = false # DO NOT PROGRESS
+      if compatible:
+        # force the lstream into an invalid state by progressing beyond EOL
+        # we can then detect this state on the next progBuild and return
+        # an EOL character (very unsafe implementation but it works well)
+        tokStream.lstream.forceProgChar()
+      buildTok = some(flushBuild(tokStream))
+  # check character and build token compatability
+  elif not isCompatibleBuild(tokStream, tType):
+      # flush old build token, the new one inherits type
+      buildTok = some(flushBuild(tokStream))
+      tokStream.build.tType = tType
+      result = true # can progress
+  else:
+    buildTok = none(nlTok)
+    result = true # can progress
--- a/src/noether/lexer/tokstream.nim
+++ b/src/noether/lexer/tokstream.nim
@ -1,104 +1,47 @@
-include lstream
-
-type
-  # Provides a stream-like interface for lexing nlToks
-  # Internally reliant on the functionality of nlLStream
-  nlTokStream = object
-    lstream: nlLStream
-    build: nlTok # the current token we're building
-
-# Resets the build token to an "empty" nlTok where
-# only tType, lit, and startPos are initialised.
-proc resetBuild(tokStream: var nlTokStream) =
-  tokStream.build = emptyTok(tokStream.lstream.pos)
-
-# Completes a token generated by emptyTok()
-# based on the nlTokStream's nlLStream's
-# current line and character positions
-proc finishBuild(tokStream: var nlTokStream) =
-  # if we've reached \0 terminator then forge the start
-  # and end positions to point OUTSIDE the line
-  let endPos = if isTokTerm(tokStream.build.tType): 
-                   inc tokStream.build.startPos; 
-                   tokStream.build.startPos 
-               else: Natural tokStream.lstream.pos 
-  tokStream.build.line = Natural tokStream.lstream.lineNum
-  tokStream.build.endPos = endPos
-
-# Returns the nlTokStream's build token and
-# empties the build token's contents.
-proc flushBuild(tokStream: var nlTokStream): nlTok = 
-  finishBuild(tokStream)
-  result = tokStream.build
-  resetBuild(tokStream)
-
-# Returns whether the build token has a set type yet.
-# This indicates that the build token should inherit
-# the nlTokType of the nlLStream's next character.
-proc isUntypedBuild(tokStream: nlTokStream): bool =
-  result = isTokUntyped(tokStream.build.tType)
-
-# Check whether an nlTokType is "compatible" with
-# the build token. flushBuild() should be called
-# when an incompatible token is discovered.
-proc isCompatibleBuild(tokStream: nlTokStream, tType: nlTokType): bool =
-  result = (tType == tokStream.build.tType)
-
-# Add a character to the nlTokStream's build token.
-# Returns a bool indicating if a new nlTok has been built
-# or not. flushBuild should then be called.
-proc appendBuild(tokStream: var nlTokStream, c: char): Option[nlTok] =
-  let tType = getTokType(c)
-  # check whether build token should inherit type
-  if isUntypedBuild(tokStream):
-    tokStream.build.tType = tType
-  # check character and build token compatability
-  elif not isCompatibleBuild(tokStream, tType):
-      # return flushed build token, and reset 
-      result = some(flushBuild(tokStream))
-      # new build token is untyped so inherit type
-      tokStream.build.tType = tType
-  # check if \0 terminator reached
-  elif isTokTerm(tokStream.build.tType):
-      # return immediately to avoid concatinating '\0'
-      return some(flushBuild(tokStream))
-  # else return none to indicate no build was completed
-  else:
-    result = none(nlTok)
-  # ensure character is appended to the build token
-  tokStream.build.lit.add(c)
-
-# Generates and returns the next token in the stream,
-# result.tType == nlTokType.NTERM implies line ended
-proc nextTok(tokStream: var nlTokStream): nlTok =
-  # try progress to next char, receives none option on failure
-  for optchar in iterChars(tokStream.lstream):
-    # unpack the Option[char], none => '\0'
-    let c = if optchar.isSome: optchar.get
-            else: '\0'
-    let opttok = appendBuild(tokStream, c)
-    if opttok.isSome:
-      return opttok.get
-  # NOTE: REACHING HERE SHOULD NEVER OCCUR
+include tokbuilding

 # Initialises a new nlTokStream on a string or file
 proc newTokStream*(content: string, isFile: bool = false): nlTokStream =
  result = nlTokStream(
    lstream: newLStream(content, isFile=isFile),
+    closed: false,
  )
-  resetBuild(result)
+  # 1. initialise an empty build token 
+  # 2. progress to the first line
+  result.resetBuild()
+  discard result.lstream.progLine()

-# Allow the nlTokStream to be iterated
-iterator toks*(tokStream: var nlTokStream): nlTok = 
-  var tok: nlTok
-  while progLine(tokStream.lstream):
-    while true:
-      tok = nextTok(tokStream)
-      # \0 terminator means the line ended OR the file
-      # has ended, so always yield a line-feed just in case
-      if isTokTerm(tok.tType):
-        yield tokTermToLineFeed(tok)
-        break
-      yield tok
-  # we ONLY reach here on EOF
-  yield tok
+# Reimplements nlLStream.progress() for nlTokStream
+# to account for additional structure (ie the build token)
+proc progChar(tokStream: var nlTokStream): bool =
+  if not tokStream.lstream.atEOL():
+    tokStream.lstream.forceProgChar()
+    result = true
+  else:
+    # attempt to progress to next line past EOL
+    result = tokStream.lstream.progLine()
+    tokStream.resetBuild()  
+  
+# Generates and sets (by reference) the next token in the stream,
+# via repeatedly calling progBuild() and progChar().
+# Returns a boolean indicating whether EOF has been reached.
+# NOTE: progBuild adds lstream's current char to the build token
+# NOTE: progChar progresses to lstream's next char
+proc nextTok*(tokStream: var nlTokStream, tok: var nlTok): bool =
+  # Return prematurely if already closed
+  if tokStream.closed:
+    return false
+  while true:
+    var buildTok: Option[nlTok]
+    let 
+      canProgress = tokStream.progBuild(buildTok)
+      tokBuilt = buildTok.isSome
+    # canProgress & EOF reached => no more tokens to build :)
+    # NOTE: reachedEOF and not canProgress => more tokens unwrapping
+    if tokBuilt:
+      tok = buildTok.get()
+    if canProgress and not tokStream.progChar():
+      tokStream.closed = true
+      return tokBuilt
+    elif tokBuilt:
+      return true
--- a/src/noether/lexer/toktype.nim
+++ b/src/noether/lexer/toktype.nim
@ -0,0 +1,54 @@
+type
+  # nlTokType allows primitive nlToks to be typed,
+  # the nlTokType enum should never be directly
+  # accessed. Use the interface in this file instead.
+  nlTokType* = enum
+    NONE, # Placeholder Value
+    EOF,  # End of File
+    EOL,  # End of Line (\0 --> EOL)
+    WORD, # Alphanumeric token
+    SYMB, # Symbolic token
+    LNFD, # \r \n Line-Feed
+    WTSP, # ' ' \t Whitespace
+    LPAR, # ( Left Parenthesis
+    RPAR, # ) Right Parenthesis
+    LBRA, # { Left Brace
+    RBRA, # } Right Brace
+    LSQB, # [ Left Square Bracket
+    RSQB, # ] Right Square Bracket
+    # LANB, # < Left Angle Bracket
+    # RANB, # > Right Angle Bracket
+    SQUO, # ' Single Quotation Marking
+    DQUO, # " Double Quotation Marking
+    GRVA, # ` Grave Accent
+    HASH, # # Number Sign (Hashtag)
+        
+# Classifies a character to its nlTokType
+proc getTokType*(c: char): nlTokType =
+  case c:
+  of '\0', '\r', '\n':
+    result = nlTokType.EOL
+  of ' ', '\t':
+    result = nlTokType.WTSP
+  of '(':
+    result = nlTokType.LPAR
+  of ')':
+    result = nlTokType.RPAR
+  of '{':
+    result = nlTokType.LBRA
+  of '}':
+    result = nlTokType.RBRA
+  of '[':
+    result = nlTokType.LSQB
+  of ']':
+    result = nlTokType.RSQB
+  of '\'':
+    result = nlTokType.SQUO
+  of '\"':
+    result = nlTokType.DQUO
+  of '`':
+    result = nlTokType.GRVA
+  of '#':
+    result = nlTokType.HASH
+  else:
+    result = nlTokType.WORD
--- a/src/noether/parser/arborist.nim
+++ b/src/noether/parser/arborist.nim
@ -1,7 +0,0 @@
-# Attempt to form an nlAST from a nlTokStream
-proc arborise(tokStream: nlTokStream): nlNode = 
-  for tok in toks(tokStream):
-    case tok.tokType:
-    of nlTokType.DQUO:
-      # Attempt to parse string literal
-      parse_strl()
--- a/src/noether/parser/parser.nim
+++ b/src/noether/parser/parser.nim
@ -0,0 +1,20 @@
+import ../lexer/tokstream
+
+# Greed will consume anything except a punishment
+proc greed(tokStream: nlTokStream, toks: var seq[nlTok], punish: str) =
+  
+
+proc parse_strl(tokStream: nlTokStream): nlNode =
+  
+
+# Attempt to form an nlAST from a nlTokStream
+proc parse(tokStream: nlTokStream): nlNode = 
+  var tok: nlTok
+  while true:
+    case tok.tokType:
+    of nlTokType.DQUO:
+      # Attempt to parse string literal
+      parse_strl()
+
+    if not tokStream.nextTok(tok):
+      break
Author	SHA1	Message	Date
Emile Clark-Boman	90ca138904	Fixed build tokens not unwrapping when both EOL and EOF occur	2025-06-18 02:35:51 +10:00
Emile Clark-Boman	3ce9390be4	Simple (shit) working lexer via nlTokStream Currently only tested on strings but its highly extendable if you modify the getTokType mapping of chars to their nlTokType	2025-06-18 01:25:20 +10:00