""" == Requirements of a nice tokenizer * Accurately and automatically report for each token which kind, text, line number, column number, character index and length * Allow methods to intercept when a token is encountered in order to * modify it * skip it * replace it by a list of tokens * Unit tests """ use System.Text.RegularExpressions use System.Reflection interface IToken pro which as String pro text as String get length as int pro value as Object? """ The value the token represents. This is normally the same as the text unless there is specific reason to give another value, such as the actual integer value that an integer token represents. """ pro info as dynamic? """ An arbitrary object that the tokenizer can set to pass additional information to the parser. """ get isEmpty as bool pro isKeyword as bool get fileName as String get fullPathName as String get lineNum as int # CC: ensure result > 0 get colNum as int # CC: ensure result > 0 get charNum as int # CC: ensure result > 0 def copy as IToken def copy(which as String) as IToken """ Returns a copy of the token, but with the `which` changed. """ def copy(which as String, text as String) as IToken """ Returns a copy of the token, but with the `which` changed. """ pro nextToken as IToken? """ Use this to "insert" extra tokens into the token stream from an .onWHICH method. """ def shortLocationString as String """ Returns filename, sans path, with line number and col number. """ def toTechString as String def incLineNum """ Increments the line number by one. Created for end-of-init assertions that class variables are not nil when their types are not nilable. """ class Token implements IToken shared var _empty as Token? get empty as Token # CC: as same if _empty is nil _empty = Token('(empty)', 1, 1, 1, '(EMPTY)', '', nil, true) return _empty to ! var _isEmpty as bool var _fileName as String var _lineNum as int var _colNum as int var _charNum as int var _which as String var _text as String var _value as dynamic? var _info as dynamic? var _isKeyword as bool var _nextToken as IToken? cue init(fileName as String, lineNum as int, colNum as int, charNum as int, which as String, text as String, value as Object?) require lineNum > 0 colNum > 0 charNum > 0 which.length body .init(fileName, lineNum, colNum, charNum, which, text, value, false) cue init(fileName as String, lineNum as int, colNum as int, charNum as int, which as String, text as String, value as Object?, isEmpty as bool) require lineNum > 0 colNum > 0 charNum > 0 which.length body base.init _fileName = fileName _lineNum = lineNum _colNum = colNum _charNum = charNum _which = which _text = text _value = value _isEmpty = isEmpty pro which from var pro text from var get length as int return _text.length pro value from var pro info from var get isEmpty from var pro isKeyword from var get fileName from var get fullPathName as String return if(Path.isPathRooted(.fileName), .fileName, Path.combine(Environment.currentDirectory, .fileName)) get lineNum from var get colNum from var get charNum from var pro nextToken from var def copy as IToken ensure result.which == .which result.text == .text result.value == .value .nextToken implies result.nextToken test t = Token('(noname)', 1, 1, 1, 'ID', 'foo', nil) u = t.copy assert t is not u assert t.which == 'ID' assert u.which == 'ID' body # TODO: should this: t = Token(_fileName, _lineNum, _colNum, _charNum, _which, _text, _value) # be this: # t = .getType()(_fileName, _lineNum, _colNum, _charNum, _which, _text, _value) # and if so, how does performance change? if _nextToken t.nextToken = _nextToken.copy return t def copy(which as String) as IToken ensure result.which == which result.text == .text result.value == .value .nextToken implies result.nextToken test t = Token('(noname)', 1, 1, 1, 'ID', 'foo', nil) u = t.copy('FOO') assert t is not u assert t.which == 'ID' assert u.which == 'FOO' body t = .copy to Token t._which = which return t def copy(which as String, text as String) as IToken ensure result.which == which result.text == text result.value == .value .nextToken implies result.nextToken test t = Token('(noname)', 1, 1, 1, 'ID', 'foo', nil) u = t.copy('FOO', 'bar') assert t is not u assert t.which == 'ID' assert u.which == 'FOO' assert t.text == 'foo' assert u.text == 'bar' body t = .copy(which) to Token t._text = text return t def shortLocationString as String """ Returns filename, sans path, with line number and col number. """ test t = Token('Foo.cobra', 13, 6, 120, 'DEF', 'def', 'def') assert t.shortLocationString == 'Foo.cobra:13:6' body return '[Path.getFileName(.fileName)]:[.lineNum]:[.colNum]' def toString as String is override test t = Token('(noname)', 1, 1, 1, 'ID', 'foo', nil) assert t.toString=='"foo" (ID)' body try return _toString() catch exc as Exception return '(Token.toString Exception: [exc.toString])' def _toString as String sb = StringBuilder() for c in _text branch c on c'\t', sb.append('\\t') on c'\r', sb.append('\\r') on c'\n', sb.append('\\n') else, sb.append(c) s = sb.toString if s.length if s.toLower == _which.toLower # if keyword... return '"[s]"' else return '"[s]" ([_which])' else return '"[_which]"' # INDENT, DEDENT, etc. def toTechString as String if .isEmpty return .getType.name + '.empty' else return '[.getType.name]([.which], [CobraCore.toTechString(.text)], [CobraCore.toTechString(.value)], ln [.lineNum], col [.colNum], [.fileName])' def incLineNum _lineNum += 1 class TokenFix inherits Token """ This hack is due to certain C# circumstances where referring to "Token.empty" in Cobra does not translate well to C# which wants to interpret "Token" as a property instead of the class. """ cue init(fileName as String, lineNum as int, colNum as int, charNum as int, which as String, text as String, value as Object?) base.init(fileName, lineNum, colNum, charNum, which, text, value) class TokenizerError inherits SystemException """ Raised by .nextToken when there are errors in the source trying to be tokenized. """ var _tokenizer as Tokenizer var _token as IToken? cue init(tokenizer as Tokenizer, msg as String) require msg.length base.init(msg) _tokenizer = tokenizer if tokenizer.curToken _token = tokenizer.curToken else _token = tokenizer.lastToken get token from var get tokenizer from var class Tokenizer """ Subclasses often override .orderedTokenSpecs, .unorderedTokenSpecs and .keywords. """ var _verbosity as int var _willAlwaysEndWithNewLine = true var _didReset = false var _didStart = false var _errors as List var _fileName as String? var _stream as TextReader? var _lastToken as IToken? var _curToken as IToken? var _tokenDefsStack = Stack() var _tokenQueue as Queue # needed when token methods return lists of tokens var _keywordToWhichToken as Dictionary var _onTokenTypeCache as Dictionary var _tokenDefs as List? var _tokenDefsByFirstChar as Dictionary>? var _tokenDefsByWhich as Dictionary? var _noWarningLines as Cobra.Lang.ISet var _curTokenDef as TokenDef? var _lastTokenDef as TokenDef? var _avgCount = 0 var _avgCountNum = 0 # Source line and location var _sourceLine as String? var _originalSourceLine as String? # keep this around since _sourceLine gets chopped down var _sourceLineIndex as int var _lineNum as int var _colNum as int var _charNum as int cue init # before adding code here, consider if it should go in _reset or _reuse instead base.init _didReset = false _reset() def toString as String is override sb = StringBuilder() sb.append('[.getType.name](') .addInfo(sb) sb.append(')') return sb.toString def addInfo(sb as StringBuilder) sb.append('_sourceLineIndex=[_sourceLineIndex], ') sb.append('_lineNum=[_lineNum], ') sb.append('_colNum=[_colNum], ') sb.append('_charNum=[_charNum], ') ## Subclasses often override get orderedTokenSpecs as List return List() get unorderedTokenSpecs as List return List() get keywords as IList """ Returns a list of all keywords. Subclasses often override this to specify keywords. """ return List() ## Common properties get curTokenDef from var get curToken from var """ Returns the current token being processed. Often examined when catching a TokenizerError. """ get lastToken from var get didStart from var get fileName from var get errors from var get tokenDefsInOrder from _tokenDefs get tokenDefsByWhich from var get noWarningLines from var get linesCompiled from _lineNum as int ## Other def _reset _reuse _fileName = nil _stream = nil _keywordToWhichToken = Dictionary() _onTokenTypeCache = Dictionary() _didReset = true _avgCount = _avgCountNum = 0 def _reuse """ Resets class vars unrelated to token defs, file name or stream. """ _errors = List() _noWarningLines = Set() _curToken = nil _lastToken = nil _sourceLine = nil _originalSourceLine = nil _lineNum = 0 _colNum = 1 _charNum = 1 _tokenQueue = Queue() def startFileNamed(fileName as String) as Tokenizer # TODO: as this _fileName = fileName _stream = File.openText(fileName) .afterStart return this def startSource(source as String) as Tokenizer return .startSource('(no file name)', source) def startSource(fileName as String, source as String) as Tokenizer if false print '**********************************************************************' print source print '**********************************************************************' _fileName = fileName _stream = StringReader(source) .afterStart return this get nextToken as IToken? """ Consumes a token and returns it, making it the .lastToken. Returns nil when there are no tokens left. """ ensure .lastToken == result #assert .readLine, 'Not started.' _didStart = true _lastToken = _nextToken if _lastToken is nil and _avgCountNum > 0 # avgCount = _avgCount / _avgCountNum _avgCountNum = 0 # print '*** avg toks to look through: [avgCount:N2] for [_fileName]' if _verbosity >=1 print '<> .nextToken returning [_lastToken]' return _lastToken def allTokens as List """ Returns all remaining tokens as a list. """ tokens = List() # start = DateTime.now while true t = .nextToken if t, tokens.add(t) else, break # determine timings for various Tokenizer lexing changes /# duration = DateTime.now.subtract(start) if tokens.count > 100 and duration > TimeSpan(0) ticksPerToken = (duration.ticks to int) / tokens.count ticksPerTokenMS = ticksPerToken / 10_000 fileName = _fileName if fileName.startsWith('('), fileName = '(None)' else, fileName = Path.getFileName(fileName) print '[fileName.padRight(20)]\t[duration]\t[tokens.count]\t' stop print ' [ticksPerTokenMS:N3] ms/Token av ([ticksPerTokenMS/1000:N3]sec)' #/ return tokens def restart """ After calling this, you can use the tokenizer anew even with different token defs. """ if _stream _stream.close _stream = nil _tokenDefs = nil # avoid caching the token defs. now .afterStart will recreate them _reset() def keywordOrWhich(tok as IToken) as IToken return .keywordOrWhich(tok, 'ID') def keywordOrWhich(tok as IToken, which as String) as IToken """ Changes the token to a keyword if it is one, otherwise sets its which. Returns the token. This self utility method is typically called by a subclass from the method for the "identifier" token. """ ensure result is tok if _keywordToWhichToken.containsKey(tok.text) tok.which = _keywordToWhichToken[tok.text] return tok def addTokenSpec(spec as String) _tokenDefs.add(.tokenDefForSpec(spec)) def tokenDefForSpec(spec as String) as TokenDef """ Returns a TokenDef object for a spec such as r'ID [A-Za-z_][A-Za-z0-9_]*' """ spec = spec.trim.replace('\t', ' ') # CC: should be something like: # which, re = spec.split(array(c' '), 2) # instead of: partNum = 0 for part in spec.split(@[c' '], 2) if partNum==0 which = part else re = part partNum += 1 assert partNum==2, 'Got [partNum] part(s) for spec "[spec]" instead of 2' re = re.trim assert which.length assert re.length return .generateTokenDef(which, re) def generateTokenDef(which as String, definition as String) as TokenDef """ Generate a TokenDef for the given Token type/which and definition. Normally, the definition is regular expression source code. But first a method call 'makeWHICH' is checked for and invoked if present. Subclasses can override this method to provide a different approach for some or all token defs. """ methInfo = .getType.getMethod('Make'+which) if methInfo tokenDef = methInfo.invoke(this, @[definition]) to TokenDef? if tokenDef return tokenDef return TokenRegexDef(which, definition) def afterStart """ Sets up class vars readLine as callable tokenDefs as list """ ensure _tokenDefs body assert _didReset, 'Have not reset. Probably the subclass overrides _reset but forgets to invoke base.' if _tokenDefs and _tokenDefs.count > 0 _reuse return # create a single list of all token defs in the correct order _tokenDefs = List() if .orderedTokenSpecs.count for spec in .orderedTokenSpecs .addTokenSpec(spec) others = List() if .unorderedTokenSpecs.count for spec in .unorderedTokenSpecs others.add(.tokenDefForSpec(spec)) keywords = .keywords if keywords.count for word in keywords _keywordToWhichToken[word] = word.toUpper # longest tokens need to be matched first others.sort(ref .compareTokenDefLengthDesc) # CC: others.sort(def(a, b)=b.length.compareTo(a.length)) _tokenDefs.addRange(others) .pushTokenDefs(_tokenDefs to !) def compareTokenDefLengthDesc(a as TokenDef, b as TokenDef) as int return b.length.compareTo(a.length) def _obtainSource as bool ensure result implies _sourceLine result implies _lineNum == old _lineNum + 1 body _sourceLine = _originalSourceLine = _stream.readLine if _sourceLine is nil # end of source return false numLines = _sourceLine.count(c'\n') if numLines == 0 and _willAlwaysEndWithNewLine _sourceLine += "\n" #trace sourceLine if numLines if numLines==1 assert _sourceLine.endsWith('\n') else assert false, 'Expecting readLine to return one line instead of many.' _sourceLineIndex = 0 _lineNum += 1 _colNum = 1 return true var _narrowTokenDefs = true var _minNumTokenDefsToNarrow = 4 get _nextToken as IToken? """ This is the core brain of the tokenizer. The primary logic for matching tokens is here. """ count, didCheckTokenDefs = 0, false try if _tokenQueue.count # eat up queue first return _tokenQueue.dequeue if not _sourceLine or not _sourceLine.length if not _obtainSource() return nil try assert _tokenDefs tokenDefs = _getCandidateTokenDefs(_sourceLine) didCheckTokenDefs = true for tokenDef in tokenDefs count += 1 if _skipMatchAttempt(tokenDef, _sourceLineIndex) continue #print '<> Trying to match [tokenDef]' sourceLine = _sourceLine to ! #print '_sourceLineIndex=[_sourceLineIndex]' match = tokenDef.match(sourceLine) if not match #print '<> No match on [tokenDef] for [_sourceLine[_sourceLineIndex]]' continue _lastTokenDef = _curTokenDef _curTokenDef = tokenDef # this enables .onTOKENWHICH methods to access the current tokenDef text = match.text #print '<> Match! [CobraCore.toTechString(text)] - [tokenDef]' # tok = Token(_fileName, _lineNum, _colNum, _charNum, tokenDef.which, text, text) to ? # CC tok as IToken? = Token(_fileName, _lineNum, _colNum, _charNum, tokenDef.which, text, text) len = text.length _curToken = tok _colNum += len _charNum += len _sourceLineIndex += len _sourceLine = _sourceLine.substring(len) # enable methods to customize handling of tokens reinvoke = false meth = _getTokenMethod(tok.which) if meth tok = _tokenPostProcess(meth, tok) # nil indicates skipped token - will re call this method if not tok reinvoke = true # to pick up next token after skipped # finished with current line? if _sourceLine.length==0 _sourceLine = nil _sourceLineIndex = -1 _originalSourceLine = nil # handle token skipping if reinvoke tok = _nextToken # yay! return tok finally _curTokenDef = nil # no match if false trace all trace _lineNum, _colNum, _charNum trace _sourceLine[0], _sourceLine[0] to int trace _sourceLine trace _sourceLineIndex trace _originalSourceLine .throwError('Lexical error: "[_originalSourceLine[_sourceLineIndex]]" ([_originalSourceLine[_sourceLineIndex] to int])') # ^ no source location information is included in the error message as that should be pulled # from TokenizerError.tokenzier.curToken or .lastToken return nil finally if didCheckTokenDefs _avgCount += count _avgCountNum += 1 def _getCandidateTokenDefs(sourceLine as String?) as List if _narrowTokenDefs and _tokenDefs.count >= _minNumTokenDefsToNarrow assert _tokenDefsByFirstChar assert _sourceLine.length if _tokenDefsByFirstChar.containsKey(_sourceLine[0]) # print 'Using short list for char: [_sourceLine[0]], [_sourceLine[0] to int]' return _tokenDefsByFirstChar[_sourceLine[0]] return _tokenDefs to ! def _skipMatchAttempt(tokenDef as TokenDef, sourceLineIndex as int) as bool if tokenDef.ignoreCount tokenDef.ignoreCount -= 1 return true if not tokenDef.isActive return true if sourceLineIndex>0 and tokenDef.requiresBOL return true if not .isActiveCall(tokenDef) return true return false def _getTokenMethod(which as String) as MethodInfo? """ Get method for token (On) using reflection. Caching is used to boost performance. """ if not _onTokenTypeCache.containsKey(which) # not already in methods cache methName = 'On' + which meth = .getType.getMethod(methName) _onTokenTypeCache[which] = meth else meth = _onTokenTypeCache[which] return meth def _tokenPostProcess(meth as MethodInfo, tok as IToken?) as IToken? """ Invoke any tokenMethod and return the result which could the same token, a replacement token or nil to skip the token. Any token returned could be the start of a token chain via token.nextToken. """ try tok = meth.invoke(this, @[tok]) to IToken? catch tie as TargetInvocationException throw tie.innerException to ! if not tok return nil # token is to be skipped retTok = tok tok = tok.nextToken while tok # TODO: could probably make this more efficient by axing the queue and just checking for nextToken in this method _tokenQueue.enqueue(tok) # store any token chain returned by method call tok = tok.nextToken return retTok def pushTokenDefs(defs as List) ensure _tokenDefs is defs _tokenDefsByWhich.count == defs.count body defsByWhich = Dictionary() defsByFirstChar = Dictionary>() unknownFirstCharDefs = List() n = 0 for tokenDef in defs tokenDef.number = n n += 1 assert not defsByWhich.containsKey(tokenDef.which), tokenDef defsByWhich[tokenDef.which] = tokenDef if _narrowTokenDefs if tokenDef.firstChars.count for c in tokenDef.firstChars if defsByFirstChar.containsKey(c) defsByFirstChar[c].add(tokenDef) else defsByFirstChar[c] = [tokenDef] else unknownFirstCharDefs.add(tokenDef) if _narrowTokenDefs v = false for key as char in defsByFirstChar.keys if v print print '***', key, defsByFirstChar[key] t = defsByFirstChar[key] t.addRange(unknownFirstCharDefs) # sort by number # CC: should specify the comparison here I think, since there is another place where they are sorted by length t.sort if v print '{[key]}' for i = 0 .. t.count print ' [i]. [t[i]]' _tokenDefsStack.push(TokenDefSet(defs, defsByWhich, defsByFirstChar)) _tokenDefs = defs _tokenDefsByWhich = defsByWhich _tokenDefsByFirstChar = defsByFirstChar def popTokenDefs require _tokenDefsStack.count > 0 body _tokenDefsStack.pop if _tokenDefsStack.count tokenDefSet = _tokenDefsStack.peek defs = tokenDefSet.defs to ? defsByWhich = tokenDefSet.defsByWhich to ? defsByFirstChar = tokenDefSet.defsByFirstChar to ? else defs = nil defsByWhich = nil defsByFirstChar = nil _tokenDefs = defs _tokenDefsByWhich = defsByWhich _tokenDefsByFirstChar = defsByFirstChar def isActiveCall(tok as TokenDef) as bool return true def recordError(msg as String) as TokenizerError err = TokenizerError(this, msg) _errors.add(err) return err def throwError(msg as String) # TODO: like parser, this should probably not be recorded unless it "makes it out". see comment in CobraParser.throwError throw .recordError(msg) def addNoWarning(token as IToken) _noWarningLines.add('[token.fileName]:[token.lineNum]') def checkTokens(tokens as List, expected as String) is shared """ Returns true if the list of tokens "matches" the string. This is a utility method to aid with testing. """ sb = StringBuilder() sep = '' for t in tokens sb.append(sep) sb.append(t.which) sep = ' ' tokensStr = sb.toString assert tokensStr==expected class TokenDef implements IComparable is abstract var _number as int var _which as String var _requiresBOL as bool var _ignoreCount as int var _isActive = true var _isActiveCall as Object? # @@@@?? cue init(which as String) require which.length body base.init _which = which pro number from var pro which from var get firstChars as List is abstract get length as int is abstract """ Returns the textual length of the token definition. This is only important for the unordered token specs which are arranged longest to shortest. Ordered token specs are left in their specific order. """ get requiresBOL from var """ Return true if the token def will only match the very beginning of a line. In practice, this is false for most token def, and when true, the token def is often consuming the entire line or some leading whitespace. """ pro ignoreCount from var pro isActive from var def match(input as String) as TokenMatch? is abstract require input.length def toString as String is override return '[.getType.name]([_innerToString()])' def _innerToString as String return '[_number], [_which]' def compareTo(obj as Object?) as int if obj is nil return 0 if obj inherits TokenDef return _number - obj.number else return .getType.name.compareTo(obj.getType.name) class TokenRegexDef inherits TokenDef """ A token definition that matches based on a regular expression. """ shared var _compiledRegExes = Dictionary() var _regExSource as String var _re as Regex var _length as int var _firstChars as List cue init(which as String, regExSource as String) require which.length regExSource.length body base.init(which) _requiresBOL = regExSource.startsWith('^') _firstChars = _computeFirstChars(regExSource) _regExSource = regExSource if not _requiresBOL regExSource = '^' + regExSource _length = regExSource.length if _compiledRegExes.containsKey(regExSource) _re = _compiledRegExes[regExSource] else # Note: Making the regex compiled has almost no effect on performance on either # .NET or Mono. Either the overhead of creating a compiled version of the regex # erases the gain (most likely based on web research) or the compiled expressions # don't run fast enough to make a difference. _re = Regex(regExSource, RegexOptions.Compiled) _compiledRegExes[regExSource] = _re def _computeFirstChars(s as String) as List # TODO: this is actually specific to Cobra, so it should really be in CobraTokenizer. # Maybe that can be done via a callback/delegate. t = List() if s.startsWith(r'[A-Za-z_]') for c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_' t.add(c) else if s.startsWith(r'\d') for c in '0123456789' t.add(c) else if s.startsWith(r'\t') t.add(c'\t') else if s.startsWith(r'\n') t.add(c'\n') else if s[0] == c'\\' t.add(s[1]) else if s.startsWith(r'^[\t]') t.add(c'\t') else if s.startsWith(r'^[ ]+') or s.startsWith(r'[ ]+') t.add(c' ') else if s.startsWith(r'[ \t]+') t.add(c' ') t.add(c'\t') else if s == r'[ \t]*' t.add(c' ') t.add(c'\t') else if s[0] not in r'^\[.' t.add(s[0]) # else # trace s return t get firstChars from var is override get length from var is override get regExSource from var get re from var def match(input as String) as TokenMatch? is override # trace _regExSource # see ..\Developer\most-common-regexes.text reMatch = _re.match(input) if reMatch.success assert reMatch.index == 0 assert reMatch.value return TokenRegexMatch(reMatch) else return nil def _innerToString as String is override return base._innerToString + ', [_regExSource], [_re]' class TokenMatch """ The class for match results returned by TokenDef.match. """ var _text as String cue init(text as String) base.init _text = text get text from var class TokenRegexMatch inherits TokenMatch var _match as Match cue init(match as Match) base.init(match.value to !) _match = match get match from var class TokenDefSet var _defs as List var _defsByWhich as Dictionary var _defsByFirstChar as Dictionary> cue init(defs as List, defsByWhich as Dictionary, defsByFirstChar as Dictionary>) require defs.count defsByWhich.count defsByWhich.count == defs.count # defsByFirstChar.count is not required because having the defs indexed by first char is an optional optimization with no change in behaviour body base.init _defs = defs _defsByWhich = defsByWhich _defsByFirstChar = defsByFirstChar get defs from var get defsByWhich from var get defsByFirstChar from var class TestTokenizer inherits Tokenizer var _idCount as int get idCount from var get orderedTokenSpecs as List is override return [ 'OPEN_IF ifx\\(', ns'ID [A-Za-z_][A-Za-z0-9_]*', ns'SPACE [ ]+', 'NEWLINE \\n', ] get unorderedTokenSpecs as List is override return [ 'DOT \\.', 'COLON :', 'PLUS \\+', 'ASSIGN =', 'EQUALS ==', ] get keywords as IList is override return ['assert', 'if', 'else'] def _reset is override base._reset _idCount = 0 def onID(tok as IToken) as IToken? _idCount += 1 return tok def onSPACE(tok as IToken) as IToken? return nil def onNEWLINE(tok as IToken) as IToken? return tok def onDOT(tok as IToken) as IToken? tok.nextToken = tok.copy return tok test # basics tt = TestTokenizer() tt.startSource('hello there') tokens = tt.allTokens .checkTokens(tokens, 'ID ID NEWLINE') assert tt.idCount==2 # the tokenizer lets methods insert tokens tt.restart tt.startSource('hello.there') tokens = tt.allTokens .checkTokens(tokens, 'ID DOT DOT ID NEWLINE') # tokens know their line numbers and columns and lengths tt.restart tt.startSource('hello\nthere\n you\n ') tokens = for tok in tt.allTokens where tok.which <> 'NEWLINE' assert tokens.count==3 hello = tokens[0] # CC: hello, there, you = tokens there = tokens[1] you = tokens[2] assert hello.lineNum==1 assert hello.colNum==1 assert hello.length==5 assert there.lineNum==2 assert there.colNum==1 assert there.length==5 assert you.lineNum==3 assert you.colNum==2 assert you.length==3