| 1 | """ |
|---|
| 2 | |
|---|
| 3 | == Requirements of a nice tokenizer |
|---|
| 4 | |
|---|
| 5 | * Accurately and automatically report for each token |
|---|
| 6 | which kind, text, line number, column number, character index and length |
|---|
| 7 | |
|---|
| 8 | * Allow methods to intercept when a token is encountered in order to |
|---|
| 9 | * modify it |
|---|
| 10 | * skip it |
|---|
| 11 | * replace it by a list of tokens |
|---|
| 12 | |
|---|
| 13 | * Unit tests |
|---|
| 14 | |
|---|
| 15 | """ |
|---|
| 16 | |
|---|
| 17 | use System.Text.RegularExpressions |
|---|
| 18 | use System.Reflection |
|---|
| 19 | |
|---|
| 20 | |
|---|
| 21 | interface IToken |
|---|
| 22 | |
|---|
| 23 | pro which as String |
|---|
| 24 | pro text as String |
|---|
| 25 | get length as int |
|---|
| 26 | pro value as Object? |
|---|
| 27 | """ |
|---|
| 28 | The value the token represents. This is normally the same as the text unless there is |
|---|
| 29 | specific reason to give another value, such as the actual integer value that an integer |
|---|
| 30 | token represents. |
|---|
| 31 | """ |
|---|
| 32 | pro info as dynamic? |
|---|
| 33 | """ |
|---|
| 34 | An arbitrary object that the tokenizer can set to pass additional information to the |
|---|
| 35 | parser. |
|---|
| 36 | """ |
|---|
| 37 | get isEmpty as bool |
|---|
| 38 | pro isKeyword as bool |
|---|
| 39 | |
|---|
| 40 | get fileName as String |
|---|
| 41 | get fullPathName as String |
|---|
| 42 | get lineNum as int |
|---|
| 43 | # CC: ensure result > 0 |
|---|
| 44 | get colNum as int |
|---|
| 45 | # CC: ensure result > 0 |
|---|
| 46 | get charNum as int |
|---|
| 47 | # CC: ensure result > 0 |
|---|
| 48 | |
|---|
| 49 | def copy as IToken |
|---|
| 50 | def copy(which as String) as IToken |
|---|
| 51 | """ |
|---|
| 52 | Returns a copy of the token, but with the `which` changed. |
|---|
| 53 | """ |
|---|
| 54 | def copy(which as String, text as String) as IToken |
|---|
| 55 | """ |
|---|
| 56 | Returns a copy of the token, but with the `which` changed. |
|---|
| 57 | """ |
|---|
| 58 | |
|---|
| 59 | pro nextToken as IToken? |
|---|
| 60 | """ |
|---|
| 61 | Use this to "insert" extra tokens into the token stream from an .onWHICH method. |
|---|
| 62 | """ |
|---|
| 63 | |
|---|
| 64 | def shortLocationString as String |
|---|
| 65 | """ |
|---|
| 66 | Returns filename, sans path, with line number and col number. |
|---|
| 67 | """ |
|---|
| 68 | |
|---|
| 69 | def toTechString as String |
|---|
| 70 | |
|---|
| 71 | def incLineNum |
|---|
| 72 | """ |
|---|
| 73 | Increments the line number by one. |
|---|
| 74 | Created for end-of-init assertions that class variables are not nil when their types are not nilable. |
|---|
| 75 | """ |
|---|
| 76 | |
|---|
| 77 | |
|---|
| 78 | class Token implements IToken |
|---|
| 79 | |
|---|
| 80 | shared |
|---|
| 81 | var _empty as Token? |
|---|
| 82 | |
|---|
| 83 | get empty as Token # CC: as same |
|---|
| 84 | if _empty is nil |
|---|
| 85 | _empty = Token('(empty)', 1, 1, 1, '(EMPTY)', '', nil, true) |
|---|
| 86 | return _empty to ! |
|---|
| 87 | |
|---|
| 88 | var _isEmpty as bool |
|---|
| 89 | var _fileName as String |
|---|
| 90 | var _lineNum as int |
|---|
| 91 | var _colNum as int |
|---|
| 92 | var _charNum as int |
|---|
| 93 | var _which as String |
|---|
| 94 | var _text as String |
|---|
| 95 | var _value as dynamic? |
|---|
| 96 | var _info as dynamic? |
|---|
| 97 | var _isKeyword as bool |
|---|
| 98 | var _nextToken as IToken? |
|---|
| 99 | |
|---|
| 100 | cue init(fileName as String, lineNum as int, colNum as int, charNum as int, which as String, text as String, value as Object?) |
|---|
| 101 | require |
|---|
| 102 | lineNum > 0 |
|---|
| 103 | colNum > 0 |
|---|
| 104 | charNum > 0 |
|---|
| 105 | which.length |
|---|
| 106 | body |
|---|
| 107 | .init(fileName, lineNum, colNum, charNum, which, text, value, false) |
|---|
| 108 | |
|---|
| 109 | cue init(fileName as String, lineNum as int, colNum as int, charNum as int, which as String, text as String, value as Object?, isEmpty as bool) |
|---|
| 110 | require |
|---|
| 111 | lineNum > 0 |
|---|
| 112 | colNum > 0 |
|---|
| 113 | charNum > 0 |
|---|
| 114 | which.length |
|---|
| 115 | body |
|---|
| 116 | base.init |
|---|
| 117 | _fileName = fileName |
|---|
| 118 | _lineNum = lineNum |
|---|
| 119 | _colNum = colNum |
|---|
| 120 | _charNum = charNum |
|---|
| 121 | _which = which |
|---|
| 122 | _text = text |
|---|
| 123 | _value = value |
|---|
| 124 | _isEmpty = isEmpty |
|---|
| 125 | |
|---|
| 126 | pro which from var |
|---|
| 127 | |
|---|
| 128 | pro text from var |
|---|
| 129 | |
|---|
| 130 | get length as int |
|---|
| 131 | return _text.length |
|---|
| 132 | |
|---|
| 133 | pro value from var |
|---|
| 134 | |
|---|
| 135 | pro info from var |
|---|
| 136 | |
|---|
| 137 | get isEmpty from var |
|---|
| 138 | |
|---|
| 139 | pro isKeyword from var |
|---|
| 140 | |
|---|
| 141 | get fileName from var |
|---|
| 142 | |
|---|
| 143 | get fullPathName as String |
|---|
| 144 | return if(Path.isPathRooted(.fileName), .fileName, Path.combine(Environment.currentDirectory, .fileName)) |
|---|
| 145 | |
|---|
| 146 | get lineNum from var |
|---|
| 147 | |
|---|
| 148 | get colNum from var |
|---|
| 149 | |
|---|
| 150 | get charNum from var |
|---|
| 151 | |
|---|
| 152 | pro nextToken from var |
|---|
| 153 | |
|---|
| 154 | def copy as IToken |
|---|
| 155 | ensure |
|---|
| 156 | result.which == .which |
|---|
| 157 | result.text == .text |
|---|
| 158 | result.value == .value |
|---|
| 159 | .nextToken implies result.nextToken |
|---|
| 160 | test |
|---|
| 161 | t = Token('(noname)', 1, 1, 1, 'ID', 'foo', nil) |
|---|
| 162 | u = t.copy |
|---|
| 163 | assert t is not u |
|---|
| 164 | assert t.which == 'ID' |
|---|
| 165 | assert u.which == 'ID' |
|---|
| 166 | body |
|---|
| 167 | # TODO: should this: |
|---|
| 168 | t = Token(_fileName, _lineNum, _colNum, _charNum, _which, _text, _value) |
|---|
| 169 | # be this: |
|---|
| 170 | # t = .getType()(_fileName, _lineNum, _colNum, _charNum, _which, _text, _value) |
|---|
| 171 | # and if so, how does performance change? |
|---|
| 172 | if _nextToken |
|---|
| 173 | t.nextToken = _nextToken.copy |
|---|
| 174 | return t |
|---|
| 175 | |
|---|
| 176 | def copy(which as String) as IToken |
|---|
| 177 | ensure |
|---|
| 178 | result.which == which |
|---|
| 179 | result.text == .text |
|---|
| 180 | result.value == .value |
|---|
| 181 | .nextToken implies result.nextToken |
|---|
| 182 | test |
|---|
| 183 | t = Token('(noname)', 1, 1, 1, 'ID', 'foo', nil) |
|---|
| 184 | u = t.copy('FOO') |
|---|
| 185 | assert t is not u |
|---|
| 186 | assert t.which == 'ID' |
|---|
| 187 | assert u.which == 'FOO' |
|---|
| 188 | body |
|---|
| 189 | t = .copy to Token |
|---|
| 190 | t._which = which |
|---|
| 191 | return t |
|---|
| 192 | |
|---|
| 193 | def copy(which as String, text as String) as IToken |
|---|
| 194 | ensure |
|---|
| 195 | result.which == which |
|---|
| 196 | result.text == text |
|---|
| 197 | result.value == .value |
|---|
| 198 | .nextToken implies result.nextToken |
|---|
| 199 | test |
|---|
| 200 | t = Token('(noname)', 1, 1, 1, 'ID', 'foo', nil) |
|---|
| 201 | u = t.copy('FOO', 'bar') |
|---|
| 202 | assert t is not u |
|---|
| 203 | assert t.which == 'ID' |
|---|
| 204 | assert u.which == 'FOO' |
|---|
| 205 | assert t.text == 'foo' |
|---|
| 206 | assert u.text == 'bar' |
|---|
| 207 | body |
|---|
| 208 | t = .copy(which) to Token |
|---|
| 209 | t._text = text |
|---|
| 210 | return t |
|---|
| 211 | |
|---|
| 212 | def shortLocationString as String |
|---|
| 213 | """ |
|---|
| 214 | Returns filename, sans path, with line number and col number. |
|---|
| 215 | """ |
|---|
| 216 | test |
|---|
| 217 | t = Token('Foo.cobra', 13, 6, 120, 'DEF', 'def', 'def') |
|---|
| 218 | assert t.shortLocationString == 'Foo.cobra:13:6' |
|---|
| 219 | body |
|---|
| 220 | return '[Path.getFileName(.fileName)]:[.lineNum]:[.colNum]' |
|---|
| 221 | |
|---|
| 222 | def toString as String is override |
|---|
| 223 | test |
|---|
| 224 | t = Token('(noname)', 1, 1, 1, 'ID', 'foo', nil) |
|---|
| 225 | assert t.toString=='"foo" (ID)' |
|---|
| 226 | body |
|---|
| 227 | try |
|---|
| 228 | return _toString() |
|---|
| 229 | catch exc as Exception |
|---|
| 230 | return '(Token.toString Exception: [exc.toString])' |
|---|
| 231 | |
|---|
| 232 | def _toString as String |
|---|
| 233 | sb = StringBuilder() |
|---|
| 234 | for c in _text |
|---|
| 235 | branch c |
|---|
| 236 | on c'\t', sb.append('\\t') |
|---|
| 237 | on c'\r', sb.append('\\r') |
|---|
| 238 | on c'\n', sb.append('\\n') |
|---|
| 239 | else, sb.append(c) |
|---|
| 240 | s = sb.toString |
|---|
| 241 | if s.length |
|---|
| 242 | if s.toLower == _which.toLower # if keyword... |
|---|
| 243 | return '"[s]"' |
|---|
| 244 | else |
|---|
| 245 | return '"[s]" ([_which])' |
|---|
| 246 | else |
|---|
| 247 | return '"[_which]"' # INDENT, DEDENT, etc. |
|---|
| 248 | |
|---|
| 249 | def toTechString as String |
|---|
| 250 | if .isEmpty |
|---|
| 251 | return .getType.name + '.empty' |
|---|
| 252 | else |
|---|
| 253 | return '[.getType.name]([.which], [CobraCore.toTechString(.text)], [CobraCore.toTechString(.value)], ln [.lineNum], col [.colNum], [.fileName])' |
|---|
| 254 | |
|---|
| 255 | def incLineNum |
|---|
| 256 | _lineNum += 1 |
|---|
| 257 | |
|---|
| 258 | |
|---|
| 259 | class TokenFix inherits Token |
|---|
| 260 | """ |
|---|
| 261 | This hack is due to certain C# circumstances where referring to "Token.empty" in Cobra |
|---|
| 262 | does not translate well to C# which wants to interpret "Token" as a property instead |
|---|
| 263 | of the class. |
|---|
| 264 | """ |
|---|
| 265 | |
|---|
| 266 | cue init(fileName as String, lineNum as int, colNum as int, charNum as int, which as String, text as String, value as Object?) |
|---|
| 267 | base.init(fileName, lineNum, colNum, charNum, which, text, value) |
|---|
| 268 | |
|---|
| 269 | |
|---|
| 270 | class TokenizerError inherits SystemException |
|---|
| 271 | """ |
|---|
| 272 | Raised by .nextToken when there are errors in the source trying to be tokenized. |
|---|
| 273 | """ |
|---|
| 274 | |
|---|
| 275 | var _tokenizer as Tokenizer |
|---|
| 276 | var _token as IToken? |
|---|
| 277 | |
|---|
| 278 | cue init(tokenizer as Tokenizer, msg as String) |
|---|
| 279 | require msg.length |
|---|
| 280 | base.init(msg) |
|---|
| 281 | _tokenizer = tokenizer |
|---|
| 282 | if tokenizer.curToken |
|---|
| 283 | _token = tokenizer.curToken |
|---|
| 284 | else |
|---|
| 285 | _token = tokenizer.lastToken |
|---|
| 286 | |
|---|
| 287 | get token from var |
|---|
| 288 | |
|---|
| 289 | get tokenizer from var |
|---|
| 290 | |
|---|
| 291 | |
|---|
| 292 | class Tokenizer |
|---|
| 293 | """ |
|---|
| 294 | Subclasses often override .orderedTokenSpecs, .unorderedTokenSpecs and .keywords. |
|---|
| 295 | """ |
|---|
| 296 | |
|---|
| 297 | var _verbosity as int |
|---|
| 298 | var _willAlwaysEndWithNewLine = true |
|---|
| 299 | var _didReset = false |
|---|
| 300 | var _didStart = false |
|---|
| 301 | var _errors as List<of TokenizerError> |
|---|
| 302 | |
|---|
| 303 | var _fileName as String? |
|---|
| 304 | var _stream as TextReader? |
|---|
| 305 | |
|---|
| 306 | var _lastToken as IToken? |
|---|
| 307 | var _curToken as IToken? |
|---|
| 308 | var _tokenDefsStack = Stack<of TokenDefSet>() |
|---|
| 309 | var _tokenQueue as Queue<of IToken> # needed when token methods return lists of tokens |
|---|
| 310 | |
|---|
| 311 | var _keywordToWhichToken as Dictionary<of String, String> |
|---|
| 312 | var _onTokenTypeCache as Dictionary<of String, MethodInfo?> |
|---|
| 313 | |
|---|
| 314 | var _tokenDefs as List<of TokenDef>? |
|---|
| 315 | var _tokenDefsByFirstChar as Dictionary<of char, List<of TokenDef>>? |
|---|
| 316 | var _tokenDefsByWhich as Dictionary<of String, TokenDef>? |
|---|
| 317 | var _noWarningLines as Cobra.Lang.ISet<of String> |
|---|
| 318 | var _curTokenDef as TokenDef? |
|---|
| 319 | var _lastTokenDef as TokenDef? |
|---|
| 320 | var _avgCount = 0 |
|---|
| 321 | var _avgCountNum = 0 |
|---|
| 322 | |
|---|
| 323 | # Source line and location |
|---|
| 324 | var _sourceLine as String? |
|---|
| 325 | var _originalSourceLine as String? # keep this around since _sourceLine gets chopped down |
|---|
| 326 | var _sourceLineIndex as int |
|---|
| 327 | var _lineNum as int |
|---|
| 328 | var _colNum as int |
|---|
| 329 | var _charNum as int |
|---|
| 330 | |
|---|
| 331 | cue init |
|---|
| 332 | # before adding code here, consider if it should go in _reset or _reuse instead |
|---|
| 333 | base.init |
|---|
| 334 | _didReset = false |
|---|
| 335 | _reset() |
|---|
| 336 | |
|---|
| 337 | def toString as String is override |
|---|
| 338 | sb = StringBuilder() |
|---|
| 339 | sb.append('[.getType.name](') |
|---|
| 340 | .addInfo(sb) |
|---|
| 341 | sb.append(')') |
|---|
| 342 | return sb.toString |
|---|
| 343 | |
|---|
| 344 | def addInfo(sb as StringBuilder) |
|---|
| 345 | sb.append('_sourceLineIndex=[_sourceLineIndex], ') |
|---|
| 346 | sb.append('_lineNum=[_lineNum], ') |
|---|
| 347 | sb.append('_colNum=[_colNum], ') |
|---|
| 348 | sb.append('_charNum=[_charNum], ') |
|---|
| 349 | |
|---|
| 350 | ## Subclasses often override |
|---|
| 351 | |
|---|
| 352 | get orderedTokenSpecs as List<of String> |
|---|
| 353 | return List<of String>() |
|---|
| 354 | |
|---|
| 355 | get unorderedTokenSpecs as List<of String> |
|---|
| 356 | return List<of String>() |
|---|
| 357 | |
|---|
| 358 | get keywords as IList<of String> |
|---|
| 359 | """ |
|---|
| 360 | Returns a list of all keywords. |
|---|
| 361 | Subclasses often override this to specify keywords. |
|---|
| 362 | """ |
|---|
| 363 | return List<of String>() |
|---|
| 364 | |
|---|
| 365 | |
|---|
| 366 | ## Common properties |
|---|
| 367 | |
|---|
| 368 | get curTokenDef from var |
|---|
| 369 | |
|---|
| 370 | get curToken from var |
|---|
| 371 | """ |
|---|
| 372 | Returns the current token being processed. Often examined when catching a TokenizerError. |
|---|
| 373 | """ |
|---|
| 374 | |
|---|
| 375 | get lastToken from var |
|---|
| 376 | |
|---|
| 377 | get didStart from var |
|---|
| 378 | |
|---|
| 379 | get fileName from var |
|---|
| 380 | |
|---|
| 381 | get errors from var |
|---|
| 382 | |
|---|
| 383 | get tokenDefsInOrder from _tokenDefs |
|---|
| 384 | |
|---|
| 385 | get tokenDefsByWhich from var |
|---|
| 386 | |
|---|
| 387 | get noWarningLines from var |
|---|
| 388 | |
|---|
| 389 | get linesCompiled from _lineNum as int |
|---|
| 390 | |
|---|
| 391 | ## Other |
|---|
| 392 | |
|---|
| 393 | def _reset |
|---|
| 394 | _reuse |
|---|
| 395 | _fileName = nil |
|---|
| 396 | _stream = nil |
|---|
| 397 | _keywordToWhichToken = Dictionary<of String, String>() |
|---|
| 398 | _onTokenTypeCache = Dictionary<of String, MethodInfo?>() |
|---|
| 399 | _didReset = true |
|---|
| 400 | _avgCount = _avgCountNum = 0 |
|---|
| 401 | |
|---|
| 402 | def _reuse |
|---|
| 403 | """ |
|---|
| 404 | Resets class vars unrelated to token defs, file name or stream. |
|---|
| 405 | """ |
|---|
| 406 | _errors = List<of TokenizerError>() |
|---|
| 407 | _noWarningLines = Set<of String>() |
|---|
| 408 | _curToken = nil |
|---|
| 409 | _lastToken = nil |
|---|
| 410 | |
|---|
| 411 | _sourceLine = nil |
|---|
| 412 | _originalSourceLine = nil |
|---|
| 413 | _lineNum = 0 |
|---|
| 414 | _colNum = 1 |
|---|
| 415 | _charNum = 1 |
|---|
| 416 | |
|---|
| 417 | _tokenQueue = Queue<of IToken>() |
|---|
| 418 | |
|---|
| 419 | def startFileNamed(fileName as String) as Tokenizer # TODO: as this |
|---|
| 420 | _fileName = fileName |
|---|
| 421 | _stream = File.openText(fileName) |
|---|
| 422 | .afterStart |
|---|
| 423 | return this |
|---|
| 424 | |
|---|
| 425 | def startSource(source as String) as Tokenizer |
|---|
| 426 | return .startSource('(no file name)', source) |
|---|
| 427 | |
|---|
| 428 | def startSource(fileName as String, source as String) as Tokenizer |
|---|
| 429 | if false |
|---|
| 430 | print '**********************************************************************' |
|---|
| 431 | print source |
|---|
| 432 | print '**********************************************************************' |
|---|
| 433 | _fileName = fileName |
|---|
| 434 | _stream = StringReader(source) |
|---|
| 435 | .afterStart |
|---|
| 436 | return this |
|---|
| 437 | |
|---|
| 438 | get nextToken as IToken? |
|---|
| 439 | """ |
|---|
| 440 | Consumes a token and returns it, making it the .lastToken. |
|---|
| 441 | Returns nil when there are no tokens left. |
|---|
| 442 | """ |
|---|
| 443 | ensure .lastToken == result |
|---|
| 444 | #assert .readLine, 'Not started.' |
|---|
| 445 | _didStart = true |
|---|
| 446 | _lastToken = _nextToken |
|---|
| 447 | if _lastToken is nil and _avgCountNum > 0 |
|---|
| 448 | # avgCount = _avgCount / _avgCountNum |
|---|
| 449 | _avgCountNum = 0 |
|---|
| 450 | # print '*** avg toks to look through: [avgCount:N2] for [_fileName]' |
|---|
| 451 | if _verbosity >=1 |
|---|
| 452 | print '<> .nextToken returning [_lastToken]' |
|---|
| 453 | return _lastToken |
|---|
| 454 | |
|---|
| 455 | def allTokens as List<of IToken> |
|---|
| 456 | """ |
|---|
| 457 | Returns all remaining tokens as a list. |
|---|
| 458 | """ |
|---|
| 459 | tokens = List<of IToken>() |
|---|
| 460 | # start = DateTime.now |
|---|
| 461 | while true |
|---|
| 462 | t = .nextToken |
|---|
| 463 | if t, tokens.add(t) |
|---|
| 464 | else, break |
|---|
| 465 | # determine timings for various Tokenizer lexing changes |
|---|
| 466 | /# |
|---|
| 467 | duration = DateTime.now.subtract(start) |
|---|
| 468 | if tokens.count > 100 and duration > TimeSpan(0) |
|---|
| 469 | ticksPerToken = (duration.ticks to int) / tokens.count |
|---|
| 470 | ticksPerTokenMS = ticksPerToken / 10_000 |
|---|
| 471 | fileName = _fileName |
|---|
| 472 | if fileName.startsWith('('), fileName = '(None)' |
|---|
| 473 | else, fileName = Path.getFileName(fileName) |
|---|
| 474 | print '[fileName.padRight(20)]\t[duration]\t[tokens.count]\t' stop |
|---|
| 475 | print ' [ticksPerTokenMS:N3] ms/Token av ([ticksPerTokenMS/1000:N3]sec)' |
|---|
| 476 | #/ |
|---|
| 477 | return tokens |
|---|
| 478 | |
|---|
| 479 | def restart |
|---|
| 480 | """ |
|---|
| 481 | After calling this, you can use the tokenizer anew even with different token defs. |
|---|
| 482 | """ |
|---|
| 483 | if _stream |
|---|
| 484 | _stream.close |
|---|
| 485 | _stream = nil |
|---|
| 486 | _tokenDefs = nil # avoid caching the token defs. now .afterStart will recreate them |
|---|
| 487 | _reset() |
|---|
| 488 | |
|---|
| 489 | def keywordOrWhich(tok as IToken) as IToken |
|---|
| 490 | return .keywordOrWhich(tok, 'ID') |
|---|
| 491 | |
|---|
| 492 | def keywordOrWhich(tok as IToken, which as String) as IToken |
|---|
| 493 | """ |
|---|
| 494 | Changes the token to a keyword if it is one, otherwise sets its which. |
|---|
| 495 | Returns the token. |
|---|
| 496 | This self utility method is typically called by a subclass from the method for the |
|---|
| 497 | "identifier" token. |
|---|
| 498 | """ |
|---|
| 499 | ensure result is tok |
|---|
| 500 | if _keywordToWhichToken.containsKey(tok.text) |
|---|
| 501 | tok.which = _keywordToWhichToken[tok.text] |
|---|
| 502 | return tok |
|---|
| 503 | |
|---|
| 504 | def addTokenSpec(spec as String) |
|---|
| 505 | _tokenDefs.add(.tokenDefForSpec(spec)) |
|---|
| 506 | |
|---|
| 507 | def tokenDefForSpec(spec as String) as TokenDef |
|---|
| 508 | """ |
|---|
| 509 | Returns a TokenDef object for a spec such as |
|---|
| 510 | r'ID [A-Za-z_][A-Za-z0-9_]*' |
|---|
| 511 | """ |
|---|
| 512 | spec = spec.trim.replace('\t', ' ') |
|---|
| 513 | |
|---|
| 514 | # CC: should be something like: |
|---|
| 515 | # which, re = spec.split(array(c' '), 2) |
|---|
| 516 | # instead of: |
|---|
| 517 | partNum = 0 |
|---|
| 518 | for part in spec.split(@[c' '], 2) |
|---|
| 519 | if partNum==0 |
|---|
| 520 | which = part |
|---|
| 521 | else |
|---|
| 522 | re = part |
|---|
| 523 | partNum += 1 |
|---|
| 524 | assert partNum==2, 'Got [partNum] part(s) for spec "[spec]" instead of 2' |
|---|
| 525 | |
|---|
| 526 | re = re.trim |
|---|
| 527 | assert which.length |
|---|
| 528 | assert re.length |
|---|
| 529 | |
|---|
| 530 | return .generateTokenDef(which, re) |
|---|
| 531 | |
|---|
| 532 | def generateTokenDef(which as String, definition as String) as TokenDef |
|---|
| 533 | """ |
|---|
| 534 | Generate a TokenDef for the given Token type/which and definition. |
|---|
| 535 | Normally, the definition is regular expression source code. |
|---|
| 536 | But first a method call 'makeWHICH' is checked for and invoked if present. |
|---|
| 537 | Subclasses can override this method to provide a different approach for some or all token defs. |
|---|
| 538 | """ |
|---|
| 539 | methInfo = .getType.getMethod('Make'+which) |
|---|
| 540 | if methInfo |
|---|
| 541 | tokenDef = methInfo.invoke(this, @[definition]) to TokenDef? |
|---|
| 542 | if tokenDef |
|---|
| 543 | return tokenDef |
|---|
| 544 | return TokenRegexDef(which, definition) |
|---|
| 545 | |
|---|
| 546 | def afterStart |
|---|
| 547 | """ |
|---|
| 548 | Sets up class vars |
|---|
| 549 | readLine as callable |
|---|
| 550 | tokenDefs as list |
|---|
| 551 | """ |
|---|
| 552 | ensure |
|---|
| 553 | _tokenDefs |
|---|
| 554 | body |
|---|
| 555 | assert _didReset, 'Have not reset. Probably the subclass overrides _reset but forgets to invoke base.' |
|---|
| 556 | if _tokenDefs and _tokenDefs.count > 0 |
|---|
| 557 | _reuse |
|---|
| 558 | return |
|---|
| 559 | # create a single list of all token defs in the correct order |
|---|
| 560 | _tokenDefs = List<of TokenDef>() |
|---|
| 561 | if .orderedTokenSpecs.count |
|---|
| 562 | for spec in .orderedTokenSpecs |
|---|
| 563 | .addTokenSpec(spec) |
|---|
| 564 | others = List<of TokenDef>() |
|---|
| 565 | if .unorderedTokenSpecs.count |
|---|
| 566 | for spec in .unorderedTokenSpecs |
|---|
| 567 | others.add(.tokenDefForSpec(spec)) |
|---|
| 568 | keywords = .keywords |
|---|
| 569 | if keywords.count |
|---|
| 570 | for word in keywords |
|---|
| 571 | _keywordToWhichToken[word] = word.toUpper |
|---|
| 572 | # longest tokens need to be matched first |
|---|
| 573 | others.sort(ref .compareTokenDefLengthDesc) # CC: others.sort(def(a, b)=b.length.compareTo(a.length)) |
|---|
| 574 | _tokenDefs.addRange(others) |
|---|
| 575 | .pushTokenDefs(_tokenDefs to !) |
|---|
| 576 | |
|---|
| 577 | def compareTokenDefLengthDesc(a as TokenDef, b as TokenDef) as int |
|---|
| 578 | return b.length.compareTo(a.length) |
|---|
| 579 | |
|---|
| 580 | def _obtainSource as bool |
|---|
| 581 | ensure |
|---|
| 582 | result implies _sourceLine |
|---|
| 583 | result implies _lineNum == old _lineNum + 1 |
|---|
| 584 | body |
|---|
| 585 | _sourceLine = _originalSourceLine = _stream.readLine |
|---|
| 586 | if _sourceLine is nil |
|---|
| 587 | # end of source |
|---|
| 588 | return false |
|---|
| 589 | numLines = _sourceLine.count(c'\n') |
|---|
| 590 | if numLines == 0 and _willAlwaysEndWithNewLine |
|---|
| 591 | _sourceLine += "\n" |
|---|
| 592 | #trace sourceLine |
|---|
| 593 | if numLines |
|---|
| 594 | if numLines==1 |
|---|
| 595 | assert _sourceLine.endsWith('\n') |
|---|
| 596 | else |
|---|
| 597 | assert false, 'Expecting readLine to return one line instead of many.' |
|---|
| 598 | _sourceLineIndex = 0 |
|---|
| 599 | _lineNum += 1 |
|---|
| 600 | _colNum = 1 |
|---|
| 601 | return true |
|---|
| 602 | |
|---|
| 603 | var _narrowTokenDefs = true |
|---|
| 604 | var _minNumTokenDefsToNarrow = 4 |
|---|
| 605 | |
|---|
| 606 | get _nextToken as IToken? |
|---|
| 607 | """ |
|---|
| 608 | This is the core brain of the tokenizer. |
|---|
| 609 | The primary logic for matching tokens is here. |
|---|
| 610 | """ |
|---|
| 611 | count, didCheckTokenDefs = 0, false |
|---|
| 612 | try |
|---|
| 613 | if _tokenQueue.count |
|---|
| 614 | # eat up queue first |
|---|
| 615 | return _tokenQueue.dequeue |
|---|
| 616 | if not _sourceLine or not _sourceLine.length |
|---|
| 617 | if not _obtainSource() |
|---|
| 618 | return nil |
|---|
| 619 | try |
|---|
| 620 | assert _tokenDefs |
|---|
| 621 | tokenDefs = _getCandidateTokenDefs(_sourceLine) |
|---|
| 622 | didCheckTokenDefs = true |
|---|
| 623 | for tokenDef in tokenDefs |
|---|
| 624 | count += 1 |
|---|
| 625 | if _skipMatchAttempt(tokenDef, _sourceLineIndex) |
|---|
| 626 | continue |
|---|
| 627 | #print '<> Trying to match [tokenDef]' |
|---|
| 628 | sourceLine = _sourceLine to ! |
|---|
| 629 | #print '_sourceLineIndex=[_sourceLineIndex]' |
|---|
| 630 | match = tokenDef.match(sourceLine) |
|---|
| 631 | if not match |
|---|
| 632 | #print '<> No match on [tokenDef] for [_sourceLine[_sourceLineIndex]]' |
|---|
| 633 | continue |
|---|
| 634 | _lastTokenDef = _curTokenDef |
|---|
| 635 | _curTokenDef = tokenDef # this enables .onTOKENWHICH methods to access the current tokenDef |
|---|
| 636 | text = match.text |
|---|
| 637 | #print '<> Match! [CobraCore.toTechString(text)] - [tokenDef]' |
|---|
| 638 | # tok = Token(_fileName, _lineNum, _colNum, _charNum, tokenDef.which, text, text) to ? # CC |
|---|
| 639 | tok as IToken? = Token(_fileName, _lineNum, _colNum, _charNum, tokenDef.which, text, text) |
|---|
| 640 | len = text.length |
|---|
| 641 | _curToken = tok |
|---|
| 642 | _colNum += len |
|---|
| 643 | _charNum += len |
|---|
| 644 | _sourceLineIndex += len |
|---|
| 645 | _sourceLine = _sourceLine.substring(len) |
|---|
| 646 | |
|---|
| 647 | # enable methods to customize handling of tokens |
|---|
| 648 | reinvoke = false |
|---|
| 649 | meth = _getTokenMethod(tok.which) |
|---|
| 650 | if meth |
|---|
| 651 | tok = _tokenPostProcess(meth, tok) |
|---|
| 652 | # nil indicates skipped token - will re call this method |
|---|
| 653 | if not tok |
|---|
| 654 | reinvoke = true # to pick up next token after skipped |
|---|
| 655 | |
|---|
| 656 | # finished with current line? |
|---|
| 657 | if _sourceLine.length==0 |
|---|
| 658 | _sourceLine = nil |
|---|
| 659 | _sourceLineIndex = -1 |
|---|
| 660 | _originalSourceLine = nil |
|---|
| 661 | |
|---|
| 662 | # handle token skipping |
|---|
| 663 | if reinvoke |
|---|
| 664 | tok = _nextToken |
|---|
| 665 | |
|---|
| 666 | # yay! |
|---|
| 667 | return tok |
|---|
| 668 | finally |
|---|
| 669 | _curTokenDef = nil |
|---|
| 670 | |
|---|
| 671 | # no match |
|---|
| 672 | if false |
|---|
| 673 | trace all |
|---|
| 674 | trace _lineNum, _colNum, _charNum |
|---|
| 675 | trace _sourceLine[0], _sourceLine[0] to int |
|---|
| 676 | trace _sourceLine |
|---|
| 677 | trace _sourceLineIndex |
|---|
| 678 | trace _originalSourceLine |
|---|
| 679 | .throwError('Lexical error: "[_originalSourceLine[_sourceLineIndex]]" ([_originalSourceLine[_sourceLineIndex] to int])') |
|---|
| 680 | # ^ no source location information is included in the error message as that should be pulled |
|---|
| 681 | # from TokenizerError.tokenzier.curToken or .lastToken |
|---|
| 682 | return nil |
|---|
| 683 | finally |
|---|
| 684 | if didCheckTokenDefs |
|---|
| 685 | _avgCount += count |
|---|
| 686 | _avgCountNum += 1 |
|---|
| 687 | |
|---|
| 688 | def _getCandidateTokenDefs(sourceLine as String?) as List<of TokenDef> |
|---|
| 689 | if _narrowTokenDefs and _tokenDefs.count >= _minNumTokenDefsToNarrow |
|---|
| 690 | assert _tokenDefsByFirstChar |
|---|
| 691 | assert _sourceLine.length |
|---|
| 692 | if _tokenDefsByFirstChar.containsKey(_sourceLine[0]) |
|---|
| 693 | # print 'Using short list for char: [_sourceLine[0]], [_sourceLine[0] to int]' |
|---|
| 694 | return _tokenDefsByFirstChar[_sourceLine[0]] |
|---|
| 695 | return _tokenDefs to ! |
|---|
| 696 | |
|---|
| 697 | def _skipMatchAttempt(tokenDef as TokenDef, sourceLineIndex as int) as bool |
|---|
| 698 | if tokenDef.ignoreCount |
|---|
| 699 | tokenDef.ignoreCount -= 1 |
|---|
| 700 | return true |
|---|
| 701 | if not tokenDef.isActive |
|---|
| 702 | return true |
|---|
| 703 | if sourceLineIndex>0 and tokenDef.requiresBOL |
|---|
| 704 | return true |
|---|
| 705 | if not .isActiveCall(tokenDef) |
|---|
| 706 | return true |
|---|
| 707 | return false |
|---|
| 708 | |
|---|
| 709 | def _getTokenMethod(which as String) as MethodInfo? |
|---|
| 710 | """ |
|---|
| 711 | Get method for token (On<WHICH>) using reflection. |
|---|
| 712 | Caching is used to boost performance. |
|---|
| 713 | """ |
|---|
| 714 | if not _onTokenTypeCache.containsKey(which) # not already in methods cache |
|---|
| 715 | methName = 'On' + which |
|---|
| 716 | meth = .getType.getMethod(methName) |
|---|
| 717 | _onTokenTypeCache[which] = meth |
|---|
| 718 | else |
|---|
| 719 | meth = _onTokenTypeCache[which] |
|---|
| 720 | return meth |
|---|
| 721 | |
|---|
| 722 | def _tokenPostProcess(meth as MethodInfo, tok as IToken?) as IToken? |
|---|
| 723 | """ |
|---|
| 724 | Invoke any tokenMethod and return the result which could the same token, a replacement |
|---|
| 725 | token or nil to skip the token. Any token returned could be the start of a token chain |
|---|
| 726 | via token.nextToken. |
|---|
| 727 | """ |
|---|
| 728 | try |
|---|
| 729 | tok = meth.invoke(this, @[tok]) to IToken? |
|---|
| 730 | catch tie as TargetInvocationException |
|---|
| 731 | throw tie.innerException to ! |
|---|
| 732 | if not tok |
|---|
| 733 | return nil # token is to be skipped |
|---|
| 734 | retTok = tok |
|---|
| 735 | tok = tok.nextToken |
|---|
| 736 | while tok |
|---|
| 737 | # TODO: could probably make this more efficient by axing the queue and just checking for nextToken in this method |
|---|
| 738 | _tokenQueue.enqueue(tok) # store any token chain returned by method call |
|---|
| 739 | tok = tok.nextToken |
|---|
| 740 | return retTok |
|---|
| 741 | |
|---|
| 742 | def pushTokenDefs(defs as List<of TokenDef>) |
|---|
| 743 | ensure |
|---|
| 744 | _tokenDefs is defs |
|---|
| 745 | _tokenDefsByWhich.count == defs.count |
|---|
| 746 | body |
|---|
| 747 | defsByWhich = Dictionary<of String, TokenDef>() |
|---|
| 748 | defsByFirstChar = Dictionary<of char, List<of TokenDef>>() |
|---|
| 749 | unknownFirstCharDefs = List<of TokenDef>() |
|---|
| 750 | n = 0 |
|---|
| 751 | for tokenDef in defs |
|---|
| 752 | tokenDef.number = n |
|---|
| 753 | n += 1 |
|---|
| 754 | assert not defsByWhich.containsKey(tokenDef.which), tokenDef |
|---|
| 755 | defsByWhich[tokenDef.which] = tokenDef |
|---|
| 756 | if _narrowTokenDefs |
|---|
| 757 | if tokenDef.firstChars.count |
|---|
| 758 | for c in tokenDef.firstChars |
|---|
| 759 | if defsByFirstChar.containsKey(c) |
|---|
| 760 | defsByFirstChar[c].add(tokenDef) |
|---|
| 761 | else |
|---|
| 762 | defsByFirstChar[c] = [tokenDef] |
|---|
| 763 | else |
|---|
| 764 | unknownFirstCharDefs.add(tokenDef) |
|---|
| 765 | if _narrowTokenDefs |
|---|
| 766 | v = false |
|---|
| 767 | for key as char in defsByFirstChar.keys |
|---|
| 768 | if v |
|---|
| 769 | print |
|---|
| 770 | print '***', key, defsByFirstChar[key] |
|---|
| 771 | t = defsByFirstChar[key] |
|---|
| 772 | t.addRange(unknownFirstCharDefs) |
|---|
| 773 | # sort by number |
|---|
| 774 | # CC: should specify the comparison here I think, since there is another place where they are sorted by length |
|---|
| 775 | t.sort |
|---|
| 776 | if v |
|---|
| 777 | print '{[key]}' |
|---|
| 778 | for i = 0 .. t.count |
|---|
| 779 | print ' [i]. [t[i]]' |
|---|
| 780 | _tokenDefsStack.push(TokenDefSet(defs, defsByWhich, defsByFirstChar)) |
|---|
| 781 | _tokenDefs = defs |
|---|
| 782 | _tokenDefsByWhich = defsByWhich |
|---|
| 783 | _tokenDefsByFirstChar = defsByFirstChar |
|---|
| 784 | |
|---|
| 785 | def popTokenDefs |
|---|
| 786 | require |
|---|
| 787 | _tokenDefsStack.count > 0 |
|---|
| 788 | body |
|---|
| 789 | _tokenDefsStack.pop |
|---|
| 790 | if _tokenDefsStack.count |
|---|
| 791 | tokenDefSet = _tokenDefsStack.peek |
|---|
| 792 | defs = tokenDefSet.defs to ? |
|---|
| 793 | defsByWhich = tokenDefSet.defsByWhich to ? |
|---|
| 794 | defsByFirstChar = tokenDefSet.defsByFirstChar to ? |
|---|
| 795 | else |
|---|
| 796 | defs = nil |
|---|
| 797 | defsByWhich = nil |
|---|
| 798 | defsByFirstChar = nil |
|---|
| 799 | _tokenDefs = defs |
|---|
| 800 | _tokenDefsByWhich = defsByWhich |
|---|
| 801 | _tokenDefsByFirstChar = defsByFirstChar |
|---|
| 802 | |
|---|
| 803 | def isActiveCall(tok as TokenDef) as bool |
|---|
| 804 | return true |
|---|
| 805 | |
|---|
| 806 | def recordError(msg as String) as TokenizerError |
|---|
| 807 | err = TokenizerError(this, msg) |
|---|
| 808 | _errors.add(err) |
|---|
| 809 | return err |
|---|
| 810 | |
|---|
| 811 | def throwError(msg as String) |
|---|
| 812 | # TODO: like parser, this should probably not be recorded unless it "makes it out". see comment in CobraParser.throwError |
|---|
| 813 | throw .recordError(msg) |
|---|
| 814 | |
|---|
| 815 | def addNoWarning(token as IToken) |
|---|
| 816 | _noWarningLines.add('[token.fileName]:[token.lineNum]') |
|---|
| 817 | |
|---|
| 818 | def checkTokens(tokens as List<of IToken>, expected as String) |
|---|
| 819 | is shared |
|---|
| 820 | """ |
|---|
| 821 | Returns true if the list of tokens "matches" the string. |
|---|
| 822 | This is a utility method to aid with testing. |
|---|
| 823 | """ |
|---|
| 824 | sb = StringBuilder() |
|---|
| 825 | sep = '' |
|---|
| 826 | for t in tokens |
|---|
| 827 | sb.append(sep) |
|---|
| 828 | sb.append(t.which) |
|---|
| 829 | sep = ' ' |
|---|
| 830 | tokensStr = sb.toString |
|---|
| 831 | assert tokensStr==expected |
|---|
| 832 | |
|---|
| 833 | |
|---|
| 834 | class TokenDef implements IComparable is abstract |
|---|
| 835 | |
|---|
| 836 | var _number as int |
|---|
| 837 | var _which as String |
|---|
| 838 | var _requiresBOL as bool |
|---|
| 839 | var _ignoreCount as int |
|---|
| 840 | var _isActive = true |
|---|
| 841 | var _isActiveCall as Object? # @@@@?? |
|---|
| 842 | |
|---|
| 843 | cue init(which as String) |
|---|
| 844 | require |
|---|
| 845 | which.length |
|---|
| 846 | body |
|---|
| 847 | base.init |
|---|
| 848 | _which = which |
|---|
| 849 | |
|---|
| 850 | pro number from var |
|---|
| 851 | |
|---|
| 852 | pro which from var |
|---|
| 853 | |
|---|
| 854 | get firstChars as List<of char> is abstract |
|---|
| 855 | |
|---|
| 856 | get length as int is abstract |
|---|
| 857 | """ |
|---|
| 858 | Returns the textual length of the token definition. |
|---|
| 859 | This is only important for the unordered token specs which are arranged longest to shortest. |
|---|
| 860 | Ordered token specs are left in their specific order. |
|---|
| 861 | """ |
|---|
| 862 | |
|---|
| 863 | get requiresBOL from var |
|---|
| 864 | """ |
|---|
| 865 | Return true if the token def will only match the very beginning of a line. |
|---|
| 866 | In practice, this is false for most token def, and when true, the token def is often |
|---|
| 867 | consuming the entire line or some leading whitespace. |
|---|
| 868 | """ |
|---|
| 869 | |
|---|
| 870 | pro ignoreCount from var |
|---|
| 871 | |
|---|
| 872 | pro isActive from var |
|---|
| 873 | |
|---|
| 874 | def match(input as String) as TokenMatch? is abstract |
|---|
| 875 | require input.length |
|---|
| 876 | |
|---|
| 877 | def toString as String is override |
|---|
| 878 | return '[.getType.name]([_innerToString()])' |
|---|
| 879 | |
|---|
| 880 | def _innerToString as String |
|---|
| 881 | return '[_number], [_which]' |
|---|
| 882 | |
|---|
| 883 | def compareTo(obj as Object?) as int |
|---|
| 884 | if obj is nil |
|---|
| 885 | return 0 |
|---|
| 886 | if obj inherits TokenDef |
|---|
| 887 | return _number - obj.number |
|---|
| 888 | else |
|---|
| 889 | return .getType.name.compareTo(obj.getType.name) |
|---|
| 890 | |
|---|
| 891 | |
|---|
| 892 | class TokenRegexDef inherits TokenDef |
|---|
| 893 | """ |
|---|
| 894 | A token definition that matches based on a regular expression. |
|---|
| 895 | """ |
|---|
| 896 | |
|---|
| 897 | shared |
|---|
| 898 | var _compiledRegExes = Dictionary<of String, Regex>() |
|---|
| 899 | |
|---|
| 900 | var _regExSource as String |
|---|
| 901 | var _re as Regex |
|---|
| 902 | var _length as int |
|---|
| 903 | var _firstChars as List<of char> |
|---|
| 904 | |
|---|
| 905 | cue init(which as String, regExSource as String) |
|---|
| 906 | require |
|---|
| 907 | which.length |
|---|
| 908 | regExSource.length |
|---|
| 909 | body |
|---|
| 910 | base.init(which) |
|---|
| 911 | _requiresBOL = regExSource.startsWith('^') |
|---|
| 912 | _firstChars = _computeFirstChars(regExSource) |
|---|
| 913 | _regExSource = regExSource |
|---|
| 914 | if not _requiresBOL |
|---|
| 915 | regExSource = '^' + regExSource |
|---|
| 916 | _length = regExSource.length |
|---|
| 917 | if _compiledRegExes.containsKey(regExSource) |
|---|
| 918 | _re = _compiledRegExes[regExSource] |
|---|
| 919 | else |
|---|
| 920 | # Note: Making the regex compiled has almost no effect on performance on either |
|---|
| 921 | # .NET or Mono. Either the overhead of creating a compiled version of the regex |
|---|
| 922 | # erases the gain (most likely based on web research) or the compiled expressions |
|---|
| 923 | # don't run fast enough to make a difference. |
|---|
| 924 | _re = Regex(regExSource, RegexOptions.Compiled) |
|---|
| 925 | _compiledRegExes[regExSource] = _re |
|---|
| 926 | |
|---|
| 927 | def _computeFirstChars(s as String) as List<of char> |
|---|
| 928 | # TODO: this is actually specific to Cobra, so it should really be in CobraTokenizer. |
|---|
| 929 | # Maybe that can be done via a callback/delegate. |
|---|
| 930 | t = List<of char>() |
|---|
| 931 | if s.startsWith(r'[A-Za-z_]') |
|---|
| 932 | for c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_' |
|---|
| 933 | t.add(c) |
|---|
| 934 | else if s.startsWith(r'\d') |
|---|
| 935 | for c in '0123456789' |
|---|
| 936 | t.add(c) |
|---|
| 937 | else if s.startsWith(r'\t') |
|---|
| 938 | t.add(c'\t') |
|---|
| 939 | else if s.startsWith(r'\n') |
|---|
| 940 | t.add(c'\n') |
|---|
| 941 | else if s[0] == c'\\' |
|---|
| 942 | t.add(s[1]) |
|---|
| 943 | else if s.startsWith(r'^[\t]') |
|---|
| 944 | t.add(c'\t') |
|---|
| 945 | else if s.startsWith(r'^[ ]+') or s.startsWith(r'[ ]+') |
|---|
| 946 | t.add(c' ') |
|---|
| 947 | else if s.startsWith(r'[ \t]+') |
|---|
| 948 | t.add(c' ') |
|---|
| 949 | t.add(c'\t') |
|---|
| 950 | else if s == r'[ \t]*' |
|---|
| 951 | t.add(c' ') |
|---|
| 952 | t.add(c'\t') |
|---|
| 953 | else if s[0] not in r'^\[.' |
|---|
| 954 | t.add(s[0]) |
|---|
| 955 | # else |
|---|
| 956 | # trace s |
|---|
| 957 | return t |
|---|
| 958 | |
|---|
| 959 | get firstChars from var is override |
|---|
| 960 | |
|---|
| 961 | get length from var is override |
|---|
| 962 | |
|---|
| 963 | get regExSource from var |
|---|
| 964 | |
|---|
| 965 | get re from var |
|---|
| 966 | |
|---|
| 967 | def match(input as String) as TokenMatch? is override |
|---|
| 968 | # trace _regExSource # see ..\Developer\most-common-regexes.text |
|---|
| 969 | reMatch = _re.match(input) |
|---|
| 970 | if reMatch.success |
|---|
| 971 | assert reMatch.index == 0 |
|---|
| 972 | assert reMatch.value |
|---|
| 973 | return TokenRegexMatch(reMatch) |
|---|
| 974 | else |
|---|
| 975 | return nil |
|---|
| 976 | |
|---|
| 977 | def _innerToString as String is override |
|---|
| 978 | return base._innerToString + ', [_regExSource], [_re]' |
|---|
| 979 | |
|---|
| 980 | |
|---|
| 981 | class TokenMatch |
|---|
| 982 | """ |
|---|
| 983 | The class for match results returned by TokenDef.match. |
|---|
| 984 | """ |
|---|
| 985 | |
|---|
| 986 | var _text as String |
|---|
| 987 | |
|---|
| 988 | cue init(text as String) |
|---|
| 989 | base.init |
|---|
| 990 | _text = text |
|---|
| 991 | |
|---|
| 992 | get text from var |
|---|
| 993 | |
|---|
| 994 | |
|---|
| 995 | class TokenRegexMatch inherits TokenMatch |
|---|
| 996 | |
|---|
| 997 | var _match as Match |
|---|
| 998 | |
|---|
| 999 | cue init(match as Match) |
|---|
| 1000 | base.init(match.value to !) |
|---|
| 1001 | _match = match |
|---|
| 1002 | |
|---|
| 1003 | get match from var |
|---|
| 1004 | |
|---|
| 1005 | |
|---|
| 1006 | class TokenDefSet |
|---|
| 1007 | |
|---|
| 1008 | var _defs as List<of TokenDef> |
|---|
| 1009 | var _defsByWhich as Dictionary<of String, TokenDef> |
|---|
| 1010 | var _defsByFirstChar as Dictionary<of char, List<of TokenDef>> |
|---|
| 1011 | |
|---|
| 1012 | cue init(defs as List<of TokenDef>, defsByWhich as Dictionary<of String, TokenDef>, defsByFirstChar as Dictionary<of char, List<of TokenDef>>) |
|---|
| 1013 | require |
|---|
| 1014 | defs.count |
|---|
| 1015 | defsByWhich.count |
|---|
| 1016 | defsByWhich.count == defs.count |
|---|
| 1017 | # defsByFirstChar.count is not required because having the defs indexed by first char is an optional optimization with no change in behaviour |
|---|
| 1018 | body |
|---|
| 1019 | base.init |
|---|
| 1020 | _defs = defs |
|---|
| 1021 | _defsByWhich = defsByWhich |
|---|
| 1022 | _defsByFirstChar = defsByFirstChar |
|---|
| 1023 | |
|---|
| 1024 | get defs from var |
|---|
| 1025 | |
|---|
| 1026 | get defsByWhich from var |
|---|
| 1027 | |
|---|
| 1028 | get defsByFirstChar from var |
|---|
| 1029 | |
|---|
| 1030 | |
|---|
| 1031 | class TestTokenizer inherits Tokenizer |
|---|
| 1032 | |
|---|
| 1033 | var _idCount as int |
|---|
| 1034 | |
|---|
| 1035 | get idCount from var |
|---|
| 1036 | |
|---|
| 1037 | get orderedTokenSpecs as List<of String> is override |
|---|
| 1038 | return [ |
|---|
| 1039 | 'OPEN_IF ifx\\(', |
|---|
| 1040 | ns'ID [A-Za-z_][A-Za-z0-9_]*', |
|---|
| 1041 | ns'SPACE [ ]+', |
|---|
| 1042 | 'NEWLINE \\n', |
|---|
| 1043 | ] |
|---|
| 1044 | |
|---|
| 1045 | get unorderedTokenSpecs as List<of String> is override |
|---|
| 1046 | return [ |
|---|
| 1047 | 'DOT \\.', |
|---|
| 1048 | 'COLON :', |
|---|
| 1049 | 'PLUS \\+', |
|---|
| 1050 | 'ASSIGN =', |
|---|
| 1051 | 'EQUALS ==', |
|---|
| 1052 | ] |
|---|
| 1053 | |
|---|
| 1054 | get keywords as IList<of String> is override |
|---|
| 1055 | return ['assert', 'if', 'else'] |
|---|
| 1056 | |
|---|
| 1057 | def _reset is override |
|---|
| 1058 | base._reset |
|---|
| 1059 | _idCount = 0 |
|---|
| 1060 | |
|---|
| 1061 | def onID(tok as IToken) as IToken? |
|---|
| 1062 | _idCount += 1 |
|---|
| 1063 | return tok |
|---|
| 1064 | |
|---|
| 1065 | def onSPACE(tok as IToken) as IToken? |
|---|
| 1066 | return nil |
|---|
| 1067 | |
|---|
| 1068 | def onNEWLINE(tok as IToken) as IToken? |
|---|
| 1069 | return tok |
|---|
| 1070 | |
|---|
| 1071 | def onDOT(tok as IToken) as IToken? |
|---|
| 1072 | tok.nextToken = tok.copy |
|---|
| 1073 | return tok |
|---|
| 1074 | |
|---|
| 1075 | test |
|---|
| 1076 | # basics |
|---|
| 1077 | tt = TestTokenizer() |
|---|
| 1078 | tt.startSource('hello there') |
|---|
| 1079 | tokens = tt.allTokens |
|---|
| 1080 | .checkTokens(tokens, 'ID ID NEWLINE') |
|---|
| 1081 | assert tt.idCount==2 |
|---|
| 1082 | |
|---|
| 1083 | # the tokenizer lets methods insert tokens |
|---|
| 1084 | tt.restart |
|---|
| 1085 | tt.startSource('hello.there') |
|---|
| 1086 | tokens = tt.allTokens |
|---|
| 1087 | .checkTokens(tokens, 'ID DOT DOT ID NEWLINE') |
|---|
| 1088 | |
|---|
| 1089 | # tokens know their line numbers and columns and lengths |
|---|
| 1090 | tt.restart |
|---|
| 1091 | tt.startSource('hello\nthere\n you\n ') |
|---|
| 1092 | tokens = for tok in tt.allTokens where tok.which <> 'NEWLINE' |
|---|
| 1093 | assert tokens.count==3 |
|---|
| 1094 | hello = tokens[0] # CC: hello, there, you = tokens |
|---|
| 1095 | there = tokens[1] |
|---|
| 1096 | you = tokens[2] |
|---|
| 1097 | |
|---|
| 1098 | assert hello.lineNum==1 |
|---|
| 1099 | assert hello.colNum==1 |
|---|
| 1100 | assert hello.length==5 |
|---|
| 1101 | assert there.lineNum==2 |
|---|
| 1102 | assert there.colNum==1 |
|---|
| 1103 | assert there.length==5 |
|---|
| 1104 | assert you.lineNum==3 |
|---|
| 1105 | assert you.colNum==2 |
|---|
| 1106 | assert you.length==3 |
|---|