""" Rules: * If an numeric literal contains a decimal point or the exponential symbol (E), it is a number (decimal, float64, etc.). * Else, the type is the first of int, uint, int64, uint64 that will fit the value. """ use System.Collections use System.Text.RegularExpressions class CobraTokenizer inherits Tokenizer test # There are plenty of external Cobra source code tests that will exercise the lexer. # But here are a few basic tests to make sure the tokenizer has some viability. t = CobraTokenizer() # TODO: t.startSource('foo bar') tokens = t.allTokens Tokenizer.checkTokens(tokens, 'ID ID EOL') t.restart t.startSource('class Foo\n\tdef foo()\n\t\treturn 1') # t.startSource( #'''class Foo # def foo() # return 1 #''') tokens = t.allTokens Tokenizer.checkTokens(tokens, 'CLASS ID EOL INDENT DEF OPEN_CALL RPAREN EOL INDENT RETURN INTEGER_LIT EOL DEDENT DEDENT') t.restart t.startSource('class Foo\n\tpass\n\nclass Bar\n\tpass') # t.startSource( # '''class Foo # pass # # class Bar # pass # ''') tokens = t.allTokens Tokenizer.checkTokens(tokens, 'CLASS ID EOL INDENT PASS EOL EOL DEDENT CLASS ID EOL INDENT PASS EOL DEDENT') t.restart t.startSource('class Foo\n\tdef foo()\n\t\treturn 1\n\nclass Bar\n\tpass\n') # t.startSource( #'''class Foo # def foo() # return 1 # #class Bar # pass #''') tokens = t.allTokens Tokenizer.checkTokens(tokens, 'CLASS ID EOL INDENT DEF OPEN_CALL RPAREN EOL INDENT RETURN INTEGER_LIT EOL EOL DEDENT DEDENT CLASS ID EOL INDENT PASS EOL DEDENT') var _indentCount as int var _substLBracketCount as int var _inSubstStringSingle = false var _inSubstStringDouble = false var _inDocString = false var _inCommentBlock = 0 cue init base.init cue init(verbosity as int) base.init _verbosity = verbosity def _reuse base._reuse _indentCount = 0 _substLBracketCount = 0 _inSubstStringSingle = false _inSubstStringDouble = false _inDocString = false _inCommentBlock = 0 def addInfo(sb as StringBuilder) is override base.addInfo(sb) sb.append('_indentCount=[_indentCount], ') sb.append('_substLBracketCount=[_substLBracketCount], ') sb.append('_inSubstStringSingle=[_inSubstStringSingle], ') sb.append('_inSubstStringDouble=[_inSubstStringDouble], ') sb.append('_inDocString=[_inDocString]') sb.append('_inCommentBlock=[_inCommentBlock]') pro willReturnComments from var as bool pro willReturnDirectives from var as bool # Note: The Tokenizer class handles it's input one line at a time, # and retains the \n at the end of the line. This affects # the regex design for the tokens below. def orderedTokenSpecs as IList is override return [ # whitespace WhiteSpaceLineTokenDef(r'^[\t ]+$'), WhiteSpaceCommentTokenDef(1, r'^[\t]+[ ]*\#.*$'), WhiteSpaceCommentTokenDef(2, r'^[ ]+[\t]*\#.*$'), SingleLineCommentBlockTokenDef(r'^[ \t]*\/\#.*\#\/[ \t]*$'), CommentBlockStartTokenDef(r'^[ \t]*\/\#.*$'), IndentMixedTokenDef(1, r'^[\t]+[ ]+(?=[^\t ])'), IndentMixedTokenDef(2, r'^[\t]+[ ]+'), r'INDENT_MIXED_ST r ^[ ]+[\t]+', IndentAllTokenDef('INDENT_ALL_TABS', c'\t', r'^[\t]+'), IndentAllTokenDef('INDENT_ALL_SPACES', c' ', r'^[ ]+'), r'NO_INDENT r ^(?=[^\t\n#\/])', CharTokenDef('EOL', c'\n'), r'INLINE_COMMENT r \/\#.*\#/', r'SINGLE_LINE_COMMENT r \#.*', r'AMBIGUOUS_COMMENT r \/\#.*', SpaceTokenDef('SPACE', r'[ \t]+'), r'AT_ID r @[A-Za-z_][A-Za-z0-9_]*', OpenComboTokenDef(), # OpenGenericTokenDef('OPEN_GENERIC', r'[A-Za-z_][A-Za-z0-9_]*]))'), # r'OPEN_DO s do(', # r'OPEN_IF s if(', # OpenCallTokenDef('OPEN_CALL', r'[A-Za-z_][A-Za-z0-9_]*\('), r'HEX_LIT_UNSIGN r 0x[\dA-Fa-f][\dA-Fa-f]*(_?u)(8|16|32|64)?', r'HEX_LIT_EXPLICIT r 0x[\dA-Fa-f][\dA-Fa-f]*_(8|16|32|64)?', r'HEX_LIT r 0x[\dA-Fa-f][\dA-Fa-f]*', r'FLOAT_LIT_1 r \d[\d_]*\.\d+_?f(32|64)?', r'FLOAT_LIT_2 r \d[\d_]*(_?f)(32|64)?', r'DECIMAL_LIT r \d[\d_]*(\.\d+)?(_?d)', r'NUMBER_LIT r \d[\d_]*(\.\d+)?(_?n)', r'FRACTIONAL_LIT r \d[\d_]*\.\d+', r'INTEGER_LIT_EXPLICIT r \d[\d_]*_?[iu](8|16|32|64)?', r'INTEGER_LIT r \d[\d_]*', r'INT_SIZE r int[0-9]+(?=[^A-Za-z0-9_])', r'UINT_SIZE r uint[0-9]+(?=[^A-Za-z0-9_])', r'FLOAT_SIZE r float[0-9]+(?=[^A-Za-z0-9_])', r"CHAR_LIT_SINGLE r c'(?:\\'|\\?[^'])'", r'CHAR_LIT_DOUBLE r c"(?:\\"|\\?[^"])"', # doc strings r'DOC_STRING_START r """[ \t]*\n', r'DOC_STRING_LINE r """.*"""[ \t]*\n', # sharp strings r"SHARP_SINGLE r sharp'(?:\\.?|[^'\n])*'", r'SHARP_DOUBLE r sharp"(?:\\.?|[^"\n])*"', # raw strings r"STRING_RAW_SINGLE r r'(?:\\.?|[^'\n])*'", r'STRING_RAW_DOUBLE r r"(?:\\.?|[^"\n])*"', # substituted strings r'RBRACKET_SPECIAL s ]', StringStartTokenDef('STRING_START_SINGLE', c"'"), StringPartTokenDef('STRING_PART_SINGLE', c"'"), StringStopTokenDef('STRING_STOP_SINGLE', c"'"), StringStartTokenDef('STRING_START_DOUBLE', c'"'), StringPartTokenDef('STRING_PART_DOUBLE', c'"'), StringStopTokenDef('STRING_STOP_DOUBLE', c'"'), r'STRING_PART_FORMAT r :[^X"\n\[]*(?=])'.replace('X', "'"), # plain strings r"STRING_NOSUB_SINGLE r ns'(?:\\.?|[^'\n])*'", r'STRING_NOSUB_DOUBLE r ns"(?:\\.?|[^"\n])*"', StringLiteralTokenDef('STRING_SINGLE', c"'"), StringLiteralTokenDef('STRING_DOUBLE', c'"'), r'TOQ s to?', IdTokenDef('ID', r'[A-Za-z_][A-Za-z0-9_]*'), ] def unorderedTokenSpecs as IList is override return [ r'SHARP_OPEN s $sharp(', # deprecated. $ is reserved for future language level regex support r"SINGLE_QUOTE s '", r'DOUBLE_QUOTE s "', r'DOT s .', r'DOTDOT s ..', r'COLON s :', r'PLUS s +', r'PLUSPLUS s ++', r'MINUSMINUS s --', r'MINUS s -', r'STARSTAR s **', r'STAR s *', r'SLASHSLASH s //', r'SLASH s /', r'PERCENTPERCENT s %%', r'PERCENT s %', r'ASSIGN s =', r'LPAREN s (', r'RPAREN s )', r'LBRACKET s [', r'RBRACKET s ]', r'LCURLY s {', r'RCURLY s }', r'SEMI s ;', r'COMMA s ,', r'DOUBLE_LT s <<', r'DOUBLE_GT s >>', r'QUESTION s ?', r'BANG s !', r'ARRAY_OPEN s @[', r'AMPERSAND s &', r'VERTICAL_BAR s |', r'CARET s ^', r'TILDE s ~', r'EQ s ==', r'NE s <>', r'LT s <', r'GT s >', r'LE s <=', r'GE s >=', r'PLUS_EQUALS s +=', r'MINUS_EQUALS s -=', r'STAR_EQUALS s *=', r'SLASH_EQUALS s /=', r'SLASHSLASH_EQUALS s //=', r'PERCENT_EQUALS s %=', r'STARSTAR_EQUALS s **=', r'AMPERSAND_EQUALS s &=', r'VERTICAL_BAR_EQUALS s |=', r'CARET_EQUALS s ^=', r'DOUBLE_LT_EQUALS s <<=', r'DOUBLE_GT_EQUALS s >>=', r'QUESTION_EQUALS s ?=', r'BANG_EQUALS s !=', ] get keywords as IList is override return KeywordSpecs.keywords pro typeProvider from var as ITypeProvider? def _reset is override base._reset _indentCount = 0 _substLBracketCount = 0 def allTokens as List tokens = base.allTokens if _inCommentBlock, .throwError('Unfinished multi-line comment.') return tokens ## popular single chars def makeASSIGN(definition as String) as TokenDef? return CharTokenDef('ASSIGN', c'=') def makeDOT(definition as String) as TokenDef? return CharTokenDef('DOT', c'.') def makeCOMMA(definition as String) as TokenDef? return CharTokenDef('COMMA', c',') def makeLPAREN(definition as String) as TokenDef? return CharTokenDef('LPAREN', c'(') def makeRPAREN(definition as String) as TokenDef? return CharTokenDef('RPAREN', c')') def makeLBRACKET(definition as String) as TokenDef? return CharTokenDef('LBRACKET', c'[') def makeRBRACKET(definition as String) as TokenDef? return CharTokenDef('RBRACKET', c']') ## end popular single chars ## popular strings def makeDOTDOT(definition as String) as TokenDef? return StringTokenDef('DOTDOT', '..') def makeOPEN_IF(definition as String) as TokenDef? return StringTokenDef('OPEN_IF', 'if(') def makeTOQ(definition as String) as TokenDef? return StringTokenDef('TOQ', 'to?') def makeOPEN_DO(definition as String) as TokenDef? return StringTokenDef('OPEN_DO', 'do(') def makeEQ(definition as String) as TokenDef? return StringTokenDef('EQ', '==') def makeNE(definition as String) as TokenDef? return StringTokenDef('NE', '<>') ## end popular strings def afterStart is override base.afterStart # CC: # _tokenDefsByWhich['STRING_PART_SINGLE'].isActiveCall = def(tokenizer)=tokenizer.inSubstStringSingle # _tokenDefsByWhich['STRING_STOP_SINGLE'].isActiveCall = def(tokenizer)=tokenizer.inSubstStringSingle # recover from multiline comments while _tokenDefsByWhich.containsKey('COMMENT_BLOCK_LINE') .popTokenDefs _inCommentBlock = 0 inactivate = [ 'RBRACKET_SPECIAL' , 'STRING_PART_SINGLE', 'STRING_STOP_SINGLE', 'STRING_PART_DOUBLE', 'STRING_STOP_DOUBLE', 'STRING_PART_FORMAT', ] for which in inactivate _tokenDefsByWhich[which].isActive = false def isActiveCall(tok as TokenDef) as bool is override if tok.which=='STRING_PART_SINGLE' or tok.which=='STRING_STOP_SINGLE' return _inSubstStringSingle return true def _nextToken as IToken is override # overridden to deliver the final DEDENTS to close out indentation tok = base._nextToken if tok.isEOF colNum = 0 while _indentCount > 0 if colNum == 0, colNum = .lastToken.colNum + 1 t = Token(.lastToken.fileName, .lastToken.lineNum, colNum, .lastToken.charNum, 'DEDENT', '', '') _tokenQueue.enqueue(t) _indentCount -= 1 colNum += 1 if _tokenQueue.count, return _nextToken return tok def onWHITESPACE_LINE(tok as IToken) as IToken? # Eat these. # Don't muck with perceived indentation level as # these kinds of lines are irrelevant. #print '<> onWHITESPACE_LINE' return nil def onWHITESPACE_COMMENT_1(tok as IToken) as IToken? #print '<> onWHITESPACE_COMMENT_1' if .checkForCommentDirective(tok) return .directiveToken(tok) else return .commentToken(tok) def onWHITESPACE_COMMENT_2(tok as IToken) as IToken? #print '<> onWHITESPACE_COMMENT_2' if .checkForCommentDirective(tok) return .directiveToken(tok) else return .commentToken(tok) return nil ## ## Comment out block ## var _commentBlockDefs as List? def onCOMMENT_BLOCK_START(tok as IToken) as IToken? #print '<> onCOMMENT_BLOCK_START', tok assert _inCommentBlock >= 0, tok # narrow the tokenizer's token defs to a new shorter set if _inCommentBlock == 0 if _commentBlockDefs is nil defs = List() defs.add(.tokenDefsByWhich['SINGLE_LINE_COMMENT_BLOCK']) defs.add(.tokenDefsByWhich['COMMENT_BLOCK_START']) defs.add(TokenRegexDef('COMMENT_BLOCK_STOP', r'[^#]*\#\/.*$')) # to-do: the .*$ portion looks wrong as in that text will never be processed defs.add(TokenRegexDef('COMMENT_BLOCK_LINE', '.*\n')) _commentBlockDefs = defs .pushTokenDefs(_commentBlockDefs to !) _inCommentBlock += 1 return .commentToken(tok) def onCOMMENT_BLOCK_LINE(tok as IToken) as IToken? #print '<> onCOMMENT_BLOCK_LINE', tok.lineNum assert _inCommentBlock > 0, tok return .commentToken(tok) def onCOMMENT_BLOCK_STOP(tok as IToken) as IToken? #print '<> onCOMMENT_BLOCK_STOP', tok.lineNum assert _inCommentBlock > 0, tok _inCommentBlock -= 1 if _inCommentBlock == 0, .popTokenDefs return .commentToken(tok) def onINDENT_MIXED_TSNS(tok as IToken) as IToken? # expecting tabs, spaces, non-whitespace assert tok.text.startsWith('\t') assert tok.text.endsWith(' ') # this is okay on continued lines if .justDidLineContinuation indentLevel = tok.text.count(c'\t') + tok.text.count(c' ') // 4 return _processNumIndentLevels(indentLevel) # will check continuation indentation rules else return .onINDENT_MIXED_TS(tok) def onINDENT_MIXED_TS(tok as IToken) as IToken? sb = StringBuilder() for c in tok.text branch c on c' ', sb.append(r'[SPACE]') on c'\t', sb.append(r'[TAB]') else, sb.append(c) # to-do: for people using tabs a stray space is most common error here, # such as ...[TAB][SPACE][TAB]... # could probably detect this and invoke .recordError so compilation can continue. .throwError('Cannot mix tabs and spaces in indentation. [sb]...') return nil # make compiler happy. def onINDENT_MIXED_ST(tok as IToken) as IToken? return .onINDENT_MIXED_TS(tok) def onINDENT_ALL_TABS(tok as IToken) as IToken? numTabs = tok.text.count(c'\t') return _processNumIndentLevels(numTabs) def onINDENT_ALL_SPACES(tok as IToken) as IToken? numSpaces = tok.text.count(c' ') if numSpaces % 4 and not .justDidLineContinuation # yes, 4. hard coded, intentionally. # TODO: should really just record an error and take (numSpaces/4).round as the indent .throwError('Space-based indentation must be a multiple of 4. This line has a remainder of [numSpaces%4].') return _processNumIndentLevels(numSpaces // 4) def onNO_INDENT(tok as IToken) as IToken? require tok.text=='' _curTokenDef.ignoreCount = 1 t = _processNumIndentLevels(0) return t def _processNumIndentLevels(numTabs as int) as IToken? if .justDidLineContinuation if numTabs < _indentCount .recordError('Must indent same amount or more on a continued line.') return nil firstTok as IToken? lastTok as IToken? while numTabs > _indentCount _indentCount += 1 newTok = Token(_fileName, _lineNum, 1, _charNum, 'INDENT', '', '') if lastTok lastTok.nextToken = newTok lastTok = newTok else firstTok = lastTok = newTok if firstTok return firstTok while numTabs < _indentCount _indentCount -= 1 newTok = Token(_fileName, _lineNum, 1, _charNum, 'DEDENT', '', '') if lastTok lastTok.nextToken = newTok lastTok = newTok else firstTok = lastTok = newTok return firstTok var _didLineContinuation as bool # only meaningful after an EOL get justDidLineContinuation as bool return .lastToken.which == 'EOL' and _didLineContinuation def onEOL(tok as IToken) as IToken? _didLineContinuation = .lastToken.text == '_' and .lastToken.which == 'ID' return tok def onSINGLE_LINE_COMMENT_BLOCK(tok as IToken) as IToken? return .onSINGLE_LINE_COMMENT(tok) def onSINGLE_LINE_COMMENT(tok as IToken) as IToken? return if(.checkForCommentDirective(tok), .directiveToken(tok), .commentToken(tok)) def onINLINE_COMMENT(tok as IToken) as IToken? return .commentToken(tok) def onAMBIGUOUS_COMMENT(tok as IToken) as IToken? .throwError('Ambiguous comment at /#. For an end-of-line comment, put a space between / and #. For an inline comment, end it with #/. For a block comment, put /# at the beginning of a line.') return .commentToken(tok) def onSPACE(tok as IToken) as IToken? # eat these return nil def onAT_ID(tok as IToken) as IToken? tok.value = tok.text[1:] return tok def onOPEN_CALL(tok as IToken) as IToken? tok.value = tok.text[:-1] return tok def onOPEN_GENERIC(tok as IToken) as IToken? require tok.text.trim.endsWith(''ID' tok.isKeyword = true return tok def onFLOAT_LIT_1(tok as IToken) as IToken? ensure result.which == 'FLOAT_LIT' (result.info to int) in [32, 64] # CC: axe cast body s = tok.text.replace('_', '') if s.endsWith('f') size = 64 s = s[:-1] else if s.endsWith('f32') size = 32 s = s[:-3] else if s.endsWith('f64') size = 64 s = s[:-3] else # cannot have other size specs given regex size = 64 try tok.value = float.parse(s, Utils.cultureInfoForNumbers) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('Range overflow for float literal "[tok.text]".') tok.which = 'FLOAT_LIT' tok.info = size return tok def onFLOAT_LIT_2(tok as IToken) as IToken? ensure result.which == 'FLOAT_LIT' (result.info to int) in [32, 64] # CC: axe cast body return .onFLOAT_LIT_1(tok) def onDECIMAL_LIT(tok as IToken) as IToken? s = tok.text assert s.endsWith('d') s = s[:-1] s = s.replace('_', '') try tok.value = decimal.parse(s, Utils.cultureInfoForNumbers) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('Range overflow for decimal literal "[tok.text]".') return tok def onNUMBER_LIT(tok as IToken) as IToken? require tok.text.endsWith('n') tok.which = 'FRACTIONAL_LIT' tok.text = tok.text[:-1] return .onFRACTIONAL_LIT(tok) def onFRACTIONAL_LIT(tok as IToken) as IToken? s = tok.text.replace('_', '') try assert _typeProvider numberType = if(.typeProvider, .typeProvider.numberType, DecimalType()) # parse literal to same type as numberType if numberType inherits DecimalType tok.value = decimal.parse(s, Utils.cultureInfoForNumbers) else if numberType inherits FloatType tok.value = float.parse(s, Utils.cultureInfoForNumbers) tok.which = 'FLOAT_LIT' tok.info = numberType.size else throw FallThroughException(numberType) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('[numberType.name.capitalized] range overflow for fractional literal "[tok.text]".') return tok def onINTEGER_LIT(tok as IToken) as IToken? try val = uint64.parse(tok.text.replace('_', ''), Utils.cultureInfoForNumbers) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('Range overflow for integer literal "[tok.text]".') return tok _inferIntSize(tok, val) return tok def onINTEGER_LIT_EXPLICIT(tok as IToken) as IToken? require 'i' in tok.text or 'u' in tok.text ensure tok.which == 'INTEGER_LIT' (tok.info to int) in [-8, 8, -16, 16, -32, 32, -64, 64] # CC: axe cast body s = tok.text.replace('_', '') for c in s, if not c.isDigit, break # c will be 'i' or 'u' if s[s.length-1] == c size = 32 s = s[:-1] else if s.endsWith('32') size = 32 s = s[:-3] else if s.endsWith('64') size = 64 s = s[:-3] else if s.endsWith('16') size = 16 s = s[:-3] else if s.endsWith('8') size = 8 s = s[:-2] else # cannot have other size specs given regex size = 32 try tok.value = uint64.parse(s, Utils.cultureInfoForNumbers) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('Range overflow for integer literal "[tok.text]".') tok.which = 'INTEGER_LIT' tok.info = if(c == c'i', -1, +1) * size return tok def onHEX_LIT_UNSIGN(tok as IToken) as IToken? require 'u' in tok.text body return .onHEX_LIT_EXPLICIT(tok) def onHEX_LIT_EXPLICIT(tok as IToken) as IToken? ensure tok.which == 'INTEGER_LIT' (tok.info to int) in [8, 16, 32, 64] # CC: axe cast body size = 32 h = tok.text s = tok.text if s.endsWith('32') size = 32 s = s[:-2] else if s.endsWith('64') size = 64 s = s[:-2] else if s.endsWith('16') size = 16 s = s[:-2] else if s.endsWith('8') size = 8 s = s[:-1] if s.endsWith('u') s = s[:-1] s = s.replace('_', '') tok.text = s tok = .onHEX_LIT(tok) to ! tok.info = size # unsigned tok.text = h return tok def onHEX_LIT(tok as IToken) as IToken? ensure tok.which == 'INTEGER_LIT' (tok.info to int) in [-32, 32, -64, 64] # CC: axe cast body try val = uint64.parse(tok.text[2:], System.Globalization.NumberStyles.HexNumber) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('Range overflow for hex literal "[tok.text]".') tok.which = 'INTEGER_LIT' _inferIntSize(tok, val) return tok def _inferIntSize(tok as IToken, val as uint64) if val <= int.maxValue tok.value = val to int tok.info = -32 else if val <= uint.maxValue tok.value = val to uint tok.info = 32 else if val <= int64.maxValue tok.value = val to int64 tok.info = -64 else tok.value = val tok.info = 64 def onINT_SIZE(tok as IToken) as IToken? size = int.parse(tok.text[3:]) tok.value = size return tok def onUINT_SIZE(tok as IToken) as IToken? size = int.parse(tok.text[4:]) tok.value = size return tok def onFLOAT_SIZE(tok as IToken) as IToken? size = int.parse(tok.text[5:]) tok.value = size return tok def onCHAR_LIT_SINGLE(tok as IToken) as IToken? return _onCharLit(tok) def onCHAR_LIT_DOUBLE(tok as IToken) as IToken? return _onCharLit(tok) def _onCharLit(tok as IToken) as IToken? require tok.text.startsWith('c') s = tok.text[2:-1] assert s.length==1 or s.length==2 tok.value = s return tok ## ## String substitution handling ## def tokValueForString(s as String) as String """ Utility class for onSTRING_START|PART|STOP_SINGLE|DOUBLE. """ require s.length >= 2 # CC: #s[0] in [c'"', c"'"] #s[s.length-1] in [c'"', c"'"] body s = s.substring(1, s.length-2) chars = StringBuilder(s.length) last = c'\0' next as char? for c in s next = nil if last == c'\\' branch c on c'a', next = c'\a' on c'b', next = c'\b' on c'f', next = c'\f' on c'n', next = c'\n' on c'r', next = c'\r' on c't', next = c'\t' on c'v', next = c'\v' on c"'", next = c"'" on c'"', next = c'"' on c'?', next = c'?' on c'0', next = c'\0' on c'\\' chars.append(c'\\') # cannot have `last` being a backslash anymore--it's considered consumed now last = c'\0' continue else, next = c # TODO: should probably be error: Invalid escape sequence else if c <> '\\' next = c if next is not nil chars.append(next) last = c return chars.toString def onSTRING_START_SINGLE(tok as IToken) as IToken require not _inSubstStringSingle _inSubstStringSingle = true tok.value = .tokValueForString(tok.text) _tokenDefsByWhich['STRING_PART_SINGLE'].isActive = true _tokenDefsByWhich['STRING_STOP_SINGLE'].isActive = true _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = true return tok def onSTRING_PART_SINGLE(tok as IToken) as IToken require _inSubstStringSingle tok.value = .tokValueForString(tok.text) return tok def onSTRING_STOP_SINGLE(tok as IToken) as IToken require _inSubstStringSingle _inSubstStringSingle = false tok.value = .tokValueForString(tok.text) _tokenDefsByWhich['STRING_PART_SINGLE'].isActive = false _tokenDefsByWhich['STRING_STOP_SINGLE'].isActive = false _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = false return tok def onSTRING_START_DOUBLE(tok as IToken) as IToken require not _inSubstStringDouble _inSubstStringDouble = true tok.value = .tokValueForString(tok.text) _tokenDefsByWhich['STRING_PART_DOUBLE'].isActive = true _tokenDefsByWhich['STRING_STOP_DOUBLE'].isActive = true _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = true return tok def onSTRING_PART_DOUBLE(tok as IToken) as IToken require _inSubstStringDouble tok.value = .tokValueForString(tok.text) return tok def onSTRING_STOP_DOUBLE(tok as IToken) as IToken require _inSubstStringDouble _inSubstStringDouble = false tok.value = .tokValueForString(tok.text) _tokenDefsByWhich['STRING_PART_DOUBLE'].isActive = false _tokenDefsByWhich['STRING_STOP_DOUBLE'].isActive = false _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = false return tok def onLBRACKET(tok as IToken) as IToken if _inSubstStringSingle or _inSubstStringDouble _substLBracketCount += 1 if _substLBracketCount==1 _tokenDefsByWhich['RBRACKET_SPECIAL'].isActive = true assert _tokenDefsByWhich['STRING_PART_FORMAT'].isActive _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = false return tok def onRBRACKET_SPECIAL(tok as IToken) as IToken require _inSubstStringSingle or _inSubstStringDouble _substLBracketCount body _substLBracketCount -= 1 if _substLBracketCount == 0 _tokenDefsByWhich['RBRACKET_SPECIAL'].isActive = false assert not _tokenDefsByWhich['STRING_PART_FORMAT'].isActive _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = true tok.which = 'RBRACKET' # tricky, tricky. the parser never sees an RBRACKET_SPECIAL return tok ## ## Doc Strings ## def onDOC_STRING_START(tok as IToken) as IToken assert not _inDocString # narrow the tokenizer's token defs to a new shorter set # TODO: cache the tokens below t = List() t.add(TokenRegexDef('DOC_STRING_STOP', r'[ \t]*"""[ \t]*\n')) t.add(TokenRegexDef('DOC_STRING_BAD_STOP', r'[ \t]*"""(.+)\n')) t.add(TokenRegexDef('DOC_STRING_BODY_TEXT', '.*\n')) .pushTokenDefs(t) _inDocString = true return tok def onDOC_STRING_STOP(tok as IToken) as IToken assert _inDocString, tok _inDocString = false .popTokenDefs return tok def onDOC_STRING_BAD_STOP(tok as IToken) as IToken assert _inDocString, tok _inDocString = false .popTokenDefs .recordError('Incorrectly finished multi-line comment.') tok.which = 'DOC_STRING_STOP' return tok def onDOC_STRING_BODY_TEXT(tok as IToken) as IToken assert _inDocString, tok return tok def onDOC_STRING_LINE(tok as IToken) as IToken tok.value = tok.text.trim[3:-3].trim return tok ## ## Simple string literals ## def onSTRING_RAW_SINGLE(tok as IToken) as IToken require tok.text.startsWith('r') tok.value = tok.text.substring(2, tok.text.length-3) tok.which = 'STRING_SINGLE' return tok def onSTRING_RAW_DOUBLE(tok as IToken) as IToken require tok.text.startsWith('r') tok.value = tok.text.substring(2, tok.text.length-3) tok.which = 'STRING_DOUBLE' return tok def onSTRING_NOSUB_SINGLE(tok as IToken) as IToken require tok.text.startsWith('ns') tok.value = .tokValueForString(tok.text.substring(2)) tok.which = 'STRING_SINGLE' return tok def onSTRING_NOSUB_DOUBLE(tok as IToken) as IToken require tok.text.startsWith('ns') tok.value = .tokValueForString(tok.text.substring(2)) tok.which = 'STRING_DOUBLE' return tok def onSTRING_SINGLE(tok as IToken) as IToken tok.value = .tokValueForString(tok.text) return tok def onSTRING_DOUBLE(tok as IToken) as IToken tok.value = .tokValueForString(tok.text) return tok ## ## Self util ## var _directiveRE = Regex(r'#\s?\.([\w\-]+)\.($|\s)', RegexOptions.Compiled) def checkForCommentDirective(tok as IToken) as bool # check for .no-warnings. reMatch = _directiveRE.match(tok.text) if reMatch.success tok.which = 'DIRECTIVE' name = reMatch.groups[1].value branch name on 'no-warnings', .addNoWarning(tok) # for testify on 'args', pass # TODO: actually this could be worth implementing outside of testify on 'compile-only', pass on 'error', pass on 'multi', pass on 'multipart', pass on 'require', pass on 'skip', pass on 'warning', pass on 'warning-lax', pass else, .throwError('Unrecognized compiler directive "[name]".') return true return false def commentToken(tok as IToken) as IToken? if .willReturnComments tok.which = 'COMMENT' return tok else return nil def directiveToken(tok as IToken) as IToken? return if(.willReturnDirectives, tok, nil) ### ### Custom Token Defs ### class CommentBlockStartTokenDef inherits TokenDef """ ^[ \t]*\/\#.*$ """ test tokDef = CommentBlockStartTokenDef('') cases = [ '/# ...', '/#', '/##', '/## /', ] for case in cases assert tokDef.match(case) assert tokDef.match(' ' + case) assert tokDef.match(case + ' ') assert tokDef.match(case + '\n') assert tokDef.match(case + ' \n') cases = [ '/ # ...', ] for case in cases assert tokDef.match(case) is nil cue init(definition as String) base.init('COMMENT_BLOCK_START') _requiresBOL = true get length as int is override throw Exception('ordered token') get firstChars as List is override return [c'\t', c' ', c'/'] def _match(input as String) as TokenMatch? is override state = 0 for i in input.length c = input[i] branch state on 0 # [ \t]* if c == c' ' or c == c'\t', pass else if c == c'/', state = 1 else, return nil on 1 # \#.*$ if c == c'#', state = 2 else, return nil on 2 # .*$ if c == c'\n', return TokenMatch(input.substring(0, i)) if state == 2, return TokenMatch(input) else, return nil class IdTokenDef inherits TokenDef """ [A-Za-z_][A-Za-z0-9_]* """ test tokDef = IdTokenDef('OPEN_CALL', '') assert tokDef.match('x').text == 'x' assert tokDef.match('foo').text == 'foo' assert tokDef.match('Foo').text == 'Foo' assert tokDef.match('Foo ').text == 'Foo' assert tokDef.match('Foo aoeu').text == 'Foo' assert tokDef.match('1234') is nil assert tokDef.match('(') is nil cue init(which as String, definition as String) base.init(which) get length as int is override throw Exception('ordered token') get firstChars as List is override t = List() for c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_', t.add(c) return t def _match(input as String) as TokenMatch? is override state = 0 for i in input.length c = input[i] branch state on 0 # [A-Za-z_] if (c >= c'A' and c <= c'Z') or (c >= c'a' and c <= c'z') or (c == c'_'), state = 1 else return nil on 1 # [A-Za-z0-9_]* if (c >= c'A' and c <= c'Z') or (c >= c'a' and c <= c'z') or (c == c'_') _ or (c >= c'0' and c <= c'9'), pass else, return TokenMatch(input.substring(0, i)) return TokenMatch(input.substring(0, i)) class IndentAllTokenDef inherits TokenDef """ r'INDENT_ALL_TABS ^[\t]+', r'INDENT_ALL_SPACES ^[ ]+', """ test tokDef = IndentAllTokenDef('INDENT_ALL_SPACES', c' ', '') cases = [' ', ' ', ' ', ' x', ' foo'] for case in cases match = tokDef.match(case) assert match assert match.text.trim == '' assert not tokDef.match('x ') var _ch as char cue init(which as String, ch as char, definition as String) base.init(which) _ch, _requiresBOL = ch, true get length as int is override throw Exception('ordered token') get firstChars as List is override return [_ch] def _match(input as String) as TokenMatch? is override ch = _ch for i in input.length if input[i] == ch, pass else, break if i > 0, return TokenMatch(.which, input.substring(0, i)) else, return nil class IndentMixedTokenDef inherits TokenDef """ 1 r'INDENT_MIXED_TSNS ^[\t]+[ ]+(?=[^\t ])', 2 r'INDENT_MIXED_TS ^[\t]+[ ]+', """ test tokDef = IndentMixedTokenDef(1, '') cases = [ '\t x', '\t\t foo', '\t bar', ] for case in cases match = tokDef.match(case) assert match, case assert match.which == 'INDENT_MIXED_TSNS' assert match.text.trim == '' tokDef = IndentMixedTokenDef(2, '') cases = [ '\t x', '\t\t foo', '\t bar', ] for case in cases match = tokDef.match(case) assert match assert match.which == 'INDENT_MIXED_TS' assert match.text.trim == '' var _num as int cue init(num as int, definition as String) require num in [1, 2] base.init('INDENT_MIXED_'+if(num==1, 'TSNS', 'TS')) _num, _requiresBOL = num, true get length as int is override throw Exception('ordered token') get firstChars as List is override return [c'\t'] def _match(input as String) as TokenMatch? is override state = 0 for i in input.length c = input[i] branch state on 0 # [\t]+ if c <> c'\t', return nil else, state = 2 on 2 # [\t]+[ ]+ if c == c'\t', pass else if c == c' ', state = 3 else, return nil on 3 # [ ]+ if c == c' ' pass else if _num == 1 # (?=[^\t ]) if c <> c'\t' and c <> c' ' return TokenMatch(.which, input.substring(0, i)) else return nil else return TokenMatch(.which, input.substring(0, i)) return nil class OpenComboTokenDef inherits TokenDef """ Combines these which appear sequentially in the token list: OpenGenericTokenDef('OPEN_GENERIC', r'[A-Za-z_][A-Za-z0-9_]*]))'), r'OPEN_DO s do(', r'OPEN_IF s if(', OpenCallTokenDef('OPEN_CALL', r'[A-Za-z_][A-Za-z0-9_]*\('), """ test tokDef = OpenComboTokenDef() # OPEN_GENERIC cases = ['Foo', 'Foo'] for case in cases match = tokDef.match(case) assert match and match.which == 'OPEN_GENERIC' and match.text.trim.endsWith(' is override t = List() for c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_', t.add(c) return t shared var _spaces = @[c' ', c'\n', c'\r', c'\t'] def _match(input as String) as TokenMatch? is override # time savings here included: # 1. eliminating use of StringBuilder() and instead returning input.substring(0, i+1) # 2. switch from: c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_' # to: (c >= c'A' and c <= c'Z') or (c >= c'a' and c <= c'z') or (c == c'_') state = 0 for i in input.length c = input[i] branch state on 0 # [A-Za-z_] if (c >= c'A' and c <= c'Z') or (c >= c'a' and c <= c'z') or (c == c'_'), state = 1 else, return nil on 1 # [A-Za-z0-9_]* if (c >= c'A' and c <= c'Z') or (c >= c'a' and c <= c'z') or (c == c'_') _ or (c >= c'0' and c <= c'9'), pass else if c == c'<', state = 2 else if c == c'(' text = input.substring(0, i+1) if text == 'do(', return TokenMatch('OPEN_DO', text) else if text == 'if(', return TokenMatch('OPEN_IF', text) else, return TokenMatch('OPEN_CALL', text) else, return nil on 2 # o - ]))' if c in _spaces return TokenMatch('OPEN_GENERIC', input.substring(0, i+1)) else if c == c',' or c == c'>' or i == input.length-1 # positive lookahead - (?=[,>]) # so don't add to sb return TokenMatch('OPEN_GENERIC', input.substring(0, i)) else return nil return nil class SingleLineCommentBlockTokenDef inherits TokenDef """ ^[ \t]*\/\#.*\#\/[ \t]*$ """ test tokDef = SingleLineCommentBlockTokenDef('') cases = [ '/# ... #/', '/##/', '/######/', ] for case in cases assert tokDef.match(case) assert tokDef.match(' ' + case) assert tokDef.match(case + ' ') assert tokDef.match(case + '\n') assert tokDef.match(case + ' \n') cases = [ '/# ...', '/ # ... ', '/# ... # /', '/# ...\n...#/', ] for case in cases assert tokDef.match(case) is nil cue init(definition as String) base.init('SINGLE_LINE_COMMENT_BLOCK') _requiresBOL = true get length as int is override throw Exception('ordered token') get firstChars as List is override return [c'\t', c' ', c'/'] def _match(input as String) as TokenMatch? is override state, lenM1 = 0, input.length - 1 for i in input.length c = input[i] branch state on 0 # [ \t]* if c == c' ' or c == c'\t', pass else if c == c'/', state = 1 else, return nil on 1 # \#.*\#\/[ \t]*$ if c == c'#', state = 2 else, return nil on 2 # .*\#\/[ \t]*$ if c == c'\n', return nil else if c == c'#' and i < lenM1 and input[i+1] == c'/', state = 3 on 3 # \/[ \t]*$ assert c == c'/' state = 4 on 4 if c == c' ' or c == c'\t', pass else if c == c'\n', return TokenMatch(input.substring(0, i+1)) else, return nil if state == 2, return nil assert state == 3 or state == 4 return TokenMatch(input) class SpaceTokenDef inherits TokenDef test # .timeIt pass def timeIt is shared input1, input2 = '\t\tx = 5', '# foo' reps = 10_000_000 re = Regex(r'[ \t]+', RegexOptions.Compiled) sw = System.Diagnostics.Stopwatch() sw.start for i in reps re.match(input1) re.match(input2) sw.stop timeRE = sw.elapsedMilliseconds td = SpaceTokenDef('SPACE', r'[ \t]+') sw = System.Diagnostics.Stopwatch() sw.start for i in reps td.match(input1) td.match(input2) sw.stop timeTD = sw.elapsedMilliseconds ratio = timeRE / timeTD trace timeRE, timeTD, ratio # trace: timeRE=6875 (Int64); timeTD=462 (Int64); ratio=14.88 (Decimal); # so at least for SpaceTokenDef, its .match is more than 14 X faster than a compiled regex! cue init(which as String, definition as String) base.init(which) assert definition == r'[ \t]+' get length as int is override throw Exception('ordered token') get firstChars as List is override return [c' ', c'\t'] def _match(input as String) as TokenMatch? is override for i in input.length c = input[i] if c <> c' ' and c <> c'\t', break return if(i==0, nil, TokenMatch(input.substring(0, i))) class StringLiteralTokenDef inherits TokenDef """ STRING_SINGLE '(?:\\.?|[^'\n])*' STRING_DOUBLE "(?:\\.?|[^"\n])*" " """ test cases = [ r'foo', r'', r'foo\n', r'foo\nbar', r'...', ] for delimeter in ['"', "'"] tokDef = StringLiteralTokenDef('STRING_FOO', delimeter[0]) for case in cases if case.length, assert tokDef.match(case) is nil # no delimeters have been added to the case yet case = delimeter + case + delimeter match = tokDef.match(case) assert match, {'case': case, 'tokDef': tokDef} assert match.text.startsWith(delimeter) and match.text.endsWith(delimeter) assert match.text.length == case.length cue init(which as String, delimiter as char) require delimiter in [c'"', c"'"] base.init(which) _delimiter = delimiter get delimiter from var as char def _innerToString as String is override return base._innerToString + ', [.delimiter.toTechString]' get length as int is override throw Exception('ordered token') get firstChars as List is override return [_delimiter] def _match(input as String) as TokenMatch? is override state = 0 for i in input.length c = input[i] branch state on 0 if c == _delimiter, state = 1 else, return nil on 1 # inside string if c == _delimiter, return TokenMatch(input.substring(0, i+1)) else if c == c'\\', state = 2 else if c <> '\n' and c <> _delimiter, pass else, return nil on 2 # \x state = 1 return nil class AbstractStringThingTokenDef inherits TokenDef is abstract """ Regexes were: r"STRING_START_SINGLE '[^'\n\[]*\[", r"STRING_PART_SINGLE \][^'\n\[]*\[", r"STRING_STOP_SINGLE \][^'\n\[]*'", r'STRING_START_DOUBLE "[^"\n\[]*\[', r'STRING_PART_DOUBLE \][^"\n\[]*\[', r'STRING_STOP_DOUBLE \][^"\n\[]*"', But there is also the need to escape left brackets which turned out difficult to express in regexes. I did manage to do it, but then the regexes became *extremely* slow. """ var _quote as char cue init(which as String, quote as char) base.init(which) _quote = quote get length as int is override throw Exception('ordered token') def _match(input as String) as TokenMatch? is override s = _match2(input) if s, return TokenMatch(s) else, return nil def _match2(input as String) as String? is abstract def _matchBetween(input as String, startch as char, stopch as char, breakch as char) as String? """ Matches between a range of characters. """ if input[0] <> startch, return nil sb = StringBuilder(input[0].toString) isEscaped = false for i in 1 : input.length c = input[i] if c == breakch and not isEscaped, return nil if c == c'\n', return nil sb.append(c) if isEscaped isEscaped = false else if c == stopch, return sb.toString if c == c'\\', isEscaped = true return nil def toString as String is override return '[.getType.name]([CobraCore.toTechString(_quote)])' class StringStartTokenDef inherits AbstractStringThingTokenDef cue init(which as String, quote as char) base.init(which, quote) get firstChars as List is override return [_quote] def _match2(input as String) as String? is override test x = StringStartTokenDef('foo', c"\'") # normal: assert x._match2('aoeu') is nil assert x._match2(r"'foo [bar]") == r"'foo [" # odd: assert x._match2(r"'foo[") == r"'foo[" # escaped: assert x._match2(r"'foo\[ foo[") == r"'foo\[ foo[" assert x._match2(r"'foo\\[foo]") == r"'foo\\[" assert x._match2(r"'foo\\\[ foo[") == r"'foo\\\[ foo[" # not this token def: assert x._match2(r"]foo [bar]") is nil assert x._match2(r"]foo[") is nil assert x._match2(r"]foo[") is nil assert x._match2(r"]foo' + ") is nil assert x._match2(r"]foo ' bah blah") is nil assert x._match2(r"]foo\[ foo'") is nil body return _matchBetween(input, _quote, c'[', _quote) class StringPartTokenDef inherits AbstractStringThingTokenDef cue init(which as String, quote as char) base.init(which, quote) get firstChars as List is override return [c']'] def _match2(input as String) as String? is override test x = StringPartTokenDef('foo', c"\'") # normal: assert x._match2('aoeu') is nil assert x._match2(r"]foo [bar]") == r"]foo [" # odd: assert x._match2(r"]foo[") == r"]foo[" # escaped: assert x._match2(r"]foo\[ foo[") == r"]foo\[ foo[" assert x._match2(r"]foo\\[foo]") == r"]foo\\[" assert x._match2(r"]foo\\\[ foo[") == r"]foo\\\[ foo[" # not this token def: assert x._match2(r"'foo [bar]") is nil assert x._match2(r"'foo[") is nil assert x._match2(r"'foo\[ foo[") is nil assert x._match2(r"]foo' + ") is nil assert x._match2(r"]foo ' bah blah") is nil assert x._match2(r"]foo\[ foo'") is nil body return _matchBetween(input, c']', c'[', _quote) class StringStopTokenDef inherits AbstractStringThingTokenDef cue init(which as String, quote as char) base.init(which, quote) get firstChars as List is override return [c']'] def _match2(input as String) as String? is override test x = StringStopTokenDef('foo', c"\'") # normal: assert x._match2('aoeu') is nil assert x._match2(r"]foo' + ") == r"]foo'" assert x._match2(r"] '") == r"] '" # odd: assert x._match2(r"]foo ' bah blah") == r"]foo '" # escaped: assert x._match2(r"]foo\[ foo'") == r"]foo\[ foo'" assert x._match2(r"]foo\\\[ foo'") == r"]foo\\\[ foo'" # not this token def: assert x._match2(r"'foo [bar]") is nil assert x._match2(r"'foo[") is nil assert x._match2(r"'foo\[ foo[") is nil assert x._match2(r"]foo[") is nil assert x._match2(r"'foo [bar]") is nil assert x._match2(r"'foo[") is nil assert x._match2(r"]foo\\[foo]") is nil body return _matchBetween(input, c']', _quote, c'[' ) class WhiteSpaceCommentTokenDef inherits TokenDef """ r'WHITESPACE_COMMENT_1 ^[\t]+[ ]*\#.*$', r'WHITESPACE_COMMENT_2 ^[ ]+[\t]*\#.*$', to-do: collapse this into one tokendef rather than a '1' and a '2'. set the TokenMatch .which """ test tokDefs = [WhiteSpaceCommentTokenDef(1, ''), WhiteSpaceCommentTokenDef(2, '')] cases = [ [1, '\t\t#foo'], [1, '\t\t #foo'], [2, ' #foo'], [2, ' \t\t#foo'], ] for num, case in cases assert tokDefs[num-1].match(case) assert tokDefs[0].match('\t\tfoo') is nil assert tokDefs[0].match('\t\t \tfoo') is nil var _num as int cue init(num as int, definition as String) require num in [1, 2] base.init('WHITESPACE_COMMENT_'+num.toString) _num, _requiresBOL = num, true get length as int is override throw Exception('ordered token') get firstChars as List is override return [c'\t', c' '] def _match(input as String) as TokenMatch? is override state = if(_num==1, 1, 4) for i in input.length c = input[i] branch state # _num == 1 on 1 # [\t]+ if c <> c'\t', return nil else, state = 2 on 2 # [\t]+\# if c == c'\t', pass else if c == c'#', state = 8 else if c == c' ', state = 3 else, return nil on 3 # [ ]*\# if c == c' ', pass else if c == c'#', state = 8 else, return nil # _num == 2 on 4 # [ ]+ if c <> c' ', return nil else, state = 5 on 5 # [ ]+\# if c == c' ', pass else if c == c'#', state = 8 else if c == c'\t', state = 6 else, return nil on 6 # [\t]*\# if c == c'\t', pass else if c == c'#', state = 8 else, return nil # either num on 8 # .*$ if c == c'\n', return TokenMatch(input.substring(0, i)) if state == 8, return TokenMatch(input) else, return nil class WhiteSpaceLineTokenDef inherits TokenDef """ ^[\t ]+$ """ test tokDef = WhiteSpaceLineTokenDef('') cases = [ ' ', ' ', '\t ', '\t', '\t\t', ' \t\t' ] for case in cases assert tokDef.match(case) assert tokDef.match(case+'\n') assert tokDef.match(' foo') is nil assert tokDef.match('foo') is nil cue init(definition as String) base.init('WHITESPACE_LINE') _requiresBOL = true get length as int is override throw Exception('ordered token') get firstChars as List is override return [c'\t', c' '] def _match(input as String) as TokenMatch? is override for i in input.length c = input[i] # [\t ]+ if c == c'\t' or c == c' ' pass else if c == '\n' return TokenMatch(input.substring(0, i)) else return nil return TokenMatch(input.substring(0, i))