use System.Text.RegularExpressions class CobraTokenizer inherits Tokenizer test # There are plenty of external Cobra source code tests that will exercise the lexer. # But here are a few basic tests to make sure the tokenizer has some viability. t = CobraTokenizer() # TODO: t.startSource('foo bar') tokens = t.allTokens Tokenizer.checkTokens(tokens, 'ID ID EOL') t.restart t.startSource('class Foo\n\tdef foo()\n\t\treturn 1') # t.startSource( #'''class Foo # def foo() # return 1 #''') tokens = t.allTokens Tokenizer.checkTokens(tokens, 'CLASS ID EOL INDENT DEF OPEN_CALL RPAREN EOL INDENT RETURN INTEGER_LIT EOL DEDENT DEDENT') t.restart t.startSource('class Foo\n\tpass\n\nclass Bar\n\tpass') # t.startSource( # '''class Foo # pass # # class Bar # pass # ''') tokens = t.allTokens Tokenizer.checkTokens(tokens, 'CLASS ID EOL INDENT PASS EOL EOL DEDENT CLASS ID EOL INDENT PASS EOL DEDENT') t.restart t.startSource('class Foo\n\tdef foo()\n\t\treturn 1\n\nclass Bar\n\tpass\n') # t.startSource( #'''class Foo # def foo() # return 1 # #class Bar # pass #''') tokens = t.allTokens Tokenizer.checkTokens(tokens, 'CLASS ID EOL INDENT DEF OPEN_CALL RPAREN EOL INDENT RETURN INTEGER_LIT EOL EOL DEDENT DEDENT CLASS ID EOL INDENT PASS EOL DEDENT') var _indentCount as int var _substLBracketCount as int var _inSubstStringSingle = false var _inSubstStringDouble = false var _inDocString = false var _inCommentBlock = 0 cue init base.init cue init(verbosity as int) base.init _verbosity = verbosity def addInfo(sb as StringBuilder) is override base.addInfo(sb) sb.append('_indentCount=[_indentCount], ') sb.append('_substLBracketCount=[_substLBracketCount], ') sb.append('_inSubstStringSingle=[_inSubstStringSingle], ') sb.append('_inSubstStringDouble=[_inSubstStringDouble], ') sb.append('_inDocString=[_inDocString]') sb.append('_inCommentBlock=[_inCommentBlock]') pro willReturnComments from var as bool pro willReturnDirectives from var as bool # Note: The Tokenizer class handles it's input one line at a time, # and retains the \n at the end of the line. This affects # the regex design for the tokens below. get orderedTokenSpecs as List is override return [ # whitespace r'WHITESPACE_LINE ^[\t ]+$', r'WHITESPACE_COMMENT_1 ^[\t]+[ ]*\#.*$', r'WHITESPACE_COMMENT_2 ^[ ]+[\t]*\#.*$', r'COMMENT_BLOCK_START ^[ \t]*\/\#.*$', r'INDENT_MIXED_TSNS ^[\t]+[ ]+(?=[^\t ])', r'INDENT_MIXED_TS ^[\t]+[ ]+', r'INDENT_MIXED_ST ^[ ]+[\t]+', r'INDENT_ALL_TABS ^[\t]+', r'INDENT_ALL_SPACES ^[ ]+', r'NO_INDENT ^(?=[^\t\n#\/])', r'EOL \n', r'INLINE_COMMENT \/\#.*\#/', r'SINGLE_LINE_COMMENT \#.*', r'AMBIGUOUS_COMMENT \/\#.*', r'SPACE [ \t]+', r'AT_ID @[A-Za-z_][A-Za-z0-9_]*', r'OPEN_GENERIC [A-Za-z_][A-Za-z0-9_]*]))', r'OPEN_DO do\(', r'OPEN_IF if\(', r'OPEN_CALL [A-Za-z_][A-Za-z0-9_]*\(', r'HEX_LIT_UNSIGN 0x[\dA-Fa-f][\dA-Fa-f]*(_?u)(8|16|32|64)?', r'HEX_LIT_EXPLICIT 0x[\dA-Fa-f][\dA-Fa-f]*_(8|16|32|64)?', r'HEX_LIT 0x[\dA-Fa-f][\dA-Fa-f]*', r'FLOAT_LIT_1 \d[\d_]*\.\d+_?f(32|64)?', r'FLOAT_LIT_2 \d[\d_]*(_?f)(32|64)?', r'DECIMAL_LIT \d[\d_]*(\.\d+)?(_?d)', r'NUMBER_LIT \d[\d_]*(\.\d+)?(_?n)', r'FRACTIONAL_LIT \d[\d_]*\.\d+', r'INTEGER_LIT_EXPLICIT \d[\d_]*_?[iu](8|16|32|64)?', r'INTEGER_LIT \d[\d_]*', r'INT_SIZE int[0-9]+(?=[^A-Za-z0-9_])', r'UINT_SIZE uint[0-9]+(?=[^A-Za-z0-9_])', r'FLOAT_SIZE float[0-9]+(?=[^A-Za-z0-9_])', r"CHAR_LIT_SINGLE c'(?:\\'|\\?[^'])'", r'CHAR_LIT_DOUBLE c"(?:\\"|\\?[^"])"', # doc strings r'DOC_STRING_START """[ \t]*\n', r'DOC_STRING_LINE """.*"""[ \t]*\n', # sharp strings r"SHARP_SINGLE sharp'(?:\\.?|[^'\n])*'", r'SHARP_DOUBLE sharp"(?:\\.?|[^"\n])*"', # raw strings r"STRING_RAW_SINGLE r'(?:\\.?|[^'\n])*'", r'STRING_RAW_DOUBLE r"(?:\\.?|[^"\n])*"', # substituted strings r'RBRACKET_SPECIAL ]', r"STRING_START_SINGLE '", # see "def makeSTRING_FOO_BAR" r"STRING_PART_SINGLE '", r"STRING_STOP_SINGLE '", r'STRING_START_DOUBLE "', r'STRING_PART_DOUBLE "', r'STRING_STOP_DOUBLE "', r'STRING_PART_FORMAT :[^X"\n\[]*(?=])'.replace('X', "'"), # plain strings r"STRING_NOSUB_SINGLE ns'(?:\\.?|[^'\n])*'", r'STRING_NOSUB_DOUBLE ns"(?:\\.?|[^"\n])*"', r"STRING_SINGLE '(?:\\.?|[^'\n])*'", r'STRING_DOUBLE "(?:\\.?|[^"\n])*"', r'TOQ to\?', r'ID [A-Za-z_][A-Za-z0-9_]*', ] get unorderedTokenSpecs as List is override return [ r'SHARP_OPEN \$sharp\(', # deprecated. $ is reserved for future language level regex support r"SINGLE_QUOTE '", r'DOUBLE_QUOTE "', r'DOT \.', r'DOTDOT \.\.', r'COLON :', r'PLUS \+', r'PLUSPLUS \+\+', r'MINUSMINUS \-\-', r'MINUS -', r'STARSTAR \*\*', r'STAR \*', r'SLASHSLASH //', r'SLASH /', r'PERCENTPERCENT %%', r'PERCENT %', r'ASSIGN =', r'LPAREN \(', r'RPAREN \)', r'LBRACKET \[', r'RBRACKET \]', r'LCURLY \{', r'RCURLY \}', r'SEMI ;', r'COMMA ,', r'DOUBLE_LT <<', r'DOUBLE_GT >>', r'DICT_OPEN {', r'DICT_CLOSE }', r'QUESTION \?', r'BANG \!', r'ARRAY_OPEN \@\[', r'AMPERSAND \&', r'VERTICAL_BAR \|', r'CARET \^', r'TILDE \~', r'EQ ==', r'NE <>', r'LT <', r'GT >', r'LE <=', r'GE >=', r'PLUS_EQUALS \+=', r'MINUS_EQUALS \-=', r'STAR_EQUALS \*=', r'SLASH_EQUALS \/=', r'PERCENT_EQUALS %=', r'STARSTAR_EQUALS \*\*=', r'AMPERSAND_EQUALS \&=', r'VERTICAL_BAR_EQUALS \|=', r'CARET_EQUALS \^=', r'DOUBLE_LT_EQUALS <<=', r'DOUBLE_GT_EQUALS >>=', r'QUESTION_EQUALS \?=', r'BANG_EQUALS \!=', ] get keywords as IList is override return KeywordSpecs.keywords pro typeProvider from var as ITypeProvider? def _reset is override base._reset _indentCount = 0 _substLBracketCount = 0 def makeSPACE(definition as String) as TokenDef? return SpaceTokenDef('SPACE', definition) # TODO # def makeOPEN_GENERIC(definition as String) as TokenDef? # return OpenGenericTokenDef('OPEN_GENERIC', definition) def makeSTRING_START_SINGLE(definition as String) as TokenDef? return StringStartTokenDef('STRING_START_SINGLE', definition[0]) def makeSTRING_START_DOUBLE(definition as String) as TokenDef? return StringStartTokenDef('STRING_START_DOUBLE', definition[0]) def makeSTRING_PART_SINGLE(definition as String) as TokenDef? return StringPartTokenDef('STRING_PART_SINGLE', definition[0]) def makeSTRING_PART_DOUBLE(definition as String) as TokenDef? return StringPartTokenDef('STRING_PART_DOUBLE', definition[0]) def makeSTRING_STOP_SINGLE(definition as String) as TokenDef? return StringStopTokenDef('STRING_STOP_SINGLE', definition[0]) def makeSTRING_STOP_DOUBLE(definition as String) as TokenDef? return StringStopTokenDef('STRING_STOP_DOUBLE', definition[0]) def afterStart is override base.afterStart # CC: # _tokenDefsByWhich['STRING_PART_SINGLE'].isActiveCall = def(tokenizer)=tokenizer.inSubstStringSingle # _tokenDefsByWhich['STRING_STOP_SINGLE'].isActiveCall = def(tokenizer)=tokenizer.inSubstStringSingle # recover from multiline comments while _tokenDefsByWhich.containsKey('COMMENT_BLOCK_LINE') .popTokenDefs _inCommentBlock = 0 inactivate = [ 'RBRACKET_SPECIAL' , 'STRING_PART_SINGLE', 'STRING_STOP_SINGLE', 'STRING_PART_DOUBLE', 'STRING_STOP_DOUBLE', 'STRING_PART_FORMAT', ] for which in inactivate _tokenDefsByWhich[which].isActive = false def isActiveCall(tok as TokenDef) as bool is override if tok.which=='STRING_PART_SINGLE' or tok.which=='STRING_STOP_SINGLE' return _inSubstStringSingle return true get _nextToken as IToken? is override # overridden to deliver the final DEDENTS to close out indentation tok = base._nextToken if tok is nil colNum = 0 while _indentCount > 0 if colNum == 0, colNum = .lastToken.colNum + 1 t = Token(.lastToken.fileName, .lastToken.lineNum, colNum, .lastToken.charNum, 'DEDENT', '', '') _tokenQueue.enqueue(t) _indentCount -= 1 colNum += 1 if _tokenQueue.count return _nextToken else return nil else return tok def onWHITESPACE_LINE(tok as IToken) as IToken? # Eat these. # Don't muck with perceived indentation level as # these kinds of lines are irrelevant. #print '<> onWHITESPACE_LINE' return nil def onWHITESPACE_COMMENT_1(tok as IToken) as IToken? #print '<> onWHITESPACE_COMMENT_1' if .checkForCommentDirective(tok) return .directiveToken(tok) else return .commentToken(tok) def onWHITESPACE_COMMENT_2(tok as IToken) as IToken? #print '<> onWHITESPACE_COMMENT_2' if .checkForCommentDirective(tok) return .directiveToken(tok) else return .commentToken(tok) return nil ## ## Comment out block ## var _commentBlockDefs as List? def onCOMMENT_BLOCK_START(tok as IToken) as IToken? #print '<> onCOMMENT_BLOCK_START', tok assert _inCommentBlock >= 0, tok # narrow the tokenizer's token defs to a new shorter set if _inCommentBlock == 0 if _commentBlockDefs is nil # CC: _commentBlockDefs = List[_commentBlockStop, _commentBlockLine] # instead of the next 5 lines defs = List() defs.add(.tokenDefsByWhich['COMMENT_BLOCK_START']) defs.add(TokenRegexDef('COMMENT_BLOCK_STOP', r'[^#]*\#\/.*$')) defs.add(TokenRegexDef('COMMENT_BLOCK_LINE', '.*\n')) _commentBlockDefs = defs .pushTokenDefs(_commentBlockDefs to !) _inCommentBlock += 1 return .commentToken(tok) def onCOMMENT_BLOCK_LINE(tok as IToken) as IToken? #print '<> onCOMMENT_BLOCK_LINE', tok.lineNum assert _inCommentBlock > 0, tok return .commentToken(tok) def onCOMMENT_BLOCK_STOP(tok as IToken) as IToken? #print '<> onCOMMENT_BLOCK_STOP', tok.lineNum assert _inCommentBlock > 0, tok _inCommentBlock -= 1 if _inCommentBlock == 0 .popTokenDefs return .commentToken(tok) def onINDENT_MIXED_TSNS(tok as IToken) as IToken? # expecting tabs, spaces, non-whitespace assert tok.text.startsWith('\t') assert tok.text.endsWith(' ') # this is okay on continued lines if .justDidLineContinuation indentLevel = tok.text.count(c'\t') + tok.text.count(c' ') // 4 return _processNumIndentLevels(indentLevel) # will check continuation indentation rules else return .onINDENT_MIXED_TS(tok) def onINDENT_MIXED_TS(tok as IToken) as IToken? sb = StringBuilder() for c in tok.text branch c on c' ', sb.append(r'[SPACE]') on c'\t', sb.append(r'[TAB]') else, sb.append(c) # to-do: for people using tabs a stray space is most common error here, # such as ...[TAB][SPACE][TAB]... # could probably detect this and invoke .recordError so compilation can continue. .throwError('Cannot mix tabs and spaces in indentation. [sb]...') return nil # make compiler happy. def onINDENT_MIXED_ST(tok as IToken) as IToken? return .onINDENT_MIXED_TS(tok) def onINDENT_ALL_TABS(tok as IToken) as IToken? numTabs = tok.text.count(c'\t') return _processNumIndentLevels(numTabs) def onINDENT_ALL_SPACES(tok as IToken) as IToken? numSpaces = tok.text.count(c' ') if numSpaces % 4 and not .justDidLineContinuation # yes, 4. hard coded, intentionally. # TODO: should really just record an error and take (numSpaces/4).round as the indent .throwError('Space-based indentation must be a multiple of 4. This line has a remainder of [numSpaces%4].') return _processNumIndentLevels(numSpaces // 4) def onNO_INDENT(tok as IToken) as IToken? require tok.text=='' _curTokenDef.ignoreCount = 1 t = _processNumIndentLevels(0) return t def _processNumIndentLevels(numTabs as int) as IToken? if .justDidLineContinuation if numTabs < _indentCount .recordError('Must indent same amount or more on a continued line.') return nil firstTok as IToken? lastTok as IToken? while numTabs > _indentCount _indentCount += 1 newTok = Token(_fileName, _lineNum, 1, _charNum, 'INDENT', '', '') if lastTok lastTok.nextToken = newTok lastTok = newTok else firstTok = lastTok = newTok if firstTok return firstTok while numTabs < _indentCount _indentCount -= 1 newTok = Token(_fileName, _lineNum, 1, _charNum, 'DEDENT', '', '') if lastTok lastTok.nextToken = newTok lastTok = newTok else firstTok = lastTok = newTok return firstTok var _didLineContinuation as bool # only meaningful after an EOL get justDidLineContinuation as bool return .lastToken and .lastToken.which == 'EOL' and _didLineContinuation def onEOL(tok as IToken) as IToken? _didLineContinuation = .lastToken and .lastToken.text == '_' and .lastToken.which == 'ID' return tok def onSINGLE_LINE_COMMENT(tok as IToken) as IToken? if .checkForCommentDirective(tok) return .directiveToken(tok) else return .commentToken(tok) def onINLINE_COMMENT(tok as IToken) as IToken? return .commentToken(tok) def onAMBIGUOUS_COMMENT(tok as IToken) as IToken? .throwError('Ambiguous comment at /#. For an end-of-line comment, put a space between / and #. For an inline comment, end it with #/. For a block comment, put /# at the beginning of a line.') return .commentToken(tok) def onSPACE(tok as IToken) as IToken? # eat these return nil def onAT_ID(tok as IToken) as IToken? tok.value = tok.text[1:] return tok def onOPEN_CALL(tok as IToken) as IToken? tok.value = tok.text[:-1] return tok def onOPEN_GENERIC(tok as IToken) as IToken? require tok.text.trim.endsWith(''ID' tok.isKeyword = true return tok def onFLOAT_LIT_1(tok as IToken) as IToken? ensure result.which == 'FLOAT_LIT' (result.info to int) in [32, 64] # CC: axe cast body s = tok.text.replace('_', '') if s.endsWith('f') size = 64 s = s[:-1] else if s.endsWith('f32') size = 32 s = s[:-3] else if s.endsWith('f64') size = 64 s = s[:-3] else # cannot have other size specs given regex size = 64 try tok.value = float.parse(s, Utils.cultureInfoForNumbers) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('Range overflow for float literal "[tok.text]".') tok.which = 'FLOAT_LIT' tok.info = size return tok def onFLOAT_LIT_2(tok as IToken) as IToken? ensure result.which == 'FLOAT_LIT' (result.info to int) in [32, 64] # CC: axe cast body return .onFLOAT_LIT_1(tok) def onDECIMAL_LIT(tok as IToken) as IToken? s = tok.text assert s.endsWith('d') s = s[:-1] s = s.replace('_', '') try tok.value = decimal.parse(s, Utils.cultureInfoForNumbers) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('Range overflow for decimal literal "[tok.text]".') return tok def onNUMBER_LIT(tok as IToken) as IToken? require tok.text.endsWith('n') tok.which = 'FRACTIONAL_LIT' tok.text = tok.text[:-1] return .onFRACTIONAL_LIT(tok) def onFRACTIONAL_LIT(tok as IToken) as IToken? s = tok.text.replace('_', '') try assert _typeProvider numberType = if(.typeProvider, .typeProvider.numberType, DecimalType()) # parse literal to same type as numberType if numberType inherits DecimalType tok.value = decimal.parse(s, Utils.cultureInfoForNumbers) else if numberType inherits FloatType tok.value = float.parse(s, Utils.cultureInfoForNumbers) tok.which = 'FLOAT_LIT' tok.info = numberType.size else throw FallThroughException(numberType) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('[numberType.name.capitalized] range overflow for fractional literal "[tok.text]".') return tok def onINTEGER_LIT(tok as IToken) as IToken? try tok.value = int.parse(tok.text.replace('_', ''), Utils.cultureInfoForNumbers) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('Range overflow for integer literal "[tok.text]".') return tok def onINTEGER_LIT_EXPLICIT(tok as IToken) as IToken? require 'i' in tok.text or 'u' in tok.text ensure tok.which == 'INTEGER_LIT' (tok.info to int) in [-8, 8, -16, 16, -32, 32, -64, 64] # CC: axe cast body s = tok.text.replace('_', '') for c in s, if not c.isDigit, break # c will be 'i' or 'u' if s[s.length-1] == c size = 32 s = s[:-1] else if s.endsWith('32') size = 32 s = s[:-3] else if s.endsWith('64') size = 64 s = s[:-3] else if s.endsWith('16') size = 16 s = s[:-3] else if s.endsWith('8') size = 8 s = s[:-2] else # cannot have other size specs given regex size = 32 try # TODO: The use of int.parse, as opposed to say int64.parse, uint32.parse, etc. probably # means that legit int lits outside the signed 32 bit range will not work in Cobra tok.value = int.parse(s, Utils.cultureInfoForNumbers) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('Range overflow for integer literal "[tok.text]".') tok.which = 'INTEGER_LIT' tok.info = if(c == c'i', -1, +1) * size return tok def onHEX_LIT_UNSIGN(tok as IToken) as IToken? require 'u' in tok.text body return .onHEX_LIT_EXPLICIT(tok) def onHEX_LIT_EXPLICIT(tok as IToken) as IToken? ensure tok.which == 'INTEGER_LIT' (tok.info to int) in [8, 16, 32, 64] # CC: axe cast body size = 32 h = tok.text s = tok.text if s.endsWith('32') size = 32 s = s[:-2] else if s.endsWith('64') size = 64 s = s[:-2] else if s.endsWith('16') size = 16 s = s[:-2] else if s.endsWith('8') size = 8 s = s[:-1] if s.endsWith('u') s = s[:-1] s = s.replace('_', '') tok.text = s tok = .onHEX_LIT(tok) to ! tok.info = size # unsigned tok.text = h return tok def onHEX_LIT(tok as IToken) as IToken? ensure tok.which == 'INTEGER_LIT' tok.info == 32 body try tok.value = int.parse(tok.text[2:], System.Globalization.NumberStyles.HexNumber) catch FormatException assert false, 'not expecting to get here given regex' catch OverflowException .recordError('Range overflow for hex literal "[tok.text]".') tok.which = 'INTEGER_LIT' tok.info = 32 # unsigned return tok def onINT_SIZE(tok as IToken) as IToken? size = int.parse(tok.text[3:]) tok.value = size return tok def onUINT_SIZE(tok as IToken) as IToken? size = int.parse(tok.text[4:]) tok.value = size return tok def onFLOAT_SIZE(tok as IToken) as IToken? size = int.parse(tok.text[5:]) tok.value = size return tok def onCHAR_LIT_SINGLE(tok as IToken) as IToken? return _onCharLit(tok) def onCHAR_LIT_DOUBLE(tok as IToken) as IToken? return _onCharLit(tok) def _onCharLit(tok as IToken) as IToken? require tok.text.startsWith('c') s = tok.text[2:-1] assert s.length==1 or s.length==2 tok.value = s return tok ## ## String substitution handling ## def tokValueForString(s as String) as String """ Utility class for onSTRING_START|PART|STOP_SINGLE|DOUBLE. """ require s.length >= 2 # CC: #s[0] in [c'"', c"'"] #s[s.length-1] in [c'"', c"'"] body s = s.substring(1, s.length-2) chars = StringBuilder(s.length) last = c'\0' next as char? for c in s next = nil if last == c'\\' branch c on c'a', next = c'\a' on c'b', next = c'\b' on c'f', next = c'\f' on c'n', next = c'\n' on c'r', next = c'\r' on c't', next = c'\t' on c'v', next = c'\v' on c"'", next = c"'" on c'"', next = c'"' on c'?', next = c'?' on c'0', next = c'\0' on c'\\' chars.append(c'\\') # cannot have `last` being a backslash anymore--it's considered consumed now last = c'\0' continue else, next = c # TODO: should probably be error: Invalid escape sequence else if c <> '\\' next = c if next is not nil chars.append(next) last = c return chars.toString def onSTRING_START_SINGLE(tok as IToken) as IToken require not _inSubstStringSingle _inSubstStringSingle = true tok.value = .tokValueForString(tok.text) _tokenDefsByWhich['STRING_PART_SINGLE'].isActive = true _tokenDefsByWhich['STRING_STOP_SINGLE'].isActive = true _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = true return tok def onSTRING_PART_SINGLE(tok as IToken) as IToken require _inSubstStringSingle tok.value = .tokValueForString(tok.text) return tok def onSTRING_STOP_SINGLE(tok as IToken) as IToken require _inSubstStringSingle _inSubstStringSingle = false tok.value = .tokValueForString(tok.text) _tokenDefsByWhich['STRING_PART_SINGLE'].isActive = false _tokenDefsByWhich['STRING_STOP_SINGLE'].isActive = false _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = false return tok def onSTRING_START_DOUBLE(tok as IToken) as IToken require not _inSubstStringDouble _inSubstStringDouble = true tok.value = .tokValueForString(tok.text) _tokenDefsByWhich['STRING_PART_DOUBLE'].isActive = true _tokenDefsByWhich['STRING_STOP_DOUBLE'].isActive = true _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = true return tok def onSTRING_PART_DOUBLE(tok as IToken) as IToken require _inSubstStringDouble tok.value = .tokValueForString(tok.text) return tok def onSTRING_STOP_DOUBLE(tok as IToken) as IToken require _inSubstStringDouble _inSubstStringDouble = false tok.value = .tokValueForString(tok.text) _tokenDefsByWhich['STRING_PART_DOUBLE'].isActive = false _tokenDefsByWhich['STRING_STOP_DOUBLE'].isActive = false _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = false return tok def onLBRACKET(tok as IToken) as IToken if _inSubstStringSingle or _inSubstStringDouble _substLBracketCount += 1 if _substLBracketCount==1 _tokenDefsByWhich['RBRACKET_SPECIAL'].isActive = true assert _tokenDefsByWhich['STRING_PART_FORMAT'].isActive _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = false return tok def onRBRACKET_SPECIAL(tok as IToken) as IToken require _inSubstStringSingle or _inSubstStringDouble _substLBracketCount body _substLBracketCount -= 1 if _substLBracketCount == 0 _tokenDefsByWhich['RBRACKET_SPECIAL'].isActive = false assert not _tokenDefsByWhich['STRING_PART_FORMAT'].isActive _tokenDefsByWhich['STRING_PART_FORMAT'].isActive = true tok.which = 'RBRACKET' # tricky, tricky. the parser never sees an RBRACKET_SPECIAL return tok ## ## Doc Strings ## def onDOC_STRING_START(tok as IToken) as IToken assert not _inDocString # narrow the tokenizer's token defs to a new shorter set # TODO: cache the tokens below t = List() t.add(TokenRegexDef('DOC_STRING_STOP', r'[ \t]*"""[ \t]*\n')) t.add(TokenRegexDef('DOC_STRING_BODY_TEXT', '.*\n')) .pushTokenDefs(t) _inDocString = true return tok def onDOC_STRING_STOP(tok as IToken) as IToken assert _inDocString, tok _inDocString = false .popTokenDefs return tok def onDOC_STRING_BODY_TEXT(tok as IToken) as IToken assert _inDocString, tok return tok def onDOC_STRING_LINE(tok as IToken) as IToken tok.value = tok.text.trim[3:-3].trim return tok ## ## Simple string literals ## def onSTRING_RAW_SINGLE(tok as IToken) as IToken require tok.text.startsWith('r') tok.value = tok.text.substring(2, tok.text.length-3) tok.which = 'STRING_SINGLE' return tok def onSTRING_RAW_DOUBLE(tok as IToken) as IToken require tok.text.startsWith('r') tok.value = tok.text.substring(2, tok.text.length-3) tok.which = 'STRING_DOUBLE' return tok def onSTRING_NOSUB_SINGLE(tok as IToken) as IToken require tok.text.startsWith('ns') tok.value = .tokValueForString(tok.text.substring(2)) tok.which = 'STRING_SINGLE' return tok def onSTRING_NOSUB_DOUBLE(tok as IToken) as IToken require tok.text.startsWith('ns') tok.value = .tokValueForString(tok.text.substring(2)) tok.which = 'STRING_DOUBLE' return tok def onSTRING_SINGLE(tok as IToken) as IToken tok.value = .tokValueForString(tok.text) return tok def onSTRING_DOUBLE(tok as IToken) as IToken tok.value = .tokValueForString(tok.text) return tok ## ## Self util ## var _directiveRE = Regex(r'#\s?\.([\w\-]+)\.($|\s)', RegexOptions.Compiled) def checkForCommentDirective(tok as IToken) as bool # check for .no-warnings. reMatch = _directiveRE.match(tok.text) if reMatch.success tok.which = 'DIRECTIVE' name = reMatch.groups[1].value branch name on 'no-warnings', .addNoWarning(tok) # for testify on 'args', pass # TODO: actually this could be worth implementing outside of testify on 'compile-only', pass on 'error', pass on 'multi', pass on 'multipart', pass on 'require', pass on 'skip', pass on 'warning', pass on 'warning-lax', pass else, .throwError('Unrecognized compiler directive "[name]".') return true return false def commentToken(tok as IToken) as IToken? if .willReturnComments tok.which = 'COMMENT' return tok else return nil def directiveToken(tok as IToken) as IToken? return if(.willReturnDirectives, tok, nil) class SpaceTokenDef inherits TokenDef """ This is just an idea for speeding up the tokenizer... implement some of the token defs by hand in the hope that they will be faster than the regex. """ test # .timeIt pass def timeIt is shared input1, input2 = '\t\tx = 5', '# foo' reps = 10_000_000 re = Regex(r'[ \t]+', RegexOptions.Compiled) sw = System.Diagnostics.Stopwatch() sw.start for i in reps re.match(input1) re.match(input2) sw.stop timeRE = sw.elapsedMilliseconds td = SpaceTokenDef('SPACE', r'[ \t]+') sw = System.Diagnostics.Stopwatch() sw.start for i in reps td.match(input1) td.match(input2) sw.stop timeTD = sw.elapsedMilliseconds ratio = timeRE / timeTD trace timeRE, timeTD, ratio # trace: timeRE=6875 (Int64); timeTD=462 (Int64); ratio=14.88 (Decimal); # so at least for SpaceTokenDef, its .match is more than 14 X faster than a compiled regex! cue init(which as String, definition as String) base.init(which) assert definition == r'[ \t]+' get length as int is override throw Exception('ordered token') get firstChars as List is override return [c' ', c'\t'] def match(input as String) as TokenMatch? is override for i in input.length c = input[i] if c <> c' ' and c <> c'\t', break return if(i==0, nil, TokenMatch(input.substring(0, i))) /# TODO class OpenGenericTokenDef inherits TokenDef """ [A-Za-z_][A-Za-z0-9_]* is override t = List() for c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_' t.add(c) return t def match(input as String) as TokenMatch? is override state = 0 for i in input.length ... #/ class StringTokenDef inherits TokenDef is abstract """ Regexes were: r"STRING_START_SINGLE '[^'\n\[]*\[", r"STRING_PART_SINGLE \][^'\n\[]*\[", r"STRING_STOP_SINGLE \][^'\n\[]*'", r'STRING_START_DOUBLE "[^"\n\[]*\[', r'STRING_PART_DOUBLE \][^"\n\[]*\[', r'STRING_STOP_DOUBLE \][^"\n\[]*"', But there is also the need to escape left brackets which turned out difficult to express in regexes. I did manage to do it, but then the regexes became *extremely* slow. """ var _quote as char cue init(which as String, quote as char) base.init(which) _quote = quote get length as int is override throw Exception('ordered token') def match(input as String) as TokenMatch? is override s = _match(input) if s, return TokenMatch(s) else, return nil def _match(input as String) as String? is abstract def _matchBetween(input as String, startch as char, stopch as char, breakch as char) as String? """ Matches between a range of characters. """ if input[0] <> startch, return nil sb = StringBuilder(input[0].toString) isEscaped = false for i in 1 : input.length c = input[i] if c == breakch and not isEscaped, return nil if c == c'\n', return nil sb.append(c) if isEscaped isEscaped = false else if c == stopch, return sb.toString if c == c'\\', isEscaped = true return nil def toString as String is override return '[.getType.name]([CobraCore.toTechString(_quote)])' class StringStartTokenDef inherits StringTokenDef cue init(which as String, quote as char) base.init(which, quote) get firstChars as List is override return [_quote] def _match(input as String) as String? is override test x = StringStartTokenDef('foo', c"\'") # normal: assert x._match('aoeu') is nil assert x._match(r"'foo [bar]") == r"'foo [" # odd: assert x._match(r"'foo[") == r"'foo[" # escaped: assert x._match(r"'foo\[ foo[") == r"'foo\[ foo[" assert x._match(r"'foo\\[foo]") == r"'foo\\[" assert x._match(r"'foo\\\[ foo[") == r"'foo\\\[ foo[" # not this token def: assert x._match(r"]foo [bar]") is nil assert x._match(r"]foo[") is nil assert x._match(r"]foo[") is nil assert x._match(r"]foo' + ") is nil assert x._match(r"]foo ' bah blah") is nil assert x._match(r"]foo\[ foo'") is nil body return _matchBetween(input, _quote, c'[', _quote) class StringPartTokenDef inherits StringTokenDef cue init(which as String, quote as char) base.init(which, quote) get firstChars as List is override return [c']'] def _match(input as String) as String? is override test x = StringPartTokenDef('foo', c"\'") # normal: assert x._match('aoeu') is nil assert x._match(r"]foo [bar]") == r"]foo [" # odd: assert x._match(r"]foo[") == r"]foo[" # escaped: assert x._match(r"]foo\[ foo[") == r"]foo\[ foo[" assert x._match(r"]foo\\[foo]") == r"]foo\\[" assert x._match(r"]foo\\\[ foo[") == r"]foo\\\[ foo[" # not this token def: assert x._match(r"'foo [bar]") is nil assert x._match(r"'foo[") is nil assert x._match(r"'foo\[ foo[") is nil assert x._match(r"]foo' + ") is nil assert x._match(r"]foo ' bah blah") is nil assert x._match(r"]foo\[ foo'") is nil body return _matchBetween(input, c']', c'[', _quote) class StringStopTokenDef inherits StringTokenDef cue init(which as String, quote as char) base.init(which, quote) get firstChars as List is override return [c']'] def _match(input as String) as String? is override test x = StringStopTokenDef('foo', c"\'") # normal: assert x._match('aoeu') is nil assert x._match(r"]foo' + ") == r"]foo'" assert x._match(r"] '") == r"] '" # odd: assert x._match(r"]foo ' bah blah") == r"]foo '" # escaped: assert x._match(r"]foo\[ foo'") == r"]foo\[ foo'" assert x._match(r"]foo\\\[ foo'") == r"]foo\\\[ foo'" # not this token def: assert x._match(r"'foo [bar]") is nil assert x._match(r"'foo[") is nil assert x._match(r"'foo\[ foo[") is nil assert x._match(r"]foo[") is nil assert x._match(r"'foo [bar]") is nil assert x._match(r"'foo[") is nil assert x._match(r"]foo\\[foo]") is nil body return _matchBetween(input, c']', _quote, c'[' )