Wiki

Ticket #370: Spam.cobra

File Spam.cobra, 2.0 KB (added by torial, 10 years ago)

File that seems to cause internal error

Line 
1use System.Net
2use System.Web
3use System.IO
4use System.Text
5
6use CherryCobbler.Common
7
8#Cobra Port of: https://github.com/rrenaud/Gibberish-Detector
9class GibberishTrainer
10    var _accepted_chars = "abcdefghijklmnopqrstuvwxyz"
11
12    var _charIndex as Dictionary<of char,int>
13
14    cue init
15        _charIndex = Dictionary<of char,int>()
16        i =0
17        for c in _accepted_chars
18            _charIndex[c] = i
19            i +=1
20
21    def normalize(line as String) as String
22        """Return the subset of acceptable characters.  Ignores punctuation, infrequent symbols, etc..."""
23        sb = StringBuilder()
24        for c in line.toLower
25            if _charIndex.containsKey(c)
26                sb.append(c)
27        return sb.toString
28   
29    def ngram(n as int, toParse as String) as String*
30        """Returns all n grams from toParse after normalizing"""
31        filtered = .normalize(toParse)
32        for start in filtered.length - n +1
33            yield String.join(filtered[start:start+n]) to !
34
35    def getFile as String
36        url = "http://norvig.com/big.txt"
37        try
38            wc = WebClient()
39            return wc.downloadString(url) to !
40        catch e as Exception
41            print "getFile error: [e.toString]"
42            return ""
43   
44    def train
45        charsetSize = _accepted_chars.length
46
47        #Assume we have seen 10 of each character pair.  This is a smoothing, to
48        #allow for unseen sequences to not automatically be 0 probability.
49        counts = for i in charsetSize get for k in charsetSize get 10.0
50
51        #count transition from big text file, taken from http://norvig.com/spell-correct.html
52        text = .getFile
53        sr = StringReader(text)
54        line = sr.readLine
55        post while line is not nil
56            for gram in .ngram(2,line to !)
57                counts[_charIndex[gram[0]]][_charIndex[gram[1]]] +=1
58           
59            line = sr.readLine
60       
61        # Normalize the counts so they become log probabilities. 
62        # We use log probabilities rather than straight probabilities to avoid numeric underflow issues with long texts.
63        # This contains a justification:
64        # http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/
65        for r in counts
66            sum = 0.0
67            for c in r
68                sum+=c
69