1 | use System.Net |
---|
2 | use System.Web |
---|
3 | use System.IO |
---|
4 | use System.Text |
---|
5 | |
---|
6 | use CherryCobbler.Common |
---|
7 | |
---|
8 | #Cobra Port of: https://github.com/rrenaud/Gibberish-Detector |
---|
9 | class GibberishTrainer |
---|
10 | var _accepted_chars = "abcdefghijklmnopqrstuvwxyz" |
---|
11 | |
---|
12 | var _charIndex as Dictionary<of char,int> |
---|
13 | |
---|
14 | cue init |
---|
15 | _charIndex = Dictionary<of char,int>() |
---|
16 | i =0 |
---|
17 | for c in _accepted_chars |
---|
18 | _charIndex[c] = i |
---|
19 | i +=1 |
---|
20 | |
---|
21 | def normalize(line as String) as String |
---|
22 | """Return the subset of acceptable characters. Ignores punctuation, infrequent symbols, etc...""" |
---|
23 | sb = StringBuilder() |
---|
24 | for c in line.toLower |
---|
25 | if _charIndex.containsKey(c) |
---|
26 | sb.append(c) |
---|
27 | return sb.toString |
---|
28 | |
---|
29 | def ngram(n as int, toParse as String) as String* |
---|
30 | """Returns all n grams from toParse after normalizing""" |
---|
31 | filtered = .normalize(toParse) |
---|
32 | for start in filtered.length - n +1 |
---|
33 | yield String.join(filtered[start:start+n]) to ! |
---|
34 | |
---|
35 | def getFile as String |
---|
36 | url = "http://norvig.com/big.txt" |
---|
37 | try |
---|
38 | wc = WebClient() |
---|
39 | return wc.downloadString(url) to ! |
---|
40 | catch e as Exception |
---|
41 | print "getFile error: [e.toString]" |
---|
42 | return "" |
---|
43 | |
---|
44 | def train |
---|
45 | charsetSize = _accepted_chars.length |
---|
46 | |
---|
47 | #Assume we have seen 10 of each character pair. This is a smoothing, to |
---|
48 | #allow for unseen sequences to not automatically be 0 probability. |
---|
49 | counts = for i in charsetSize get for k in charsetSize get 10.0 |
---|
50 | |
---|
51 | #count transition from big text file, taken from http://norvig.com/spell-correct.html |
---|
52 | text = .getFile |
---|
53 | sr = StringReader(text) |
---|
54 | line = sr.readLine |
---|
55 | post while line is not nil |
---|
56 | for gram in .ngram(2,line to !) |
---|
57 | counts[_charIndex[gram[0]]][_charIndex[gram[1]]] +=1 |
---|
58 | |
---|
59 | line = sr.readLine |
---|
60 | |
---|
61 | # Normalize the counts so they become log probabilities. |
---|
62 | # We use log probabilities rather than straight probabilities to avoid numeric underflow issues with long texts. |
---|
63 | # This contains a justification: |
---|
64 | # http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/ |
---|
65 | for r in counts |
---|
66 | sum = 0.0 |
---|
67 | for c in r |
---|
68 | sum+=c |
---|
69 | |
---|