Ticket #370: Spam.cobra

File Spam.cobra, 2.0 KB (added by torial, 10 years ago)
File that seems to cause internal error

Line
1	use System.Net
2	use System.Web
3	use System.IO
4	use System.Text
5
6	use CherryCobbler.Common
7
8	#Cobra Port of: https://github.com/rrenaud/Gibberish-Detector
9	class GibberishTrainer
10	var _accepted_chars = "abcdefghijklmnopqrstuvwxyz"
11
12	var _charIndex as Dictionary<of char,int>
13
14	cue init
15	_charIndex = Dictionary<of char,int>()
16	i =0
17	for c in _accepted_chars
18	_charIndex[c] = i
19	i +=1
20
21	def normalize(line as String) as String
22	"""Return the subset of acceptable characters. Ignores punctuation, infrequent symbols, etc..."""
23	sb = StringBuilder()
24	for c in line.toLower
25	if _charIndex.containsKey(c)
26	sb.append(c)
27	return sb.toString
28
29	def ngram(n as int, toParse as String) as String*
30	"""Returns all n grams from toParse after normalizing"""
31	filtered = .normalize(toParse)
32	for start in filtered.length - n +1
33	yield String.join(filtered[start:start+n]) to !
34
35	def getFile as String
36	url = "http://norvig.com/big.txt"
37	try
38	wc = WebClient()
39	return wc.downloadString(url) to !
40	catch e as Exception
41	print "getFile error: [e.toString]"
42	return ""
43
44	def train
45	charsetSize = _accepted_chars.length
46
47	#Assume we have seen 10 of each character pair. This is a smoothing, to
48	#allow for unseen sequences to not automatically be 0 probability.
49	counts = for i in charsetSize get for k in charsetSize get 10.0
50
51	#count transition from big text file, taken from http://norvig.com/spell-correct.html
52	text = .getFile
53	sr = StringReader(text)
54	line = sr.readLine
55	post while line is not nil
56	for gram in .ngram(2,line to !)
57	counts[_charIndex[gram[0]]][_charIndex[gram[1]]] +=1
58
59	line = sr.readLine
60
61	# Normalize the counts so they become log probabilities.
62	# We use log probabilities rather than straight probabilities to avoid numeric underflow issues with long texts.
63	# This contains a justification:
64	# http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/
65	for r in counts
66	sum = 0.0
67	for c in r
68	sum+=c
69

Download in other formats:

Original Format