| 1 | """ |
|---|
| 2 | SimpleEnglishParser.cobra |
|---|
| 3 | |
|---|
| 4 | Parses a limited set of English sentences using a recursive descent parser. |
|---|
| 5 | |
|---|
| 6 | To run: |
|---|
| 7 | |
|---|
| 8 | cobra SimpleEnglishParser.cobra |
|---|
| 9 | |
|---|
| 10 | Some test sentences are included. Example output: |
|---|
| 11 | |
|---|
| 12 | >> John took the orange. |
|---|
| 13 | sentence |
|---|
| 14 | noun-phrase |
|---|
| 15 | name/"John" |
|---|
| 16 | verb-phrase |
|---|
| 17 | verb/"took" |
|---|
| 18 | noun-phrase |
|---|
| 19 | determiner/"the" |
|---|
| 20 | noun/"orange" |
|---|
| 21 | |
|---|
| 22 | |
|---|
| 23 | == Grammar |
|---|
| 24 | |
|---|
| 25 | Sentence -> NounPhrase VerbPhrase |
|---|
| 26 | NounPhrase -> Name | Pronoun | Determiner Adjective* Noun | NounPhrase PrepositionalPhrase |
|---|
| 27 | VerbPhrase -> Verb | Verb NounPhrase | VerbPhrase PrepositionalPhrase |
|---|
| 28 | PrepositionalPhrase -> Preposition NounPhrase |
|---|
| 29 | |
|---|
| 30 | See the Lexer class for the vocabulary of words that are recognized. |
|---|
| 31 | |
|---|
| 32 | |
|---|
| 33 | == Limitations |
|---|
| 34 | |
|---|
| 35 | * Technically, this is a parser for a formal language, but English is a natural language and there |
|---|
| 36 | are many, many English sentences that wouldn't fit the grammatical structure covered here. |
|---|
| 37 | * Does not intelligently handle groups of words like "Los Angeles" or "May 5th". |
|---|
| 38 | * Does not handle ambiguities. |
|---|
| 39 | * The vocabularly is hard coded in the source code instead of taken from an external source such as |
|---|
| 40 | text files or a WordNet database. |
|---|
| 41 | * The location of a token is not recorded. |
|---|
| 42 | * The location of a parse error is not reported. |
|---|
| 43 | |
|---|
| 44 | |
|---|
| 45 | == To Do |
|---|
| 46 | |
|---|
| 47 | * Even though this is a small sample application, it might still be nice to include the |
|---|
| 48 | 100 most-used words in the base vocabulary. |
|---|
| 49 | See http://s.wsj.net/public/resources/documents/info-numbguy-sort.html |
|---|
| 50 | |
|---|
| 51 | |
|---|
| 52 | == See Also |
|---|
| 53 | |
|---|
| 54 | * http://en.wikipedia.org/wiki/Recursive_descent_parser |
|---|
| 55 | * This English grammar is an example taken from "Paradigms of AI Programming" 1992 by Norvig |
|---|
| 56 | |
|---|
| 57 | """ |
|---|
| 58 | |
|---|
| 59 | |
|---|
| 60 | class SimpleEnglishParser |
|---|
| 61 | |
|---|
| 62 | def main |
|---|
| 63 | verbose = true |
|---|
| 64 | cases = [ |
|---|
| 65 | ['John saw Mary.', 'sentence(noun-phrase(name/"John"), verb-phrase(verb/"saw", noun-phrase(name/"Mary")))'], |
|---|
| 66 | ['John took the orange.', 'sentence(noun-phrase(name/"John"), verb-phrase(verb/"took", noun-phrase(determiner/"the", noun/"orange")))'], |
|---|
| 67 | ['The green ball hit John.', 'sentence(noun-phrase(determiner/"The", adjective/"green", noun/"ball"), verb-phrase(verb/"hit", noun-phrase(name/"John")))'], |
|---|
| 68 | ['The green ball hit John in the face.', 'sentence(noun-phrase(determiner/"The", adjective/"green", noun/"ball"), verb-phrase(verb/"hit", noun-phrase(noun-phrase(name/"John"), prepositional-phrase(preposition/"in", noun-phrase(determiner/"the", noun/"face")))))'], |
|---|
| 69 | ['The woman hit the man with a little green ball.', 'sentence(noun-phrase(determiner/"The", noun/"woman"), verb-phrase(verb/"hit", noun-phrase(noun-phrase(determiner/"the", noun/"man"), prepositional-phrase(preposition/"with", noun-phrase(determiner/"a", adjective/"little", adjective/"green", noun/"ball")))))'], |
|---|
| 70 | ['The woman hit the man with a little green ball on the table.', 'sentence(noun-phrase(determiner/"The", noun/"woman"), verb-phrase(verb/"hit", noun-phrase(noun-phrase(determiner/"the", noun/"man"), prepositional-phrase(preposition/"with", noun-phrase(noun-phrase(determiner/"a", adjective/"little", adjective/"green", noun/"ball"), prepositional-phrase(preposition/"on", noun-phrase(determiner/"the", noun/"table")))))))'], |
|---|
| 71 | ] |
|---|
| 72 | for input, expected in cases |
|---|
| 73 | if verbose |
|---|
| 74 | print |
|---|
| 75 | print '>>', input |
|---|
| 76 | s = .parseOne(input) |
|---|
| 77 | s.print |
|---|
| 78 | if verbose and s.toString <> expected |
|---|
| 79 | print 'expected:', expected |
|---|
| 80 | print 'actual: ', s |
|---|
| 81 | assert s.toString == expected, input |
|---|
| 82 | if verbose |
|---|
| 83 | print |
|---|
| 84 | print 'done.' |
|---|
| 85 | |
|---|
| 86 | get tokens from var as TokenStream? |
|---|
| 87 | |
|---|
| 88 | ## Parse |
|---|
| 89 | |
|---|
| 90 | # to-do: def parseMany for multiple sentences |
|---|
| 91 | |
|---|
| 92 | def parseOne(sentence as String) as Node |
|---|
| 93 | return .parseOne(sentence, Lexer()) |
|---|
| 94 | |
|---|
| 95 | def parseOne(sentence as String, lexer as Lexer) as Node |
|---|
| 96 | _tokens = lexer.lexOne(sentence) |
|---|
| 97 | node = .sentence |
|---|
| 98 | if .tokens.peek |
|---|
| 99 | throw ParseException('Did not consume all output. Next token: [.tokens.peek]') |
|---|
| 100 | return node |
|---|
| 101 | |
|---|
| 102 | ## Non-terminals |
|---|
| 103 | |
|---|
| 104 | def sentence as Node |
|---|
| 105 | return Node('sentence', .nounPhrase, .verbPhrase) |
|---|
| 106 | |
|---|
| 107 | def nounPhrase as Node |
|---|
| 108 | """ |
|---|
| 109 | NounPhrase -> Name | Pronoun | Determiner Adjective* Noun | NounPhrase PrepositionalPhrase |
|---|
| 110 | """ |
|---|
| 111 | node = _nounPhrase |
|---|
| 112 | peek = .tokens.peek |
|---|
| 113 | if peek and peek.category == 'preposition' |
|---|
| 114 | node = Node('noun-phrase', node, .prepositionalPhrase) |
|---|
| 115 | return node |
|---|
| 116 | |
|---|
| 117 | def _nounPhrase as Node |
|---|
| 118 | # -> .name |
|---|
| 119 | token = .name |
|---|
| 120 | if token, return Node('noun-phrase', token) |
|---|
| 121 | |
|---|
| 122 | # -> .pronoun |
|---|
| 123 | token = .pronoun |
|---|
| 124 | if token, return Node('noun-phrase', token) |
|---|
| 125 | |
|---|
| 126 | # -> .determiner .adjective* .noun |
|---|
| 127 | token = .determiner |
|---|
| 128 | if token |
|---|
| 129 | subnodes = List<of Part>() |
|---|
| 130 | subnodes.add(token) |
|---|
| 131 | while true |
|---|
| 132 | adj = .adjective |
|---|
| 133 | if adj, subnodes.add(adj) |
|---|
| 134 | else, break |
|---|
| 135 | token = .noun |
|---|
| 136 | if token, subnodes.add(token) |
|---|
| 137 | else, throw ParseException('Expecting a noun after adjectives.') |
|---|
| 138 | return Node('noun-phrase', subnodes) |
|---|
| 139 | |
|---|
| 140 | throw ParseException('Expecting noun phrase.') |
|---|
| 141 | |
|---|
| 142 | def verbPhrase as Node |
|---|
| 143 | """ VerbPhrase -> Verb | Verb NounPhrase | VerbPhrase PrepositionalPhrase """ |
|---|
| 144 | verb = .verb |
|---|
| 145 | if verb |
|---|
| 146 | # check for noun phrase |
|---|
| 147 | peek = .tokens.peek |
|---|
| 148 | if peek and peek.category in {'determiner', 'name', 'pronoun'} |
|---|
| 149 | node = Node('verb-phrase', verb, .nounPhrase) |
|---|
| 150 | else |
|---|
| 151 | node = Node('verb-phrase', verb) |
|---|
| 152 | # check for prepositional phrase |
|---|
| 153 | peek = .tokens.peek |
|---|
| 154 | if peek and peek.category == 'preposition' |
|---|
| 155 | node = Node('verb-phrase', node, .prepositionalPhrase) |
|---|
| 156 | return node |
|---|
| 157 | throw ParseException('Expecting verb phrase.') |
|---|
| 158 | |
|---|
| 159 | def prepositionalPhrase as Node |
|---|
| 160 | """ PrepositionalPhrase -> Preposition NounPhrase """ |
|---|
| 161 | return Node('prepositional-phrase', .preposition, .nounPhrase) |
|---|
| 162 | |
|---|
| 163 | ## Terminals |
|---|
| 164 | |
|---|
| 165 | def adjective as Token? |
|---|
| 166 | return .tokens.optional('adjective') |
|---|
| 167 | |
|---|
| 168 | def determiner as Token? |
|---|
| 169 | return .tokens.optional('determiner') |
|---|
| 170 | |
|---|
| 171 | def name as Token? |
|---|
| 172 | return .tokens.optional('name') |
|---|
| 173 | |
|---|
| 174 | def noun as Token? |
|---|
| 175 | return .tokens.optional('noun') |
|---|
| 176 | |
|---|
| 177 | def preposition as Token? |
|---|
| 178 | return .tokens.optional('preposition') |
|---|
| 179 | |
|---|
| 180 | def pronoun as Token? |
|---|
| 181 | return .tokens.optional('pronoun') |
|---|
| 182 | |
|---|
| 183 | def verb as Token? |
|---|
| 184 | return .tokens.optional('verb') |
|---|
| 185 | |
|---|
| 186 | |
|---|
| 187 | class ParseException inherits Exception |
|---|
| 188 | |
|---|
| 189 | cue init(message as String) |
|---|
| 190 | .init(message, nil) |
|---|
| 191 | |
|---|
| 192 | cue init(message as String, innerExc as Exception?) |
|---|
| 193 | base.init(message, innerExc) |
|---|
| 194 | |
|---|
| 195 | |
|---|
| 196 | class Lexer |
|---|
| 197 | |
|---|
| 198 | test |
|---|
| 199 | lx = Lexer() |
|---|
| 200 | ts = lx.lexOne('you') |
|---|
| 201 | assert ts.next.doesMatch('pronoun', 'you', 'you') |
|---|
| 202 | assert ts.peek is nil |
|---|
| 203 | |
|---|
| 204 | ts = lx.lexOne(' You saw ') |
|---|
| 205 | assert ts.next.doesMatch('pronoun', 'You', 'you') |
|---|
| 206 | assert ts.next.doesMatch('verb', 'saw', 'saw') |
|---|
| 207 | assert ts.peek is nil |
|---|
| 208 | |
|---|
| 209 | var _categories = [ |
|---|
| 210 | 'pronoun = i you he she it me him her', |
|---|
| 211 | 'name = john mary', |
|---|
| 212 | 'adjective = big little old young blue green orange perspicuous', |
|---|
| 213 | 'determiner = the a an', |
|---|
| 214 | 'noun = ball face hall man noun orange saw table verb woman', |
|---|
| 215 | 'preposition = with for at on by of in', |
|---|
| 216 | 'verb = hit took liked saw saws walks', |
|---|
| 217 | ] |
|---|
| 218 | |
|---|
| 219 | var _wordToCategory = Dictionary<of String, String>() |
|---|
| 220 | |
|---|
| 221 | cue init |
|---|
| 222 | base.init |
|---|
| 223 | _unpack |
|---|
| 224 | |
|---|
| 225 | def _unpack |
|---|
| 226 | for line in _categories |
|---|
| 227 | parts = line.toLower.split(c'=') |
|---|
| 228 | cat = parts[0].trim |
|---|
| 229 | for word in parts[1].split |
|---|
| 230 | word = word.trim |
|---|
| 231 | if word <> '', _wordToCategory[word] = cat |
|---|
| 232 | assert _wordToCategory.count > 0 |
|---|
| 233 | # trace _wordToCategory |
|---|
| 234 | |
|---|
| 235 | def lexOne(sentence as String) as TokenStream |
|---|
| 236 | tokens = List<of Token>() |
|---|
| 237 | if sentence.endsWith('.'), sentence = sentence[:-1] |
|---|
| 238 | for word in sentence.split |
|---|
| 239 | word = word.trim |
|---|
| 240 | if word == '', continue |
|---|
| 241 | normalized = word.toLower |
|---|
| 242 | cat = _wordToCategory.get(normalized, '_unknown') |
|---|
| 243 | # trace cat, word, normalized |
|---|
| 244 | tokens.add(Token(cat, word, normalized)) |
|---|
| 245 | return TokenStream(tokens) |
|---|
| 246 | |
|---|
| 247 | |
|---|
| 248 | class Part is abstract |
|---|
| 249 | """ |
|---|
| 250 | The parser yields parts which are either Nodes or Tokens. |
|---|
| 251 | All parts have a .category. |
|---|
| 252 | """ |
|---|
| 253 | |
|---|
| 254 | cue init(category as String) |
|---|
| 255 | base.init |
|---|
| 256 | _category = category |
|---|
| 257 | |
|---|
| 258 | get category from var as String |
|---|
| 259 | |
|---|
| 260 | def print |
|---|
| 261 | .print(0) |
|---|
| 262 | |
|---|
| 263 | def print(indent as int) is abstract |
|---|
| 264 | |
|---|
| 265 | |
|---|
| 266 | class Node inherits Part |
|---|
| 267 | """ |
|---|
| 268 | Used to represent non-terminals in the grammar as recognized by the parser. |
|---|
| 269 | In other words, nodes represent structure through their subnodes. |
|---|
| 270 | """ |
|---|
| 271 | |
|---|
| 272 | cue init(category as String, subnodes as vari Part) |
|---|
| 273 | .init(category, subnodes to Part*) |
|---|
| 274 | |
|---|
| 275 | cue init(category as String, subnodes as Part*) |
|---|
| 276 | base.init(category) |
|---|
| 277 | _subnodes = subnodes |
|---|
| 278 | |
|---|
| 279 | get subnodes from var as Part* |
|---|
| 280 | |
|---|
| 281 | def print(indent as int) is override |
|---|
| 282 | print String(c' ', 4*indent) stop |
|---|
| 283 | print .category |
|---|
| 284 | indent += 1 |
|---|
| 285 | for subnode in .subnodes |
|---|
| 286 | subnode.print(indent) |
|---|
| 287 | |
|---|
| 288 | def toString as String is override |
|---|
| 289 | sb = StringBuilder('[.category](') |
|---|
| 290 | sep = '' |
|---|
| 291 | for node in .subnodes |
|---|
| 292 | sb.append('[sep][node]') |
|---|
| 293 | sep = ', ' |
|---|
| 294 | sb.append(')') |
|---|
| 295 | return sb.toString |
|---|
| 296 | |
|---|
| 297 | |
|---|
| 298 | class TokenStream |
|---|
| 299 | """ |
|---|
| 300 | A token stream represents a linear stream of tokens that can be easily traversed and examined |
|---|
| 301 | with .next, .optional and .peek. By setting .verbose to true, you can observe the consumption of |
|---|
| 302 | tokens by the parser as it happpens. |
|---|
| 303 | """ |
|---|
| 304 | |
|---|
| 305 | test |
|---|
| 306 | ts = TokenStream([Token('a', 'a', 'a'), Token('b', 'b', 'b'), Token('c', 'c', 'c')]) |
|---|
| 307 | assert ts.peek.category == 'a' |
|---|
| 308 | assert ts.next.category == 'a' |
|---|
| 309 | assert ts.peek.category == 'b' |
|---|
| 310 | assert ts.next.category == 'b' |
|---|
| 311 | assert ts.peek.category == 'c' |
|---|
| 312 | assert ts.next.category == 'c' |
|---|
| 313 | assert ts.peek is nil |
|---|
| 314 | |
|---|
| 315 | var _tokens as List<of Token> |
|---|
| 316 | var _index = -1 |
|---|
| 317 | |
|---|
| 318 | cue init(tokens as Token*) |
|---|
| 319 | base.init |
|---|
| 320 | _tokens = List<of Token>(tokens) |
|---|
| 321 | |
|---|
| 322 | pro verbose from var = false |
|---|
| 323 | |
|---|
| 324 | def next as Token? |
|---|
| 325 | """ Returns the next token, or nil if there are none left. """ |
|---|
| 326 | if _index < _tokens.count, _index += 1 |
|---|
| 327 | if _index < _tokens.count |
|---|
| 328 | result = _tokens[_index] |
|---|
| 329 | if .verbose, print '<> TokenStream.next =', result |
|---|
| 330 | return result |
|---|
| 331 | return nil |
|---|
| 332 | |
|---|
| 333 | def optional(category as String) as Token? |
|---|
| 334 | if .peek and .peek.category == category |
|---|
| 335 | result = .next |
|---|
| 336 | if .verbose, print '<> TokenStream.optional =', result |
|---|
| 337 | return result |
|---|
| 338 | return nil |
|---|
| 339 | |
|---|
| 340 | def peek as Token? |
|---|
| 341 | if _index + 1 < _tokens.count, return _tokens[_index+1] |
|---|
| 342 | return nil |
|---|
| 343 | |
|---|
| 344 | |
|---|
| 345 | class Token inherits Part |
|---|
| 346 | |
|---|
| 347 | cue init(category as String, originalText as String, normalizedText as String) |
|---|
| 348 | base.init(category) |
|---|
| 349 | _originalText, _normalizedText = originalText, normalizedText |
|---|
| 350 | |
|---|
| 351 | get originalText from var as String |
|---|
| 352 | |
|---|
| 353 | get normalizedText from var as String |
|---|
| 354 | |
|---|
| 355 | def doesMatch(category as String, originalText as String, normalizedText as String) as bool |
|---|
| 356 | return .category == category and .originalText == originalText and _ |
|---|
| 357 | .normalizedText == normalizedText |
|---|
| 358 | |
|---|
| 359 | def print(indent as int) is override |
|---|
| 360 | print String(c' ', 4*indent) stop |
|---|
| 361 | print this |
|---|
| 362 | |
|---|
| 363 | def toString as String is override |
|---|
| 364 | return '[.category]/"[.originalText]"' |
|---|