""" SimpleEnglishParser.cobra Parses a limited set of English sentences using a recursive descent parser. To run: cobra SimpleEnglishParser.cobra Some test sentences are included. Example output: >> John took the orange. sentence noun-phrase name/"John" verb-phrase verb/"took" noun-phrase determiner/"the" noun/"orange" == Grammar Sentence -> NounPhrase VerbPhrase NounPhrase -> Name | Pronoun | Determiner Adjective* Noun | NounPhrase PrepositionalPhrase VerbPhrase -> Verb | Verb NounPhrase | VerbPhrase PrepositionalPhrase PrepositionalPhrase -> Preposition NounPhrase See the Lexer class for the vocabulary of words that are recognized. == Limitations * Technically, this is a parser for a formal language, but English is a natural language and there are many, many English sentences that wouldn't fit the grammatical structure covered here. * Does not intelligently handle groups of words like "Los Angeles" or "May 5th". * Does not handle ambiguities. * The vocabularly is hard coded in the source code instead of taken from an external source such as text files or a WordNet database. * The location of a token is not recorded. * The location of a parse error is not reported. == To Do * Even though this is a small sample application, it might still be nice to include the 100 most-used words in the base vocabulary. See http://s.wsj.net/public/resources/documents/info-numbguy-sort.html == See Also * http://en.wikipedia.org/wiki/Recursive_descent_parser * This English grammar is an example taken from "Paradigms of AI Programming" 1992 by Norvig """ class SimpleEnglishParser def main verbose = true cases = [ ['John saw Mary.', 'sentence(noun-phrase(name/"John"), verb-phrase(verb/"saw", noun-phrase(name/"Mary")))'], ['John took the orange.', 'sentence(noun-phrase(name/"John"), verb-phrase(verb/"took", noun-phrase(determiner/"the", noun/"orange")))'], ['The green ball hit John.', 'sentence(noun-phrase(determiner/"The", adjective/"green", noun/"ball"), verb-phrase(verb/"hit", noun-phrase(name/"John")))'], ['The green ball hit John in the face.', 'sentence(noun-phrase(determiner/"The", adjective/"green", noun/"ball"), verb-phrase(verb/"hit", noun-phrase(noun-phrase(name/"John"), prepositional-phrase(preposition/"in", noun-phrase(determiner/"the", noun/"face")))))'], ['The woman hit the man with a little green ball.', 'sentence(noun-phrase(determiner/"The", noun/"woman"), verb-phrase(verb/"hit", noun-phrase(noun-phrase(determiner/"the", noun/"man"), prepositional-phrase(preposition/"with", noun-phrase(determiner/"a", adjective/"little", adjective/"green", noun/"ball")))))'], ['The woman hit the man with a little green ball on the table.', 'sentence(noun-phrase(determiner/"The", noun/"woman"), verb-phrase(verb/"hit", noun-phrase(noun-phrase(determiner/"the", noun/"man"), prepositional-phrase(preposition/"with", noun-phrase(noun-phrase(determiner/"a", adjective/"little", adjective/"green", noun/"ball"), prepositional-phrase(preposition/"on", noun-phrase(determiner/"the", noun/"table")))))))'], ] for input, expected in cases if verbose print print '>>', input s = .parseOne(input) s.print if verbose and s.toString <> expected print 'expected:', expected print 'actual: ', s assert s.toString == expected, input if verbose print print 'done.' get tokens from var as TokenStream? ## Parse # to-do: def parseMany for multiple sentences def parseOne(sentence as String) as Node return .parseOne(sentence, Lexer()) def parseOne(sentence as String, lexer as Lexer) as Node _tokens = lexer.lexOne(sentence) node = .sentence if .tokens.peek throw ParseException('Did not consume all output. Next token: [.tokens.peek]') return node ## Non-terminals def sentence as Node return Node('sentence', .nounPhrase, .verbPhrase) def nounPhrase as Node """ NounPhrase -> Name | Pronoun | Determiner Adjective* Noun | NounPhrase PrepositionalPhrase """ node = _nounPhrase peek = .tokens.peek if peek and peek.category == 'preposition' node = Node('noun-phrase', node, .prepositionalPhrase) return node def _nounPhrase as Node # -> .name token = .name if token, return Node('noun-phrase', token) # -> .pronoun token = .pronoun if token, return Node('noun-phrase', token) # -> .determiner .adjective* .noun token = .determiner if token subnodes = List() subnodes.add(token) while true adj = .adjective if adj, subnodes.add(adj) else, break token = .noun if token, subnodes.add(token) else, throw ParseException('Expecting a noun after adjectives.') return Node('noun-phrase', subnodes) throw ParseException('Expecting noun phrase.') def verbPhrase as Node """ VerbPhrase -> Verb | Verb NounPhrase | VerbPhrase PrepositionalPhrase """ verb = .verb if verb # check for noun phrase peek = .tokens.peek if peek and peek.category in {'determiner', 'name', 'pronoun'} node = Node('verb-phrase', verb, .nounPhrase) else node = Node('verb-phrase', verb) # check for prepositional phrase peek = .tokens.peek if peek and peek.category == 'preposition' node = Node('verb-phrase', node, .prepositionalPhrase) return node throw ParseException('Expecting verb phrase.') def prepositionalPhrase as Node """ PrepositionalPhrase -> Preposition NounPhrase """ return Node('prepositional-phrase', .preposition, .nounPhrase) ## Terminals def adjective as Token? return .tokens.optional('adjective') def determiner as Token? return .tokens.optional('determiner') def name as Token? return .tokens.optional('name') def noun as Token? return .tokens.optional('noun') def preposition as Token? return .tokens.optional('preposition') def pronoun as Token? return .tokens.optional('pronoun') def verb as Token? return .tokens.optional('verb') class ParseException inherits Exception cue init(message as String) .init(message, nil) cue init(message as String, innerExc as Exception?) base.init(message, innerExc) class Lexer test lx = Lexer() ts = lx.lexOne('you') assert ts.next.doesMatch('pronoun', 'you', 'you') assert ts.peek is nil ts = lx.lexOne(' You saw ') assert ts.next.doesMatch('pronoun', 'You', 'you') assert ts.next.doesMatch('verb', 'saw', 'saw') assert ts.peek is nil var _categories = [ 'pronoun = i you he she it me him her', 'name = john mary', 'adjective = big little old young blue green orange perspicuous', 'determiner = the a an', 'noun = ball face hall man noun orange saw table verb woman', 'preposition = with for at on by of in', 'verb = hit took liked saw saws walks', ] var _wordToCategory = Dictionary() cue init base.init _unpack def _unpack for line in _categories parts = line.toLower.split(c'=') cat = parts[0].trim for word in parts[1].split word = word.trim if word <> '', _wordToCategory[word] = cat assert _wordToCategory.count > 0 # trace _wordToCategory def lexOne(sentence as String) as TokenStream tokens = List() if sentence.endsWith('.'), sentence = sentence[:-1] for word in sentence.split word = word.trim if word == '', continue normalized = word.toLower cat = _wordToCategory.get(normalized, '_unknown') # trace cat, word, normalized tokens.add(Token(cat, word, normalized)) return TokenStream(tokens) class Part is abstract """ The parser yields parts which are either Nodes or Tokens. All parts have a .category. """ cue init(category as String) base.init _category = category get category from var as String def print .print(0) def print(indent as int) is abstract class Node inherits Part """ Used to represent non-terminals in the grammar as recognized by the parser. In other words, nodes represent structure through their subnodes. """ cue init(category as String, subnodes as vari Part) .init(category, subnodes to Part*) cue init(category as String, subnodes as Part*) base.init(category) _subnodes = subnodes get subnodes from var as Part* def print(indent as int) is override print String(c' ', 4*indent) stop print .category indent += 1 for subnode in .subnodes subnode.print(indent) def toString as String is override sb = StringBuilder('[.category](') sep = '' for node in .subnodes sb.append('[sep][node]') sep = ', ' sb.append(')') return sb.toString class TokenStream """ A token stream represents a linear stream of tokens that can be easily traversed and examined with .next, .optional and .peek. By setting .verbose to true, you can observe the consumption of tokens by the parser as it happpens. """ test ts = TokenStream([Token('a', 'a', 'a'), Token('b', 'b', 'b'), Token('c', 'c', 'c')]) assert ts.peek.category == 'a' assert ts.next.category == 'a' assert ts.peek.category == 'b' assert ts.next.category == 'b' assert ts.peek.category == 'c' assert ts.next.category == 'c' assert ts.peek is nil var _tokens as List var _index = -1 cue init(tokens as Token*) base.init _tokens = List(tokens) pro verbose from var = false def next as Token? """ Returns the next token, or nil if there are none left. """ if _index < _tokens.count, _index += 1 if _index < _tokens.count result = _tokens[_index] if .verbose, print '<> TokenStream.next =', result return result return nil def optional(category as String) as Token? if .peek and .peek.category == category result = .next if .verbose, print '<> TokenStream.optional =', result return result return nil def peek as Token? if _index + 1 < _tokens.count, return _tokens[_index+1] return nil class Token inherits Part cue init(category as String, originalText as String, normalizedText as String) base.init(category) _originalText, _normalizedText = originalText, normalizedText get originalText from var as String get normalizedText from var as String def doesMatch(category as String, originalText as String, normalizedText as String) as bool return .category == category and .originalText == originalText and _ .normalizedText == normalizedText def print(indent as int) is override print String(c' ', 4*indent) stop print this def toString as String is override return '[.category]/"[.originalText]"'