Wiki

root/cobra/trunk/Samples/SimpleEnglishParser.cobra

Revision 2378, 10.3 KB (checked in by Chuck.Esterbrook, 2 years ago)

(minor) Added a doc note.

  • Property svn:eol-style set to native
Line 
1"""
2SimpleEnglishParser.cobra
3
4Parses a limited set of English sentences using a recursive descent parser.
5
6To run:
7
8    cobra SimpleEnglishParser.cobra
9
10Some test sentences are included. Example output:
11
12>> John took the orange.
13sentence
14    noun-phrase
15        name/"John"
16    verb-phrase
17        verb/"took"
18        noun-phrase
19            determiner/"the"
20            noun/"orange"
21
22
23== Grammar
24
25Sentence -> NounPhrase VerbPhrase
26NounPhrase -> Name | Pronoun | Determiner Adjective* Noun | NounPhrase PrepositionalPhrase
27VerbPhrase -> Verb | Verb NounPhrase | VerbPhrase PrepositionalPhrase
28PrepositionalPhrase -> Preposition NounPhrase
29
30See the Lexer class for the vocabulary of words that are recognized.
31
32
33== Limitations
34
35 * Technically, this is a parser for a formal language, but English is a natural language and there
36   are many, many English sentences that wouldn't fit the grammatical structure covered here.
37 * Does not intelligently handle groups of words like "Los Angeles" or "May 5th".
38 * Does not handle ambiguities.
39 * The vocabularly is hard coded in the source code instead of taken from an external source such as
40   text files or a WordNet database.
41 * The location of a token is not recorded.
42 * The location of a parse error is not reported.
43
44
45== To Do
46
47 * Even though this is a small sample application, it might still be nice to include the
48   100 most-used words in the base vocabulary.
49   See http://s.wsj.net/public/resources/documents/info-numbguy-sort.html
50
51
52== See Also
53
54 * http://en.wikipedia.org/wiki/Recursive_descent_parser
55 * This English grammar is an example taken from "Paradigms of AI Programming" 1992 by Norvig
56
57"""
58
59
60class SimpleEnglishParser
61
62    def main
63        verbose = true
64        cases = [
65            ['John saw Mary.', 'sentence(noun-phrase(name/"John"), verb-phrase(verb/"saw", noun-phrase(name/"Mary")))'],
66            ['John took the orange.', 'sentence(noun-phrase(name/"John"), verb-phrase(verb/"took", noun-phrase(determiner/"the", noun/"orange")))'],
67            ['The green ball hit John.', 'sentence(noun-phrase(determiner/"The", adjective/"green", noun/"ball"), verb-phrase(verb/"hit", noun-phrase(name/"John")))'],
68            ['The green ball hit John in the face.', 'sentence(noun-phrase(determiner/"The", adjective/"green", noun/"ball"), verb-phrase(verb/"hit", noun-phrase(noun-phrase(name/"John"), prepositional-phrase(preposition/"in", noun-phrase(determiner/"the", noun/"face")))))'],
69            ['The woman hit the man with a little green ball.', 'sentence(noun-phrase(determiner/"The", noun/"woman"), verb-phrase(verb/"hit", noun-phrase(noun-phrase(determiner/"the", noun/"man"), prepositional-phrase(preposition/"with", noun-phrase(determiner/"a", adjective/"little", adjective/"green", noun/"ball")))))'],
70            ['The woman hit the man with a little green ball on the table.', 'sentence(noun-phrase(determiner/"The", noun/"woman"), verb-phrase(verb/"hit", noun-phrase(noun-phrase(determiner/"the", noun/"man"), prepositional-phrase(preposition/"with", noun-phrase(noun-phrase(determiner/"a", adjective/"little", adjective/"green", noun/"ball"), prepositional-phrase(preposition/"on", noun-phrase(determiner/"the", noun/"table")))))))'],
71        ]
72        for input, expected in cases
73            if verbose
74                print
75                print '>>', input
76            s = .parseOne(input)
77            s.print
78            if verbose and s.toString <> expected
79                print 'expected:', expected
80                print 'actual:  ', s
81            assert s.toString == expected, input
82        if verbose
83            print
84            print 'done.'
85
86    get tokens from var as TokenStream?
87
88    ## Parse
89   
90    # to-do: def parseMany for multiple sentences
91   
92    def parseOne(sentence as String) as Node
93        return .parseOne(sentence, Lexer())
94       
95    def parseOne(sentence as String, lexer as Lexer) as Node
96        _tokens = lexer.lexOne(sentence)
97        node = .sentence
98        if .tokens.peek
99            throw ParseException('Did not consume all output. Next token: [.tokens.peek]')
100        return node
101
102    ## Non-terminals
103   
104    def sentence as Node
105        return Node('sentence', .nounPhrase, .verbPhrase)
106
107    def nounPhrase as Node
108        """
109        NounPhrase -> Name | Pronoun | Determiner Adjective* Noun | NounPhrase PrepositionalPhrase
110        """
111        node = _nounPhrase
112        peek = .tokens.peek
113        if peek and peek.category == 'preposition'
114            node = Node('noun-phrase', node, .prepositionalPhrase)
115        return node
116
117    def _nounPhrase as Node
118        # -> .name
119        token = .name
120        if token, return Node('noun-phrase', token)
121       
122        # -> .pronoun
123        token = .pronoun
124        if token, return Node('noun-phrase', token)
125
126        # -> .determiner .adjective* .noun
127        token = .determiner
128        if token
129            subnodes = List<of Part>()
130            subnodes.add(token)
131            while true
132                adj = .adjective
133                if adj, subnodes.add(adj)
134                else, break
135            token = .noun
136            if token, subnodes.add(token)
137            else, throw ParseException('Expecting a noun after adjectives.')
138            return Node('noun-phrase', subnodes)
139           
140        throw ParseException('Expecting noun phrase.')
141   
142    def verbPhrase as Node
143        """ VerbPhrase -> Verb | Verb NounPhrase | VerbPhrase PrepositionalPhrase """
144        verb = .verb
145        if verb
146            # check for noun phrase
147            peek = .tokens.peek
148            if peek and peek.category in {'determiner', 'name', 'pronoun'}
149                node = Node('verb-phrase', verb, .nounPhrase)
150            else
151                node = Node('verb-phrase', verb)
152            # check for prepositional phrase
153            peek = .tokens.peek
154            if peek and peek.category == 'preposition'
155                node = Node('verb-phrase', node, .prepositionalPhrase)
156            return node
157        throw ParseException('Expecting verb phrase.')
158   
159    def prepositionalPhrase as Node
160        """ PrepositionalPhrase -> Preposition NounPhrase """
161        return Node('prepositional-phrase', .preposition, .nounPhrase)
162
163    ## Terminals
164   
165    def adjective as Token?
166        return .tokens.optional('adjective')
167       
168    def determiner as Token?
169        return .tokens.optional('determiner')
170   
171    def name as Token?
172        return .tokens.optional('name')
173
174    def noun as Token?
175        return .tokens.optional('noun')
176       
177    def preposition as Token?
178        return .tokens.optional('preposition')
179       
180    def pronoun as Token?
181        return .tokens.optional('pronoun')
182   
183    def verb as Token?
184        return .tokens.optional('verb')
185
186
187class ParseException inherits Exception
188
189    cue init(message as String)
190        .init(message, nil)
191   
192    cue init(message as String, innerExc as Exception?)
193        base.init(message, innerExc)
194
195
196class Lexer
197
198    test
199        lx = Lexer()
200        ts = lx.lexOne('you')
201        assert ts.next.doesMatch('pronoun', 'you', 'you')
202        assert ts.peek is nil
203
204        ts = lx.lexOne('  You   saw  ')
205        assert ts.next.doesMatch('pronoun', 'You', 'you')
206        assert ts.next.doesMatch('verb', 'saw', 'saw')
207        assert ts.peek is nil
208
209    var _categories = [
210        'pronoun = i you he she it me him her',
211        'name = john mary',
212        'adjective = big little old young blue green orange perspicuous',
213        'determiner = the a an',
214        'noun = ball face hall man noun orange saw table verb woman',
215        'preposition = with for at on by of in',
216        'verb = hit took liked saw saws walks',
217    ]
218
219    var _wordToCategory = Dictionary<of String, String>()
220
221    cue init
222        base.init
223        _unpack
224       
225    def _unpack
226        for line in _categories
227            parts = line.toLower.split(c'=')
228            cat = parts[0].trim
229            for word in parts[1].split
230                word = word.trim
231                if word <> '', _wordToCategory[word] = cat
232        assert _wordToCategory.count > 0
233        # trace _wordToCategory
234       
235    def lexOne(sentence as String) as TokenStream
236        tokens = List<of Token>()
237        if sentence.endsWith('.'), sentence = sentence[:-1]
238        for word in sentence.split
239            word = word.trim
240            if word == '', continue
241            normalized = word.toLower
242            cat = _wordToCategory.get(normalized, '_unknown')
243            # trace cat, word, normalized
244            tokens.add(Token(cat, word, normalized))
245        return TokenStream(tokens)
246
247
248class Part is abstract
249    """
250    The parser yields parts which are either Nodes or Tokens.
251    All parts have a .category.
252    """
253
254    cue init(category as String)
255        base.init
256        _category = category
257       
258    get category from var as String
259
260    def print
261        .print(0)
262
263    def print(indent as int) is abstract
264
265
266class Node inherits Part
267    """
268    Used to represent non-terminals in the grammar as recognized by the parser.
269    In other words, nodes represent structure through their subnodes.
270    """
271
272    cue init(category as String, subnodes as vari Part)
273        .init(category, subnodes to Part*)
274   
275    cue init(category as String, subnodes as Part*)
276        base.init(category)
277        _subnodes = subnodes
278   
279    get subnodes from var as Part*
280
281    def print(indent as int) is override
282        print String(c' ', 4*indent) stop
283        print .category
284        indent += 1
285        for subnode in .subnodes
286            subnode.print(indent)
287
288    def toString as String is override
289        sb = StringBuilder('[.category](')
290        sep = ''
291        for node in .subnodes
292            sb.append('[sep][node]')
293            sep = ', '
294        sb.append(')')
295        return sb.toString
296
297
298class TokenStream
299    """
300    A token stream represents a linear stream of tokens that can be easily traversed and examined
301    with .next, .optional and .peek. By setting .verbose to true, you can observe the consumption of
302    tokens by the parser as it happpens.
303    """
304   
305    test
306        ts = TokenStream([Token('a', 'a', 'a'), Token('b', 'b', 'b'), Token('c', 'c', 'c')])
307        assert ts.peek.category == 'a'
308        assert ts.next.category == 'a'
309        assert ts.peek.category == 'b'
310        assert ts.next.category == 'b'
311        assert ts.peek.category == 'c'
312        assert ts.next.category == 'c'
313        assert ts.peek is nil
314
315    var _tokens as List<of Token>
316    var _index = -1
317   
318    cue init(tokens as Token*)
319        base.init
320        _tokens = List<of Token>(tokens)
321
322    pro verbose from var = false
323   
324    def next as Token?
325        """ Returns the next token, or nil if there are none left. """
326        if _index < _tokens.count, _index += 1
327        if _index < _tokens.count
328            result = _tokens[_index]
329            if .verbose, print '<> TokenStream.next =', result
330            return result
331        return nil
332
333    def optional(category as String) as Token?
334        if .peek and .peek.category == category
335            result = .next
336            if .verbose, print '<> TokenStream.optional =', result
337            return result
338        return nil
339       
340    def peek as Token?
341        if _index + 1 < _tokens.count, return _tokens[_index+1]
342        return nil
343
344
345class Token inherits Part
346
347    cue init(category as String, originalText as String, normalizedText as String)
348        base.init(category)
349        _originalText, _normalizedText = originalText, normalizedText
350
351    get originalText from var as String
352
353    get normalizedText from var as String
354
355    def doesMatch(category as String, originalText as String, normalizedText as String) as bool
356        return .category == category and .originalText == originalText and _
357            .normalizedText == normalizedText
358
359    def print(indent as int) is override
360        print String(c' ', 4*indent) stop
361        print this
362
363    def toString as String is override
364        return '[.category]/"[.originalText]"'
Note: See TracBrowser for help on using the browser.