Wiki

root/cobra/trunk/Samples/FindWords.cobra

Revision 2076, 9.5 KB (checked in by Chuck.Esterbrook, 3 years ago)

New FindWords? sample program.
credit:Caligari,Chuck

  • Property svn:eol-style set to native
Line 
1"""
2FindWords
3  by Caligari
4  with mods by Chuck
5
6This tool can be used to find which files in a directory tree contain any
7of a list of words.
8
9The list of words to search for is provided by default in a file called
10"wordlist.txt" (located in the same directory as the executable) with one
11regular expression per line.
12
13A list of filename extensions to ignore can also be provided, if needed.
14That list has one extension per line (not including the "."), and is
15not a regular expression.
16
17A description of further options is available using the "-help" command-line argument.
18
19Comments in the code explain the use of regular expressions.
20"""
21
22use System.Text.RegularExpressions
23
24
25class FindWords
26
27    var wordsFilename = ''
28    var ignoreFilename = ''
29    var searchDirectory = ''
30    var onlyNames as bool
31    var showLines as bool
32    var words = SortedDictionary<of String, Regex>()
33    var ignoreExtensions = List<of String>()
34    var startDir as DirectoryInfo?
35    var listFile as FileInfo?
36
37   
38    def main
39        .wordsFilename = Path.combine(Path.getDirectoryName(CobraCore.exePath), "wordlist.txt")
40        .ignoreFilename = ""
41        .searchDirectory = Path.getFullPath(".")
42        .onlyNames = false
43        .showLines = true
44
45        .processCommandLine
46        .readRegExList
47        .readExtToIgnore
48       
49        .listFile = FileInfo(.wordsFilename)
50        .startDir = DirectoryInfo(.searchDirectory)
51       
52        .checkDirectory(.startDir to !)
53       
54        print "\nDone."
55
56
57    def showSyntax
58        appName = CobraCore.commandLineArgs[0]
59        if appName.lastIndexOf(Path.directorySeparatorChar) > 0
60            appName = appName[appName.lastIndexOf(Path.directorySeparatorChar)+1:]
61        if appName.lastIndexOf(".") > 0
62            appName = appName[:appName.lastIndexOf(".")]
63        print "Syntax:\n[appName] \[searchDirectory\] \[-l|list listFilename\] \[-i|ignore ingorelistFilename\] \[-n|-names] \[-s|-summary] \[-h|-help]"
64        print "  -l|list file    filename of the list of words to search for"
65        print "  -i|ignore file  filename of an optional list of extensions for files to ignore"
66        print "  -n|names        only check file and directory names"
67        print "  -s|summary      only show files, not lines within files"
68        print "  -h|help         show this help text"       
69        CobraCore.exit(0)
70
71
72    def processCommandLine
73        args = CobraCore.commandLineArgs
74
75        listArgs    = ["-l", "-list"]
76        ignoreArgs  = ["-i", "-ignore"]
77        nameArgs    = ["-n", "-name", "-names"]
78        summaryArgs = ["-s", "-summary"]
79        helpArgs    = ["-h", "-help"]
80        allArgs     = listArgs.concated(ignoreArgs).concated(nameArgs).concated(summaryArgs).concated(helpArgs)
81       
82        lookingForIgnore = false
83        lookingForList = false
84        foundDir = false
85        argError = false
86       
87        if args.count > 1
88            for arg in args[1:]
89                if arg in allArgs and lookingForList
90                    print "unable to determine filename for list of words, expected '-l filename'"
91                    lookingForList = false
92                    argError = true
93                if arg in allArgs and lookingForIgnore
94                    print "unable to determine filename for list of ignore extensions, expected '-i filename'"
95                    lookingForIgnore = false
96                    argError = true
97               
98                if arg in helpArgs
99                    .showSyntax
100                else if arg in nameArgs
101                    .onlyNames = true
102                else if arg in summaryArgs
103                    .showLines = false
104                else if arg in ignoreArgs
105                    lookingForIgnore = true
106                else if arg in listArgs
107                    lookingForList = true
108                else if arg[0] == "-"
109                    print "unrecognized argument [arg]"
110                    argError = true
111                else
112                    if lookingForIgnore
113                        .ignoreFilename = arg.toString
114                        lookingForIgnore = false
115                    else if lookingForList
116                        .wordsFilename = arg.toString
117                        lookingForList = false                     
118                    else if not foundDir  # must be the directory to look in
119                        .searchDirectory = arg.toString
120                        foundDir = true
121                    else  # already found directory
122                        print "more than one directory provied on commandline, expected 'directory'"
123                        argError = true
124
125        if argError
126            print
127            .showSyntax
128
129
130    def readRegExList       
131        try
132            using wordsFile = StreamReader(.wordsFilename)
133
134                wordLine = wordsFile.readLine
135               
136                while wordLine
137                    # We can tell if there is a word on a line using an
138                    # inline regular expression:
139                    #   Regex.match(textToSearch, regularExpression)
140                    # The textToSearch can be a hard-coded string,
141                    # but most often it will be a String.
142                    # The regularExpression is a string (hard-coded
143                    # or not) that follows the regular expression
144                    # syntax.
145                    wordMatch = Regex.match(wordLine, r".+")
146                   
147                    # The return from a match() call is a Match class
148                    # object. The success property tells us whether the
149                    # search located what we were looking for.
150                    # The toString method gives us the resulting subString.
151                    if wordMatch.success
152                        # We can create a Regex object which can later be
153                        # used to execute a search with a given regular
154                        # expression. In this case we are setting two
155                        # options (ignoring case in the search we are
156                        # creating, and pre-compiling the search, as we
157                        # may be using it many times).
158                        # Note that here we are saving the resulting
159                        # Regex object to a Dictionary for later use.
160                        # We could just as easily save the string itself,
161                        # but this way we only pay the Regex creation cost
162                        # once, rather than every time we carry out the
163                        # search.
164                        .words[wordMatch.toString] = Regex(wordMatch.toString, RegexOptions(Compiled, IgnoreCase))
165                       
166                    wordLine = wordsFile.readLine
167        catch ioe as IOException
168            .reportError(.wordsFilename, ioe)
169            return
170        print "\nFound [.words.count] search words.\n"
171
172
173    def readExtToIgnore
174        # optionally find extensions to ignore
175        if .ignoreFilename.length > 0
176            try
177                using ignoreFile = StreamReader(.ignoreFilename)
178                    ignoreLine = ignoreFile.readLine                   
179                    while ignoreLine
180                        if ignoreLine.length > 0
181                            .ignoreExtensions.add("." + ignoreLine.trim)
182                        ignoreLine = ignoreFile.readLine
183            catch ioe as IOException
184                .reportError(.ignoreFilename, ioe)
185                return
186            print "\nFound [.ignoreExtensions.count] ignored extenstions.\n"
187
188
189    def checkDirectory(curDir as DirectoryInfo)
190        if not curDir.exists
191            print "Unable to find search directory [curDir.fullName]"
192            return
193       
194        # print "Checking [curDir.fullName]..."
195       
196        foundWords = List<of String>()
197
198        for word, wordMatch in .words
199            # check directory name
200            # Here we use a regular expression match
201            # to determine whether the directory names have
202            # any of the words we are looking for.
203            # Each directory name is passed to the match
204            # method call made on each of the Regex objects
205            # in the dictionary we created earlier.
206            dirNameCheck = wordMatch.match(curDir.name) 
207
208            if dirNameCheck.success
209                foundWords.add(word) 
210               
211        if foundWords.count
212            print "[curDir.fullName]: directory name may have [foundWords]"
213                       
214        # check files
215           
216        for subFile in curDir.getFiles
217            # Note that here we check if the file we are about
218            # to look at has a name which matches our list of words
219            # or an extension we are ignoring. Neither of these
220            # tests use regular expressions, although we could
221            # use the same scheme for the ignore extensions, if
222            # we wanted to support regular expressions there as well.
223            if subFile.fullName == .listFile.fullName or subFile.extension in .ignoreExtensions
224                continue
225            .checkFile(subFile, curDir)
226
227        for subDir in curDir.getDirectories
228            .checkDirectory(subDir)
229
230
231    def checkFile(curFile as FileInfo, curDir as DirectoryInfo)
232        if not curFile.exists
233            print "Unable to find file [curFile.fullName]"
234            return
235           
236        # check filename
237        foundWords = List<of String>()
238
239        for word, wordMatch in .words
240            # Here we use a regular expression match
241            # to determine whether the filename has
242            # any of the words we are looking for.
243            # The name is passed to the match method
244            # call made on each of the Regex objects
245            # in the dictionary we created earlier.
246            fileNameCheck = wordMatch.match(curFile.name) 
247
248            if fileNameCheck.success
249                foundWords.add(word) 
250               
251        if foundWords.count
252            print "[curFile.fullName]: file name may have [foundWords]"
253           
254        if .onlyNames
255            return
256           
257        # check contents
258        try
259            openFile = StreamReader(curFile.fullName)
260        catch ioe as IOException
261            .reportError(curFile.fullName, ioe)
262            return
263       
264        if .showLines
265            try
266                lineNum, curLine = 1, openFile.readLine
267                while curLine
268                    foundWords = List<of String>()
269   
270                    for word, wordMatch in .words
271                        # Finally, we use a regular expression match
272                        # on the contents of each file for each
273                        # of the words we want to find. We do this
274                        # check for each line in the file, so that
275                        # we can report the findings by line, if we
276                        # want to do so.
277                        curLineCheck = wordMatch.match(curLine)
278   
279                        if curLineCheck.success
280                            foundWords.add(word) 
281   
282                    if foundWords.count
283                        print "[curFile.fullName]([lineNum]) has [foundWords]"
284   
285                    curLine = openFile.readLine
286                    lineNum += 1
287            catch ioe as IOException
288                .reportError(curFile.fullName, ioe)
289                return
290        else  # not show lines
291            # When we don't need to report each line in a file
292            # we read the entire contents into a "line" and
293            # run the same regular expression match on it.
294            try
295                curLine = openFile.readToEnd
296            catch ioe as IOException
297                .reportError(curFile.fullName, ioe)
298                return
299            foundWords = List<of String>()
300            for word, wordMatch in .words
301                curLineCheck = wordMatch.match(curLine)
302                if curLineCheck.success
303                    foundWords.add(word) 
304            if foundWords.count
305                print "[curFile.fullName] has [foundWords]"
306
307        openFile.close
308
309
310    def reportError(fileName, exc as Exception)
311        print 'Error reading "[fileName]":'
312        print exc.message
313        print 'Run with -h for options.'
Note: See TracBrowser for help on using the browser.