import string # Based on a class by Professor Norm Matloff, Dept. of Computer Science, University of California at Davis # http://heather.cs.ucdavis.edu/~matloff/py.html # http://heather.cs.ucdavis.edu/~matloff/Python/PLN/FastLanePython.pdf # Usage: # oliver = Textfile("c:/braude/python/projects/files/oliver_twist.txt") # => Create a Textfile object # # oliver.nlines # => Get number of lines in the file # # oliver.nwords # => Get number of words in the file # # oliver.nchars # => Get number of characters in the file # # oliver.freq['Fagin'] # => frequency of the word 'Fagin' in the text file # # oliver.find('Fagin') # => print all lines in which the word 'Fagin' appears # # oliver.most_common_words(n) # => returm list of n most common words in file # class Textfile: def __init__(self,filename): fp = open(filename, "r") # file pointer self.lines = fp.readlines() # list of all lines (public member) fp.close() self.name = filename # name of file (public member) self.nlines = len(self.lines) # number of lines (public member) self.nwords = 0 # number of words (public member). counts also non-alphabetic words and duplicates self.nchars = 0 # number of words (private member) self.freq = dict() # word frequency dictionary (starts empty, public member) self._prepare_data() # make all word/char calculations (private method) def words(self): # find list of words that are purely alphabetic (sorted) return sorted(self.freq.keys()) def find(self, target): "prints out all lines containing target" for line in self.lines: if line.find(target) >= 0: print line def most_common_words(self, n): # Return n most common words List = [(f, word) for word,f in self.freq.items()] List.sort() return [(word,f) for f,word in List[-n:]] # Private method! Should not be exposed to clients # make all necessary calculations for computing words and characters count # and at the same time build the word frequency dictionary def _prepare_data(self): for line in self.lines: self.nchars += len(line) words = line.split() self.nwords += len(words) for _word in words: word = _word.strip(string.punctuation) if not word.isalpha(): continue if word in self.freq: self.freq[word] += 1 else: self.freq[word] = 1 ######## TESTING THE CLASS ###################### def test1(): file = "D:/WORKSPACE/oliver_twist.txt" oliver = Textfile(file) print "number of lines:", oliver.nlines print "number of words:", oliver.nwords print "number of chars:", oliver.nchars print "The word 'Fagin' frequency:", oliver.freq['Fagin'] print "Print all lines in which the word 'Twist' appears" raw_input("Type to continue ...") oliver.find('Twist') print "Print all lines in which the word 'Fagin' appears" raw_input("Type to continue ...") oliver.find('Fagin') print "Print 12 most common words" raw_input("Type to continue ...") print oliver.most_common_words(12) if __name__ == "__main__": test1()