#------------------------------ # PROBLEM 5 #------------------------------ # # Download the text file: # # http://www.samyzaf.com/braude/PYTHON/projects/oliver_twist.txt # # (a) Write a function word_count(file) that prints the number of lines, words, and characters in a file. # Test it on the above file: # >>> word_count("oliver_twist.txt") # file: oliver_twist.txt # 19191 lines 160999 words 916980 characters # # (b) Write a function word_frequency(file) which counts how many times each word appears in that book. # To make it simple: a word should consist only of English letters (no punctuation marks, hyphens, or quotes). # Hint: you should build a dictionary # Hint: Use Python string.punctuation to remove punctuation characters from words. # # Test your program by running it on oliver_twist.txt book (you should get 12733 words!) # Try to sort the words by frequency (from most frequent to least frequent). # # >>> word_frequency("oliver_twist.txt") # 730 Oliver # 303 gentleman # 288 Fagin # 53 Twist # 36 Crackit # 7 keyhole # 4 funny # ...... (this is of course only a small part: there are 12733 words in this book!) # # (c) What is the most frequent 3 letters word in this book? # How many times it appears in this book? # # (d) How many words occur more than 1000 times? # (don't count your output, write a program to find this!) #---------------------------------------------- # SOLUTION: #---------------------------------------------- import string, os # Part (a) def word_count(file): f = open(file, "r") nlines = 0 nwords = 0 nchars = 0 for line in f: nlines += 1 nwords += len(line.split()) nchars += len(line) f.close() print "File:", file print "%d lines, %d words, %d characters" % (nlines, nwords, nchars) return nlines,nwords,nchars # Part (b) def word_frequency(filename): d = dict() f = file(filename, "r") for line in f: for w in line.split(): word = string.strip(w, string.punctuation) if not word.isalpha(): continue if word in d: d[word] += 1 else: d[word] = 1 f.close() return d # Part (c) # Find the n most common words in the file def most_common_words(filename, n): d = word_frequency(filename) items = [(value, key) for key,value in d.items()] items.sort() return items[-n:] # Part (d) # Find all words whose frequency > n def more_than_n(filename, n): d = word_frequency(filename) items = [] for word in d: if d[word] > n: items.append((word, d[word])) return items # EXAMPLE USAGE #---------------------------------------------- if __name__ == "__main__": filename = "D:/workspace/oliver_twist.txt" print "Testing Charles Dickens book: Oliver Twist" print "Counting lines, words, and characters:" print word_count(filename) print "5 most common words:" print most_common_words(filename, 5) print "Words whose frequency > 1000:" print more_than_n(filename, 1000)