#--------------------------------------------------- # PROBLEM 4 : Advanced Solution for part (d) #--------------------------------------------------- # # Download the text file: # # http://www.samyzaf.com/braude/PYTHON/projects/oliver_twist.txt # # (a) Write a function word_count(file) that prints the number of lines, words, and characters in a file. # Test it on the above file: # >>> word_count("oliver_twist.txt") # file: oliver_twist.txt # 19191 lines 160999 words 916980 characters # # (b) Write a function word_frequency(file) which counts how many times each word appears in that book. # To make it simple: a word should consist only of English letters (no punctuation marks, hyphens, or quotes). # Hint: you should build a dictionary # Hint: Use Python string.punctuation to remove punctuation characters from words. # # Test your program by running it on oliver_twist.txt book (you should get 12733 words!) # Try to sort the words by frequency (from most frequent to least frequent). # # >>> word_frequency("oliver_twist.txt") # 730 Oliver # 303 gentleman # 288 Fagin # 53 Twist # 36 Crackit # 7 keyhole # 4 funny # ...... (this is of course only a small part: there are 12733 words in this book!) # # (c) What is the most frequent 3 letters word in this book? # How many times it appears in this book? # # (d) How many words occur more than 1000 times? # (don't count your output, write a program to find this!) #--------------------------------------------------------- # ADVANCED SOLUTION #--------------------------------------------------------- # This solution is based on the advanced re and collections modules: # # http://docs.python.org/2/library/collections.html # http://docs.python.org/2/library/re.html # # The collections module contains a Counter class which does all the work for # you, so it an unfair solution ... # re - regular expressions module: # this is an advanced module which we may not have time to do in class # but you are encouraged to read about it in: # # http://docs.python.org/2/library/re.html # import string, os, re from collections import Counter # Part (d) def most_common_words(textfile, n): "Find the n most common words in textfile" f = open(textfile) words = re.findall('\w+', f.read().lower()) f.close() counter = Counter(words) return counter.most_common(n) # EXAMPLE USAGE #---------------------------------------------- def test4(): print "Testing the short solution: most_common_words" file = "D:/BRAUDE/PYTHON/Projects/FILES/robinson_crusoe.txt" for item in most_common_words(file, 90): # word, frequency = item print "%-8s %d" % item #---------------------------------------------- if __name__ == "__main__": test4()