import string

# Based on a class by Professor Norm Matloff, Dept. of Computer Science, University of California at Davis
# http://heather.cs.ucdavis.edu/~matloff/py.html
# http://heather.cs.ucdavis.edu/~matloff/Python/PLN/FastLanePython.pdf

# Usage:
#   oliver = Textfile("c:/braude/python/projects/files/oliver_twist.txt")
#     => Create a Textfile object
#
#   oliver.nlines
#     => Get number of lines in the file
#
#   oliver.nwords
#     => Get number of words in the file
#
#   oliver.nchars
#     => Get number of characters in the file
#
#   oliver.freq['Fagin']
#     => frequency of the word 'Fagin' in the text file
#
#   oliver.find('Fagin')
#     => print all lines in which the word 'Fagin' appears
#
#   oliver.most_common_words(n)
#     => returm list of n most common words in file
#

class Textfile:
    def __init__(self,filename):
        fp = open(filename, "r")        # file pointer
        self.lines = fp.readlines()     # list of all lines (public member)
        fp.close()
        self.name = filename            # name of file (public member)
        self.nlines = len(self.lines)   # number of lines (public member)
        self.nwords = 0                 # number of words (public member). counts also non-alphabetic words and duplicates
        self.nchars = 0                 # number of words (private member)
        self.freq = dict()              # word frequency dictionary (starts empty, public member)
        self._prepare_data()            # make all word/char calculations (private method)

    def words(self):
        # find list of words that are purely alphabetic (sorted)
        return sorted(self.freq.keys())

    def find(self, target):
        "prints out all lines containing target"
        for line in self.lines:
            if line.find(target) >= 0:
                print line

    def most_common_words(self, n):    # Return n most common words
        List = [(f, word) for word,f in self.freq.items()]
        List.sort()
        return [(word,f) for f,word in List[-n:]]

    # Private method! Should not be exposed to clients
    # make all necessary calculations for computing words and characters count
    # and at the same time build the word frequency dictionary
    def _prepare_data(self):
        for line in self.lines:
            self.nchars += len(line)
            words = line.split()
            self.nwords += len(words)
            for _word in words:
                word = _word.strip(string.punctuation)
                if not word.isalpha():
                    continue
                if word in self.freq:
                    self.freq[word] += 1
                else:
                    self.freq[word] = 1


######## TESTING THE CLASS ######################

def test1():
    file = "D:/WORKSPACE/oliver_twist.txt"
    oliver = Textfile(file)
    print "number of lines:", oliver.nlines
    print "number of words:", oliver.nwords
    print "number of chars:", oliver.nchars
    print "The word 'Fagin' frequency:", oliver.freq['Fagin']

    print "Print all lines in which the word 'Twist' appears"
    raw_input("Type <Enter> to continue ...")
    oliver.find('Twist')

    print "Print all lines in which the word 'Fagin' appears"
    raw_input("Type <Enter> to continue ...")
    oliver.find('Fagin')

    print "Print 12 most common words"
    raw_input("Type <Enter> to continue ...")
    print oliver.most_common_words(12)

if __name__ == "__main__":
    test1()