#!/usr/bin/env python
#=================================================================
# Count the number of occurrences of each word in a file
# Algorithm 1 - Sort all words, count consecutive equal words
# Algorithm 2 - Use a dictionary to keep count of words 
#=================================================================
import cgi                                  #cgi 
import cgitb                                #cgi with traceback error handling
from   timeit import default_timer			#used to time the process
cgitb.enable()

print("Content-Type: text/html \n")          #required http response header (w/ extra line)

defaultFile = "/home/sultans/web/bigdata/demo/hadoop/data/book1.txt"     #default input file

elements = cgi.FieldStorage()                           #obtain the http parameters
filename = elements.getvalue('file') or defaultFile     #if nothing entered, replace with default file   

#=================================================================
#display the html page
#=================================================================
def display():
   
    print("""
        <html>
        <head>
        <title>Word Count</title>    
        <body bgcolor=lightyellow>
        <h2>Count the Number of Occurences for each Word</h2>
        <form action=wordCount_web.py method=GET>
        File Name <input type=text name=file size=50
    """)
    print("value='" + filename + "'>")
    print("<input type=submit value=Count>")
    print("<br><br>")

#=================================================================
# read a file and create an array of words
#=================================================================
def read_file():
    global allWords

    allWords = [ ]                  #create an all words array

    with open(filename) as file:    #open the file for reading
        lines = file.readlines( )   #read all lines into a python list called lines

    for line in lines:              #for every line in the list of lines 
        words     = line.split( )   #split the line on space(s) into words array
        allWords += words           #append the words array to the all words array

#=================================================================
# Count the number of occurrences of each word in an array
# by sorting the words array, and counting consecutive words
#=================================================================
def count_words():
    global allWords

    prevWord = ""                   #create a variable called prevWord and set to empty
    count    = 0                    #create a variable called count, and set it to 0

    allWords.sort( )                #sort the array.  For desc use a.sort(reverse=True)

    print("<table width=100 border=1 bgcolor=white>")
    print("<tr bgcolor=lightgray><td><b>Word<th>Count")

    for word in allWords:                              #for every word in the sorted allWords array
        if word == prevWord:                           #if word is the same as previous word 
            count += 1                                 #  add 1 to count
        else:                                          #if word is not the same as previous word
            if prevWord:                               #if prev word exists (eliminates first loop)
                print("<tr><td>%s <th> %d" % (prevWord, count))   #  print(the word & count
            prevWord = word                            #save last word in previous word 
            count = 1                                  #reset count to 1 for the new word

    print("<tr><td>%s <th> %d" % (prevWord, count))    #print(the last word & count
    print("</table>")

#==================================================================================
# Count the number of occurrences of each word in an array  (another way)
# by creating a dictionary element for each word, and adding 1 for each occurence 
#==================================================================================
def count_words2():
    global allWords
    
    dictionary = {}                                     #create an empty dictionary

    print("<table width=100 border=1 bgcolor=white>")
    print("<tr bgcolor=lightgray><td><b>Word<th>Count")

    for word in allWords:                               #for every word in the allWords array
        if not word in dictionary:                      #if word is not in dictionary
            dictionary[word] = 1                        #add it to dictionary with count 1
        else:                                           #if word is in the dictionary
            dictionary[word] += 1                       #add 1 to the count

    for word in dictionary:
        print("<tr><td>%s <th> %d" % (word, dictionary[word]))    #print the word & count

    print("</table>")


#=================================================================================

display()
read_file()

start = default_timer()
#count_words()
count_words2()
end   = default_timer()

print("<br>")
print("Process took ", end-start, " seconds")



#=== link to see the python code =================================================
import os, sys
sys.path.insert(0,'/home/sultans/web/python/demo')
import zCode                          #import func to display the Python code
fileName = os.path.abspath(__file__)  #get absolute file name 
zCode.display(fileName)               #call it
#=================================================================================