#!/usr/bin/env python
#======================================================================
# Count the number of occurrences of each word in a file
# Clean up the word (lowercase it, eliminate special characters) 
# Display is highest number of occurences
#======================================================================
import cgi                                  #cgi 
import cgitb                                #cgi with traceback error handling
import re                                   #for regular expression
cgitb.enable()

print("Content-Type: text/html \n")          #required http response header (w/ extra line)

defaultFile = "/home/sultans/web/bigdata/demo/hadoop/data/book1.txt"     #default input file

elements = cgi.FieldStorage()                           #obtain the http parameters
filename = elements.getvalue('file') or defaultFile     #if nothing entered, replace with default file   

#=================================================================
#display: display the html page
#=================================================================
def display():
    global filename
   
    print("""
        <html>
        <head>
        <title>Word Count</title>    
        <body bgcolor=lightyellow>
        <h2>Count the Number of Occurences for each Word<br>
        <font size=4><i><span title='Changed to lower case, and punctuation eliminated'>Words are normalized</span>, 
                                     and displayed by highest occurrence</i></font></h2>
        <form action=wordCount_web2.py method=GET>
        File Name <input type=text name=file size=50
    """)
    print("value='" + filename + "'>")
    print("<input type=submit value=Count>")
    print("<br><br>")

#=================================================================
# Count the number of occurrences of each word in a file
#=================================================================
def count():
    global filename

    allWords = [ ]                  #create an all words array
    prevWord = ""                   #create a variable called prevWord and set to empty
    count    = 0                    #create a variable called count, and set it to 0

    with open(filename) as file:    #open the file for reading
        lines = file.readlines( )   #read all lines into a python list called lines

    for line in lines:                              #for every line in the list of lines 
        words     = line.split( )                   #split the line into words array
        for word in words:
            word = word.lower();                    #make it all lowercase
            word = re.sub("[^a-zA-Z]","",word)      #get rid of all non alpha characters
            if word:                                #if word is not null
                allWords.append(word)               #append it to the allWords array

    allWords.sort( )                                #sort the array of words

#== Re-sort ======================================

    allWords2 = []                                  #create an array

    for word in allWords:                           #for every word in the sorted allWords array
        if word == prevWord:                        #if word is the same as previous word 
            count += 1                              #  add 1 to count
        else:                                       #if word is not the same as previous word
            if prevWord:                            #  if prev word exists (eliminates first loop)
                row = [prevWord, count]             #  create a row made up of prevWord & count
                allWords2.append(row)               #  add row to array
            prevWord = word                         #  save last word in previous word 
            count = 1                               #  reset count to 1 for the new word

    row = [prevWord, count]                         #do the same for last entry
    allWords2.append(row)

    allWords2 = sorted(allWords2, key=lambda row: row[1], reverse=True)     #sort the array on column 2 (the count) 

#== Display ======================================
 
    print("<table width=100 border=1 bgcolor=white>")
    print("<tr bgcolor=lightgray><th>Word<th>Count")

    for row in allWords2:                           #for every row in the sorted allWords2 array
        (word, count) = row                         #  each row is a word & count
        if count == 1: break                        #  skip words that only occur once
        print("<tr><td>%s <th> %d" % (word, count)) #  print(the word & count
        
    print("<tr><td><i>Eliminated words < 2 occurrences </i><th>1")        #display message
    print("</table>")

#=================================================================================

display()
count()



#=== link to see the python code =================================================
import os, sys
sys.path.insert(0,'/home/sultans/web/python/demo')
import zCode                          #import func to display the Python code
fileName = os.path.abspath(__file__)  #get absolute file name 
zCode.display(fileName)               #call it
#=================================================================================