diff --git a/word-counter.py b/word-counter.py new file mode 100644 index 0000000..4cec3d7 --- /dev/null +++ b/word-counter.py @@ -0,0 +1,43 @@ +import re + + +def get_text(): + with open('sample.txt') as file:#imports file(and closes file when done with it??) + + file = file.read() + file = re.sub(r'[^A-Za-z\s]','',file)#strips characters + file = re.sub(r'[\s]+',' ', file)#strips whitespace + file = file.lower()#lowercases + a_list = file.split()#turns file to a list of strings + return a_list +def count_text(a_list): + wordcount={}#my empty list to plug into + for word in a_list: + if word not in wordcount: + wordcount[word] = 1 + else: #Counted the occurences of all words and added them to list(wordcount) + wordcount[word] += 1 + return wordcount + +def frequency(wordcount): + a_dict = {} + nums = sorted(wordcount.items(),key = lambda x:x[1], reverse=True) + sorts_dic = nums[0:20] + for item in sorts_dic: + a_dict.update({item[0] : item[1]}); + return a_dict +def print_results(a_dict): + an_dict = sorted(a_dict.items(),key=lambda x : x[1],reverse=True) + for items in an_dict: + print(items[0]+" "+str(items[1])) + + + +def main(): + a_list = get_text() + wordcount = count_text(a_list) + a_dict = frequency(wordcount) + print_results(a_dict) + + +main() diff --git a/word_counter.py b/word_counter.py new file mode 100644 index 0000000..9d3baa5 --- /dev/null +++ b/word_counter.py @@ -0,0 +1,66 @@ +import re + + +def get_text(): + with open('sample.txt') as file: #imports file(and closes file when done with it??) + file = file.read() + file = re.sub(r'[^A-Za-z0-9]',' ',file)#strips characters + file = re.sub(r'[\s]+ ','', file)#strips whitespace + file = file.lower()#lowercases + a_list = file.split()#turns file to a list of strings + return a_list + +def ignore_words(file): + words=[] + all_words = get_text() + common_words = ['a', 'able', 'about', 'across', 'after', 'all', 'almost', + 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', + 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', + 'do', 'does', 'either', 'else', 'ever', 'every', 'for', 'from', 'get', + 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', + 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', + 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must', + 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', + 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say', 'says', + 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', + 'their', 'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', + 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', + 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', + 'would', 'yet', 'you', 'your', 's'] + for word in all_words: + if word not in common_words: + words.append(word) + return words + +def count_text(a_list): + wordcount={}#my empty dictionary to plug into + for word in a_list: + if word not in wordcount: + wordcount[word] = 1 + else: #Counted the occurences of all words and added them to list(wordcount) + wordcount[word] += 1 + return wordcount + +def top_20(a_dict): + a_dict = dict(a_dict) + top_list = sorted(a_dict.items(), key=lambda x: x[1], reverse=True) + top20 = top_list[:20] + return top20 + +def normalize(a_list, normalize_to=50): + divisor = a_list[0][1] / normalize_to + return divisor + +def tables(a_list): + for index in a_list: + print(index[0].ljust(10),"|".center(1), + ("#"*(int(index[1])//normalize(a_list))).rjust(1)) + +def word_frequency(): + clean = get_text() + not_ignored = ignore_words(clean) + counter = count_text(not_ignored) + freq = top_20(counter) + tables(freq) + +word_frequency()