umd-fire-coml · sobrad956 · Oct 12, 2020 · Sep 22, 2020 · Sep 25, 2020 · Sep 25, 2020
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,5 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+/data/
+/data.zip
diff --git a/ModuleException.py b/ModuleException.py
@@ -0,0 +1,4 @@
+class ModuleException(Exception):
+    def __init__(self, msg: str):
+        super().__init__(msg)
+        self.msg = msg
diff --git a/README.md b/README.md
@@ -1 +1,2 @@
-# 2020-Text-Generation
+# 2020-Text-Generation
+A series of products beginning from predicting the rest of a word given a few characters.
diff --git a/data_checker_script.py b/data_checker_script.py
@@ -0,0 +1,24 @@
+import xml.etree.ElementTree as ET
+
+
+def data_parser():
+    path = "data/nysk.xml"
+    with open(path, "r", encoding="utf-8") as f:
+        doc = ET.ElementTree(file=f)
+
+    r = doc.getroot()
+    print(len(r))
+
+    for item in r:
+        news_id = item.findtext('docid')
+        source = item.findtext('source')
+        url = item.findtext('url')
+        title = item.findtext('title')
+        summary = item.findtext('summary')
+        text = item.findtext('text')
+
+        print(f"{news_id} -- {title}")
+
+
+if __name__ == "__main__":
+    data_parser()
diff --git a/data_generator.py b/data_generator.py
@@ -0,0 +1,35 @@
+import numpy as np
+import sys
+from matplotlib import pyplot as plt
+import csv
+import xml.etree.ElementTree as ET
+from nysk.xml
+
+def __init__(self, doc_ids, batch_size=32, shuffle=True):
+    'Initialization'
+    self.batch_size = batch_size
+    self.doc_ids = doc_ids
+    self.shuffle = shuffle
+    self.generate_doc_ids()
+    self.on_epoch_end()
+
+def generate_batch(self):
+    'Creating a ElementTree with the xml file'
+    data_tree = ET.parse('nysk.xml')
+    data_root = data_tree.getroot()
+
+    'Opens file for writing'
+    word_set = open("WordDataset.txt", "w")
+
+    'Finds all document tags and their corresponding text contents'
+    for description in myroot.findall('document'):
+        doc_text = description.find('text').text
+        'Splits the description string content into individual words in a list'
+        split_words = doc_text.split()
+        for words in split_words:
+            word_set.write(words + "\n")
+    word_set.close()
+
+def on_epoch_end(self):
+    if self.shuffle == True:
+        np.random.shuffle(self.doc_ids)
diff --git a/eng_dictionary.py b/eng_dictionary.py
@@ -0,0 +1,131 @@
+from enum import Enum, unique
+from ModuleException import ModuleException
+
+
+@unique
+class word_class(Enum):
+    """ Enums for tracking all word classes """
+    N = "Noun"
+    VERB = "Verb"
+    ADJ = "Adjective"
+    ADV = "Adverb"
+    P = "Pronoun"
+    PP = "Preposition"
+    CONJ = "Conjunction"
+
+    def __str__(self):
+        return self.value
+
+    def __repr__(self):
+        return self.value
+
+    def __eq__(self, other):
+        return self.value == other.value
+
+
+# The class for a word, stores the word itself and the class(es) it belongs to
+class e_word:
+    """
+    The class that stores each English word
+
+    Args:
+        word: The string that stores actual word
+        wc: The list that stores the possible word class(es) the word belongs to
+    """
+    def __init__(self, w: str):
+        self.word = w
+        self.wc = []
+        self.__dict__ = {"word": self.word, "wc": self.wc}
+
+    def __len__(self):
+        """ return the length of the word"""
+        return len(self.word)
+
+    def __getitem__(self, item):
+        """ Return the letters at specific indexes """
+        return self.word[item]
+
+    def __str__(self):
+        """ Overwrites the original __str__ method """
+        return f"[{self.word}, {self.wc}]"
+
+    def __repr__(self):
+        """ Overwrites the original __repr__ method """
+        return f"[{self.word}, {self.wc}]"
+
+    def add_wc(self, wc: list):
+        """ Add specific word classes to this word """
+        for i in wc:
+            # Check if the class already exists
+            if i not in self.wc:
+                self.wc.append(i)
+
+        # Overwrites its __dict__ property for easier serialization
+        self.__dict__["wc"] = self.wc
+
+
+class en_dict:
+    """
+    The class of english_dictionary
+    Args:
+        d: the dictionary object that stores all the words
+    """
+    def __init__(self):
+        self.d = {}
+
+    def add_word(self, word):
+        """ Add a word into the dictionary """
+        if type(word) is e_word:
+            self.__add_to_dict(self.d, word, 0)
+        elif type(word) is list:
+            for i in word:
+                en_dict.__add_to_dict(self.d, i, 0)
+
+    @staticmethod
+    def __add_to_dict(present: dict, temp_word: e_word, pos: int):
+        if len(temp_word) == pos:
+            if "word" in present.keys():
+                present["word"].add_wc(temp_word.wc)
+            else:
+                present["word"] = temp_word
+        else:
+            if temp_word[pos] in present.keys():
+                en_dict.__add_to_dict(present[temp_word[pos]], temp_word, pos + 1)
+            else:
+                t = {"word": temp_word}
+                for i in range(1, len(temp_word) - pos):
+                    t = {temp_word[-i]: t}
+                present[temp_word[pos]] = t
+
+    def lookup_word(self, target: str):
+        """ lookup a word to see if it's in the dictionary """
+        return en_dict.__lookup(self.d, target, 0)
+
+    @staticmethod
+    def __lookup(present: dict, word: str, pos: int):
+        if pos == len(word):
+            if "word" in present.keys():
+                return True
+            else:
+                return False
+        else:
+            if not word[pos].isalpha():
+                return False
+            if word[pos] in present.keys():
+                return en_dict.__lookup(present[word[pos]], word, pos + 1)
+            else:
+                return False
+
+    def add_all(self, other):
+        """ Add the words from another en_dict object into this one """
+        if type(other) is en_dict:
+            self.__add_from_other(other.d)
+        else:
+            raise ModuleException("The argument should be an en_dict object!")
+
+    def __add_from_other(self, other: dict):
+        for i in other.keys():
+            if i == "word":
+                self.add_word(other[i])
+            else:
+                self.__add_from_other(other[i])
diff --git a/environment.yml b/environment.yml
@@ -7,4 +7,4 @@ dependencies:
     - numpy
     - matplotlib
     - tqdm
-    - gensim
+    - gensim
diff --git a/file_downloader.py b/file_downloader.py
@@ -0,0 +1,172 @@
+from bs4 import BeautifulSoup as bs
+import requests
+import re
+import tqdm
+import string
+import zipfile
+import os
+import pickle
+
+from eng_dictionary import en_dict, e_word, word_class
+
+
+def fetch(letter: str, base_dict=en_dict(), err_list=None):
+    """ Fetch the dictionary for the letter provided
+
+    Args:
+        letter: All words started with this letter will be downloaded
+        base_dict: The base en_dictionary object
+        err_list: The error list that is used to store all words that doesn't have a word class
+
+    Returns:
+        return the base en_dictionary object and the error list
+
+    """
+    if err_list is None:
+        err_list = []
+
+    # Get the HTML response
+    url = f"http://www.mso.anu.edu.au/~ralph/OPTED/v003/wb1913_{letter}.html"
+    r = requests.get(url)
+    s = bs(r.text, "html.parser")
+
+    # Analyze the document and find specific contents using regex
+    for i in tqdm.tqdm(s.find_all("p")):
+        prop_match = re.search(r"<i>.*</i>", str(i))
+        word_match = re.search(r"<b>.*</b>", str(i))
+        if prop_match is None or word_match is None:
+            continue
+
+        # Get the actual content, without HTML tags
+        prep = get_content(prop_match.group())
+        word = get_content(word_match.group())
+
+        # Analyze the possible classes of the words and assign it to a list
+        t_list = []
+        if "n." in prep:
+            t_list.append(word_class.N)
+        if "v." in prep:
+            t_list.append(word_class.VERB)
+        if "adj." in prep or "a" in prep:
+            t_list.append(word_class.ADJ)
+        if "adv." in prep:
+            t_list.append(word_class.ADV)
+        if "obj." in prep:
+            t_list.append(word_class.P)
+        if "prep." in prep:
+            t_list.append(word_class.PP)
+        if "conj." in prep:
+            t_list.append(word_class.N)
+
+        # if it doesn't have any proper class then add it into the error_list
+        if len(t_list) == 0:
+            err_list.append([word, prep])
+            continue
+        # Otherwise create a new e_word object to store it
+        else:
+            t_word = e_word(word)
+            t_word.add_wc(t_list)
+
+        base_dict.add_word(t_word)
+
+    return base_dict, err_list
+
+
+def get_content(raw: str):
+    """ Get the actual contents from the HTML element """
+    return raw[3: len(raw)-4]
+
+
+def dict_downloader(overwrite=False):
+    """ The downloader for english dictionary
+
+    Args:
+        overwrite: When it's True, the function overwrites the original file is the file exists,
+                   otherwise it skips that file
+
+    Returns:
+        Return the english_dictionaary object (An en_dict object)
+    """
+    e = None
+    data_dir = "data/en_dictionary"
+
+    # Create the dir if not exist
+    if not os.path.isdir(data_dir):
+        os.mkdir(data_dir)
+
+    # iterate through all letters and download the dictionary
+    for s in string.ascii_lowercase:
+        f_name = f"{data_dir}/dict_{s}.pickle"
+        print(f"Present letter: {s}")
+
+        # Check if file exists
+        if not os.path.isfile(f_name) or overwrite:
+            temp, e = fetch(s, err_list=e)
+
+            with open(f_name, "wb") as f:
+                pickle.dump(temp, f)
+        else:
+            print(f"Dict pickle file '{f_name}' already exist")
+    return e
+
+
+def dataset_downloader(overwrite=False):
+    """
+    Args:
+        overwrite: When it's True, the function overwrites the original file is the file exists,
+                   otherwise it skips that file
+
+    Returns:
+        Doesn't return anythings
+    """
+    file_name = "data.zip"
+    extract_path = "data/dataset/"
+
+    if not os.path.isdir(extract_path):
+        os.mkdir(extract_path)
+
+    if not os.path.isfile(file_name):
+        print("Downloading")
+
+        # Get the response and download the file
+        url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00260/nysk.xml.zip"
+        r = requests.get(url)
+
+        with open(file_name, "wb") as f:
+            f.write(r.content)
+        print("Downloading finished, starts extracting")
+    else:
+        print("Compressed file already exists, starts extracting")
+
+    # Check if target file exists
+    if not os.path.isfile(f"{extract_path}{file_name}") or overwrite:
+        zf = zipfile.ZipFile(file_name)
+        if not os.path.isdir(extract_path):
+            os.mkdir(extract_path)
+
+        for n in zf.namelist():
+            zf.extract(n, f"{extract_path}/")
+        zf.close()
+        print("Finished")
+    else:
+        print("Dataset already exists")
+
+
+def downloader():
+    """ Main method for download all required files """
+    if not os.path.isdir("data"):
+        os.mkdir("data")
+
+    error_file_name = "data/en_dictionary/error_list.pickle"
+
+    error = dict_downloader()
+
+    with open(error_file_name, "wb") as f:
+        pickle.dump(error, f)
+
+    dataset_downloader()
+    return
+
+
+if __name__ == "__main__":
+    downloader()
-Original file line number
+Diff line change
@@ Expand Up / @@ -127,3 +127,5 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    /data/
+    /data.zip