Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data preprocessor generator #23

Merged
merged 20 commits into from
Oct 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
4905e1d
Merge pull request #13 from umd-fire-coml/FTR-environment-checker
chromestone Sep 22, 2020
17500ea
implemented data downloader and updated .gitignore file
KeithMaxwellZ Sep 25, 2020
cecada1
implemented data parser
KeithMaxwellZ Sep 25, 2020
c847fad
Merge pull request #14 from umd-fire-coml/data-checker-downloader
KeithMaxwell-Z Sep 25, 2020
4cdd2a8
updated dictionary and dataset downloader to make it more organized a…
KeithMaxwellZ Sep 26, 2020
d8c2d0c
updated dictionary and dataset downloader to make it more organized a…
KeithMaxwellZ Sep 26, 2020
5c5c946
updated .gitignore file
KeithMaxwellZ Sep 26, 2020
4c444ce
rephrased comments to make it nicer
KeithMaxwellZ Sep 26, 2020
53a6220
rephrased comments to make it nicer
KeithMaxwellZ Sep 26, 2020
9451d03
fixed a bug with dir path
KeithMaxwellZ Sep 26, 2020
6a0fe51
updated .gitignore file
KeithMaxwellZ Sep 26, 2020
6b9feed
implemented dataset/dictionary file validation script and fixed a bug…
KeithMaxwellZ Sep 29, 2020
637987c
Merge pull request #20 from umd-fire-coml/data-checker--validation-sc…
jzchan132 Sep 30, 2020
0a1757d
Update README.md
jzchan132 Sep 30, 2020
7d0e6d5
Merge pull request #21 from umd-fire-coml/FTR-data-checker
jzchan132 Sep 30, 2020
dd62dde
Create data_generator.py
jzchan132 Sep 30, 2020
cb5afa9
Update data_generator.py
jzchan132 Sep 30, 2020
bf69b7a
Adding init and skeleton structure
umcptasa Oct 7, 2020
df91f96
Finished implementation for data_generator (Implemented "generate_bat…
umcptasa Oct 12, 2020
09b1d7c
Merge branch 'FTR-data-preprocessor' into data-preprocessor-generator
jzchan132 Oct 12, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,5 @@ dmypy.json

# Pyre type checker
.pyre/
/data/
/data.zip
4 changes: 4 additions & 0 deletions ModuleException.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class ModuleException(Exception):
def __init__(self, msg: str):
super().__init__(msg)
self.msg = msg
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
# 2020-Text-Generation
# 2020-Text-Generation
A series of products beginning from predicting the rest of a word given a few characters.
24 changes: 24 additions & 0 deletions data_checker_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import xml.etree.ElementTree as ET


def data_parser():
path = "data/nysk.xml"
with open(path, "r", encoding="utf-8") as f:
doc = ET.ElementTree(file=f)

r = doc.getroot()
print(len(r))

for item in r:
news_id = item.findtext('docid')
source = item.findtext('source')
url = item.findtext('url')
title = item.findtext('title')
summary = item.findtext('summary')
text = item.findtext('text')

print(f"{news_id} -- {title}")


if __name__ == "__main__":
data_parser()
35 changes: 35 additions & 0 deletions data_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import numpy as np
import sys
from matplotlib import pyplot as plt
import csv
import xml.etree.ElementTree as ET
from nysk.xml

def __init__(self, doc_ids, batch_size=32, shuffle=True):
'Initialization'
self.batch_size = batch_size
self.doc_ids = doc_ids
self.shuffle = shuffle
self.generate_doc_ids()
self.on_epoch_end()

def generate_batch(self):
'Creating a ElementTree with the xml file'
data_tree = ET.parse('nysk.xml')
data_root = data_tree.getroot()

'Opens file for writing'
word_set = open("WordDataset.txt", "w")

'Finds all document tags and their corresponding text contents'
for description in myroot.findall('document'):
doc_text = description.find('text').text
'Splits the description string content into individual words in a list'
split_words = doc_text.split()
for words in split_words:
word_set.write(words + "\n")
word_set.close()

def on_epoch_end(self):
if self.shuffle == True:
np.random.shuffle(self.doc_ids)
131 changes: 131 additions & 0 deletions eng_dictionary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from enum import Enum, unique
from ModuleException import ModuleException


@unique
class word_class(Enum):
""" Enums for tracking all word classes """
N = "Noun"
VERB = "Verb"
ADJ = "Adjective"
ADV = "Adverb"
P = "Pronoun"
PP = "Preposition"
CONJ = "Conjunction"

def __str__(self):
return self.value

def __repr__(self):
return self.value

def __eq__(self, other):
return self.value == other.value


# The class for a word, stores the word itself and the class(es) it belongs to
class e_word:
"""
The class that stores each English word

Args:
word: The string that stores actual word
wc: The list that stores the possible word class(es) the word belongs to
"""
def __init__(self, w: str):
self.word = w
self.wc = []
self.__dict__ = {"word": self.word, "wc": self.wc}

def __len__(self):
""" return the length of the word"""
return len(self.word)

def __getitem__(self, item):
""" Return the letters at specific indexes """
return self.word[item]

def __str__(self):
""" Overwrites the original __str__ method """
return f"[{self.word}, {self.wc}]"

def __repr__(self):
""" Overwrites the original __repr__ method """
return f"[{self.word}, {self.wc}]"

def add_wc(self, wc: list):
""" Add specific word classes to this word """
for i in wc:
# Check if the class already exists
if i not in self.wc:
self.wc.append(i)

# Overwrites its __dict__ property for easier serialization
self.__dict__["wc"] = self.wc


class en_dict:
"""
The class of english_dictionary
Args:
d: the dictionary object that stores all the words
"""
def __init__(self):
self.d = {}

def add_word(self, word):
""" Add a word into the dictionary """
if type(word) is e_word:
self.__add_to_dict(self.d, word, 0)
elif type(word) is list:
for i in word:
en_dict.__add_to_dict(self.d, i, 0)

@staticmethod
def __add_to_dict(present: dict, temp_word: e_word, pos: int):
if len(temp_word) == pos:
if "word" in present.keys():
present["word"].add_wc(temp_word.wc)
else:
present["word"] = temp_word
else:
if temp_word[pos] in present.keys():
en_dict.__add_to_dict(present[temp_word[pos]], temp_word, pos + 1)
else:
t = {"word": temp_word}
for i in range(1, len(temp_word) - pos):
t = {temp_word[-i]: t}
present[temp_word[pos]] = t

def lookup_word(self, target: str):
""" lookup a word to see if it's in the dictionary """
return en_dict.__lookup(self.d, target, 0)

@staticmethod
def __lookup(present: dict, word: str, pos: int):
if pos == len(word):
if "word" in present.keys():
return True
else:
return False
else:
if not word[pos].isalpha():
return False
if word[pos] in present.keys():
return en_dict.__lookup(present[word[pos]], word, pos + 1)
else:
return False

def add_all(self, other):
""" Add the words from another en_dict object into this one """
if type(other) is en_dict:
self.__add_from_other(other.d)
else:
raise ModuleException("The argument should be an en_dict object!")

def __add_from_other(self, other: dict):
for i in other.keys():
if i == "word":
self.add_word(other[i])
else:
self.__add_from_other(other[i])
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ dependencies:
- numpy
- matplotlib
- tqdm
- gensim
- gensim
172 changes: 172 additions & 0 deletions file_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
from bs4 import BeautifulSoup as bs
import requests
import re
import tqdm
import string
import zipfile
import os
import pickle

from eng_dictionary import en_dict, e_word, word_class


def fetch(letter: str, base_dict=en_dict(), err_list=None):
""" Fetch the dictionary for the letter provided

Args:
letter: All words started with this letter will be downloaded
base_dict: The base en_dictionary object
err_list: The error list that is used to store all words that doesn't have a word class

Returns:
return the base en_dictionary object and the error list

"""
if err_list is None:
err_list = []

# Get the HTML response
url = f"http://www.mso.anu.edu.au/~ralph/OPTED/v003/wb1913_{letter}.html"
r = requests.get(url)
s = bs(r.text, "html.parser")

# Analyze the document and find specific contents using regex
for i in tqdm.tqdm(s.find_all("p")):
prop_match = re.search(r"<i>.*</i>", str(i))
word_match = re.search(r"<b>.*</b>", str(i))
if prop_match is None or word_match is None:
continue

# Get the actual content, without HTML tags
prep = get_content(prop_match.group())
word = get_content(word_match.group())

# Analyze the possible classes of the words and assign it to a list
t_list = []
if "n." in prep:
t_list.append(word_class.N)
if "v." in prep:
t_list.append(word_class.VERB)
if "adj." in prep or "a" in prep:
t_list.append(word_class.ADJ)
if "adv." in prep:
t_list.append(word_class.ADV)
if "obj." in prep:
t_list.append(word_class.P)
if "prep." in prep:
t_list.append(word_class.PP)
if "conj." in prep:
t_list.append(word_class.N)

# if it doesn't have any proper class then add it into the error_list
if len(t_list) == 0:
err_list.append([word, prep])
continue
# Otherwise create a new e_word object to store it
else:
t_word = e_word(word)
t_word.add_wc(t_list)

base_dict.add_word(t_word)

return base_dict, err_list


def get_content(raw: str):
""" Get the actual contents from the HTML element """
return raw[3: len(raw)-4]


def dict_downloader(overwrite=False):
""" The downloader for english dictionary

Args:
overwrite: When it's True, the function overwrites the original file is the file exists,
otherwise it skips that file

Returns:
Return the english_dictionaary object (An en_dict object)
"""
e = None
data_dir = "data/en_dictionary"

# Create the dir if not exist
if not os.path.isdir(data_dir):
os.mkdir(data_dir)

# iterate through all letters and download the dictionary
for s in string.ascii_lowercase:
f_name = f"{data_dir}/dict_{s}.pickle"
print(f"Present letter: {s}")

# Check if file exists
if not os.path.isfile(f_name) or overwrite:
temp, e = fetch(s, err_list=e)

with open(f_name, "wb") as f:
pickle.dump(temp, f)
else:
print(f"Dict pickle file '{f_name}' already exist")
return e


def dataset_downloader(overwrite=False):
"""
Args:
overwrite: When it's True, the function overwrites the original file is the file exists,
otherwise it skips that file

Returns:
Doesn't return anythings
"""
file_name = "data.zip"
extract_path = "data/dataset/"

if not os.path.isdir(extract_path):
os.mkdir(extract_path)

if not os.path.isfile(file_name):
print("Downloading")

# Get the response and download the file
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00260/nysk.xml.zip"
r = requests.get(url)

with open(file_name, "wb") as f:
f.write(r.content)
print("Downloading finished, starts extracting")
else:
print("Compressed file already exists, starts extracting")

# Check if target file exists
if not os.path.isfile(f"{extract_path}{file_name}") or overwrite:
zf = zipfile.ZipFile(file_name)
if not os.path.isdir(extract_path):
os.mkdir(extract_path)

for n in zf.namelist():
zf.extract(n, f"{extract_path}/")
zf.close()
print("Finished")
else:
print("Dataset already exists")


def downloader():
""" Main method for download all required files """
if not os.path.isdir("data"):
os.mkdir("data")

error_file_name = "data/en_dictionary/error_list.pickle"

error = dict_downloader()

with open(error_file_name, "wb") as f:
pickle.dump(error, f)

dataset_downloader()
return


if __name__ == "__main__":
downloader()
Loading