Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into fix-model-test-render
Browse files Browse the repository at this point in the history
  • Loading branch information
GwydionJon committed Sep 5, 2023
2 parents bb792a1 + cce0d14 commit c555644
Show file tree
Hide file tree
Showing 22 changed files with 79,802 additions and 636 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,5 @@ data/
.vscode/settings.json
notebooks/my_model/

# keep test data
!/moralization/data
1 change: 0 additions & 1 deletion moralization/analyse.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def _return_span_analyzer(doc_dict: dict) -> SpanAnalyzer:
for doc in doc_dict.values():
# doc.spans.pop("paragraphs", None)
doc.spans.pop("KOMMENTAR", None)
doc.spans.pop("KAT5-Forderung implizit", None)
doc_list.append(doc)

return SpanAnalyzer(doc_list)
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

40,586 changes: 40,586 additions & 0 deletions moralization/data/large_input_data/Kommentare-pos-RR-neu-optimiert-CK.xmi

Large diffs are not rendered by default.

240 changes: 131 additions & 109 deletions moralization/data_manager.py

Large diffs are not rendered by default.

298 changes: 151 additions & 147 deletions moralization/input_data.py

Large diffs are not rendered by default.

46 changes: 7 additions & 39 deletions moralization/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,64 +442,32 @@ def __init__(self, data_manager):
self.app.layout = html.Div(
[
"Interactive Visualization",
dcc.Dropdown(["all", "train", "test"], value="all", id="dropdown_mode"),
dcc.Dropdown(id="dropdown_span_cat"),
dcc.Markdown(id="markdown_displacy", dangerously_allow_html=True),
]
)

# Define the callback to update the dropdown_span_cat options and default value based on the selected mode
self.app.callback(
Output("dropdown_span_cat", "options"),
Output("dropdown_span_cat", "value"),
Input("dropdown_mode", "value"),
)(self.change_mode)

# Define the callback to update the visualization based on the selected span category and mode
# Define the callback to update the visualization based on the selected span category
self.app.callback(
Output("markdown_displacy", "children"),
Input("dropdown_span_cat", "value"),
State("dropdown_mode", "value"),
)(self.change_span_cat)

def change_mode(self, mode) -> tuple:
"""
Changes the mode of the visualization.
This method retrieves the span categories for the selected mode and returns them as
options for the dropdown_span_cat dropdown.
Args:
mode (str): The selected mode of the visualization.
Returns:
A tuple containing a list of span categories and the default value for the dropdown_span_cat.
"""
span_cats = []

# Retrieve the span categories for the selected mode
for doc in self.data_manager.doc_dict.values():
[span_cats.append(span_cat) for span_cat in list(doc.spans.keys())]

span_cats = list(set(span_cats))
return sorted(span_cats), "sc"

def change_span_cat(self, span_cat, mode) -> str:
def change_span_cat(self, span_cat):
"""
Changes the selected span category.
This method visualizes the selected span category for the selected mode and returns the
This method visualizes the selected span category and returns the
visualized data as an HTML document.
Args:
span_cat (str): The selected span category.
mode (str): The selected mode of the visualization.
Returns:
The visualized data as an HTML document.
"""
# Visualize the selected span category for the selected mode
html_doc = self.data_manager.visualize_data(_type=mode, spans_key=span_cat)
# Visualize the selected span category
html_doc = self.data_manager.visualize_data(spans_key=span_cat)
html_doc = html_doc.replace("\n", " ")
return html_doc

Expand All @@ -518,7 +486,7 @@ def run_app(self, port=8052):
)


def visualize_data(doc_dict, style="span", spans_key="sc", use_notebook=False):
def return_displacy_visualization(doc_dict, style="span", spans_key="sc"):
"""Use the displacy class offered by spacy to visualize the current dataset.
use SpacySetup.span_keys to show possible keys or use 'sc' for all.
Expand Down Expand Up @@ -567,5 +535,5 @@ def visualize_data(doc_dict, style="span", spans_key="sc", use_notebook=False):
[doc for doc in doc_dict.values()],
style=style,
options={"spans_key": spans_key},
jupyter=use_notebook,
# jupyter=use_notebook,
)
242 changes: 204 additions & 38 deletions moralization/spacy_data_handler.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,199 @@
from spacy.tokens import DocBin
from spacy.tokens import DocBin, Doc
from pathlib import Path
from tempfile import mkdtemp
import datasets
import spacy


class SpacyDataHandler:
"""Helper class to organize and prepare spacy trainings data."""
"""Helper class to organize and prepare spacy train and test data."""

@staticmethod
def docbin_from_dataset(
data_set: datasets.Dataset,
task: str,
data_split: str = "train",
output_dir: Path = None,
overwrite: bool = False,
column_names: list = None,
check_docs: bool = False,
) -> DocBin:
"""Create a DocBin from a Dataset.
This uses the span begin, span end, and span label columns from the
Dataset. The complication here is that span begin and end are given as
token id, whereas the doc requires the character id. So we need to count
the characters in the sentence and assign accordingly.
Also, each sentence creates its own doc which is then appended to the
overall doc. It is done like this to avoid having to factor in all the
previous character counts from prior sentences. If this becomes too slow
for large corpora, we can think about first parsing the lists and correcting
the character count and then feeding everything into a doc at once.
Args:
data_set (datasets.Dataset): The dataframe to be converted into a DocBin.
task (str): The name of the SpanGroup (task that is targeted in the training).
data_split (str, optional): The split of the data that is exported. Defaults to "train".
output_dir (Path, optional): Path of the output directory where the data is saved, defaults to None.
If None the working directory is used.
overwrite (bool, optional): Whether or not the spacy files should be written
even if the file is already present.
column_names (str, optional): The column names of the feature, label, and span columns.
Defaults to None, in which case it will be set to
["Sentences", "Labels", "Span_begin", "Span_end", "Span_label"].
check_docs (bool, optional): Check all the spans inside the doc and print. Defaults to False.
Returns:
Path: The path to the spacy formatted data."""
nlp = spacy.blank("en")

if not column_names:
column_names = [
"Sentences",
"Labels",
"Span_begin",
"Span_end",
"Span_label",
]
# first create a list from the dataset "Sentences" column for train and test
textlist = SpacyDataHandler._get_list_from_dataset_column(
data_set[data_split], column_names[0]
)
# similarly for the spans
span_begin_list = SpacyDataHandler._get_list_from_dataset_column(
data_set[data_split], column_names[2]
)
span_end_list = SpacyDataHandler._get_list_from_dataset_column(
data_set[data_split], column_names[3]
)
span_label_list = SpacyDataHandler._get_list_from_dataset_column(
data_set[data_split], column_names[4]
)

# we create one doc container each for each sentence, then concatenate them together
# could be faster to first concat the lists and adjust the span_begin and span_end
# token number, but is messier, so we do it the slow way first
doclist = []
for i in range(len(textlist)):
# find out if there is an annotation for that sentence
# if yes, create span
if span_begin_list[i] != [0]:
# join the tokens in the sentence together with whitespace and create doc
merged_tokens = " ".join(textlist[i])
doc = nlp(merged_tokens)
# create a new span group "task"
doc.spans[task] = []

# get the character ids for each of the spans in the sentence
for j in range(len(span_begin_list[i])):
(
char_span_begin,
char_span_end,
substring,
) = SpacyDataHandler._get_character_ids(
merged_tokens,
textlist[i],
span_begin_list[i][j],
span_end_list[i][j],
)
# check that this will return the same string as merged_tokens
SpacyDataHandler._check_same_string(
merged_tokens[char_span_begin:char_span_end],
substring,
merged_tokens,
)
span = doc.char_span(
char_span_begin, char_span_end, span_label_list[i][j]
)
# check that span text is the same as substring
SpacyDataHandler._check_same_string(
span.text, substring, merged_tokens
)
doc.spans[task].append(span)
doclist.append(doc)
if check_docs:
SpacyDataHandler._check_docs(doc, task)
# now merge all the docs for each sentence into one DocBin
# the file is named "train" for training and "dev" for testing
# for now, we do not have further filenames
outfilename = "train" if data_split == "train" else "dev"
data_path = SpacyDataHandler.export_training_testing_data(
doclist, outfilename, output_dir, overwrite
)
return data_path

@staticmethod
def _get_list_from_dataset_column(data, column):
# we try this, if fails likely the column names are incorrect
try:
mylist = data[column]
except KeyError:
raise ValueError(
"Could not generate a list of the text input, likely the given column name {}\
does not match any of {} in the dataset.".format(
column, data.column_names
)
)
return mylist

@staticmethod
def _check_same_string(new_string, old_string, complete_string):
if new_string != old_string:
raise RuntimeError(
"Could not match *{}* and *{}* inside *{}*".format(
new_string,
old_string,
complete_string,
)
)

@staticmethod
def _check_docs(doc: Doc, task):
# go through the doc and print all spans and labels
for span in doc.spans[task]:
print("""Span is: "{}", with label: "{}".""".format(span, span.label_))

@staticmethod
def _get_character_ids(merged_tokens, text, span_begin, span_end):
# figure out which list indices match the span tokens
# first put the token substring together
substring = ""
# sometimes annotations are longer than one sentences
# we skip these because at the moment, we cannot account for
# multisentence lists
# we just end the annotation at the end of the sentence
if len(text) < span_end - 1:
span_end = len(text) + 1
# account for Python counting, and span_end signifies
# first token that is not in the span
for j in range(span_begin - 1, span_end - 1):
substring = " ".join([substring, text[j]])
# do not account for whitespace in beginning of span, otherwise spacy will return None
# this whitespace is added in first iteration of join
substring = substring.strip()
temp = merged_tokens.split(substring)
# now we need the character count of the beginning of sentence up to the
# annotated span; and the character count for the span
# sometimes no split could be found, because the complete string is replicated in the annotation
char_span_begin = len(temp[0]) if len(temp) > 1 else 0
# whitespace in beginning of substring
char_span_end = len(substring) + char_span_begin
return char_span_begin, char_span_end, substring

@staticmethod
def export_training_testing_data(
self, train_dict, test_dict, output_dir=None, overwrite=False
):
doclist: list, filename: str, output_dir, overwrite
) -> Path:
"""Convert a list of spacy docs to a serialisable DocBin object and save it to disk.
Automatically processes training and testing files.
Args:
train_dict(dict): internally handled data storage.
test_dict(dict): internally handled data storage.
output_dir(list[Path], optional): Path of the output directory where the data is saved, defaults to None.
If None the working directory is used.
overwrite(bool, optional): wether or not the spacy files should be written
even if files are already present.
doclist (list): List of one doc per sentence.
filename (str): Name of the file to write.
output_dir (Path): Path of the output directory where the data is saved.
overwrite (bool): Whether or not the spacy files should be written
even if the file is already present.
Return:
db_files(list[Path]) the location of the written files.
Path: the location of the written file.
"""

if output_dir is None:
Expand All @@ -29,32 +202,30 @@ def export_training_testing_data(
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)

train_filename = output_dir / "train.spacy"
dev_filename = output_dir / "dev.spacy"
file = filename + ".spacy"
out_filename = output_dir / file

# check if files already exists, only if overwrite is False:

if overwrite is False:
if train_filename.exists() or dev_filename.exists():
if out_filename.exists():
raise FileExistsError(
"The given directory already has a training and testing file."
"The given directory already has an exported DocBin file with name {}.".format(
filename
)
+ " Please choose a new directory or set overwrite to True."
+ f"Given directory is: {output_dir}"
)

db_train = DocBin()
db_test = DocBin()
db_out = DocBin()

for doc_train, doc_test in zip(train_dict.values(), test_dict.values()):
db_train.add(doc_train)
db_test.add(doc_test)
for doc in doclist:
db_out.add(doc)

db_train.to_disk(train_filename)
db_test.to_disk(dev_filename)
self.db_files = [train_filename, dev_filename]
return self.db_files
db_out.to_disk(out_filename)
return out_filename

def _check_files(self, input_dir=None, train_file=None, test_file=None):
@staticmethod
def _check_files(input_dir: Path = None, train_file=None, test_file=None):
if input_dir is None and test_file is None and train_file is None:
raise FileNotFoundError(
"Please provide either a directory or the file locations."
Expand Down Expand Up @@ -87,12 +258,6 @@ def _check_files(self, input_dir=None, train_file=None, test_file=None):
# if not we search in the current or given working directory
if input_dir is None:
input_dir = Path.cwd()
else:
input_dir = Path(input_dir)

# search the directory for the files.

input_dir = Path(input_dir)
if (input_dir / train_file).exists():
db_train = input_dir / train_file
else:
Expand All @@ -105,9 +270,10 @@ def _check_files(self, input_dir=None, train_file=None, test_file=None):

return db_train, db_test

def import_training_testing_data(
self, input_dir=None, train_file=None, test_file=None
):
db_train, db_test = self._check_files(input_dir, train_file, test_file)
self.db_files = [db_train, db_test]
return self.db_files
@staticmethod
def import_training_testing_data(input_dir=None, train_file=None, test_file=None):
db_train, db_test = SpacyDataHandler._check_files(
input_dir, train_file, test_file
)
db_files = [db_train, db_test]
return db_files
Loading

0 comments on commit c555644

Please sign in to comment.