Merge remote-tracking branch 'origin/main' into fix-model-test-render

ssciwr · Sep 5, 2023 · c555644 · c555644
2 parents bb792a1 + cce0d14
commit c555644
Show file tree

Hide file tree

Showing 22 changed files with 79,802 additions and 636 deletions.
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,5 @@ data/
 .vscode/settings.json
 notebooks/my_model/
 
+# keep test data 
+!/moralization/data
diff --git a/moralization/analyse.py b/moralization/analyse.py
@@ -16,7 +16,6 @@ def _return_span_analyzer(doc_dict: dict) -> SpanAnalyzer:
     for doc in doc_dict.values():
         # doc.spans.pop("paragraphs", None)
         doc.spans.pop("KOMMENTAR", None)
-        doc.spans.pop("KAT5-Forderung implizit", None)
         doc_list.append(doc)
 
     return SpanAnalyzer(doc_list)

diff --git a/...on/data/failed_inputs/Wikipediadiskussion-pos-BD-alt_bis_Zeile60-optimiert-CK-trimmed.xmi b/...on/data/failed_inputs/Wikipediadiskussion-pos-BD-alt_bis_Zeile60-optimiert-CK-trimmed.xmi
diff --git a/...n/data/failed_inputs/Wikipediadiskussionen-pos-BD-neu_ab_Zeile60-optimiert-CK-trimmed.xmi b/...n/data/failed_inputs/Wikipediadiskussionen-pos-BD-neu_ab_Zeile60-optimiert-CK-trimmed.xmi
diff --git a/moralization/data/large_input_data/Kommentare-pos-RR-neu-optimiert-CK.xmi b/moralization/data/large_input_data/Kommentare-pos-RR-neu-optimiert-CK.xmi
diff --git a/moralization/data_manager.py b/moralization/data_manager.py
diff --git a/moralization/input_data.py b/moralization/input_data.py
diff --git a/moralization/plot.py b/moralization/plot.py
@@ -442,64 +442,32 @@ def __init__(self, data_manager):
         self.app.layout = html.Div(
             [
                 "Interactive Visualization",
-                dcc.Dropdown(["all", "train", "test"], value="all", id="dropdown_mode"),
                 dcc.Dropdown(id="dropdown_span_cat"),
                 dcc.Markdown(id="markdown_displacy", dangerously_allow_html=True),
             ]
         )
 
-        # Define the callback to update the dropdown_span_cat options and default value based on the selected mode
-        self.app.callback(
-            Output("dropdown_span_cat", "options"),
-            Output("dropdown_span_cat", "value"),
-            Input("dropdown_mode", "value"),
-        )(self.change_mode)
-
-        # Define the callback to update the visualization based on the selected span category and mode
+        # Define the callback to update the visualization based on the selected span category
         self.app.callback(
             Output("markdown_displacy", "children"),
             Input("dropdown_span_cat", "value"),
-            State("dropdown_mode", "value"),
         )(self.change_span_cat)
 
-    def change_mode(self, mode) -> tuple:
-        """
-        Changes the mode of the visualization.
-
-        This method retrieves the span categories for the selected mode and returns them as
-        options for the dropdown_span_cat dropdown.
-
-        Args:
-            mode (str): The selected mode of the visualization.
-
-        Returns:
-            A tuple containing a list of span categories and the default value for the dropdown_span_cat.
-        """
-        span_cats = []
-
-        # Retrieve the span categories for the selected mode
-        for doc in self.data_manager.doc_dict.values():
-            [span_cats.append(span_cat) for span_cat in list(doc.spans.keys())]
-
-        span_cats = list(set(span_cats))
-        return sorted(span_cats), "sc"
-
-    def change_span_cat(self, span_cat, mode) -> str:
+    def change_span_cat(self, span_cat):
         """
         Changes the selected span category.
 
-        This method visualizes the selected span category for the selected mode and returns the
+        This method visualizes the selected span category and returns the
         visualized data as an HTML document.
 
         Args:
             span_cat (str): The selected span category.
-            mode (str): The selected mode of the visualization.
 
         Returns:
             The visualized data as an HTML document.
         """
-        # Visualize the selected span category for the selected mode
-        html_doc = self.data_manager.visualize_data(_type=mode, spans_key=span_cat)
+        # Visualize the selected span category
+        html_doc = self.data_manager.visualize_data(spans_key=span_cat)
         html_doc = html_doc.replace("\n", " ")
         return html_doc
 
@@ -518,7 +486,7 @@ def run_app(self, port=8052):
         )
 
 
-def visualize_data(doc_dict, style="span", spans_key="sc", use_notebook=False):
+def return_displacy_visualization(doc_dict, style="span", spans_key="sc"):
     """Use the displacy class offered by spacy to visualize the current dataset.
         use SpacySetup.span_keys to show possible keys or use 'sc' for all.
 
@@ -567,5 +535,5 @@ def visualize_data(doc_dict, style="span", spans_key="sc", use_notebook=False):
         [doc for doc in doc_dict.values()],
         style=style,
         options={"spans_key": spans_key},
-        jupyter=use_notebook,
+        # jupyter=use_notebook,
     )
diff --git a/moralization/spacy_data_handler.py b/moralization/spacy_data_handler.py
@@ -1,26 +1,199 @@
-from spacy.tokens import DocBin
+from spacy.tokens import DocBin, Doc
 from pathlib import Path
 from tempfile import mkdtemp
+import datasets
+import spacy
 
 
 class SpacyDataHandler:
-    """Helper class to organize and prepare spacy trainings data."""
+    """Helper class to organize and prepare spacy train and test data."""
 
+    @staticmethod
+    def docbin_from_dataset(
+        data_set: datasets.Dataset,
+        task: str,
+        data_split: str = "train",
+        output_dir: Path = None,
+        overwrite: bool = False,
+        column_names: list = None,
+        check_docs: bool = False,
+    ) -> DocBin:
+        """Create a DocBin from a Dataset.
+
+        This uses the span begin, span end, and span label columns from the
+        Dataset. The complication here is that span begin and end are given as
+        token id, whereas the doc requires the character id. So we need to count
+        the characters in the sentence and assign accordingly.
+        Also, each sentence creates its own doc which is then appended to the
+        overall doc. It is done like this to avoid having to factor in all the
+        previous character counts from prior sentences. If this becomes too slow
+        for large corpora, we can think about first parsing the lists and correcting
+        the character count and then feeding everything into a doc at once.
+
+        Args:
+            data_set (datasets.Dataset): The dataframe to be converted into a DocBin.
+            task (str): The name of the SpanGroup (task that is targeted in the training).
+            data_split (str, optional): The split of the data that is exported. Defaults to "train".
+            output_dir (Path, optional): Path of the output directory where the data is saved, defaults to None.
+                If None the working directory is used.
+            overwrite (bool, optional): Whether or not the spacy files should be written
+                even if the file is already present.
+            column_names (str, optional): The column names of the feature, label, and span columns.
+                Defaults to None, in which case it will be set to
+                ["Sentences", "Labels", "Span_begin", "Span_end", "Span_label"].
+            check_docs (bool, optional): Check all the spans inside the doc and print. Defaults to False.
+        Returns:
+            Path: The path to the spacy formatted data."""
+        nlp = spacy.blank("en")
+
+        if not column_names:
+            column_names = [
+                "Sentences",
+                "Labels",
+                "Span_begin",
+                "Span_end",
+                "Span_label",
+            ]
+        # first create a list from the dataset "Sentences" column for train and test
+        textlist = SpacyDataHandler._get_list_from_dataset_column(
+            data_set[data_split], column_names[0]
+        )
+        # similarly for the spans
+        span_begin_list = SpacyDataHandler._get_list_from_dataset_column(
+            data_set[data_split], column_names[2]
+        )
+        span_end_list = SpacyDataHandler._get_list_from_dataset_column(
+            data_set[data_split], column_names[3]
+        )
+        span_label_list = SpacyDataHandler._get_list_from_dataset_column(
+            data_set[data_split], column_names[4]
+        )
+
+        # we create one doc container each for each sentence, then concatenate them together
+        # could be faster to first concat the lists and adjust the span_begin and span_end
+        # token number, but is messier, so we do it the slow way first
+        doclist = []
+        for i in range(len(textlist)):
+            # find out if there is an annotation for that sentence
+            # if yes, create span
+            if span_begin_list[i] != [0]:
+                # join the tokens in the sentence together with whitespace and create doc
+                merged_tokens = " ".join(textlist[i])
+                doc = nlp(merged_tokens)
+                # create a new span group "task"
+                doc.spans[task] = []
+
+                # get the character ids for each of the spans in the sentence
+                for j in range(len(span_begin_list[i])):
+                    (
+                        char_span_begin,
+                        char_span_end,
+                        substring,
+                    ) = SpacyDataHandler._get_character_ids(
+                        merged_tokens,
+                        textlist[i],
+                        span_begin_list[i][j],
+                        span_end_list[i][j],
+                    )
+                    # check that this will return the same string as merged_tokens
+                    SpacyDataHandler._check_same_string(
+                        merged_tokens[char_span_begin:char_span_end],
+                        substring,
+                        merged_tokens,
+                    )
+                    span = doc.char_span(
+                        char_span_begin, char_span_end, span_label_list[i][j]
+                    )
+                    # check that span text is the same as substring
+                    SpacyDataHandler._check_same_string(
+                        span.text, substring, merged_tokens
+                    )
+                    doc.spans[task].append(span)
+                doclist.append(doc)
+                if check_docs:
+                    SpacyDataHandler._check_docs(doc, task)
+        # now merge all the docs for each sentence into one DocBin
+        # the file is named "train" for training and "dev" for testing
+        # for now, we do not have further filenames
+        outfilename = "train" if data_split == "train" else "dev"
+        data_path = SpacyDataHandler.export_training_testing_data(
+            doclist, outfilename, output_dir, overwrite
+        )
+        return data_path
+
+    @staticmethod
+    def _get_list_from_dataset_column(data, column):
+        # we try this, if fails likely the column names are incorrect
+        try:
+            mylist = data[column]
+        except KeyError:
+            raise ValueError(
+                "Could not generate a list of the text input, likely the given column name {}\
+                does not match any of {} in the dataset.".format(
+                    column, data.column_names
+                )
+            )
+        return mylist
+
+    @staticmethod
+    def _check_same_string(new_string, old_string, complete_string):
+        if new_string != old_string:
+            raise RuntimeError(
+                "Could not match *{}* and *{}* inside *{}*".format(
+                    new_string,
+                    old_string,
+                    complete_string,
+                )
+            )
+
+    @staticmethod
+    def _check_docs(doc: Doc, task):
+        # go through the doc and print all spans and labels
+        for span in doc.spans[task]:
+            print("""Span is: "{}", with label: "{}".""".format(span, span.label_))
+
+    @staticmethod
+    def _get_character_ids(merged_tokens, text, span_begin, span_end):
+        # figure out which list indices match the span tokens
+        # first put the token substring together
+        substring = ""
+        # sometimes annotations are longer than one sentences
+        # we skip these because at the moment, we cannot account for
+        # multisentence lists
+        # we just end the annotation at the end of the sentence
+        if len(text) < span_end - 1:
+            span_end = len(text) + 1
+        # account for Python counting, and span_end signifies
+        # first token that is not in the span
+        for j in range(span_begin - 1, span_end - 1):
+            substring = " ".join([substring, text[j]])
+        # do not account for whitespace in beginning of span, otherwise spacy will return None
+        # this whitespace is added in first iteration of join
+        substring = substring.strip()
+        temp = merged_tokens.split(substring)
+        # now we need the character count of the beginning of sentence up to the
+        # annotated span; and the character count for the span
+        # sometimes no split could be found, because the complete string is replicated in the annotation
+        char_span_begin = len(temp[0]) if len(temp) > 1 else 0
+        # whitespace in beginning of substring
+        char_span_end = len(substring) + char_span_begin
+        return char_span_begin, char_span_end, substring
+
+    @staticmethod
     def export_training_testing_data(
-        self, train_dict, test_dict, output_dir=None, overwrite=False
-    ):
+        doclist: list, filename: str, output_dir, overwrite
+    ) -> Path:
         """Convert a list of spacy docs to a serialisable DocBin object and save it to disk.
         Automatically processes training and testing files.
 
         Args:
-            train_dict(dict): internally handled data storage.
-            test_dict(dict): internally handled data storage.
-            output_dir(list[Path], optional): Path of the output directory where the data is saved, defaults to None.
-            If None the working directory is used.
-            overwrite(bool, optional): wether or not the spacy files should be written
-            even if files are already present.
+            doclist (list): List of one doc per sentence.
+            filename (str): Name of the file to write.
+            output_dir (Path): Path of the output directory where the data is saved.
+            overwrite (bool): Whether or not the spacy files should be written
+                even if the file is already present.
         Return:
-            db_files(list[Path]) the location of the written files.
+            Path: the location of the written file.
         """
 
         if output_dir is None:
@@ -29,32 +202,30 @@ def export_training_testing_data(
             output_dir = Path(output_dir)
             output_dir.mkdir(exist_ok=True)
 
-        train_filename = output_dir / "train.spacy"
-        dev_filename = output_dir / "dev.spacy"
+        file = filename + ".spacy"
+        out_filename = output_dir / file
 
         # check if files already exists, only if overwrite is False:
-
         if overwrite is False:
-            if train_filename.exists() or dev_filename.exists():
+            if out_filename.exists():
                 raise FileExistsError(
-                    "The given directory already has a training and testing file."
+                    "The given directory already has an exported DocBin file with name {}.".format(
+                        filename
+                    )
                     + " Please choose a new directory or set overwrite to True."
                     + f"Given directory is: {output_dir}"
                 )
 
-        db_train = DocBin()
-        db_test = DocBin()
+        db_out = DocBin()
 
-        for doc_train, doc_test in zip(train_dict.values(), test_dict.values()):
-            db_train.add(doc_train)
-            db_test.add(doc_test)
+        for doc in doclist:
+            db_out.add(doc)
 
-        db_train.to_disk(train_filename)
-        db_test.to_disk(dev_filename)
-        self.db_files = [train_filename, dev_filename]
-        return self.db_files
+        db_out.to_disk(out_filename)
+        return out_filename
 
-    def _check_files(self, input_dir=None, train_file=None, test_file=None):
+    @staticmethod
+    def _check_files(input_dir: Path = None, train_file=None, test_file=None):
         if input_dir is None and test_file is None and train_file is None:
             raise FileNotFoundError(
                 "Please provide either a directory or the file locations."
@@ -87,12 +258,6 @@ def _check_files(self, input_dir=None, train_file=None, test_file=None):
         # if not we search in the current or given working directory
         if input_dir is None:
             input_dir = Path.cwd()
-        else:
-            input_dir = Path(input_dir)
-
-        # search the directory for the files.
-
-        input_dir = Path(input_dir)
         if (input_dir / train_file).exists():
             db_train = input_dir / train_file
         else:
@@ -105,9 +270,10 @@ def _check_files(self, input_dir=None, train_file=None, test_file=None):
 
         return db_train, db_test
 
-    def import_training_testing_data(
-        self, input_dir=None, train_file=None, test_file=None
-    ):
-        db_train, db_test = self._check_files(input_dir, train_file, test_file)
-        self.db_files = [db_train, db_test]
-        return self.db_files
+    @staticmethod
+    def import_training_testing_data(input_dir=None, train_file=None, test_file=None):
+        db_train, db_test = SpacyDataHandler._check_files(
+            input_dir, train_file, test_file
+        )
+        db_files = [db_train, db_test]
+        return db_files