ssciwr · iulusoy · Aug 29, 2023 · Aug 18, 2023 · Aug 18, 2023 · Aug 18, 2023
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,5 @@ data/
 .vscode/settings.json
 notebooks/my_model/
 
+# keep test data 
+!/moralization/data
diff --git a/moralization/analyse.py b/moralization/analyse.py
@@ -15,7 +15,6 @@ def _return_span_analyzer(doc_dict):
     for doc in doc_dict.values():
         # doc.spans.pop("paragraphs", None)
         doc.spans.pop("KOMMENTAR", None)
-        doc.spans.pop("KAT5-Forderung implizit", None)
         doc_list.append(doc)
 
     return SpanAnalyzer(doc_list)

diff --git a/...on/data/failed_inputs/Wikipediadiskussion-pos-BD-alt_bis_Zeile60-optimiert-CK-trimmed.xmi b/...on/data/failed_inputs/Wikipediadiskussion-pos-BD-alt_bis_Zeile60-optimiert-CK-trimmed.xmi
diff --git a/...n/data/failed_inputs/Wikipediadiskussionen-pos-BD-neu_ab_Zeile60-optimiert-CK-trimmed.xmi b/...n/data/failed_inputs/Wikipediadiskussionen-pos-BD-neu_ab_Zeile60-optimiert-CK-trimmed.xmi
diff --git a/moralization/data/large_input_data/Kommentare-pos-RR-neu-optimiert-CK.xmi b/moralization/data/large_input_data/Kommentare-pos-RR-neu-optimiert-CK.xmi
diff --git a/moralization/data_manager.py b/moralization/data_manager.py
diff --git a/moralization/input_data.py b/moralization/input_data.py
@@ -158,115 +158,126 @@
             "Protagonistinnen3": "KAT3-own/other",
             "KommunikativeFunktion": "KAT4-Kommunikative Funktion",
             "Forderung": "KAT5-Forderung explizit",
-            #       "KAT5Ausformulierung": "KAT5-Forderung implizit",
+            "KAT5Ausformulierung": "KAT5-Forderung implizit",
             #       "Kommentar": "KOMMENTAR",
         }
 
         nlp = spacy_load_model(language_model)
         doc = nlp(cas.sofa_string)
-
-        doc_train = nlp(cas.sofa_string)
-        doc_test = nlp(cas.sofa_string)
-
-        # add original cassis sentence as paragraph span
+        # initalize the SpanGroup objects
+        doc.spans["sc"] = []
+        doc.spans["paragraphs"] = []
+        for cat in map_expressions.values():
+            doc.spans[cat] = []
+
+        # now put the paragraphs (instances/segments) into the SpanGroup "paragraphs"
+        # these are defined as cas sentences in the input
         sentence_type = ts.get_type(
             "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"
         )
+        paragraph_list = cas.select(sentence_type.name)
+        doc = InputOutput._get_paragraphs(doc, paragraph_list)
 
-        # initilize all span categories
-        for doc_object in [doc, doc_train, doc_test]:
-            doc_object.spans["sc"] = []
-            doc_object.spans["paragraphs"] = []
-            for cat in map_expressions.values():
-                doc_object.spans[cat] = []
-
-            paragraph_list = cas.select(sentence_type.name)
-            for paragraph in paragraph_list:
-                doc_object.spans["paragraphs"].append(
-                    doc_object.char_span(
-                        paragraph.begin,
-                        paragraph.end,
-                        label="paragraph",
-                    )
-                )
+        # now put the different categories of the custom spans (ie Kat1, etc) into
+        # SpanGroups
         span_type = ts.get_type("custom.Span")
-
         span_list = cas.select(span_type.name)
-
-        doc, doc_train, doc_test = InputOutput._split_train_test(
-            doc, doc_train, doc_test, span_list, map_expressions
-        )
-
-        return doc, doc_train, doc_test
+        # now assign the spans and labels in the doc object from the cas object
+        doc = InputOutput._assign_span_labels(doc, span_list, map_expressions)
+        return doc
 
     @staticmethod
-    def _split_train_test(doc, doc_train, doc_test, span_list, map_expressions):
-        # every n-th entry is put as a test value
-        n_test = 5
-        n_start = 0
+    def _get_paragraphs(doc, paragraph_list):
+        # add original cassis sentence as paragraph span
+        for paragraph in paragraph_list:
+            doc.spans["paragraphs"].append(
+                doc.char_span(
+                    paragraph.begin,
+                    paragraph.end,
+                    label="paragraph",
+                )
+            )
+        return doc
 
+    @staticmethod
+    def _assign_span_labels(doc, span_list, map_expressions):
+        # put the custom spans into the categories
+        # we also need to delete "Moralisierung" and "Keine Moralisierung"
+        labels_to_delete = ["Keine Moralisierung", "Moralisierung"]
         for span in span_list:
             for cat_old, cat_new in map_expressions.items():
                 # not all of these categories have values in every span.
-                if span[cat_old]:
+                if span[cat_old] and span[cat_old] not in labels_to_delete:
                     # we need to attach each span category on its own, as well as all together in "sc"
-
-                    char_span = doc.char_span(
-                        span.begin,
-                        span.end,
-                        label=span[cat_old],
+                    char_span = InputOutput._get_char_span(cat_old, doc, span)
+                    doc = InputOutput._append_char_span(
+                        doc, cat_new, char_span, span, cat_old
                     )
-                    if char_span:
-                        doc.spans[cat_new].append(char_span)
-                        doc.spans["sc"].append(char_span)
-                        n_start = n_start + 1
-
-                        if n_start % n_test != 0:
-                            char_span_train = doc_train.char_span(
-                                span.begin,
-                                span.end,
-                                label=span[cat_old],
-                            )
-                            doc_train.spans[cat_new].append(char_span_train)
-                            doc_train.spans["sc"].append(char_span_train)
-                        else:
-                            char_span_test = doc_test.char_span(
-                                span.begin,
-                                span.end,
-                                label=span[cat_old],
-                            )
-                            doc_test.spans[cat_new].append(char_span_test)
-                            doc_test.spans["sc"].append(char_span_test)
-
-                    # char_span returns None when the given indices do not match a token begin and end.
-                    # e.G ".Ich" instead of ". Ich"
-                    elif char_span is None:
-                        logging_warning = f"The char span for {span.get_covered_text()} ({span}) returned None.\n"
-                        logging_warning += (
-                            "It might be due to a mismatch between char indices. \n"
-                        )
-                        if logging.root.level > logging.DEBUG:
-                            logging_warning += "Skipping span! Enable Debug Logging for more information."
-
-                        logging.warning(logging_warning)
-                        logging.debug(
-                            f"""Token should be: \n \t'{span.get_covered_text()}', but is '{
-                                    doc.char_span(
-                                    span.begin,
-                                    span.end,
-                                    alignment_mode="expand",
-                                    label=span[cat_old],
-
-                                )}'\n"""
-                        )
-
-                    # create test and train set:
-
-        return doc, doc_train, doc_test
+        return doc
+
+    @staticmethod
+    def _get_char_span(cat_old, doc, span):
+        # Kat5 implicit has a long string inside the label
+        # we need to delete this string and instead put "implicit"
+        if cat_old == "KAT5Ausformulierung":
+            char_span = doc.char_span(
+                span.begin,
+                span.end,
+                label="implizit",
+            )
+        else:
+            char_span = doc.char_span(
+                span.begin,
+                span.end,
+                label=span[cat_old],
+            )
+        return char_span
+
+    @staticmethod
+    def _append_char_span(doc, cat_new, char_span, span, cat_old):
+        if char_span:
+            doc.spans[cat_new].append(char_span)
+            doc.spans["sc"].append(char_span)
+        # char_span returns None when the given indices do not match a token begin and end.
+        # e.G ".Ich" instead of ". Ich"
+        # The problem stems from a mismatch between spacy token beginnings and cassis token beginning.
+        # This might be due to the fact that spacy tokenizes on whitespace and cassis on punctuation.
+        # This leads to a mismatch between the indices of the tokens,
+        # where spacy sees ".Ich" as a single token
+        # cassis on the other hand returns only the indices for I and h as start and end point,
+        # thus spacy complains that the start ID is not actually the beginning of the token.
+        # We could fix this by trying reduce the index by 1 and check if the token is not complete.
+        # However this would give us some tokens that are not actually Words and
+        # thus are not useful for training.
+        # print a warning that this span cannot be used
+        elif char_span is None:
+            InputOutput._warn_empty_span(doc, span, cat_old)
+        return doc
+
+    @staticmethod
+    def _warn_empty_span(doc, span, cat_old):
+        logging_warning = (
+            f"The char span for {span.get_covered_text()} ({span}) returned None.\n"
+        )
+        logging_warning += "It might be due to a mismatch between char indices. \n"
+        if logging.root.level > logging.DEBUG:
+            logging_warning += (
+                "Skipping span! Enable Debug Logging for more information."
+            )
+        logging.warning(logging_warning)
+        logging.debug(
+            f"""Token should be: \n \t'{span.get_covered_text()}', but is '{
+                    doc.char_span(
+                    span.begin,
+                    span.end,
+                    alignment_mode="expand",
+                    label=span[cat_old],
+                )}'\n"""
+        )
 
     @staticmethod
     def files_to_docs(
-        data_files: List or str, ts: object, language_model: str = "de_core_news_sm"
+        data_files: List, ts: object, language_model: str = "de_core_news_sm"
     ):
         """
 
@@ -280,36 +291,39 @@
 
         """
         doc_dict = {}
-        train_dict = {}
-        test_dict = {}
 
         for file in data_files:
             logging.info(f"Reading ./{file}")
             try:
-                cas, file_type = InputOutput.read_cas_file(file, ts)
-                doc, doc_train, doc_test = InputOutput.cas_to_doc(
-                    cas, ts, language_model
-                )
+                cas, _ = InputOutput.read_cas_file(file, ts)
+                doc = InputOutput.cas_to_doc(cas, ts, language_model)
                 doc_dict[file.stem] = doc
-                train_dict[file.stem] = doc_train
-                test_dict[file.stem] = doc_test
 
             except XMLSyntaxError as e:
                 logging.warning(
                     f"WARNING: skipping file '{file}' due to XMLSyntaxError: {e}"
                 )
 
-        return doc_dict, train_dict, test_dict
+        return doc_dict
 
     @staticmethod
-    def _merge_span_categories(doc_dict, merge_dict=None):
+    def _merge_span_categories(doc_dict, merge_dict=None, task=None):
         """Take the new_dict_cat dict and add its key as a main_cat to data_dict.
         The values are the total sub_dict_entries of the given list.
 
         Args:
-          doc_dict(dict: doc): The provided doc dict.
-          new_dict_cat(dict): map new category to list of existing_categories.
-
+            doc_dict(dict: doc): The provided doc dict.
+            merge_dict_cat(dict, optional): map new category to list of existing_categories.
+                merge_dict = {
+                    "task1": ["KAT1-Moralisierendes Segment"],
+                    "task2": ["KAT2-Moralwerte", "KAT2-Subjektive Ausdrücke"],
+                    "task3": ["KAT3-Rolle", "KAT3-Gruppe", "KAT3-own/other"],
+                    "task4": ["KAT4-Kommunikative Funktion"],
+                    "task5": ["KAT5-Forderung explizit",  "KAT5-Forderung implizit"],
+                }
+            Defaults to None.
+            task (str, optional): The task from which the labels are selected.
+            By default task 1 is selected. Default is None.
         Return:
             dict: The data_dict with new span categories.
         """
@@ -319,49 +333,56 @@
                 "task2": ["KAT2-Moralwerte", "KAT2-Subjektive Ausdrücke"],
                 "task3": ["KAT3-Rolle", "KAT3-Gruppe", "KAT3-own/other"],
                 "task4": ["KAT4-Kommunikative Funktion"],
-                "task5": ["KAT5-Forderung explizit"],
+                "task5": ["KAT5-Forderung explizit", "KAT5-Forderung implizit"],
             }
+        if task is None:
+            task = "task1"
+
+        if task not in merge_dict.keys():
+            raise KeyError(
+                f"{task} not in merge_dict. Please provide a valid task or include the given task in the merge dict."
+            )
+
+        # now we only need to merge categories for the given task.
+        merge_categories = merge_dict[task]
 
         for file in doc_dict.keys():
-            # initilize new span_groups
-            for cat in merge_dict.keys():
-                doc_dict[file].spans[cat] = []
-
-            for new_main_cat, new_cat_entries in merge_dict.items():
-                if new_cat_entries == "all":
-                    for main_cat in list(doc_dict[file].spans.keys()):
-                        doc_dict[file].spans[new_main_cat].extend(
-                            doc_dict[file].spans[main_cat]
-                        )
-                else:
-                    for old_main_cat in new_cat_entries:
-                        doc_dict[file].spans[new_main_cat].extend(
-                            doc_dict[file].spans[old_main_cat]
-                        )
+            # initilize new span_group
+            doc_dict[file].spans[task] = []
+
+            for old_main_cat in merge_categories:
+                try:
+                    doc_dict[file].spans[task].extend(
+                        doc_dict[file].spans[old_main_cat]
+                    )
+
+                except KeyError:
+                    raise KeyError(
+                        f"{old_main_cat} not found in doc_dict[file].spans which"
+                        + f" has {list(doc_dict[file].spans.keys())} as keys."
+                    )
         return doc_dict
 
     @staticmethod
-    def read_data(dir: str, language_model: str = "de_core_news_sm"):
+    def read_data(
+        dir: str, language_model: str = "de_core_news_sm", merge_dict=None, task=None
+    ):
         """Convenience method to handle input reading in one go.
 
         Args:
-          dir (str): Path to the data directory.
-          language_model (str, optional): Language model of the corpus that is being read.
-            Defaults to "de_core_news_sm" (German).
-
+            dir (str): Path to the data directory.
+            language_model (str, optional): Language model of the corpus that is being read.
+                Defaults to "de_core_news_sm" (German).
+            merge_dict_cat(dict, optional): map new category to list of existing_categories.
+            task (str, optional): which task to use in the merge. Defaults to None.
         Returns:
             doc_dict (dict): Dictionary of with all the available data in one.
-            train_dict (dict): Dictionary with only the spans that are used for training.
-            test_dict (dict): Dictionary with only the spans that are used for testing.
         """
         data_files, ts_file = InputOutput.get_multiple_input(dir)
         # read in the ts
         ts = InputOutput.read_typesystem(ts_file)
-        doc_dict, train_dict, test_dict = InputOutput.files_to_docs(
+        doc_dict = InputOutput.files_to_docs(
             data_files, ts, language_model=language_model
         )
-
-        for dict_ in [doc_dict, train_dict, test_dict]:
-            dict_ = InputOutput._merge_span_categories(dict_)
-
-        return doc_dict, train_dict, test_dict
+        doc_dict = InputOutput._merge_span_categories(doc_dict, merge_dict, task)
+        return doc_dict