From 1c49c4218bc29e44cd84ccaa1e5319c1f02e474b Mon Sep 17 00:00:00 2001 From: Mathijs Boezer Date: Tue, 14 Mar 2023 13:39:09 +0100 Subject: [PATCH 1/7] Implement optional char inidices --- .../CrossLingualPredictor.py | 35 +++++++++++++++++-- .../examples/test_char_indices.py | 7 ++++ tests/test_model_configs.py | 4 +++ 3 files changed, 43 insertions(+), 3 deletions(-) create mode 100644 crosslingual_coreference/examples/test_char_indices.py diff --git a/crosslingual_coreference/CrossLingualPredictor.py b/crosslingual_coreference/CrossLingualPredictor.py index eba2985..809627e 100644 --- a/crosslingual_coreference/CrossLingualPredictor.py +++ b/crosslingual_coreference/CrossLingualPredictor.py @@ -43,12 +43,14 @@ def __init__( model_name: str = "minilm", chunk_size: Union[int, None] = None, # determines the # sentences per batch chunk_overlap: int = 2, # determines the # of overlapping sentences per chunk + char_indices: bool = False, ) -> None: self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.language = language self.filename = None self.device = device + self.char_indices = char_indices self.model_url = MODELS[model_name]["url"] self.resolver = Resolver() self.download_model() @@ -129,6 +131,19 @@ def predict(self, text: str) -> dict: prediction = {"clusters": merged_clusters, "resolved_text": resolved_text, "cluster_heads": heads} + if self.char_indices: + # clusters + for cluster in prediction["clusters"]: + for span in cluster: + span[0] = doc[span[0]].idx + span[1] = doc[span[1] - 1].idx + len(doc[span[1] - 1]) + + # cluster heads + for head_key in prediction["cluster_heads"].keys(): + prediction["cluster_heads"][head_key][0] = doc[prediction["cluster_heads"][head_key][0]].idx + prediction["cluster_heads"][head_key][1] = doc[prediction["cluster_heads"][head_key][1] - 1].idx \ + + len(doc[prediction["cluster_heads"][head_key][1] - 1]) + return prediction def pipe(self, texts: List[str]): @@ -152,9 +167,23 @@ def pipe(self, texts: List[str]): json_predictions = self.predictor.predict_batch_json(json_batch) clusters_predictions = [prediction.get("clusters") for prediction in json_predictions] - for spacy_doc, cluster in zip(spacy_document_list, clusters_predictions): - resolved_text, heads = self.resolver.replace_corefs(spacy_doc, cluster) - predictions.append({"clusters": cluster, "resolved_text": resolved_text, "cluster_heads": heads}) + for spacy_doc, clusters in zip(spacy_document_list, clusters_predictions): + resolved_text, heads = self.resolver.replace_corefs(spacy_doc, clusters) + + if self.char_indices: + # clusters + for cluster in clusters: + for span in cluster: + span[0] = spacy_doc[span[0]].idx + span[1] = spacy_doc[span[1] - 1].idx + len(spacy_doc[span[1] - 1]) + + # cluster heads + for head_key in heads.keys(): + heads[head_key][0] = spacy_doc[heads[head_key][0]].idx + heads[head_key][1] = spacy_doc[heads[head_key][1] - 1].idx \ + + len(spacy_doc[heads[head_key][1] - 1]) + + predictions.append({"clusters": clusters, "resolved_text": resolved_text, "cluster_heads": heads}) return predictions diff --git a/crosslingual_coreference/examples/test_char_indices.py b/crosslingual_coreference/examples/test_char_indices.py new file mode 100644 index 0000000..6a6d60d --- /dev/null +++ b/crosslingual_coreference/examples/test_char_indices.py @@ -0,0 +1,7 @@ +from crosslingual_coreference import Predictor + +from .data import texts + +predictor = Predictor(language="nl_core_news_sm", chunk_size=2500, char_indices=True) + +print(predictor.pipe(texts)) diff --git a/tests/test_model_configs.py b/tests/test_model_configs.py index 3330939..de0f034 100644 --- a/tests/test_model_configs.py +++ b/tests/test_model_configs.py @@ -8,3 +8,7 @@ def test_standalone_chunking(): def test_spacy(): from crosslingual_coreference.examples import test_spacy # noqa: F401 + + +def test_char_ranges(): + from crosslingual_coreference.examples import test_char_indices # noqa: F401 From 66313ee1a1968166611e0211dbec0aad7ef0b948 Mon Sep 17 00:00:00 2001 From: Mathijs Boezer Date: Tue, 14 Mar 2023 13:58:47 +0100 Subject: [PATCH 2/7] Fix pipe flow for char indices --- crosslingual_coreference/CrossLingualPredictorSpacy.py | 3 ++- crosslingual_coreference/__init__.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/crosslingual_coreference/CrossLingualPredictorSpacy.py b/crosslingual_coreference/CrossLingualPredictorSpacy.py index 7fa6247..32f49f9 100644 --- a/crosslingual_coreference/CrossLingualPredictorSpacy.py +++ b/crosslingual_coreference/CrossLingualPredictorSpacy.py @@ -14,8 +14,9 @@ def __init__( model_name: str = "minilm", chunk_size: Union[int, None] = None, chunk_overlap: int = 2, + char_indices: bool = False, ) -> None: - super().__init__(language, device, model_name, chunk_size, chunk_overlap) + super().__init__(language, device, model_name, chunk_size, chunk_overlap, char_indices) Doc.set_extension("coref_clusters", default=None, force=True) Doc.set_extension("resolved_text", default=None, force=True) Doc.set_extension("cluster_heads", default=None, force=True) diff --git a/crosslingual_coreference/__init__.py b/crosslingual_coreference/__init__.py index 173fb97..21d0a17 100644 --- a/crosslingual_coreference/__init__.py +++ b/crosslingual_coreference/__init__.py @@ -24,6 +24,7 @@ "model_name": "minilm", "chunk_size": None, "chunk_overlap": 2, + "char_indices": False, }, ) def make_crosslingual_coreference( @@ -33,6 +34,7 @@ def make_crosslingual_coreference( model_name: str, chunk_size: Union[int, None], chunk_overlap: int, + char_indices: bool, ): return SpacyPredictor( language=nlp.path.name.split("-")[0], @@ -40,4 +42,5 @@ def make_crosslingual_coreference( model_name=model_name, chunk_size=chunk_size, chunk_overlap=chunk_overlap, + char_indices=char_indices ) From c6bb9e10a0c3e2974372d7227f6f8181bde820f0 Mon Sep 17 00:00:00 2001 From: Mathijs Boezer Date: Tue, 14 Mar 2023 14:17:45 +0100 Subject: [PATCH 3/7] extra comma --- crosslingual_coreference/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crosslingual_coreference/__init__.py b/crosslingual_coreference/__init__.py index 21d0a17..f5c3562 100644 --- a/crosslingual_coreference/__init__.py +++ b/crosslingual_coreference/__init__.py @@ -42,5 +42,5 @@ def make_crosslingual_coreference( model_name=model_name, chunk_size=chunk_size, chunk_overlap=chunk_overlap, - char_indices=char_indices + char_indices=char_indices, ) From 6c8d52a9c1af246a10263010f2500f639df0d9b5 Mon Sep 17 00:00:00 2001 From: Mathijs Boezer Date: Mon, 20 Mar 2023 10:39:44 +0100 Subject: [PATCH 4/7] Work on making char indices the default --- .../CrossLingualPredictor.py | 65 +++++++++++-------- .../CrossLingualPredictorSpacy.py | 3 +- crosslingual_coreference/__init__.py | 3 - .../examples/test_char_indices.py | 7 -- tests/test_model_configs.py | 4 -- 5 files changed, 39 insertions(+), 43 deletions(-) delete mode 100644 crosslingual_coreference/examples/test_char_indices.py diff --git a/crosslingual_coreference/CrossLingualPredictor.py b/crosslingual_coreference/CrossLingualPredictor.py index 809627e..f555a14 100644 --- a/crosslingual_coreference/CrossLingualPredictor.py +++ b/crosslingual_coreference/CrossLingualPredictor.py @@ -1,6 +1,6 @@ import itertools import pathlib -from typing import List, Union +from typing import List, Tuple, Union import requests import tqdm # progress bar @@ -128,22 +128,10 @@ def predict(self, text: str) -> dict: merged_clusters = self.merge_clusters(corrected_clusters) resolved_text, heads = self.resolver.replace_corefs(doc, merged_clusters) + merged_clusters, heads = self.convert_indices(merged_clusters, heads, doc) prediction = {"clusters": merged_clusters, "resolved_text": resolved_text, "cluster_heads": heads} - if self.char_indices: - # clusters - for cluster in prediction["clusters"]: - for span in cluster: - span[0] = doc[span[0]].idx - span[1] = doc[span[1] - 1].idx + len(doc[span[1] - 1]) - - # cluster heads - for head_key in prediction["cluster_heads"].keys(): - prediction["cluster_heads"][head_key][0] = doc[prediction["cluster_heads"][head_key][0]].idx - prediction["cluster_heads"][head_key][1] = doc[prediction["cluster_heads"][head_key][1] - 1].idx \ - + len(doc[prediction["cluster_heads"][head_key][1] - 1]) - return prediction def pipe(self, texts: List[str]): @@ -169,19 +157,7 @@ def pipe(self, texts: List[str]): for spacy_doc, clusters in zip(spacy_document_list, clusters_predictions): resolved_text, heads = self.resolver.replace_corefs(spacy_doc, clusters) - - if self.char_indices: - # clusters - for cluster in clusters: - for span in cluster: - span[0] = spacy_doc[span[0]].idx - span[1] = spacy_doc[span[1] - 1].idx + len(spacy_doc[span[1] - 1]) - - # cluster heads - for head_key in heads.keys(): - heads[head_key][0] = spacy_doc[heads[head_key][0]].idx - heads[head_key][1] = spacy_doc[heads[head_key][1] - 1].idx \ - + len(spacy_doc[heads[head_key][1] - 1]) + clusters, heads = self.convert_indices(clusters, heads, spacy_doc) predictions.append({"clusters": clusters, "resolved_text": resolved_text, "cluster_heads": heads}) @@ -255,3 +231,38 @@ def merge_clusters( main_doc_clus.sort() main_doc_clus = list(k for k, _ in itertools.groupby(main_doc_clus)) return main_doc_clus + + @staticmethod + def convert_indices(merged_clusters: List[List[List[int]]], heads: dict, spacy_doc: Doc) -> Tuple[list, dict]: + """Convert indices from token to character level + + Args: + merged_clusters (List[List[List[int]]]): List of clusters + heads (Dict[List[int]]): Dictionary of cluster heads + spacy_doc (Doc): Spacy doc object + + Returns: + List[List[List[int]]], Dict[List[int]]: Tuple of converted clusters and heads + """ + char_merged_clusters = [] + char_heads = {} + + # clusters + for cluster in merged_clusters: + for span in cluster: + char_span = [-1, -1] + + char_span[0] = spacy_doc[span[0]].idx + char_span[1] = spacy_doc[span[1] - 1].idx + len(spacy_doc[span[1] - 1]) + + char_merged_clusters.append(char_span) + + # cluster heads + for head_key in heads.keys(): + char_heads[head_key] = [-1, -1] + + char_heads[head_key][0] = spacy_doc[heads[head_key][0]].idx + char_heads[head_key][1] = spacy_doc[heads[head_key][1] - 1].idx \ + + len(spacy_doc[heads[head_key][1] - 1]) + + return char_merged_clusters, char_heads diff --git a/crosslingual_coreference/CrossLingualPredictorSpacy.py b/crosslingual_coreference/CrossLingualPredictorSpacy.py index 32f49f9..7fa6247 100644 --- a/crosslingual_coreference/CrossLingualPredictorSpacy.py +++ b/crosslingual_coreference/CrossLingualPredictorSpacy.py @@ -14,9 +14,8 @@ def __init__( model_name: str = "minilm", chunk_size: Union[int, None] = None, chunk_overlap: int = 2, - char_indices: bool = False, ) -> None: - super().__init__(language, device, model_name, chunk_size, chunk_overlap, char_indices) + super().__init__(language, device, model_name, chunk_size, chunk_overlap) Doc.set_extension("coref_clusters", default=None, force=True) Doc.set_extension("resolved_text", default=None, force=True) Doc.set_extension("cluster_heads", default=None, force=True) diff --git a/crosslingual_coreference/__init__.py b/crosslingual_coreference/__init__.py index f5c3562..173fb97 100644 --- a/crosslingual_coreference/__init__.py +++ b/crosslingual_coreference/__init__.py @@ -24,7 +24,6 @@ "model_name": "minilm", "chunk_size": None, "chunk_overlap": 2, - "char_indices": False, }, ) def make_crosslingual_coreference( @@ -34,7 +33,6 @@ def make_crosslingual_coreference( model_name: str, chunk_size: Union[int, None], chunk_overlap: int, - char_indices: bool, ): return SpacyPredictor( language=nlp.path.name.split("-")[0], @@ -42,5 +40,4 @@ def make_crosslingual_coreference( model_name=model_name, chunk_size=chunk_size, chunk_overlap=chunk_overlap, - char_indices=char_indices, ) diff --git a/crosslingual_coreference/examples/test_char_indices.py b/crosslingual_coreference/examples/test_char_indices.py deleted file mode 100644 index 6a6d60d..0000000 --- a/crosslingual_coreference/examples/test_char_indices.py +++ /dev/null @@ -1,7 +0,0 @@ -from crosslingual_coreference import Predictor - -from .data import texts - -predictor = Predictor(language="nl_core_news_sm", chunk_size=2500, char_indices=True) - -print(predictor.pipe(texts)) diff --git a/tests/test_model_configs.py b/tests/test_model_configs.py index de0f034..3330939 100644 --- a/tests/test_model_configs.py +++ b/tests/test_model_configs.py @@ -8,7 +8,3 @@ def test_standalone_chunking(): def test_spacy(): from crosslingual_coreference.examples import test_spacy # noqa: F401 - - -def test_char_ranges(): - from crosslingual_coreference.examples import test_char_indices # noqa: F401 From c941b546b8c2670e70ea9ed75d7ba722b419a932 Mon Sep 17 00:00:00 2001 From: Mathijs Boezer Date: Mon, 20 Mar 2023 11:07:47 +0100 Subject: [PATCH 5/7] Clean up, use spacy span --- .../CrossLingualPredictor.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/crosslingual_coreference/CrossLingualPredictor.py b/crosslingual_coreference/CrossLingualPredictor.py index f555a14..2f8c6ab 100644 --- a/crosslingual_coreference/CrossLingualPredictor.py +++ b/crosslingual_coreference/CrossLingualPredictor.py @@ -5,7 +5,7 @@ import requests import tqdm # progress bar from allennlp.predictors.predictor import Predictor -from spacy.tokens import Doc +from spacy.tokens import Doc, Span from .CorefResolver import CorefResolver as Resolver @@ -249,20 +249,15 @@ def convert_indices(merged_clusters: List[List[List[int]]], heads: dict, spacy_d # clusters for cluster in merged_clusters: + char_cluster = [] for span in cluster: - char_span = [-1, -1] - - char_span[0] = spacy_doc[span[0]].idx - char_span[1] = spacy_doc[span[1] - 1].idx + len(spacy_doc[span[1] - 1]) - - char_merged_clusters.append(char_span) + spacy_span = Span(spacy_doc, span[0], span[1] + 1) + char_cluster.append([spacy_span.start_char, spacy_span.end_char]) + char_merged_clusters.append(char_cluster) # cluster heads for head_key in heads.keys(): - char_heads[head_key] = [-1, -1] - - char_heads[head_key][0] = spacy_doc[heads[head_key][0]].idx - char_heads[head_key][1] = spacy_doc[heads[head_key][1] - 1].idx \ - + len(spacy_doc[heads[head_key][1] - 1]) + span = Span(spacy_doc, heads[head_key][0], heads[head_key][1] + 1) + char_heads[head_key] = [span.start_char, span.end_char] return char_merged_clusters, char_heads From 8e553cbe4c26661ab7a6bbb4193e0c3dff02322c Mon Sep 17 00:00:00 2001 From: Mathijs Boezer Date: Mon, 20 Mar 2023 11:12:04 +0100 Subject: [PATCH 6/7] forgot to remove one parameter mention --- crosslingual_coreference/CrossLingualPredictor.py | 2 -- test2.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) create mode 100644 test2.py diff --git a/crosslingual_coreference/CrossLingualPredictor.py b/crosslingual_coreference/CrossLingualPredictor.py index 2f8c6ab..fbeb9a4 100644 --- a/crosslingual_coreference/CrossLingualPredictor.py +++ b/crosslingual_coreference/CrossLingualPredictor.py @@ -43,14 +43,12 @@ def __init__( model_name: str = "minilm", chunk_size: Union[int, None] = None, # determines the # sentences per batch chunk_overlap: int = 2, # determines the # of overlapping sentences per chunk - char_indices: bool = False, ) -> None: self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.language = language self.filename = None self.device = device - self.char_indices = char_indices self.model_url = MODELS[model_name]["url"] self.resolver = Resolver() self.download_model() diff --git a/test2.py b/test2.py new file mode 100644 index 0000000..f69578e --- /dev/null +++ b/test2.py @@ -0,0 +1 @@ +for cluster in doc._.coref_clusters: From d4494b29a1d806609e533a1d9822eea96fcb82f9 Mon Sep 17 00:00:00 2001 From: Mathijs Boezer Date: Mon, 20 Mar 2023 11:28:38 +0100 Subject: [PATCH 7/7] remove temp test file --- test2.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 test2.py diff --git a/test2.py b/test2.py deleted file mode 100644 index f69578e..0000000 --- a/test2.py +++ /dev/null @@ -1 +0,0 @@ -for cluster in doc._.coref_clusters: