Merge pull request #288 from dice-group/develop

New Release
dice-group · Dec 3, 2024 · 18451ea · 18451ea
2 parents 7001e31 + e4a0bd1
commit 18451ea
Show file tree

Hide file tree

Showing 62 changed files with 3,139 additions and 1,893 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,19 @@
+[run]
+omit =
+    tests/*
+    /tmp/*
+
+[report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    if self.debug:
+    if settings.DEBUG
+    raise AssertionError
+    raise NotImplementedError
+    if 0:
+    if __name__ == .__main__.:
+    if TYPE_CHECKING:
+    class .*\bProtocol\):
+    @(abc\.)?abstractmethod
+    pass
diff --git a/.github/workflows/github-actions-python-package.yml b/.github/workflows/github-actions-python-package.yml
@@ -22,8 +22,13 @@ jobs:
 
       - name: Lint with ruff
         run: |
-          ruff  check dicee/ --select=E501 --line-length=200
+          ruff  check dicee/ --line-length=200
       - name: Test with pytest
         run: |
           wget https://files.dice-research.org/datasets/dice-embeddings/KGs.zip --no-check-certificate && unzip KGs.zip
           python -m pytest -p no:warnings -x
+      - name: Coverage report
+        run: |
+          pip install coverage
+          coverage run -m pytest -p no:warnings -x
+          coverage report -m 
diff --git a/README.md b/README.md
diff --git a/dicee/__init__.py b/dicee/__init__.py
@@ -4,4 +4,4 @@
 from .executer import Execute # noqa
 from .dataset_classes import * # noqa
 from .query_generator import QueryGenerator # noqa
-__version__ = '0.1.4'
+__version__ = '0.1.5'
diff --git a/dicee/__main__.py b/dicee/__main__.py
@@ -0,0 +1,6 @@
+# dicee/__main__.py
+
+from dicee.scripts.run import main # Import the main entry point of dicee
+
+if __name__ == "__main__":
+    main()  # Call the main function to execute the program logic
diff --git a/dicee/abstracts.py b/dicee/abstracts.py
@@ -26,6 +26,8 @@ def __init__(self, args, callbacks):
         self.attributes = args
         self.callbacks = callbacks
         self.is_global_zero = True
+        self.global_rank=0
+        self.local_rank = 0
         # Set True to use Model summary callback of pl.
         torch.manual_seed(self.attributes.random_seed)
         torch.cuda.manual_seed_all(self.attributes.random_seed)
@@ -178,21 +180,13 @@ def __init__(self, path: str = None, url: str = None, construct_ensemble: bool =
             self.num_relations = len(self.relation_to_idx)
             self.entity_to_idx: dict
             self.relation_to_idx: dict
-            assert list(self.entity_to_idx.values()) == list(range(0, len(self.entity_to_idx)))
-            assert list(self.relation_to_idx.values()) == list(range(0, len(self.relation_to_idx)))
+            # 0, ....,
+            assert sorted(list(self.entity_to_idx.values())) == list(range(0, len(self.entity_to_idx)))
+            assert sorted(list(self.relation_to_idx.values())) == list(range(0, len(self.relation_to_idx)))
 
             self.idx_to_entity = {v: k for k, v in self.entity_to_idx.items()}
             self.idx_to_relations = {v: k for k, v in self.relation_to_idx.items()}
 
-        # See https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
-        # @TODO: Ignore temporalryIf file exists
-        # if os.path.exists(self.path + '/train_set.npy'):
-        #    self.train_set = np.load(file=self.path + '/train_set.npy', mmap_mode='r')
-
-        # if apply_semantic_constraint:
-        #    (self.domain_constraints_per_rel, self.range_constraints_per_rel,
-        #     self.domain_per_rel, self.range_per_rel) = create_constraints(self.train_set)
-
     def get_eval_report(self) -> dict:
         return load_json(self.path + "/eval_report.json")
 
@@ -253,17 +247,6 @@ def get_padded_bpe_triple_representation(self, triples: List[List[str]]) -> Tupl
             padded_bpe_t.append(self.get_bpe_token_representation(str_o))
         return padded_bpe_h, padded_bpe_r, padded_bpe_t
 
-    def get_domain_of_relation(self, rel: str) -> List[str]:
-        x = [self.idx_to_entity[i] for i in self.domain_per_rel[self.relation_to_idx[rel]]]
-        res = set(x)
-        assert len(x) == len(res)
-        return res
-
-    def get_range_of_relation(self, rel: str) -> List[str]:
-        x = [self.idx_to_entity[i] for i in self.range_per_rel[self.relation_to_idx[rel]]]
-        res = set(x)
-        assert len(x) == len(res)
-        return res
 
     def set_model_train_mode(self) -> None:
         """

diff --git a/dicee/analyse_experiments.py b/dicee/analyse_experiments.py
@@ -1,4 +1,7 @@
-""" This script should be moved to dicee/scripts"""
+""" This script should be moved to dicee/scripts
+Example:
+python dicee/analyse_experiments.py --dir Experiments --features "model" "trainMRR" "testMRR"
+"""
 import os
 import json
 import pandas as pd
@@ -120,19 +123,13 @@ def analyse(args):
         if os.path.isdir(full_path) is False:
             continue
 
-
         with open(f'{full_path}/configuration.json', 'r') as f:
             config = json.load(f)
-
-        try:
-            with open(f'{full_path}/report.json', 'r') as f:
-                report = json.load(f)
-                report = {i: report[i] for i in ['Runtime', 'NumParam']}
-            with open(f'{full_path}/eval_report.json', 'r') as f:
-                eval_report = json.load(f)
-        except FileNotFoundError:
-            print("NOT found")
-            continue
+        with open(f'{full_path}/report.json', 'r') as f:
+            report = json.load(f)
+            report = {i: report[i] for i in ['Runtime', 'NumParam']}
+        with open(f'{full_path}/eval_report.json', 'r') as f:
+            eval_report = json.load(f)
         config.update(eval_report)
         config.update(report)
         if "Train" in config:
@@ -160,10 +157,9 @@ def analyse(args):
     # print(df.columns)
     try:
         df_features = df[args.features]
-    except:
+    except KeyError:
         print(f"--features ({args.features}) is not a subset of {df.columns}")
-        exit(1)
-
+        raise KeyError
     print(df_features.to_latex(index=False, float_format="%.3f"))
     path_to_save = args.dir + '/summary.csv'
     df_features.to_csv(path_or_buf=path_to_save)

diff --git a/dicee/callbacks.py b/dicee/callbacks.py
@@ -166,9 +166,10 @@ def on_fit_end(self, trainer, model):
         if self.initial_eval_setting:
             # ADD this info back
             trainer.evaluator.args.eval_model = self.initial_eval_setting
-
-        param_ensemble = torch.load(f"{self.path}/aswa.pt", torch.device("cpu"))
-        model.load_state_dict(param_ensemble)
+
+        if trainer.global_rank==trainer.local_rank==0:
+            param_ensemble = torch.load(f"{self.path}/aswa.pt", torch.device("cpu"))
+            model.load_state_dict(param_ensemble)
 
     @staticmethod
     def compute_mrr(trainer, model) -> float:
@@ -241,6 +242,10 @@ def decide(self, running_model_state_dict, ensemble_state_dict, val_running_mode
             return True
 
     def on_train_epoch_end(self, trainer, model):
+
+        if (trainer.global_rank == trainer.local_rank == 0) is False:
+            return None
+
         # (1) Increment epoch counter
         self.epoch_count += 1
         # (2) Save the given eval setting if it is not saved.

diff --git a/dicee/config.py b/dicee/config.py
@@ -50,6 +50,9 @@ def __init__(self, **kwargs):
         self.backend: str = "pandas"
         """Backend to read, process, and index input knowledge graph. pandas, polars and rdflib available"""
 
+        self.separator: str = "\s+"
+        """separator for extracting head, relation and tail from a triple"""
+
         self.trainer: str = 'torchCPUTrainer'
         """Trainer for knowledge graph embedding model"""
 
@@ -82,7 +85,6 @@ def __init__(self, **kwargs):
 
         self.label_smoothing_rate: float = 0.0
 
-
         self.num_core: int = 0
         """Number of CPUs to be used in the mini-batch loading process"""
 
@@ -136,6 +138,9 @@ def __init__(self, **kwargs):
         self.continual_learning=None
         "Path of a pretrained model size of LLM"
 
+        self.auto_batch_finding=False
+        "A flag for using auto batch finding"
+
     def __iter__(self):
         # Iterate
         for k, v in self.__dict__.items():