Skip to content

Commit

Permalink
Merge pull request #288 from dice-group/develop
Browse files Browse the repository at this point in the history
New Release
  • Loading branch information
Demirrr authored Dec 3, 2024
2 parents 7001e31 + e4a0bd1 commit 18451ea
Show file tree
Hide file tree
Showing 62 changed files with 3,139 additions and 1,893 deletions.
19 changes: 19 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[run]
omit =
tests/*
/tmp/*

[report]
exclude_lines =
pragma: no cover
def __repr__
if self.debug:
if settings.DEBUG
raise AssertionError
raise NotImplementedError
if 0:
if __name__ == .__main__.:
if TYPE_CHECKING:
class .*\bProtocol\):
@(abc\.)?abstractmethod
pass
7 changes: 6 additions & 1 deletion .github/workflows/github-actions-python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,13 @@ jobs:
- name: Lint with ruff
run: |
ruff check dicee/ --select=E501 --line-length=200
ruff check dicee/ --line-length=200
- name: Test with pytest
run: |
wget https://files.dice-research.org/datasets/dice-embeddings/KGs.zip --no-check-certificate && unzip KGs.zip
python -m pytest -p no:warnings -x
- name: Coverage report
run: |
pip install coverage
coverage run -m pytest -p no:warnings -x
coverage report -m
362 changes: 311 additions & 51 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dicee/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
from .executer import Execute # noqa
from .dataset_classes import * # noqa
from .query_generator import QueryGenerator # noqa
__version__ = '0.1.4'
__version__ = '0.1.5'
6 changes: 6 additions & 0 deletions dicee/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# dicee/__main__.py

from dicee.scripts.run import main # Import the main entry point of dicee

if __name__ == "__main__":
main() # Call the main function to execute the program logic
27 changes: 5 additions & 22 deletions dicee/abstracts.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def __init__(self, args, callbacks):
self.attributes = args
self.callbacks = callbacks
self.is_global_zero = True
self.global_rank=0
self.local_rank = 0
# Set True to use Model summary callback of pl.
torch.manual_seed(self.attributes.random_seed)
torch.cuda.manual_seed_all(self.attributes.random_seed)
Expand Down Expand Up @@ -178,21 +180,13 @@ def __init__(self, path: str = None, url: str = None, construct_ensemble: bool =
self.num_relations = len(self.relation_to_idx)
self.entity_to_idx: dict
self.relation_to_idx: dict
assert list(self.entity_to_idx.values()) == list(range(0, len(self.entity_to_idx)))
assert list(self.relation_to_idx.values()) == list(range(0, len(self.relation_to_idx)))
# 0, ....,
assert sorted(list(self.entity_to_idx.values())) == list(range(0, len(self.entity_to_idx)))
assert sorted(list(self.relation_to_idx.values())) == list(range(0, len(self.relation_to_idx)))

self.idx_to_entity = {v: k for k, v in self.entity_to_idx.items()}
self.idx_to_relations = {v: k for k, v in self.relation_to_idx.items()}

# See https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
# @TODO: Ignore temporalryIf file exists
# if os.path.exists(self.path + '/train_set.npy'):
# self.train_set = np.load(file=self.path + '/train_set.npy', mmap_mode='r')

# if apply_semantic_constraint:
# (self.domain_constraints_per_rel, self.range_constraints_per_rel,
# self.domain_per_rel, self.range_per_rel) = create_constraints(self.train_set)

def get_eval_report(self) -> dict:
return load_json(self.path + "/eval_report.json")

Expand Down Expand Up @@ -253,17 +247,6 @@ def get_padded_bpe_triple_representation(self, triples: List[List[str]]) -> Tupl
padded_bpe_t.append(self.get_bpe_token_representation(str_o))
return padded_bpe_h, padded_bpe_r, padded_bpe_t

def get_domain_of_relation(self, rel: str) -> List[str]:
x = [self.idx_to_entity[i] for i in self.domain_per_rel[self.relation_to_idx[rel]]]
res = set(x)
assert len(x) == len(res)
return res

def get_range_of_relation(self, rel: str) -> List[str]:
x = [self.idx_to_entity[i] for i in self.range_per_rel[self.relation_to_idx[rel]]]
res = set(x)
assert len(x) == len(res)
return res

def set_model_train_mode(self) -> None:
"""
Expand Down
26 changes: 11 additions & 15 deletions dicee/analyse_experiments.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
""" This script should be moved to dicee/scripts"""
""" This script should be moved to dicee/scripts
Example:
python dicee/analyse_experiments.py --dir Experiments --features "model" "trainMRR" "testMRR"
"""
import os
import json
import pandas as pd
Expand Down Expand Up @@ -120,19 +123,13 @@ def analyse(args):
if os.path.isdir(full_path) is False:
continue


with open(f'{full_path}/configuration.json', 'r') as f:
config = json.load(f)

try:
with open(f'{full_path}/report.json', 'r') as f:
report = json.load(f)
report = {i: report[i] for i in ['Runtime', 'NumParam']}
with open(f'{full_path}/eval_report.json', 'r') as f:
eval_report = json.load(f)
except FileNotFoundError:
print("NOT found")
continue
with open(f'{full_path}/report.json', 'r') as f:
report = json.load(f)
report = {i: report[i] for i in ['Runtime', 'NumParam']}
with open(f'{full_path}/eval_report.json', 'r') as f:
eval_report = json.load(f)
config.update(eval_report)
config.update(report)
if "Train" in config:
Expand Down Expand Up @@ -160,10 +157,9 @@ def analyse(args):
# print(df.columns)
try:
df_features = df[args.features]
except:
except KeyError:
print(f"--features ({args.features}) is not a subset of {df.columns}")
exit(1)

raise KeyError
print(df_features.to_latex(index=False, float_format="%.3f"))
path_to_save = args.dir + '/summary.csv'
df_features.to_csv(path_or_buf=path_to_save)
Expand Down
11 changes: 8 additions & 3 deletions dicee/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,10 @@ def on_fit_end(self, trainer, model):
if self.initial_eval_setting:
# ADD this info back
trainer.evaluator.args.eval_model = self.initial_eval_setting

param_ensemble = torch.load(f"{self.path}/aswa.pt", torch.device("cpu"))
model.load_state_dict(param_ensemble)

if trainer.global_rank==trainer.local_rank==0:
param_ensemble = torch.load(f"{self.path}/aswa.pt", torch.device("cpu"))
model.load_state_dict(param_ensemble)

@staticmethod
def compute_mrr(trainer, model) -> float:
Expand Down Expand Up @@ -241,6 +242,10 @@ def decide(self, running_model_state_dict, ensemble_state_dict, val_running_mode
return True

def on_train_epoch_end(self, trainer, model):

if (trainer.global_rank == trainer.local_rank == 0) is False:
return None

# (1) Increment epoch counter
self.epoch_count += 1
# (2) Save the given eval setting if it is not saved.
Expand Down
7 changes: 6 additions & 1 deletion dicee/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def __init__(self, **kwargs):
self.backend: str = "pandas"
"""Backend to read, process, and index input knowledge graph. pandas, polars and rdflib available"""

self.separator: str = "\s+"
"""separator for extracting head, relation and tail from a triple"""

self.trainer: str = 'torchCPUTrainer'
"""Trainer for knowledge graph embedding model"""

Expand Down Expand Up @@ -82,7 +85,6 @@ def __init__(self, **kwargs):

self.label_smoothing_rate: float = 0.0


self.num_core: int = 0
"""Number of CPUs to be used in the mini-batch loading process"""

Expand Down Expand Up @@ -136,6 +138,9 @@ def __init__(self, **kwargs):
self.continual_learning=None
"Path of a pretrained model size of LLM"

self.auto_batch_finding=False
"A flag for using auto batch finding"

def __iter__(self):
# Iterate
for k, v in self.__dict__.items():
Expand Down
Loading

0 comments on commit 18451ea

Please sign in to comment.