Skip to content

Commit

Permalink
update NCES2 and ROCES scripts. Training NCES2 and ROCES works!
Browse files Browse the repository at this point in the history
  • Loading branch information
Jean-KOUAGOU committed Dec 18, 2024
1 parent 94a06e6 commit a10fae8
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 76 deletions.
13 changes: 7 additions & 6 deletions examples/train_nces.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,15 @@ def start(args):
print("Could not find training data. Will generate some data and train.")
training_data = NCES2.generate_training_data(knowledge_base_path, beyond_alc=True)
if args.synthesizer == "NCES":
nces = NCES(knowledge_base_path=knowledge_base_path, learner_names=args.models, path_of_embeddings=path_of_embeddings,
max_length=48, proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, verbose=True, load_pretrained=args.load_pretrained)
synthesizer = NCES(knowledge_base_path=knowledge_base_path, learner_names=args.models, path_of_embeddings=path_of_embeddings,
max_length=48, proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, load_pretrained=args.load_pretrained, verbose=True)
elif args.synthesizer == "NCES2":
nces = NCES2(knowledge_base_path=knowledge_base_path, max_length=48, proj_dim=128,
synthesizer = NCES2(knowledge_base_path=knowledge_base_path, path_of_trained_models=args.path_of_trained_models, nces2_or_roces=True, max_length=48, proj_dim=128,
drop_prob=0.1, num_heads=4, num_seeds=1, m=32, verbose=True, load_pretrained=args.load_pretrained)
else:
nces = ROCES(knowledge_base_path=knowledge_base_path, k=5, path_of_trained_models="", max_length=48, proj_dim=128,
drop_prob=0.1, num_heads=4, num_seeds=1, m=32, verbose=True, load_pretrained=args.load_pretrained)
nces.train(training_data, epochs=args.epochs, learning_rate=args.learning_rate, num_workers=2, save_model=True)
synthesizer = ROCES(knowledge_base_path=knowledge_base_path, path_of_trained_models=args.path_of_trained_models, nces2_or_roces=True, k=5, max_length=48, proj_dim=128,
drop_prob=0.1, num_heads=4, num_seeds=1, m=32, load_pretrained=args.load_pretrained, verbose=True)
synthesizer.train(training_data, epochs=args.epochs, learning_rate=args.learning_rate, num_workers=2, save_model=True)


if __name__ == '__main__':
Expand All @@ -71,6 +71,7 @@ def start(args):
parser.add_argument('--embeddings', type=str, nargs='+', default=None, help='Paths of embeddings for each KB.')
parser.add_argument('--synthesizer', type=str, default="NCES", help='Neural synthesizer to train')
parser.add_argument('--path_train_data', type=str, help='Path to training data')
parser.add_argument('--path_of_trained_models', type=str, default=None, help='Path to training data')
parser.add_argument('--models', type=str, nargs='+', default=['SetTransformer', 'LSTM', 'GRU'],
help='Neural models')
parser.add_argument('--load_pretrained', type=str2bool, default=False, help='Whether to load the pretrained model')
Expand Down
41 changes: 22 additions & 19 deletions ontolearn/base_nces.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@
from torch.functional import F
from torch.nn.utils.rnn import pad_sequence
from abc import abstractmethod
import re


class BaseNCES:

def __init__(self, knowledge_base_path, quality_func, num_predictions, proj_dim=128, drop_prob=0.1,
def __init__(self, knowledge_base_path, nces2_or_roces, quality_func, num_predictions, proj_dim=128, drop_prob=0.1,
num_heads=4, num_seeds=1, m=32, ln=False, learning_rate=1e-4, decay_rate=0.0, clip_value=5.0,
batch_size=256, num_workers=4, max_length=48, load_pretrained=True, verbose: int = 0):
kb = KnowledgeBase(path=knowledge_base_path)
Expand All @@ -45,6 +46,10 @@ def __init__(self, knowledge_base_path, quality_func, num_predictions, proj_dim=
self.atomic_concept_names = atomic_concept_names
role_names = [rel.iri.get_remainder() for rel in kb.ontology.object_properties_in_signature()]
vocab = atomic_concept_names + role_names + ['⊔', '⊓', '∃', '∀', '¬', '⊤', '⊥', '.', ' ', '(', ')']
if nces2_or_roces:
concrete_role_names = [rel.iri.get_remainder() for rel in kb.ontology.data_properties_in_signature()]
vocab.extend(concrete_role_names)
vocab.extend(['⁻', '≤', '≥', 'True', 'False', 'true', 'false', '{', '}', ':', '[', ']', 'double', 'integer', 'date', 'xsd'])
vocab = sorted(vocab) + ['PAD']
self.knowledge_base_path = knowledge_base_path
self.kb = kb
Expand Down Expand Up @@ -76,24 +81,22 @@ def find_optimal_number_of_examples(kb):
return min(kb.individuals_count()//2, 1000)
return kb.individuals_count()

def collate_batch(self, batch): # pragma: no cover
pos_emb_list = []
neg_emb_list = []
target_labels = []
for pos_emb, neg_emb, label in batch:
if pos_emb.ndim != 2:
pos_emb = pos_emb.reshape(1, -1)
if neg_emb.ndim != 2:
neg_emb = neg_emb.reshape(1, -1)
pos_emb_list.append(pos_emb)
neg_emb_list.append(neg_emb)
target_labels.append(label)
pos_emb_list[0] = F.pad(pos_emb_list[0], (0, 0, 0, self.num_examples - pos_emb_list[0].shape[0]), "constant", 0)
pos_emb_list = pad_sequence(pos_emb_list, batch_first=True, padding_value=0)
neg_emb_list[0] = F.pad(neg_emb_list[0], (0, 0, 0, self.num_examples - neg_emb_list[0].shape[0]), "constant", 0)
neg_emb_list = pad_sequence(neg_emb_list, batch_first=True, padding_value=0)
target_labels = pad_sequence(target_labels, batch_first=True, padding_value=-100)
return pos_emb_list, neg_emb_list, target_labels
def add_data_values(self, data):
print("\nUpdating vocabulary based on training data...\n")
quantified_restriction_values = [str(i) for i in range(1,12)]
vocab = list(self.vocab.keys())
vocab.extend(quantified_restriction_values)
values = set()
for ce, examples in data:
if '[' in ce:
for val in re.findall("\[(.*?)\]", ce):
values.add(val.split(' ')[-1])
vocab.extend(list(values))
vocab = sorted(vocab)
self.inv_vocab = np.array(vocab, dtype='object')
self.vocab = {vocab[i]: i for i in range(len(vocab))}
print("Done.\n")


def collate_batch_inference(self, batch): # pragma: no cover
pos_emb_list = []
Expand Down
70 changes: 39 additions & 31 deletions ontolearn/concept_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,13 +796,13 @@ def train(self, data: Iterable[List[Tuple]], epochs=300, batch_size=256, learnin
class NCES(BaseNCES):
"""Neural Class Expression Synthesis."""

def __init__(self, knowledge_base_path,
def __init__(self, knowledge_base_path, nces2_or_roces=False,
quality_func: Optional[AbstractScorer] = None, num_predictions=5,
learner_names=["SetTransformer"], path_of_embeddings="", proj_dim=128, rnn_n_layers=2,
drop_prob=0.1, num_heads=4, num_seeds=1, m=32, ln=False,
learning_rate=1e-4, decay_rate=0.0, clip_value=5.0, batch_size=256, num_workers=4,
max_length=48, load_pretrained=True, sorted_examples=False, verbose: int = 0):
super().__init__(knowledge_base_path, quality_func, num_predictions, proj_dim, drop_prob,
super().__init__(knowledge_base_path, nces2_or_roces, quality_func, num_predictions, proj_dim, drop_prob,
num_heads, num_seeds, m, ln, learning_rate, decay_rate, clip_value,
batch_size, num_workers, max_length, load_pretrained, verbose)

Expand Down Expand Up @@ -866,7 +866,7 @@ def get_synthesizer(self, path=None):
elif num_loaded_models > 0:
print("Some model weights could not be loaded. Successful ones are: ", loaded_model_names)
else:
print("!!!Returning untrained models, could not load pretrained")
print("!!!No pretrained weights, initializing models with random weights")
return Models

elif len(glob.glob(path+"/*.pt")) == 0:
Expand Down Expand Up @@ -894,10 +894,10 @@ def get_synthesizer(self, path=None):
print("Some model weights could not be loaded. Successful ones are: ", loaded_model_names)
return Models
else:
print("!!!Returning untrained models, could not load pretrained")
print("!!!No pretrained weights were provided, initializing models with random weights")
return Models
else:
print("!!!Returning untrained models, could not load pretrained weights. Check the `load_pretrained parameter` or train the models using NCES.train(data).")
print("!!!No pretrained weights were provided, initializing models with random weights.")
return Models


Expand Down Expand Up @@ -1114,14 +1114,14 @@ def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_
class NCES2(BaseNCES):
"""Neural Class Expression Synthesis in ALCHIQ(D)."""

def __init__(self, knowledge_base_path,
def __init__(self, knowledge_base_path, nces2_or_roces=True,
quality_func: Optional[AbstractScorer] = None, num_predictions=5,
path_of_trained_models="", proj_dim=128, drop_prob=0.1,
path_of_trained_models=None, proj_dim=128, drop_prob=0.1,
num_heads=4, num_seeds=1, m=[32, 64, 128], ln=False, embedding_dim=256, sampling_strategy="nces2",
input_dropout=0.0, feature_map_dropout=0.1, kernel_size=4, num_of_output_channels=32,
learning_rate=1e-4, decay_rate=0.0, clip_value=5.0, batch_size=256, num_workers=4,
max_length=48, load_pretrained=True, verbose: int = 0):
super().__init__(knowledge_base_path, quality_func, num_predictions, proj_dim, drop_prob,
super().__init__(knowledge_base_path, nces2_or_roces, quality_func, num_predictions, proj_dim, drop_prob,
num_heads, num_seeds, m, ln, learning_rate, decay_rate, clip_value,
batch_size, num_workers, max_length, load_pretrained, verbose)

Expand All @@ -1138,36 +1138,39 @@ def __init__(self, knowledge_base_path,
self.kernel_size = kernel_size
self.num_of_output_channels = num_of_output_channels
self._maybe_load_pretrained_config()
self.model = self.get_synthesizer()
self.model = self.get_synthesizer(path_of_trained_models)

def _maybe_load_pretrained_config(self, path=None):
if isinstance(self.m, int):
self.m = [self.m]
if path:
possible_checkpoints = glob.glob(path + "/*.pt")
else:
elif self.path_of_trained_models:
possible_checkpoints = glob.glob(self.path_of_trained_models + "/*.pt")
if self.load_pretrained and len(possible_checkpoints):
stop_outer_loop = False
for file_name in possible_checkpoints:
for m in self.m:
if m in file_name:
if str(m) in file_name and "emb" in file_name:
try:
weights = torch.load(file_name, map_location=self.device, weights_only=True)
num_ents, half_emb_dim = weights["emb_ent_real"].weight.data.shape
num_rels, _ = weights["emb_rel_real"].weight.data.shape
#print(weights)
num_ents, half_emb_dim = weights["emb_ent_real.weight"].shape
num_rels, _ = weights["emb_rel_real.weight"].shape
self.embedding_dim = 2*half_emb_dim
self.num_entities = num_ents
self.num_relations = num_rels
stop_outer_loop = True
print("Updated number of entities and relation types in embedding model!\n")
break
except:
except Exception as e:
print(e)
pass
if stop_outer_loop:
break


def get_synthesizer(self, path=None):
def get_synthesizer(self, path=None, verbose=True):

Models = {str(m): {"emb_model": ConEx(self.embedding_dim, self.num_entities, self.num_relations, self.input_dropout,
self.feature_map_dropout, self.kernel_size, self.num_of_output_channels),
Expand All @@ -1183,14 +1186,14 @@ def get_synthesizer(self, path=None):
else:
for file_name in possible_checkpoints:
for m in self.m:
if m in file_name:
try:
if str(m) in file_name:
if not "emb" in file_name:
weights = torch.load(file_name, map_location=self.device, weights_only=True)
model = Models[str(m)]["model"]
model.load_state_dict(weights)
Models[str(m)]["model"] = model
num_loaded += 1
except Exception:
else:
weights = torch.load(file_name, map_location=self.device, weights_only=True)
emb_model = Models[str(m)]["emb_model"]
emb_model.load_state_dict(weights)
Expand All @@ -1203,7 +1206,7 @@ def get_synthesizer(self, path=None):
print(f"\nLoaded {self.name} weights!\n")
return Models
else:
print("!!!Returning untrained models, could not load pretrained models")
print("!!!No pretrained weights were provided, initializing models with random weights")
return Models

elif len(glob.glob(path + "/*.pt")) == 0:
Expand All @@ -1214,14 +1217,14 @@ def get_synthesizer(self, path=None):
num_loaded = 0
for file_name in possible_checkpoints:
for m in self.m:
if m in file_name:
try:
if str(m) in file_name:
if not "emb" in file_name:
weights = torch.load(file_name, map_location=self.device, weights_only=True)
model = Models[str(m)]["model"]
model.load_state_dict(weights)
Models[str(m)]["model"] = model
num_loaded += 1
except Exception:
else:
weights = torch.load(file_name, map_location=self.device, weights_only=True)
emb_model = Models[str(m)]["emb_model"]
emb_model.load_state_dict(weights)
Expand All @@ -1230,10 +1233,11 @@ def get_synthesizer(self, path=None):
print(f"\nLoaded {self.name} weights!\n")
return Models
else:
print("!!!Returning untrained models, could not load pretrained")
print("!!!No pretrained weights were found, initializing models with random weights")
return Models
else:
print(f"!!!Returning untrained models, could not load any pretrained model. Check the `load_pretrained parameter` or train the models using {self.name}.train(data).")
if verbose:
print(f"!!!No pretrained weights were provided, initializing models with random weights")
return Models


Expand Down Expand Up @@ -1421,7 +1425,9 @@ def fit_from_iterable(self, data: Union[List[Tuple[str, Set[OWLNamedIndividual],
return predictions_as_owl_class_expressions

@staticmethod
def generate_training_data(kb_path, num_lps=1000, beyond_alc=False, storage_dir="./Training_Data"):
def generate_training_data(kb_path, num_lps=1000, beyond_alc=False, storage_dir=None):
if storage_path is None:
storage_path = f"./Training_Data_{self.name}"
lp_gen = LPGen(kb_path=kb_path, max_num_lps=num_lps, beyond_alc=beyond_alc, storage_dir=storage_dir)
lp_gen.generate()
print("Loading generated data...")
Expand All @@ -1439,35 +1445,37 @@ def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_
num_workers = max(0,os.cpu_count()-1)
if storage_path is None:
currentDateAndTime = datetime.now()
storage_path = f'NCES-Experiment-{currentDateAndTime.strftime("%H-%M-%S")}'
storage_path = f'{self.name}-Experiment-{currentDateAndTime.strftime("%H-%M-%S")}'
if not os.path.exists(storage_path):
os.mkdir(storage_path)
self.trained_models_path = storage_path+"/trained_models"
if batch_size is None:
batch_size = self.batch_size
if data is None:
data = self.generate_training_data(self.knowledge_base_path, num_lps=num_lps, beyond_alc=True, storage_dir=storage_path)
self.add_data_values(data) # Add data values based on training data
self.model = self.get_synthesizer(verbose=False)
trainer = NCESTrainer(self, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, decay_rate=decay_rate,
clip_value=clip_value, num_workers=num_workers, storage_path=storage_path)
trainer.train(data, save_model, optimizer, record_runtime)
trainer.train(data=data, save_model=save_model, optimizer=optimizer, record_runtime=record_runtime)



class ROCES(NCES2):
"""Robust Class Expression Synthesis in Description Logics via Iterative Sampling."""

def __init__(self, knowledge_base_path,
def __init__(self, knowledge_base_path, nces2_or_roces=True,
quality_func: Optional[AbstractScorer] = None, num_predictions=5, k=5,
path_of_trained_models="", proj_dim=128, rnn_n_layers=2, drop_prob=0.1,
num_heads=4, num_seeds=1, m=[32, 64, 128], ln=False, embedding_dim=256, sampling_strategy="p",
input_dropout=0.0, feature_map_dropout=0.1, kernel_size=4, num_of_output_channels=32,
learning_rate=1e-4, decay_rate=0.0, clip_value=5.0, batch_size=256, num_workers=4,
max_length=48, load_pretrained=True, sorted_examples=False, verbose: int = 0):
super().__init__(knowledge_base_path,
max_length=48, load_pretrained=True, verbose: int = 0):
super().__init__(knowledge_base_path, nces2_or_roces,
quality_func, num_predictions, path_of_trained_models, proj_dim, drop_prob,
num_heads, num_seeds, m, ln, embedding_dim, sampling_strategy, input_dropout, feature_map_dropout,
kernel_size, num_of_output_channels, learning_rate, decay_rate, clip_value, batch_size,
num_workers, max_length, load_pretrained, sorted_examples, verbose)
num_workers, max_length, load_pretrained, verbose)

self.name = "ROCES"
self.k = k
Expand Down
Loading

0 comments on commit a10fae8

Please sign in to comment.