diff --git a/examples/train_nces.py b/examples/train_nces.py index f7678dd9..0b388217 100644 --- a/examples/train_nces.py +++ b/examples/train_nces.py @@ -54,15 +54,15 @@ def start(args): print("Could not find training data. Will generate some data and train.") training_data = NCES2.generate_training_data(knowledge_base_path, beyond_alc=True) if args.synthesizer == "NCES": - nces = NCES(knowledge_base_path=knowledge_base_path, learner_names=args.models, path_of_embeddings=path_of_embeddings, - max_length=48, proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, verbose=True, load_pretrained=args.load_pretrained) + synthesizer = NCES(knowledge_base_path=knowledge_base_path, learner_names=args.models, path_of_embeddings=path_of_embeddings, + max_length=48, proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, load_pretrained=args.load_pretrained, verbose=True) elif args.synthesizer == "NCES2": - nces = NCES2(knowledge_base_path=knowledge_base_path, max_length=48, proj_dim=128, + synthesizer = NCES2(knowledge_base_path=knowledge_base_path, path_of_trained_models=args.path_of_trained_models, nces2_or_roces=True, max_length=48, proj_dim=128, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, verbose=True, load_pretrained=args.load_pretrained) else: - nces = ROCES(knowledge_base_path=knowledge_base_path, k=5, path_of_trained_models="", max_length=48, proj_dim=128, - drop_prob=0.1, num_heads=4, num_seeds=1, m=32, verbose=True, load_pretrained=args.load_pretrained) - nces.train(training_data, epochs=args.epochs, learning_rate=args.learning_rate, num_workers=2, save_model=True) + synthesizer = ROCES(knowledge_base_path=knowledge_base_path, path_of_trained_models=args.path_of_trained_models, nces2_or_roces=True, k=5, max_length=48, proj_dim=128, + drop_prob=0.1, num_heads=4, num_seeds=1, m=32, load_pretrained=args.load_pretrained, verbose=True) + synthesizer.train(training_data, epochs=args.epochs, learning_rate=args.learning_rate, num_workers=2, save_model=True) if __name__ == '__main__': @@ -71,6 +71,7 @@ def start(args): parser.add_argument('--embeddings', type=str, nargs='+', default=None, help='Paths of embeddings for each KB.') parser.add_argument('--synthesizer', type=str, default="NCES", help='Neural synthesizer to train') parser.add_argument('--path_train_data', type=str, help='Path to training data') + parser.add_argument('--path_of_trained_models', type=str, default=None, help='Path to training data') parser.add_argument('--models', type=str, nargs='+', default=['SetTransformer', 'LSTM', 'GRU'], help='Neural models') parser.add_argument('--load_pretrained', type=str2bool, default=False, help='Whether to load the pretrained model') diff --git a/ontolearn/base_nces.py b/ontolearn/base_nces.py index 3badbbc7..2ae7f416 100644 --- a/ontolearn/base_nces.py +++ b/ontolearn/base_nces.py @@ -30,11 +30,12 @@ from torch.functional import F from torch.nn.utils.rnn import pad_sequence from abc import abstractmethod +import re class BaseNCES: - def __init__(self, knowledge_base_path, quality_func, num_predictions, proj_dim=128, drop_prob=0.1, + def __init__(self, knowledge_base_path, nces2_or_roces, quality_func, num_predictions, proj_dim=128, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, ln=False, learning_rate=1e-4, decay_rate=0.0, clip_value=5.0, batch_size=256, num_workers=4, max_length=48, load_pretrained=True, verbose: int = 0): kb = KnowledgeBase(path=knowledge_base_path) @@ -45,6 +46,10 @@ def __init__(self, knowledge_base_path, quality_func, num_predictions, proj_dim= self.atomic_concept_names = atomic_concept_names role_names = [rel.iri.get_remainder() for rel in kb.ontology.object_properties_in_signature()] vocab = atomic_concept_names + role_names + ['⊔', '⊓', '∃', '∀', '¬', '⊤', '⊥', '.', ' ', '(', ')'] + if nces2_or_roces: + concrete_role_names = [rel.iri.get_remainder() for rel in kb.ontology.data_properties_in_signature()] + vocab.extend(concrete_role_names) + vocab.extend(['⁻', '≤', '≥', 'True', 'False', 'true', 'false', '{', '}', ':', '[', ']', 'double', 'integer', 'date', 'xsd']) vocab = sorted(vocab) + ['PAD'] self.knowledge_base_path = knowledge_base_path self.kb = kb @@ -76,24 +81,22 @@ def find_optimal_number_of_examples(kb): return min(kb.individuals_count()//2, 1000) return kb.individuals_count() - def collate_batch(self, batch): # pragma: no cover - pos_emb_list = [] - neg_emb_list = [] - target_labels = [] - for pos_emb, neg_emb, label in batch: - if pos_emb.ndim != 2: - pos_emb = pos_emb.reshape(1, -1) - if neg_emb.ndim != 2: - neg_emb = neg_emb.reshape(1, -1) - pos_emb_list.append(pos_emb) - neg_emb_list.append(neg_emb) - target_labels.append(label) - pos_emb_list[0] = F.pad(pos_emb_list[0], (0, 0, 0, self.num_examples - pos_emb_list[0].shape[0]), "constant", 0) - pos_emb_list = pad_sequence(pos_emb_list, batch_first=True, padding_value=0) - neg_emb_list[0] = F.pad(neg_emb_list[0], (0, 0, 0, self.num_examples - neg_emb_list[0].shape[0]), "constant", 0) - neg_emb_list = pad_sequence(neg_emb_list, batch_first=True, padding_value=0) - target_labels = pad_sequence(target_labels, batch_first=True, padding_value=-100) - return pos_emb_list, neg_emb_list, target_labels + def add_data_values(self, data): + print("\nUpdating vocabulary based on training data...\n") + quantified_restriction_values = [str(i) for i in range(1,12)] + vocab = list(self.vocab.keys()) + vocab.extend(quantified_restriction_values) + values = set() + for ce, examples in data: + if '[' in ce: + for val in re.findall("\[(.*?)\]", ce): + values.add(val.split(' ')[-1]) + vocab.extend(list(values)) + vocab = sorted(vocab) + self.inv_vocab = np.array(vocab, dtype='object') + self.vocab = {vocab[i]: i for i in range(len(vocab))} + print("Done.\n") + def collate_batch_inference(self, batch): # pragma: no cover pos_emb_list = [] diff --git a/ontolearn/concept_learner.py b/ontolearn/concept_learner.py index b547e788..4591705e 100644 --- a/ontolearn/concept_learner.py +++ b/ontolearn/concept_learner.py @@ -796,13 +796,13 @@ def train(self, data: Iterable[List[Tuple]], epochs=300, batch_size=256, learnin class NCES(BaseNCES): """Neural Class Expression Synthesis.""" - def __init__(self, knowledge_base_path, + def __init__(self, knowledge_base_path, nces2_or_roces=False, quality_func: Optional[AbstractScorer] = None, num_predictions=5, learner_names=["SetTransformer"], path_of_embeddings="", proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, ln=False, learning_rate=1e-4, decay_rate=0.0, clip_value=5.0, batch_size=256, num_workers=4, max_length=48, load_pretrained=True, sorted_examples=False, verbose: int = 0): - super().__init__(knowledge_base_path, quality_func, num_predictions, proj_dim, drop_prob, + super().__init__(knowledge_base_path, nces2_or_roces, quality_func, num_predictions, proj_dim, drop_prob, num_heads, num_seeds, m, ln, learning_rate, decay_rate, clip_value, batch_size, num_workers, max_length, load_pretrained, verbose) @@ -866,7 +866,7 @@ def get_synthesizer(self, path=None): elif num_loaded_models > 0: print("Some model weights could not be loaded. Successful ones are: ", loaded_model_names) else: - print("!!!Returning untrained models, could not load pretrained") + print("!!!No pretrained weights, initializing models with random weights") return Models elif len(glob.glob(path+"/*.pt")) == 0: @@ -894,10 +894,10 @@ def get_synthesizer(self, path=None): print("Some model weights could not be loaded. Successful ones are: ", loaded_model_names) return Models else: - print("!!!Returning untrained models, could not load pretrained") + print("!!!No pretrained weights were provided, initializing models with random weights") return Models else: - print("!!!Returning untrained models, could not load pretrained weights. Check the `load_pretrained parameter` or train the models using NCES.train(data).") + print("!!!No pretrained weights were provided, initializing models with random weights.") return Models @@ -1114,14 +1114,14 @@ def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_ class NCES2(BaseNCES): """Neural Class Expression Synthesis in ALCHIQ(D).""" - def __init__(self, knowledge_base_path, + def __init__(self, knowledge_base_path, nces2_or_roces=True, quality_func: Optional[AbstractScorer] = None, num_predictions=5, - path_of_trained_models="", proj_dim=128, drop_prob=0.1, + path_of_trained_models=None, proj_dim=128, drop_prob=0.1, num_heads=4, num_seeds=1, m=[32, 64, 128], ln=False, embedding_dim=256, sampling_strategy="nces2", input_dropout=0.0, feature_map_dropout=0.1, kernel_size=4, num_of_output_channels=32, learning_rate=1e-4, decay_rate=0.0, clip_value=5.0, batch_size=256, num_workers=4, max_length=48, load_pretrained=True, verbose: int = 0): - super().__init__(knowledge_base_path, quality_func, num_predictions, proj_dim, drop_prob, + super().__init__(knowledge_base_path, nces2_or_roces, quality_func, num_predictions, proj_dim, drop_prob, num_heads, num_seeds, m, ln, learning_rate, decay_rate, clip_value, batch_size, num_workers, max_length, load_pretrained, verbose) @@ -1138,36 +1138,39 @@ def __init__(self, knowledge_base_path, self.kernel_size = kernel_size self.num_of_output_channels = num_of_output_channels self._maybe_load_pretrained_config() - self.model = self.get_synthesizer() + self.model = self.get_synthesizer(path_of_trained_models) def _maybe_load_pretrained_config(self, path=None): if isinstance(self.m, int): self.m = [self.m] if path: possible_checkpoints = glob.glob(path + "/*.pt") - else: + elif self.path_of_trained_models: possible_checkpoints = glob.glob(self.path_of_trained_models + "/*.pt") if self.load_pretrained and len(possible_checkpoints): stop_outer_loop = False for file_name in possible_checkpoints: for m in self.m: - if m in file_name: + if str(m) in file_name and "emb" in file_name: try: weights = torch.load(file_name, map_location=self.device, weights_only=True) - num_ents, half_emb_dim = weights["emb_ent_real"].weight.data.shape - num_rels, _ = weights["emb_rel_real"].weight.data.shape + #print(weights) + num_ents, half_emb_dim = weights["emb_ent_real.weight"].shape + num_rels, _ = weights["emb_rel_real.weight"].shape self.embedding_dim = 2*half_emb_dim self.num_entities = num_ents self.num_relations = num_rels stop_outer_loop = True + print("Updated number of entities and relation types in embedding model!\n") break - except: + except Exception as e: + print(e) pass if stop_outer_loop: break - def get_synthesizer(self, path=None): + def get_synthesizer(self, path=None, verbose=True): Models = {str(m): {"emb_model": ConEx(self.embedding_dim, self.num_entities, self.num_relations, self.input_dropout, self.feature_map_dropout, self.kernel_size, self.num_of_output_channels), @@ -1183,14 +1186,14 @@ def get_synthesizer(self, path=None): else: for file_name in possible_checkpoints: for m in self.m: - if m in file_name: - try: + if str(m) in file_name: + if not "emb" in file_name: weights = torch.load(file_name, map_location=self.device, weights_only=True) model = Models[str(m)]["model"] model.load_state_dict(weights) Models[str(m)]["model"] = model num_loaded += 1 - except Exception: + else: weights = torch.load(file_name, map_location=self.device, weights_only=True) emb_model = Models[str(m)]["emb_model"] emb_model.load_state_dict(weights) @@ -1203,7 +1206,7 @@ def get_synthesizer(self, path=None): print(f"\nLoaded {self.name} weights!\n") return Models else: - print("!!!Returning untrained models, could not load pretrained models") + print("!!!No pretrained weights were provided, initializing models with random weights") return Models elif len(glob.glob(path + "/*.pt")) == 0: @@ -1214,14 +1217,14 @@ def get_synthesizer(self, path=None): num_loaded = 0 for file_name in possible_checkpoints: for m in self.m: - if m in file_name: - try: + if str(m) in file_name: + if not "emb" in file_name: weights = torch.load(file_name, map_location=self.device, weights_only=True) model = Models[str(m)]["model"] model.load_state_dict(weights) Models[str(m)]["model"] = model num_loaded += 1 - except Exception: + else: weights = torch.load(file_name, map_location=self.device, weights_only=True) emb_model = Models[str(m)]["emb_model"] emb_model.load_state_dict(weights) @@ -1230,10 +1233,11 @@ def get_synthesizer(self, path=None): print(f"\nLoaded {self.name} weights!\n") return Models else: - print("!!!Returning untrained models, could not load pretrained") + print("!!!No pretrained weights were found, initializing models with random weights") return Models else: - print(f"!!!Returning untrained models, could not load any pretrained model. Check the `load_pretrained parameter` or train the models using {self.name}.train(data).") + if verbose: + print(f"!!!No pretrained weights were provided, initializing models with random weights") return Models @@ -1421,7 +1425,9 @@ def fit_from_iterable(self, data: Union[List[Tuple[str, Set[OWLNamedIndividual], return predictions_as_owl_class_expressions @staticmethod - def generate_training_data(kb_path, num_lps=1000, beyond_alc=False, storage_dir="./Training_Data"): + def generate_training_data(kb_path, num_lps=1000, beyond_alc=False, storage_dir=None): + if storage_path is None: + storage_path = f"./Training_Data_{self.name}" lp_gen = LPGen(kb_path=kb_path, max_num_lps=num_lps, beyond_alc=beyond_alc, storage_dir=storage_dir) lp_gen.generate() print("Loading generated data...") @@ -1439,7 +1445,7 @@ def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_ num_workers = max(0,os.cpu_count()-1) if storage_path is None: currentDateAndTime = datetime.now() - storage_path = f'NCES-Experiment-{currentDateAndTime.strftime("%H-%M-%S")}' + storage_path = f'{self.name}-Experiment-{currentDateAndTime.strftime("%H-%M-%S")}' if not os.path.exists(storage_path): os.mkdir(storage_path) self.trained_models_path = storage_path+"/trained_models" @@ -1447,27 +1453,29 @@ def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_ batch_size = self.batch_size if data is None: data = self.generate_training_data(self.knowledge_base_path, num_lps=num_lps, beyond_alc=True, storage_dir=storage_path) + self.add_data_values(data) # Add data values based on training data + self.model = self.get_synthesizer(verbose=False) trainer = NCESTrainer(self, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, decay_rate=decay_rate, clip_value=clip_value, num_workers=num_workers, storage_path=storage_path) - trainer.train(data, save_model, optimizer, record_runtime) + trainer.train(data=data, save_model=save_model, optimizer=optimizer, record_runtime=record_runtime) class ROCES(NCES2): """Robust Class Expression Synthesis in Description Logics via Iterative Sampling.""" - def __init__(self, knowledge_base_path, + def __init__(self, knowledge_base_path, nces2_or_roces=True, quality_func: Optional[AbstractScorer] = None, num_predictions=5, k=5, path_of_trained_models="", proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=[32, 64, 128], ln=False, embedding_dim=256, sampling_strategy="p", input_dropout=0.0, feature_map_dropout=0.1, kernel_size=4, num_of_output_channels=32, learning_rate=1e-4, decay_rate=0.0, clip_value=5.0, batch_size=256, num_workers=4, - max_length=48, load_pretrained=True, sorted_examples=False, verbose: int = 0): - super().__init__(knowledge_base_path, + max_length=48, load_pretrained=True, verbose: int = 0): + super().__init__(knowledge_base_path, nces2_or_roces, quality_func, num_predictions, path_of_trained_models, proj_dim, drop_prob, num_heads, num_seeds, m, ln, embedding_dim, sampling_strategy, input_dropout, feature_map_dropout, kernel_size, num_of_output_channels, learning_rate, decay_rate, clip_value, batch_size, - num_workers, max_length, load_pretrained, sorted_examples, verbose) + num_workers, max_length, load_pretrained, verbose) self.name = "ROCES" self.k = k diff --git a/ontolearn/data_struct.py b/ontolearn/data_struct.py index f1db687a..98edd902 100644 --- a/ontolearn/data_struct.py +++ b/ontolearn/data_struct.py @@ -376,7 +376,6 @@ class ROCESDataset(NCESBaseDataset, torch.utils.data.Dataset): def __init__(self, data, triples_data, k, vocab, inv_vocab, max_length, sampling_strategy="p"): super(ROCESDataset, self).__init__(vocab, inv_vocab, max_length) self.data = data - print("\n\nData type", type(data)) self.triples_data = triples_data self.k = k self.sampling_strategy = sampling_strategy @@ -393,9 +392,7 @@ def __len__(self): return len(self.data) def __getitem__(self, idx): - print(len(self.data[idx])) key, value = self.data[idx] - 1/0 pos = value['positive examples'] neg = value['negative examples'] if self.sampling_strategy == 'p': @@ -407,10 +404,6 @@ def __getitem__(self, idx): k_neg = np.random.choice(range(min(self.k, len(neg)), len(neg)+1, self.k), replace=False, p=prob_neg_set) elif self.sampling_strategy == 'nces2': if random.random() > 0.5: - prob_pos_set = 1.0/(1+np.array(range(min(self.k, len(pos)), len(pos)+1, self.k))) - prob_pos_set = prob_pos_set/prob_pos_set.sum() - prob_neg_set = 1.0/(1+np.array(range(min(self.k, len(neg)), len(neg)+1, self.k))) - prob_neg_set = prob_neg_set/prob_neg_set.sum() k_pos = max(1, 2*len(pos)//3) k_neg = max(1, 2*len(neg)//3) else: diff --git a/ontolearn/nces_trainer.py b/ontolearn/nces_trainer.py index c652854d..31865157 100644 --- a/ontolearn/nces_trainer.py +++ b/ontolearn/nces_trainer.py @@ -211,7 +211,7 @@ def train(self, data, shuffle_examples=False, example_sizes=None, print("#"*50) print() model = copy.deepcopy(self.synthesizer.model[model_name]) - print("{} starts training... \n".format(model["model"].name)) + print("{}: {} starts training... \n".format(self.synthesizer.name, model["model"].name)) print("#"*50, "\n") desc = model["model"].name if device.type == "cuda": @@ -222,21 +222,28 @@ def train(self, data, shuffle_examples=False, example_sizes=None, if self.decay_rate: self.scheduler = ExponentialLR(optim_algo, self.decay_rate) if model["emb_model"] is not None: + # When there is no embedding_model, then we are training NCES2 or ROCES and we need to repeatedly query the embedding model for the updated embeddings + train_dataset = ROCESDataset(data, self.synthesizer.triples_data, k=self.synthesizer.k if hasattr(self.synthesizer, 'k') else None, vocab=self.synthesizer.vocab, inv_vocab=self.synthesizer.inv_vocab, + max_length=self.synthesizer.max_length, sampling_strategy=self.synthesizer.sampling_strategy) + train_dataset.load_embeddings(model["emb_model"]) # Load embeddings the first time + train_dataloader = DataLoader(train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, collate_fn=self.collate_batch, shuffle=True) + # Get dataloader for the embedding model self.er_vocab = self.get_er_vocab() triples_dataloader = iter(DataLoader(TriplesDataset(er_vocab=self.er_vocab, num_e=len(self.synthesizer.triples_data.entities)), batch_size=2*self.batch_size, num_workers=self.num_workers, shuffle=True)) else: assert hasattr(self.synthesizer, "instance_embeddings"), "If no embedding model is available, `instance_embeddings` must be an attribute of the synthesizer since you are probably training NCES" - train_dataset = DataLoader(NCESDataset(data, embeddings=self.synthesizer.instance_embeddings, vocab=self.synthesizer.vocab, inv_vocab=self.synthesizer.inv_vocab, + train_dataloader = DataLoader(NCESDataset(data, embeddings=self.synthesizer.instance_embeddings, vocab=self.synthesizer.vocab, inv_vocab=self.synthesizer.inv_vocab, shuffle_examples=shuffle_examples, max_length=self.synthesizer.max_length, example_sizes=example_sizes), batch_size=self.batch_size, num_workers=self.num_workers, collate_fn=self.collate_batch, shuffle=True) Train_loss = [] Train_acc = defaultdict(list) best_score = 0 best_weights = (None, None) + s_acc, h_acc = 0, 0 if record_runtime: t0 = time.time() - s_acc, h_acc = 0, 0 + Epochs = trange(self.epochs, desc=f'Loss: {np.nan}, Soft Acc: {s_acc}, Hard Acc: {h_acc}', leave=True, colour='green') for e in Epochs: soft_acc, hard_acc = [], [] @@ -245,14 +252,7 @@ def train(self, data, shuffle_examples=False, example_sizes=None, num_batches = len(data) // self.batch_size if len(data) % self.batch_size == 0 else len(data) // self.batch_size + 1 batch_data = trange(num_batches, desc=f'Train: ', leave=False) if model["emb_model"] is not None: - # When there is no embedding_model, then we are training NCES2 or ROCES and need to use slicing and shuffling to construct input batches since we need to repeatedly query the embedding model for the updated embeddings - random.shuffle(data) - train_dataset = ROCESDataset(data, self.synthesizer.triples_data, k=self.synthesizer.k if hasattr(self.synthesizer, 'k') else None, vocab=self.synthesizer.vocab, inv_vocab=self.synthesizer.inv_vocab, - sampling_strategy=self.synthesizer.sampling_strategy, max_length=self.synthesizer.max_length) - # Load currently learned embeddings - train_dataset.load_embeddings(model["emb_model"]) - for _, train_idx in zip(batch_data, range(0, len(data), self.batch_size)): - batch = train_dataset[train_idx:train_idx+self.batch_size] + for _, batch in zip(batch_data, train_dataloader): loss, s_acc, h_acc = self.train_step(batch, model["model"], model["emb_model"], optim_algo, device, triples_dataloader) batch_count += 1 batch_data.set_description('Train: '.format(batch_count, num_batches, loss, s_acc, h_acc)) @@ -260,9 +260,11 @@ def train(self, data, shuffle_examples=False, example_sizes=None, soft_acc.append(s_acc) hard_acc.append(h_acc) train_losses.append(loss) + # Load currently learned embeddings + train_dataset.load_embeddings(model["emb_model"]) else: - # When an embedding model is None, then we are training NCES and the training data is a torch.utils.data.DataLoader object - for _, batch in zip(batch_data, train_dataset): + # When an embedding model is None, then we are training NCES + for _, batch in zip(batch_data, train_dataloader): loss, s_acc, h_acc = self.train_step(batch, model["model"], model["emb_model"], optim_algo, device) batch_count += 1 batch_data.set_description('Train: '.format(batch_count, num_batches, loss, s_acc, h_acc))