Skip to content

Commit

Permalink
Set NCES to train from scratch when no pretrained model is available
Browse files Browse the repository at this point in the history
  • Loading branch information
Jean-KOUAGOU committed Oct 16, 2024
1 parent 5824bbc commit 49427ed
Show file tree
Hide file tree
Showing 8 changed files with 217 additions and 152 deletions.
2 changes: 1 addition & 1 deletion examples/concept_learning_cv_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def dl_concept_learning(args):
nces = NCES(knowledge_base_path=args.kb,
quality_func=F1(),
path_of_embeddings=get_embedding_path("https://files.dice-research.org/projects/NCES/NCES_Ontolearn_Data/NCESData.zip",args.path_of_nces_embeddings, args.kb),
pretrained_model_name=["LSTM", "GRU", "SetTransformer"],
learner_names=["LSTM", "GRU", "SetTransformer"],
num_predictions=100,
verbose=0)
args.path_of_clip_embeddings = get_embedding_path(
Expand Down
23 changes: 12 additions & 11 deletions examples/train_nces.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def str2bool(v):


parser = argparse.ArgumentParser()
parser.add_argument('--kbs', type=str, nargs='+', default=['carcinogenesis'], help='Knowledge base name(s)')
parser.add_argument('--kbs', type=str, nargs='+', default=['family'], help='Knowledge base name(s)')
parser.add_argument('--path_train_data', type=str, default="./NCES-Experiment-17:42:45/", help='Path to training data')
parser.add_argument('--models', type=str, nargs='+', default=['SetTransformer', 'LSTM', 'GRU'], help='Neural models')
parser.add_argument('--load_pretrained', type=str2bool, default=False, help='Whether to load the pretrained model')
parser.add_argument('--learning_rate', type=float, default=0.001, help='The learning rate')
Expand All @@ -31,15 +32,15 @@ def str2bool(v):
for kb in args.kbs:
knowledge_base_path = f"./NCESData/{kb}/{kb}.owl"
path_of_embeddings = f"./NCESData/{kb}/embeddings/ConEx_entity_embeddings.csv"
with open(f"./NCESData/{kb}/training_data/Data.json") as file:
training_data = list(json.load(file).items())

nces = NCES(knowledge_base_path=knowledge_base_path, learner_name="SetTransformer",
try:
with open(args.path_train_data+"/LPs.json") as file:
training_data = list(json.load(file).items())
except FileNotFoundError:
print("Could not find training data. Will generate some data and train.")
training_data = None

nces = NCES(knowledge_base_path=knowledge_base_path, learner_names=args.models,
path_of_embeddings=path_of_embeddings, max_length=48, proj_dim=128, rnn_n_layers=2, drop_prob=0.1,
num_heads=4, num_seeds=1, num_inds=32, load_pretrained=args.load_pretrained)
num_heads=4, num_seeds=1, num_inds=32, verbose=True, load_pretrained=args.load_pretrained)

for model in args.models:
nces.learner_name = model
nces.pretrained_model_name = model
nces.refresh()
nces.train(training_data, epochs=args.epochs, learning_rate=args.learning_rate, save_model=True)
nces.train(training_data, epochs=args.epochs, learning_rate=args.learning_rate, num_workers=2, save_model=True)
6 changes: 3 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def get_default_arguments(description=None):
help="Random initialization method.", choices=["GROW", "FULL", "RAMPED_HALF_HALF"])

# NCES only
parser.add_argument("--learner_name", type=str, default="SetTransformer", help="Learner name.",
parser.add_argument("--learner_names", type=str, nargs="+", default=["SetTransformer"], help="Learner name.",
choices=["SetTransformer", "GRU", "LSTM"])
parser.add_argument("--proj_dim", type=int, default=128, help="Number of projection dimensions.")
parser.add_argument("--rnn_n_layers", type=int, default=2, help="Number of RNN layers (only for LSTM and GRU).")
Expand All @@ -122,8 +122,8 @@ def get_default_arguments(description=None):
parser.add_argument("--max_length", type=int, default=48, help="Maximum length")
parser.add_argument("--load_pretrained", type=bool, default=True, help="Load pretrained.")
parser.add_argument("--sorted_examples", type=bool, default=True, help="Sorted examples.")
parser.add_argument("--pretrained_model_name", type=str, default="SetTransformer", help="Pretrained model name",
choices=["SetTransformer", "GRU", "LSTM"])
# parser.add_argument("--pretrained_model_name", type=str, default="SetTransformer", help="Pretrained model name",
# choices=["SetTransformer", "GRU", "LSTM"])

if description is None:
return parser.parse_args()
Expand Down
4 changes: 2 additions & 2 deletions ontolearn/base_nces.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

class BaseNCES:

def __init__(self, knowledge_base_path, learner_name, path_of_embeddings, batch_size=256, learning_rate=1e-4,
def __init__(self, knowledge_base_path, learner_names, path_of_embeddings, batch_size=256, learning_rate=1e-4,
decay_rate=0.0, clip_value=5.0, num_workers=4):
self.name = "NCES"
kb = KnowledgeBase(path=knowledge_base_path)
Expand All @@ -52,7 +52,7 @@ def __init__(self, knowledge_base_path, learner_name, path_of_embeddings, batch_
self.all_individuals = set([ind.str.split("/")[-1] for ind in kb.individuals()])
self.inv_vocab = np.array(vocab, dtype='object')
self.vocab = {vocab[i]: i for i in range(len(vocab))}
self.learner_name = learner_name
self.learner_names = learner_names
self.num_examples = self.find_optimal_number_of_examples(kb)
self.batch_size = batch_size
self.learning_rate = learning_rate
Expand Down
148 changes: 106 additions & 42 deletions ontolearn/concept_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import operator
import random
import time
from datetime import datetime
from contextlib import contextmanager
from itertools import islice, chain
from typing import Any, Callable, Dict, FrozenSet, Set, List, Tuple, Iterable, Optional, Union
Expand Down Expand Up @@ -78,6 +79,9 @@
from owlapy.utils import OrderedOWLObject
from sortedcontainers import SortedSet
import os
import json
import glob
from ontolearn.lp_generator import LPGen

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1226,7 +1230,7 @@ def load_model(predictor_name, load_pretrained):
pretrained_model_path = self.path_of_embeddings.split("embeddings")[
0] + "trained_models/trained_" + predictor_name + ".pt"
if load_pretrained and os.path.isfile(pretrained_model_path):
model.load_state_dict(torch.load(pretrained_model_path, map_location=self.device))
model.load_state_dict(torch.load(pretrained_model_path, map_location=self.device, weights_only=True))
model.eval()
print("\n Loaded length predictor!")
return model
Expand Down Expand Up @@ -1399,11 +1403,10 @@ class NCES(BaseNCES):

def __init__(self, knowledge_base_path,
quality_func: Optional[AbstractScorer] = None, num_predictions=5,
learner_name="SetTransformer", path_of_embeddings="", proj_dim=128, rnn_n_layers=2, drop_prob=0.1,
learner_names=["SetTransformer"], path_of_embeddings="", proj_dim=128, rnn_n_layers=2, drop_prob=0.1,
num_heads=4, num_seeds=1, num_inds=32, ln=False, learning_rate=1e-4, decay_rate=0.0, clip_value=5.0,
batch_size=256, num_workers=4, max_length=48, load_pretrained=True, sorted_examples=False,
pretrained_model_name=None, verbose: int = 0):
super().__init__(knowledge_base_path, learner_name, path_of_embeddings, batch_size, learning_rate, decay_rate,
batch_size=256, num_workers=4, max_length=48, load_pretrained=True, sorted_examples=False, verbose: int = 0):
super().__init__(knowledge_base_path, learner_names, path_of_embeddings, batch_size, learning_rate, decay_rate,
clip_value, num_workers)
self.quality_func = quality_func
self.num_predictions = num_predictions
Expand All @@ -1419,42 +1422,84 @@ def __init__(self, knowledge_base_path,
self.ln = ln
self.load_pretrained = load_pretrained
self.sorted_examples = sorted_examples
self.pretrained_model_name = pretrained_model_name
self.verbose = verbose
self.model = self.get_synthesizer()
self.dl_parser = DLSyntaxParser(namespace=self.kb_namespace)
self.best_predictions = None

def get_synthesizer(self):
def load_model(learner_name, load_pretrained):
if learner_name == 'SetTransformer':
model = SetTransformer(self.knowledge_base_path, self.vocab, self.inv_vocab, self.max_length,
self.input_size, self.proj_dim, self.num_heads, self.num_seeds, self.num_inds,
self.ln)
elif learner_name == 'GRU':
model = GRU(self.knowledge_base_path, self.vocab, self.inv_vocab, self.max_length, self.input_size,
self.proj_dim, self.rnn_n_layers, self.drop_prob)
elif learner_name == 'LSTM':
model = LSTM(self.knowledge_base_path, self.vocab, self.inv_vocab, self.max_length, self.input_size,
self.proj_dim, self.rnn_n_layers, self.drop_prob)
if load_pretrained:
model_path = self.path_of_embeddings.split("embeddings")[
0] + "trained_models/trained_" + learner_name + ".pt"
model.load_state_dict(torch.load(model_path, map_location=self.device))
model.eval()
if self.verbose > 0:
print("\n Loaded synthesizer model!")
return model
def get_synthesizer(self, path=None):
m1 = SetTransformer(self.knowledge_base_path, self.vocab, self.inv_vocab, self.max_length,
self.input_size, self.proj_dim, self.num_heads, self.num_seeds, self.num_inds,
self.ln)
m2 = GRU(self.knowledge_base_path, self.vocab, self.inv_vocab, self.max_length, self.input_size,
self.proj_dim, self.rnn_n_layers, self.drop_prob)

m3 = LSTM(self.knowledge_base_path, self.vocab, self.inv_vocab, self.max_length, self.input_size,
self.proj_dim, self.rnn_n_layers, self.drop_prob)
Untrained = []
for name in self.learner_names:
for m in [m1,m2,m3]:
if m.name == name:
Untrained.append(m)

Models = []

if self.load_pretrained:
if path is None:
try:
if len(glob.glob(self.path_of_embeddings.split("embeddings")[0] + "trained_models/*.pt")) == 0:
raise FileNotFoundError
else:
for file_name in glob.glob(self.path_of_embeddings.split("embeddings")[0] + "trained_models/*.pt"):
for m in Untrained:
if m.name in file_name:
try:
m.load_state_dict(torch.load(file_name, map_location=self.device, weights_only=True))
Models.append(m.eval())
except Exception as e:
print(e)
pass
except Exception as e:
print(e)
raise RuntimeError

if Models:
if self.verbose:
print("\n Loaded synthesizer model!")
return Models
else:
print("!!!Returning untrained models, could not load pretrained")
return Untrained

if not self.load_pretrained:
return [load_model(self.learner_name, self.load_pretrained)]
elif self.load_pretrained and isinstance(self.pretrained_model_name, str):
return [load_model(self.pretrained_model_name, self.load_pretrained)]
elif self.load_pretrained and isinstance(self.pretrained_model_name, list):
return [load_model(name, self.load_pretrained) for name in self.pretrained_model_name]
elif len(glob.glob(path+"/*.pt")) == 0:
print("No pretrained model found!")
raise FileNotFoundError
else:
for file_name in glob.glob(path+"/*.pt"):
for m in Untrained:
if m.name in file_name:
try:
m.load_state_dict(torch.load(file_name, map_location=self.device, weights_only=True))
Models.append(m.eval())
except Exception as e:
print(e)
pass
if Models:
if self.verbose:
print("\n Loaded synthesizer model!")
return Models
else:
print("!!!Returning untrained models, could not load pretrained")
return Untrained
else:
print("!!!Returning untrained models, could not load pretrained. Check the `load_pretrained parameter` or train the models using NCES.train(data).")
return Untrained

def refresh(self):
self.model = self.get_synthesizer()

def refresh(self, path=None):
if path is not None:
self.load_pretrained = True
self.model = self.get_synthesizer(path)

def sample_examples(self, pos, neg): # pragma: no cover
assert type(pos[0]) == type(neg[0]), "The two iterables pos and neg must be of same type"
Expand Down Expand Up @@ -1507,7 +1552,7 @@ def fit_one(self, pos: Union[Set[OWLNamedIndividual], Set[str]], neg: Union[Set[
Pos = np.random.choice(pos_str, size=(self.num_predictions, len(pos_str)), replace=True)
Neg = np.random.choice(neg_str, size=(self.num_predictions, len(neg_str)), replace=True)

assert self.load_pretrained and self.pretrained_model_name, \
assert self.load_pretrained and self.learner_names, \
"No pretrained model found. Please first train NCES, see the <<train>> method below"

dataset = NCESDataLoaderInference([("", Pos_str, Neg_str) for (Pos_str, Neg_str) in zip(Pos, Neg)],
Expand Down Expand Up @@ -1597,7 +1642,7 @@ def fit_from_iterable(self, dataset: Union[List[Tuple[str, Set[OWLNamedIndividua
- This function returns predictions as owl class expressions, not nodes as in fit
"""
assert self.load_pretrained and self.pretrained_model_name, \
assert self.load_pretrained and self.learner_names, \
"No pretrained model found. Please first train NCES, refer to the <<train>> method"
dataset = [self.convert_to_list_str_from_iterable(datapoint) for datapoint in dataset]
dataset = NCESDataLoaderInference(dataset, self.instance_embeddings, self.vocab, self.inv_vocab,
Expand All @@ -1623,20 +1668,39 @@ def fit_from_iterable(self, dataset: Union[List[Tuple[str, Set[OWLNamedIndividua
print("Predictions: ", predictions_str)
return predictions_as_owl_class_expressions

def train(self, data: Iterable[List[Tuple]], epochs=300, batch_size=None, learning_rate=1e-4, decay_rate=0.0,
@staticmethod
def generate_training_data(kb_path, num_lps=1000, storage_dir="./NCES_Training_Data"):
lp_gen = LPGen(kb_path=kb_path, max_num_lps=num_lps, storage_dir=storage_dir)
lp_gen.generate()
print("Loading generated data...")
with open(f"{storage_dir}/LPs.json") as file:
lps = list(json.load(file).items())
print("Number of learning problems:", len(lps))
return lps



def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_lps=1000, learning_rate=1e-4, decay_rate=0.0,
clip_value=5.0, num_workers=8, save_model=True, storage_path=None, optimizer='Adam', record_runtime=True,
example_sizes=None, shuffle_examples=False):
if os.cpu_count() <= num_workers:
num_workers = max(0,os.cpu_count()-1)
if storage_path is None:
currentDateAndTime = datetime.now()
storage_path = f'NCES-Experiment-{currentDateAndTime.strftime("%H:%M:%S")}'
if not os.path.exists(storage_path):
os.mkdir(storage_path)
if batch_size is None:
batch_size = self.batch_size
if data is None:
data = self.generate_training_data(self.knowledge_base_path, num_lps=num_lps, storage_dir=storage_path)
train_dataset = NCESDataLoader(data, self.instance_embeddings, self.vocab, self.inv_vocab,
shuffle_examples=shuffle_examples, max_length=self.max_length,
example_sizes=example_sizes)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=self.num_workers,
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers,
collate_fn=self.collate_batch, shuffle=True)
if storage_path is None:
storage_path = self.knowledge_base_path[:self.knowledge_base_path.rfind("/")]
elif not os.path.exists(storage_path) and (record_runtime or save_model):
os.mkdir(storage_path)

trainer = NCESTrainer(self, epochs=epochs, learning_rate=learning_rate, decay_rate=decay_rate,
clip_value=clip_value, num_workers=num_workers, storage_path=storage_path)
trainer.train(train_dataloader, save_model, optimizer, record_runtime)
self.refresh(storage_path+"/trained_models")
Loading

0 comments on commit 49427ed

Please sign in to comment.