Merge pull request #1 from Louis-Mozart/retrieval_eval_incomplete

Retrieval eval incomplete
Louis-Mozart · Dec 5, 2024 · 59b45c9 · 59b45c9
2 parents cf7c395 + 01c9d81
commit 59b45c9
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 156 deletions.
diff --git a/examples/retrieval_with_cache.py b/examples/retrieval_with_cache.py
@@ -1,19 +1,18 @@
 
 import argparse 
 import pandas as pd
-from semantic_caching import run_cache, concept_generator
+from ontolearn.semantic_caching import run_cache, concept_generator
 from plot_metrics import *
 import seaborn as sns
 
-#5, 16, 32, 128, 256, 512, 700, 800, 1024, , "KGs/Family/family.owl"  .2, .4, .6, .8
 parser = argparse.ArgumentParser()
-parser.add_argument('--cache_size_ratios', type=list, default=[1.], help="cache size is proportional to num_concepts, cache size = k * num_concepts")
+parser.add_argument('--cache_size_ratios', type=list, default=[.1, .2, .4, .8, 1.], help="cache size is proportional to num_concepts, cache size = k * num_concepts")
 parser.add_argument('--path_kg', type=list, default=["KGs/Family/family.owl"])
 parser.add_argument('--path_kge', type=list, default=None)
 parser.add_argument('--name_reasoner', type=str, default='EBR', choices=["EBR",'HermiT', 'Pellet', 'JFact', 'Openllet'])
 parser.add_argument('--eviction_strategy', type=str, default='LRU', choices=['LIFO', 'FIFO', 'LRU', 'MRU', 'RP'])
 parser.add_argument('--random_seed_for_RP', type=int, default=10, help="Random seed if the eviction startegy is RP")
-parser.add_argument('--cache_type', type=str, default='cold', choices=['hot', 'cold'], help="Type of cache to be used. With cold cache we initialize the cache with NC, NNC")
+parser.add_argument('--cache_type', type=str, default='hot', choices=['hot', 'cold'], help="Type of cache to be used. With cold cache we initialize the cache with NC, NNC")
 parser.add_argument('--shuffle_concepts', action='store_true',help="If set, we shuffle the concepts for randomness")
 args = parser.parse_args()
 
@@ -24,136 +23,59 @@ def get_cache_size(list_k, path_kg):
     return [max(1, int(k * data_size)) for k in list_k]
 
 
+# results = []
+# for path_kg in args.path_kg:
+#     for cache_size in get_cache_size(args.cache_size_ratios, path_kg):
+#         for strategy in ['LIFO', 'FIFO', 'LRU', 'MRU', 'RP']:
+#             result, detailed = run_cache(
+#                 path_kg=path_kg,
+#                 path_kge=args.path_kge,
+#                 cache_size=cache_size,
+#                 name_reasoner=args.name_reasoner,
+#                 eviction=strategy,
+#                 random_seed=args.random_seed_for_RP,
+#                 cache_type=args.cache_type,
+#                 shuffle_concepts=args.shuffle_concepts
+#             )
+#             results.append(result)
+
+#     data_kg = result['dataset']
+#     df = pd.DataFrame(results)
+#     print(df)
+
+#     # Save to CSV
+#     df.to_csv(f'caching_results_{data_kg}/cache_experiments_{args.name_reasoner}_{data_kg}.csv', index=False)
+
+
 results = []
+detailed_results = []
 for path_kg in args.path_kg:
     for cache_size in get_cache_size(args.cache_size_ratios, path_kg):
-        for strategy in ['LIFO', 'FIFO', 'LRU', 'MRU', 'RP']:
-            result, detailed = run_cache(
+        result, D = run_cache(
                 path_kg=path_kg,
                 path_kge=args.path_kge,
                 cache_size=cache_size,
                 name_reasoner=args.name_reasoner,
-                eviction=strategy,
+                eviction=args.eviction_strategy,
                 random_seed=args.random_seed_for_RP,
                 cache_type=args.cache_type,
                 shuffle_concepts=args.shuffle_concepts
             )
-            results.append(result)
-
-    data_kg = result['dataset']
-    df = pd.DataFrame(results)
-    print(df)
-
-    # Save to CSV
-    # df.to_csv(f'caching_results_{data_kg}/cache_experiments_{args.name_reasoner}_{data_kg}.csv', index=False)
-
-
-# name_reasoners = ["EBR",'HermiT','Pellet','JFact','Openllet']
-# data_kgs = ["family"]
-
-# for data_kg in data_kgs:
-
-#     for name_reasoner in name_reasoners:
-
-#         df = pd.read_csv(f'caching_results_{data_kg}/cache_experiments_{name_reasoner}_{data_kg}.csv')
-#         print(df)
-
-
-        # sns.set_context("talk", font_scale=3.6)
-
-        # plot1 = sns.catplot(
-        # data=df,
-        # kind="bar",
-        # x="cache_size",
-        # y="hit_ratio",
-        # hue="strategy",
-        # col="dataset",
-        # height=10,
-        # aspect=2
-        # )
-        # plt.show()
-        # plot1.savefig(f'caching_results_{data_kg}/cache_vs_hit_sns_{name_reasoner}_{data_kg}.pdf')
-
+        results.append(result)
+        detailed_results.append(D)
 
-        # plot2 = sns.catplot(
-        # data=df,
-        # kind="bar",
-        # x="cache_size",
-        # y="avg_jaccard",
-        # hue="strategy",
-        # col="dataset",
-        # height=10,
-        # aspect=2
-        # )
-        # plt.show()
-        # plot2.savefig(f'caching_results_{data_kg}/cache_vs_jaccard_sns_{name_reasoner}_{data_kg}.pdf')
+all_detailed_results = [item for sublist in detailed_results for item in sublist]
 
-
-        # plot3 = sns.catplot(esults = []
-# detailed_results = []
-# for path_kg in args.path_kg:
-#     for cache_size in get_cache_size(args.cache_size_ratios, path_kg):
-#         result, D = run_cache(path_kg=path_kg, path_kge=args.path_kge, cache_size=cache_size, name_reasoner=args.name_reasoner,\
-#                               eviction=args.eviction_strategy, random_seed=args.random_seed_for_RP) 
-#         results.append(result)
-#         detailed_results.append(D)
-
-# all_detailed_results = [item for sublist in detailed_results for item in sublist]
-
-# results = pd.DataFrame(results)
+results = pd.DataFrame(results)
 # results.to_csv(f'caching_results/cache_experiments_{args.name_reasoner}.csv')  
 
-# plot_scale_factor(results, args.name_reasoner)    
-# plot_jaccard_vs_cache_size(results, args.name_reasoner) 
-
+plot_scale_factor(results, args.name_reasoner)    
+plot_jaccard_vs_cache_size(results, args.name_reasoner) 
 
 # # print(results.to_latex(index=False))
 
-# all_detailed_results = pd.DataFrame(all_detailed_results)
-# bar_plot_separate_data(all_detailed_results, cache_size=90, name_reasoner=args.name_reasoner)
-        # data=df,
-        # kind="bar",
-        # x="cache_size",
-        # y="RT_cache",
-        # hue="strategy",
-        # col="dataset",
-        # height=10,
-        # aspect=2
-        # )
-        # plt.show()
-        # plot3.savefig(f'caching_results_{data_kg}/cache_vs_RT_sns_{name_reasoner}_{data_kg}.pdf')
-
-
-
-# results = []
-# detailed_results = []
-# for path_kg in args.path_kg:
-#     for cache_size in get_cache_size(args.cache_size_ratios, path_kg):
-#         result, D = run_cache(
-#                 path_kg=path_kg,
-#                 path_kge=args.path_kge,
-#                 cache_size=cache_size,
-#                 name_reasoner=args.name_reasoner,
-#                 eviction=args.eviction_strategy,
-#                 random_seed=args.random_seed_for_RP,
-#                 cache_type=args.cache_type,
-#                 shuffle_concepts=args.shuffle_concepts
-#             )
-#         results.append(result)
-#         detailed_results.append(D)
-
-# all_detailed_results = [item for sublist in detailed_results for item in sublist]
-
-# results = pd.DataFrame(results)
-# # results.to_csv(f'caching_results/cache_experiments_{args.name_reasoner}.csv')  
-
-# plot_scale_factor(results, args.name_reasoner)    
-# plot_jaccard_vs_cache_size(results, args.name_reasoner) 
-
-# # # print(results.to_latex(index=False))
-
-# all_detailed_results = pd.DataFrame(all_detailed_results)
-# bar_plot_separate_data(all_detailed_results, cache_size=90, name_reasoner=args.name_reasoner)
+all_detailed_results = pd.DataFrame(all_detailed_results)
+bar_plot_separate_data(all_detailed_results, cache_size=90, name_reasoner=args.name_reasoner)
 # bar_plot_all_data(all_detailed_results, cache_size=90, name_reasoner=args.name_reasoner)
 # all_detailed_results.to_csv(f'caching_results/detailed_cache_experiments_{args.name_reasoner}.csv')
 
diff --git a/semantic_caching.py → ontolearn/semantic_caching.py b/semantic_caching.py → ontolearn/semantic_caching.py
@@ -165,13 +165,11 @@ def get_shuffled_concepts(path_kg, data_name):
         random.shuffle(alc_concepts)
         with open(save_file, "wb") as f:
             pickle.dump(alc_concepts, f)
-        print("Generated, shuffled, and saved concepts.")
-
+        print("Generated, shuffled, and saved concepts.")   
     return alc_concepts
 
 
 def concept_retrieval(retriever_func, c) -> Set[str]:
-
     return {i.str for i in retriever_func.individuals(c)}
 
 
@@ -232,7 +230,7 @@ def put(self, key, value):
         if self.strategy in ['LRU', 'MRU']:
             self.access_times[key] = time.time()  # Record access timestamp
 
-    def initialize_cache(self, ontology, func, path_onto, third, All_individuals):
+    def initialize_cache(self, ontology, func, path_onto, third, All_individuals, handle_restriction_func=None):
         """
         Initialize the cache with precomputed results.
         :param ontology: The loaded ontology.
@@ -245,16 +243,37 @@ def initialize_cache(self, ontology, func, path_onto, third, All_individuals):
         # Fetch object properties and classes from ontology
         roles = list(ontology.object_properties())
         classes = list(ontology.classes())
+
         for cls in classes:
             named_class = OWLClass(cls.iri)
             named_class_str = str(cls).split(".")[-1]
+
+            # Add named concept
             self.put(named_class_str, func(named_class, path_onto, third))
             negated_named_class_str = f"¬{named_class_str}"
+
+            # Add negated named concept
             self.put(negated_named_class_str, All_individuals-self.cache[named_class_str])
+            negated_class = OWLObjectComplementOf(named_class)
+
             for role in roles:
                 role_property = OWLObjectProperty(role.iri)
-                existential_a = OWLObjectSomeValuesFrom(property=role_property, filler=named_class)         
-                self.put(owl_expression_to_dl(existential_a), func(existential_a, path_onto, third))
+                existential_a = OWLObjectSomeValuesFrom(property=role_property, filler=named_class)   
+
+                # Add ∃ r.C
+                if handle_restriction_func is not None:     
+                    self.put(owl_expression_to_dl(existential_a), handle_restriction_func(existential_a))
+                else:
+                    self.put(owl_expression_to_dl(existential_a), func(existential_a, path_onto, third))
+
+                # Add ∃ r.(¬C)
+                existential_negated = OWLObjectSomeValuesFrom(property=role_property, filler=negated_class)
+                existential_negated_str = owl_expression_to_dl(existential_negated)
+                if handle_restriction_func is not None:
+                    self.put(existential_negated_str, handle_restriction_func(existential_negated))
+                else:
+                    self.put(existential_negated_str, func(existential_negated, path_onto, third))
+
         self.initialized = True 
 
     def get_all_items(self):
@@ -265,8 +284,6 @@ def is_full(self):
         return len(self.cache) >= self.max_size
 
 
-
-
 def semantic_caching_size(func, cache_size, eviction_strategy, random_seed, cache_type):
 
     '''This function implements the semantic caching algorithm for ALC concepts as presented in the paper'''
@@ -307,42 +324,43 @@ def retrieve_from_cache(expression):
                 stats['misses'] += 1
                 return None
 
-        def handle_owl_some_values_from():
+        def handle_owl_some_values_from(owl_expression):
             """
             Process the OWLObjectSomeValuesFrom expression locally.
             When called, return the retrieval of OWLObjectSomeValuesFrom
             based on the Algorithm described in the paper
             """
-            object_property = owl_expression.get_property()
-            filler_expression = owl_expression.get_filler()
-            instances = retrieve_from_cache(owl_expression_to_dl(filler_expression))
-            if instances:
-                result = set()
-                if isinstance(object_property, OWLObjectInverseOf):
-                    r = onto.search_one(iri=object_property.get_inverse_property().str)
+            if isinstance(owl_expression, OWLObjectSomeValuesFrom): 
+                object_property = owl_expression.get_property()
+                filler_expression = owl_expression.get_filler()
+                instances = retrieve_from_cache(owl_expression_to_dl(filler_expression))
+                if instances is not None:
+                    result = set()
+                    if isinstance(object_property, OWLObjectInverseOf):
+                        r = onto.search_one(iri=object_property.get_inverse_property().str)
+                    else:
+                        r = onto.search_one(iri=object_property.str)
+                    individual_map = {ind: onto.search_one(iri=ind) for ind in All_individuals | instances}
+                    for ind_a in All_individuals:
+                        a = individual_map[ind_a]
+                        for ind_b in instances:
+                            b = individual_map[ind_b]
+                            if isinstance(object_property, OWLObjectInverseOf):
+                                if a in getattr(b, r.name):
+                                    result.add(a)
+                            else:
+                                if b in getattr(a, r.name):
+                                    result.add(ind_a) 
                 else:
-                    r = onto.search_one(iri=object_property.str)
-                individual_map = {ind: onto.search_one(iri=ind) for ind in All_individuals | instances}
-                for ind_a in All_individuals:
-                    a = individual_map[ind_a]
-                    for ind_b in instances:
-                        b = individual_map[ind_b]
-                        if isinstance(object_property, OWLObjectInverseOf):
-                            if a in getattr(b, r.name):
-                                result.add(a)
-                        else:
-                            if b in getattr(a, r.name):
-                                result.add(ind_a) 
-            else:
-                result = func(*args)
-            return result
+                    result = func(*args)
+                return result
 
         start_time = time.time() #state the timing before the cache initialization 
 
         # Cold cache initialization
         start_time_initialization = time.time()
         if cache_type == 'cold' and not cache.initialized:
-            cache.initialize_cache(onto, func, path_onto, args[-1], All_individuals)
+            cache.initialize_cache(onto, func, path_onto, args[-1], All_individuals, handle_restriction_func=handle_owl_some_values_from)
         time_initialization = time.time()- start_time_initialization
 
         # start_time = time.time() #state the timing after the cache initialization 
@@ -385,9 +403,9 @@ def handle_owl_some_values_from():
                 if cached_result_cold is not None:
                     result = cached_result_cold
                 else:
-                    result = handle_owl_some_values_from()   
+                    result = handle_owl_some_values_from(owl_expression)   
             else:
-               result = handle_owl_some_values_from()
+               result = handle_owl_some_values_from(owl_expression)
 
         elif isinstance(owl_expression, OWLObjectAllValuesFrom):
             all_values_expr = owl_expression_to_dl(owl_expression)
@@ -508,8 +526,6 @@ def run_cache(path_kg:str, path_kge:str, cache_size:int, name_reasoner:str, evic
 
         ground_truth = concept_retrieval(symbolic_kb, expr)
 
-
-
         jacc = jaccard_similarity(A, ground_truth)
         jacc_reas = jaccard_similarity(retrieve_ebr, ground_truth)
         Avg_jaccard.append(jacc)
@@ -519,8 +535,6 @@ def run_cache(path_kg:str, path_kge:str, cache_size:int, name_reasoner:str, evic
         print(f'Jaccard similarity: {jacc}')
         # assert jacc == 1.0 
 
-
-
     stats = cached_retriever.get_stats()
 
     print('-'*50)
@@ -549,8 +563,6 @@ def run_cache(path_kg:str, path_kge:str, cache_size:int, name_reasoner:str, evic
 
 
 
-
-
 # def subsumption_based_caching(func, cache_size):
 #     cache = {}  # Dictionary to store cached results