We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Logic for label prediction
key_map_prediction.py
import difflib import pandas as pd from rdflib import Graph from rdflib.namespace import OWL, RDF, RDFS, SKOS from sqlalchemy import create_engine label_choice = [ SKOS.altLabel, SKOS.prefLabel, RDFS.label, ] def update_key_map_using_ontology(source, key_map_db): g = Graph() g.parse(source, format="turtle") key_map_df = pd.DataFrame() for s, _p, o in g.triples((None, RDF.type, OWL.Class)): onto_concept = str(s).split("#")[-1] for label in label_choice: for _s, _p, o in g.triples((s, label, None)): key_map = {"ontology_key": onto_concept, "data_key": o} key_map_df = key_map_df.append(key_map, ignore_index=True) engine = create_engine(f"sqlite:///{key_map_db}", echo=False) key_map_df.to_sql(con=engine, name="key_map", if_exists="append") # remove duplicated mappings df = pd.read_sql_table("key_map", con=engine, index_col="index") df = df.drop_duplicates() df = df.reset_index(drop=True) print(df) df.to_sql(con=engine, name="key_map", if_exists="replace") def update_key_map_using_mapping(source, key_map_db, worksheet="sameas"): mappings = pd.read_excel( open(source, "rb"), sheet_name=worksheet, engine="openpyxl" ) mappings = mappings.dropna() key_map_df = pd.DataFrame() key_map_df[["ontology_key", "data_key"]] = mappings[ ["Method Label Match", "Data Label Match"] ] engine = create_engine(f"sqlite:///{key_map_db}", echo=False) key_map_df.to_sql(con=engine, name="key_map", if_exists="append") # remove dupliacted mappings df = pd.read_sql_table("key_map", con=engine, index_col="index") df = df.drop_duplicates() df = df.reset_index(drop=True) df.to_sql(con=engine, name="key_map", if_exists="replace") def prediction_key_map_based_on_db( mapping_path, prediction_path, key_map_db, worksheet="sameas" ): engine = create_engine(f"sqlite:///{key_map_db}", echo=False) key_map_db = pd.read_sql_table("key_map", con=engine, index_col="index") mapping_choices = pd.read_excel( open(mapping_path, "rb"), sheet_name=worksheet, engine="openpyxl" ) result_mapping_df = pd.DataFrame() for _, row in mapping_choices.iterrows(): # go trough all possible data labels data_name = row["Data Label Choice"] if not isinstance(data_name, str): continue # get similarity of all data word with the db mapping pairs scores = key_map_db["data_key"].apply( lambda match_word: difflib.SequenceMatcher( None, data_name.lower(), match_word.lower() ).ratio() ) best_score_idx = scores.idxmax() best_data_score = scores.max() best_scoring_data_name = key_map_db.loc[best_score_idx, "data_key"] # get all mapping pairs of the data match data_matches = key_map_db.loc[ (key_map_db["data_key"] == best_scoring_data_name), : ] best_matches = pd.DataFrame() for _, row in data_matches.iterrows(): method_name = row["ontology_key"] # print(method_name) method_choice = mapping_choices["Method Label Choice"].dropna() # get similarity of all method words with the possible method # choices scores = method_choice.apply( lambda match_word: difflib.SequenceMatcher( None, method_name.lower(), match_word.lower() ).ratio() ) best_score_idx = scores.idxmax() best_method_score = scores.max() best_scoring_method_choice = method_choice.loc[best_score_idx] best_scoring_method = { "method_match": method_name, "method_choice": best_scoring_method_choice, "score": best_method_score, } best_matches = best_matches.append( best_scoring_method, ignore_index=True ) best_score_idx = best_matches["score"].idxmax() best_scoring_method_choice_name = best_matches.loc[ best_score_idx, "method_choice" ] best_scoring_method_match_name = best_matches.loc[ best_score_idx, "method_match" ] best_method_score = best_matches.loc[best_score_idx, "score"] result_mapping = { "Method Label Match": best_scoring_method_choice_name, "Data Label Match": data_name, "Method Mapping-DB Match": best_scoring_method_match_name, "Data Mapping-DB Match": best_scoring_data_name, "Method Score": best_method_score, "Data Score": best_data_score, } result_mapping_df = result_mapping_df.append( result_mapping, ignore_index=True ) result_mapping_df.sort_values( by=["Method Score", "Data Score"], inplace=True, ascending=False ) result_mapping_df = result_mapping_df[ [ "Data Label Match", "Method Label Match", "Data Mapping-DB Match", "Method Mapping-DB Match", "Data Score", "Method Score", ] ] result_mapping_df.to_csv(prediction_path) # ontology_path = os.path.join("../tests/key_map_generation/stahl_digital_v23.09.2021.ttl") # key_map_db = os.path.join("../tests/key_map_generation/key_map.db") # example_mapping = os.path.join("../tests/key_map_generation/mapping.xlsx") # prediction_path = os.path.join("../tests/key_map_generation/predicted_mapping.csv") # update_key_map_using_mapping(example_mapping, key_map_db) # update_key_map_using_ontology(ontology_path, key_map_db) # prediction_key_map_based_on_db(example_mapping, prediction_path, key_map_db)
The text was updated successfully, but these errors were encountered:
No branches or pull requests
Logic for label prediction
key_map_prediction.py
The text was updated successfully, but these errors were encountered: