From 99e4bb8f4e8e2a2ef2eccf06a6dfe4c80c66bc86 Mon Sep 17 00:00:00 2001 From: Jeanice Angelica Santosa Date: Thu, 28 Apr 2022 12:02:38 -0700 Subject: [PATCH 1/2] inconsistent env_broad_scale #77 --- sample_annotator/non_edge.py | 31 +++++++++++++++++++++++++++++++ sql/env_mapping.sql | 10 ++++++++++ sql/merged_table.sql | 10 ++++++++++ 3 files changed, 51 insertions(+) create mode 100644 sample_annotator/non_edge.py create mode 100644 sql/env_mapping.sql create mode 100644 sql/merged_table.sql diff --git a/sample_annotator/non_edge.py b/sample_annotator/non_edge.py new file mode 100644 index 0000000..8f0df58 --- /dev/null +++ b/sample_annotator/non_edge.py @@ -0,0 +1,31 @@ +import sqlite3 +import pandas as pd +import numpy as np +import re + +def to_csv(): + db = sqlite3.connect('biosample.db') + cursor = db.cursor() + cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") + tables = cursor.fetchall() + for table_name in tables: + table_name = table_name[0] + table = pd.read_sql_query("SELECT * from %s" % table_name, db) + table.to_csv(table_name + '.csv', index_label='index') + cursor.close() + db.close() +to_csv() + +#tried dumping env_mapping into biosample instead +map = pd.read_csv('new_env_mapping.csv') +main = pd.read_csv('harmonized_wide_sel_envs.csv') + +main['env_broad_scale'] = main['env_broad_scale'].str.lower() +main['env_broad_scale'] = main['env_broad_scale'].str.replace('envo:','', regex=True) +merge_table = main.merge(map, left_on = 'env_broad_scale', right_on = 'label', how='left') +merge_table['broad_scale_fixed'] = merge_table['env_broad_scale'].astype(str) + ' ' + '['+merge_table['term_id'].astype(str) +']' + +#still need to change the 'nan[nan]' string values to NaN (np.nan) values +#not sure how +merge_table['broad_scale_fixed'] = merge_table['broad_scale_fixed'].replace('nan[nan]',np.nan) +merge_table['broad_scale_fixed'].value_counts() \ No newline at end of file diff --git a/sql/env_mapping.sql b/sql/env_mapping.sql new file mode 100644 index 0000000..8c70d63 --- /dev/null +++ b/sql/env_mapping.sql @@ -0,0 +1,10 @@ +-- SQLITE +CREATE TABLE new_env_mapping AS +SELECT distinct + subject as term_id, + value as label +from + statements s +where + predicate = 'rdfs:label' + and subject like 'ENVO:%'; \ No newline at end of file diff --git a/sql/merged_table.sql b/sql/merged_table.sql new file mode 100644 index 0000000..90f84dc --- /dev/null +++ b/sql/merged_table.sql @@ -0,0 +1,10 @@ +-- SQLite +-- before this i created a new table for the mapping called new_env_mapping and +-- dumpped the harmonized table into envo.db +CREATE TABLE merged AS + SELECT * + FROM + harmonized_wide_sel_envs big_table + LEFT JOIN new_env_mapping AS map + ON + big_table.env_broad_scale = map.label; \ No newline at end of file From b4e81220d64f0cde7d2214dd2c0361cb43456f07 Mon Sep 17 00:00:00 2001 From: Jeanice Angelica Santosa Date: Fri, 29 Apr 2022 10:18:49 -0700 Subject: [PATCH 2/2] env_broad with edge cases --- sample_annotator/non_edge.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/sample_annotator/non_edge.py b/sample_annotator/non_edge.py index 8f0df58..b3eb0b5 100644 --- a/sample_annotator/non_edge.py +++ b/sample_annotator/non_edge.py @@ -3,6 +3,10 @@ import numpy as np import re +#biosample.db in this case is already in directory, +#replace biosample.db with the path of the database. +#in this case, biosample.db contains the env_mapping table that I +#have created. def to_csv(): db = sqlite3.connect('biosample.db') cursor = db.cursor() @@ -20,12 +24,20 @@ def to_csv(): map = pd.read_csv('new_env_mapping.csv') main = pd.read_csv('harmonized_wide_sel_envs.csv') +#fixing with some edge cases main['env_broad_scale'] = main['env_broad_scale'].str.lower() main['env_broad_scale'] = main['env_broad_scale'].str.replace('envo:','', regex=True) +main['env_broad_scale'] = main['env_broad_scale'].str.replace('env:','', regex=True) +main['env_broad_scale'] = main ['env_broad_scale'].str.replace('\\(\d+\\)', '', regex=True) +main['env_broad_scale'] = main ['env_broad_scale'].str.replace('\\[\d+\\]', '', regex=True) +main['env_broad_scale'] = main ['env_broad_scale'].str.replace('marine biome ', 'marine biome', regex=True) +main['env_broad_scale'] = main ['env_broad_scale'].str.replace('ocean biome ', 'ocean biome', regex=True) +main['env_broad_scale'] = main['env_broad_scale'].str.replace('marine abyssal zone biome ', 'marine abyssal zone biome', regex=True) +main['env_broad_scale'] = main['env_broad_scale'].str.replace(r"(\d+,?)(\s.+)", r"\1",regex=True) merge_table = main.merge(map, left_on = 'env_broad_scale', right_on = 'label', how='left') merge_table['broad_scale_fixed'] = merge_table['env_broad_scale'].astype(str) + ' ' + '['+merge_table['term_id'].astype(str) +']' +merge_table['broad_scale_fixed'] = merge_table['broad_scale_fixed'].str.replace('(\\[nan\\])','', regex=True) #still need to change the 'nan[nan]' string values to NaN (np.nan) values #not sure how -merge_table['broad_scale_fixed'] = merge_table['broad_scale_fixed'].replace('nan[nan]',np.nan) -merge_table['broad_scale_fixed'].value_counts() \ No newline at end of file +merge_table['broad_scale_fixed'] = merge_table['broad_scale_fixed'].replace('nan[nan]',np.nan) \ No newline at end of file