Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

inconsistent env_broad_scale #77 #78

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions sample_annotator/non_edge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import sqlite3
import pandas as pd
import numpy as np
import re

#biosample.db in this case is already in directory,
#replace biosample.db with the path of the database.
#in this case, biosample.db contains the env_mapping table that I
#have created.
def to_csv():
db = sqlite3.connect('biosample.db')
cursor = db.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
for table_name in tables:
table_name = table_name[0]
table = pd.read_sql_query("SELECT * from %s" % table_name, db)
table.to_csv(table_name + '.csv', index_label='index')
cursor.close()
db.close()
to_csv()

#tried dumping env_mapping into biosample instead
map = pd.read_csv('new_env_mapping.csv')
main = pd.read_csv('harmonized_wide_sel_envs.csv')

#fixing with some edge cases
main['env_broad_scale'] = main['env_broad_scale'].str.lower()
main['env_broad_scale'] = main['env_broad_scale'].str.replace('envo:','', regex=True)
main['env_broad_scale'] = main['env_broad_scale'].str.replace('env:','', regex=True)
main['env_broad_scale'] = main ['env_broad_scale'].str.replace('\\(\d+\\)', '', regex=True)
main['env_broad_scale'] = main ['env_broad_scale'].str.replace('\\[\d+\\]', '', regex=True)
main['env_broad_scale'] = main ['env_broad_scale'].str.replace('marine biome ', 'marine biome', regex=True)
main['env_broad_scale'] = main ['env_broad_scale'].str.replace('ocean biome ', 'ocean biome', regex=True)
main['env_broad_scale'] = main['env_broad_scale'].str.replace('marine abyssal zone biome ', 'marine abyssal zone biome', regex=True)
main['env_broad_scale'] = main['env_broad_scale'].str.replace(r"(\d+,?)(\s.+)", r"\1",regex=True)
merge_table = main.merge(map, left_on = 'env_broad_scale', right_on = 'label', how='left')
merge_table['broad_scale_fixed'] = merge_table['env_broad_scale'].astype(str) + ' ' + '['+merge_table['term_id'].astype(str) +']'
merge_table['broad_scale_fixed'] = merge_table['broad_scale_fixed'].str.replace('(\\[nan\\])','', regex=True)

#still need to change the 'nan[nan]' string values to NaN (np.nan) values
#not sure how
merge_table['broad_scale_fixed'] = merge_table['broad_scale_fixed'].replace('nan[nan]',np.nan)
10 changes: 10 additions & 0 deletions sql/env_mapping.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-- SQLITE
CREATE TABLE new_env_mapping AS
SELECT distinct
subject as term_id,
value as label
from
statements s
where
predicate = 'rdfs:label'
and subject like 'ENVO:%';
10 changes: 10 additions & 0 deletions sql/merged_table.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-- SQLite
-- before this i created a new table for the mapping called new_env_mapping and
-- dumpped the harmonized table into envo.db
CREATE TABLE merged AS
SELECT *
FROM
harmonized_wide_sel_envs big_table
LEFT JOIN new_env_mapping AS map
ON
big_table.env_broad_scale = map.label;