Skip to content

Commit

Permalink
use defined columns when creating tables
Browse files Browse the repository at this point in the history
  • Loading branch information
lmcmicu committed Sep 16, 2024
1 parent 7de06b8 commit 35f2464
Show file tree
Hide file tree
Showing 7 changed files with 390 additions and 231 deletions.
15 changes: 10 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -135,14 +135,14 @@ random_test_data: test/generate_random_test_data.py valve valve test/random_test
./$< $$(date +"%s") 100 5 $(word 3,$^) $|

.PHONY: sqlite_random_test
sqlite_random_test: valve random_test_data | build test/output
sqlite_random_test: valve clean_random_data random_test_data | build test/output
@echo "Testing with random data on sqlite ..."
./$< --assume-yes load $(random_test_dir)/table.tsv $(sqlite_random_db)
test/round_trip.sh $(sqlite_random_db) $(random_test_dir)/table.tsv
@echo "Test succeeded!"

.PHONY: pg_random_test
pg_random_test: valve random_test_data | build test/output
pg_random_test: valve clean_random_data random_test_data | build test/output
@echo "Testing with random data on postgresql ..."
./$< --assume-yes load $(random_test_dir)/table.tsv $(pg_connect_string)
test/round_trip.sh $(pg_connect_string) $(random_test_dir)/table.tsv
Expand Down Expand Up @@ -185,9 +185,10 @@ penguin_test: valve | test/penguins/src/data

guess_test_dir = test/guess_test_data
guess_test_db = build/valve_guess.db
num_guess_test_rows = 30000

$(guess_test_dir)/table1.tsv: test/generate_random_test_data.py valve $(guess_test_dir)/*.tsv
./$< 0 30000 5 $(guess_test_dir)/table.tsv $(guess_test_dir)
./$< 0 $(num_guess_test_rows) 5 $(guess_test_dir)/table.tsv $(guess_test_dir)

$(guess_test_dir)/ontology:
mkdir -p $@
Expand All @@ -196,9 +197,9 @@ $(guess_test_dir)/ontology:
guess_test_data: test/generate_random_test_data.py $(guess_test_dir)/table1.tsv valve confirm_overwrite.sh $(guess_test_dir)/*.tsv | $(guess_test_dir)/ontology
./confirm_overwrite.sh $(guess_test_dir)/ontology
rm -f $(guess_test_dir)/table1.tsv
./$< 0 30000 5 $(guess_test_dir)/table.tsv $(guess_test_dir)
./$< 0 $(num_guess_test_rows) 5 $(guess_test_dir)/table.tsv $(guess_test_dir)
rm -f $(guess_test_dir)/ontology/*.tsv
./$< 0 30000 5 $(guess_test_dir)/table_expected.tsv $|
./$< 0 $(num_guess_test_rows) 5 $(guess_test_dir)/table_expected.tsv $|
rm -f $(guess_test_dir)/ontology/table1.tsv

$(guess_test_db): valve guess_test_data $(guess_test_dir)/*.tsv | build $(guess_test_dir)/ontology
Expand Down Expand Up @@ -249,6 +250,10 @@ perf_test: sqlite_perf_test pg_perf_test
clean:
rm -Rf build/valve.db* build/valve_random.db* test/output $(random_test_dir)/ontology valve

.PHONY: clean_random_data
clean_random_data:
rm -Rf $(random_test_dir)/ontology

.PHONY: clean_test_db
clean_test_db:
rm -Rf build/valve.db
Expand Down
32 changes: 16 additions & 16 deletions scripts/export_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_column_order_and_info_for_postgres(cursor, table):
sorted by priority, and a list of the table's unique keys. I.e., returns a dict of the form:
{"unsorted_columns": [], "sorted_columns": [], "primary_keys": [], "unique_keys": []}. Note that
for tables with primary keys, we sort by primary key first, then by all other columns from left
to right. For tables without primary keys, we sort by row_number.
to right. For tables without primary keys, we sort by row_order.
"""
constraints_query_template = f"""
SELECT kcu.column_name
Expand All @@ -35,22 +35,18 @@ def get_column_order_and_info_for_postgres(cursor, table):

cursor.execute(
f"""
SELECT column_name
FROM information_schema.columns
WHERE table_name = '{table}'
ORDER BY ordinal_position
SELECT "column" FROM "column" WHERE "table" = '{table}'
"""
)
unsorted_columns = ["row_number", "row_order"] + [row[0] for row in cursor]

if not primary_keys:
sorted_columns = ["row_number"]
unsorted_columns = [row[0] for row in cursor]
sorted_columns = ["row_order"]
else:
unsorted_columns = []
non_pk_columns = []
for row in cursor:
column_name = row[0]
unsorted_columns.append(column_name)
if column_name not in primary_keys and not column_name == "row_number":
if column_name not in primary_keys:
non_pk_columns.append(column_name)
sorted_columns = primary_keys + non_pk_columns

Expand All @@ -72,23 +68,27 @@ def get_column_order_and_info_for_sqlite(cursor, table):
sorted by priority, and a list of the table's unique keys. I.e., returns a dict of the form:
{"unsorted_columns": [], "sorted_columns": [], "primary_keys": [], "unique_keys": []}. Note that
for tables with primary keys, we sort by primary key first, then by all other columns from left
to right. For tables without primary keys, we sort by row_number.
to right. For tables without primary keys, we sort by row_order.
"""
cursor.execute(
f"""
SELECT "column" FROM "column" WHERE "table" = '{table}'
"""
)
unsorted_columns = ["row_number", "row_order"] + [row[0] for row in cursor]

cursor.execute(f'PRAGMA TABLE_INFO("{table}")')
columns_info = [d[0] for d in cursor.description]
pragma_rows = list(map(lambda r: dict(zip(columns_info, r)), cursor))
primary_keys = dict()
if not any([row["pk"] == 1 for row in pragma_rows]):
sorted_columns = ["row_number"]
unsorted_columns = [p["name"] for p in pragma_rows]
sorted_columns = ["row_order"]
else:
unsorted_columns = []
non_pk_columns = []
for row in pragma_rows:
unsorted_columns.append(row["name"])
if row["pk"] != 0:
primary_keys[row["pk"]] = row["name"]
elif not row["name"] == "row_number":
elif not row["name"] not in ("row_number", "row_order"):
non_pk_columns.append(row["name"])
primary_keys = dict(sorted(primary_keys.items()))
sorted_columns = [primary_keys[key] for key in primary_keys] + non_pk_columns
Expand Down
71 changes: 54 additions & 17 deletions src/guess.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
//! Implementation of the column configuration guesser

use crate::valve::{Valve, ValveConfig, ValveDatatypeConfig};
use crate::{
toolkit::local_sql_syntax,
valve::{Valve, ValveConfig, ValveDatatypeConfig},
SQL_PARAM,
};
use fix_fn::fix_fn;
use futures::executor::block_on;
use indexmap::IndexMap;
Expand Down Expand Up @@ -218,15 +222,18 @@ pub fn guess(
.map(|h| format!(r#""{}""#, h))
.collect::<Vec<_>>()
.join(", ");
format!(
r#"INSERT INTO "table" ("row_number", {}) VALUES ({}, '{}', '{}', NULL, NULL)"#,
column_names, row_number, table, table_tsv
local_sql_syntax(
&valve.pool,
&format!(
r#"INSERT INTO "table" ("row_number", {column_names}) VALUES
({row_number}, {SQL_PARAM}, {SQL_PARAM}, NULL, NULL)"#,
),
)
};
if verbose {
println!("Executing SQL: {}", sql);
}
let query = sqlx_query(&sql);
let query = sqlx_query(&sql).bind(table).bind(table_tsv);
block_on(query.execute(&valve.pool)).expect(&format!("Error executing SQL '{}'", sql));

// Column configuration
Expand All @@ -240,49 +247,78 @@ pub fn guess(
.map(|h| format!(r#""{}""#, h))
.collect::<Vec<_>>()
.join(", ");
let mut params = vec![];
let values = vec![
// row_number
format!("{}", row_number),
format!("'{}'", table),
format!("'{}'", sample.normalized),
// table
{
params.push(table);
format!("{}", SQL_PARAM)
},
// column
{
params.push(&sample.normalized);
format!("{}", SQL_PARAM)
},
// label
{
if *label != sample.normalized {
format!("'{}'", label)
params.push(label);
format!("{}", SQL_PARAM)
} else {
"NULL".to_string()
}
},
// nulltype
{
if sample.nulltype != "" {
format!("'{}'", sample.nulltype)
params.push(&sample.nulltype);
format!("{}", SQL_PARAM)
} else {
"NULL".to_string()
}
},
format!("'{}'", sample.datatype),
// datatype
{
params.push(&sample.datatype);
format!("{}", SQL_PARAM)
},
// structure
{
if sample.structure != "" {
format!("'{}'", sample.structure)
params.push(&sample.structure);
format!("{}", SQL_PARAM)
} else {
"NULL".to_string()
}
},
// description
{
if sample.description != "" {
format!("'{}'", sample.description)
params.push(&sample.description);
format!("{}", SQL_PARAM)
} else {
"NULL".to_string()
}
},
]
.join(", ");
let sql = format!(
r#"INSERT INTO "column" ("row_number", {}) VALUES ({})"#,
column_names, values

let sql = local_sql_syntax(
&valve.pool,
&format!(
r#"INSERT INTO "column" ("row_number", {}) VALUES ({})"#,
column_names, values
),
);
if verbose {
println!("Executing SQL: {}", sql);
}
let query = sqlx_query(&sql);
let mut query = sqlx_query(&sql);
for param in &params {
query = query.bind(param);
}
block_on(query.execute(&valve.pool)).expect(&format!("Error executing SQL '{}'", sql));
row_number += 1;
}
Expand Down Expand Up @@ -624,9 +660,10 @@ pub fn annotate(
// if it exists in the foreign column. It is automatically a failed match.
continue;
}
// TODO: Here.
let value = {
if foreign.sql_type == "text" {
format!("'{}'", value)
format!("'{}'", value.replace("'", "''"))
} else {
value.to_string()
}
Expand Down
Loading

0 comments on commit 35f2464

Please sign in to comment.