Skip to content

Commit

Permalink
Revamp age csv loader (apache#2044)
Browse files Browse the repository at this point in the history
* Allow 0 as entry_id

- No regression test were impacted by this change.

* Use batch inserts to improve performance

- Changed heap_insert to heap_multi_insert since it is faster than
  calling heap_insert() in a loop. When multiple tuples can be inserted
  on a single page, just a single WAL record covering all of them, and
  only need to lock/unlock the page once.

- BATCH_SIZE is set to 1000, which is the number of tuples to insert in
  a single batch. This number was chosen after some experimentation.

- Change some of the field names to avoid confusion.

* Use sequence for generating ids for edge and vertex

- Sequence is not used if the id_field_exists is true in
  load_labels_from_file function, since the entry id is present in the
  csv.

* Add function to create temporary table for ids

- Created a temporary table and populate it with already generated
  vertex ids. A unique index is created on id column to ensure that
  new ids generated (using entry id from csv) are unique.

* Insert generated ids in the temporary table to enforce uniqueness

- Insert ids in the temporary table and also update index to
  enforce uniqueness.
- If the entry id provided in the CSV is greater than the current
  sequence value, the sequence value is updated to match the entry ID.
  For example:
  Suppose the current sequence value is 1, and the CSV entry ID is 2.
  If we use 2 but not update the sequence to 2, next time the CREATE
  clause is used, 2 will be returned by sequence as an entry id,
  resulting in duplicate.
- Update batch functions

* Add functions to create graph and label automatically

- These functions will check existence of graph and label, and create
  them if they don't exist.

* Add regression tests
  • Loading branch information
MuhammadTahaNaveed committed Aug 19, 2024
1 parent be771e0 commit 750a09e
Show file tree
Hide file tree
Showing 13 changed files with 779 additions and 105 deletions.
144 changes: 108 additions & 36 deletions regress/expected/age_load.out
Original file line number Diff line number Diff line change
Expand Up @@ -19,41 +19,87 @@
\! cp -r regress/age_load/data regress/instance/data/age_load
LOAD 'age';
SET search_path TO ag_catalog;
-- Create a country using CREATE clause
SELECT create_graph('agload_test_graph');
NOTICE: graph "agload_test_graph" has been created
create_graph
--------------

(1 row)

SELECT create_vlabel('agload_test_graph','Country');
NOTICE: VLabel "Country" has been created
create_vlabel
---------------

SELECT * FROM cypher('agload_test_graph', $$CREATE (n:Country {__id__:1}) RETURN n$$) as (n agtype);
n
----------------------------------------------------------------------------------
{"id": 844424930131969, "label": "Country", "properties": {"__id__": 1}}::vertex
(1 row)

--
-- Load countries with id
--
SELECT load_labels_from_file('agload_test_graph', 'Country',
'age_load/countries.csv');
'age_load/countries.csv', true);
load_labels_from_file
-----------------------

(1 row)

SELECT create_vlabel('agload_test_graph','City');
NOTICE: VLabel "City" has been created
create_vlabel
---------------

-- A temporary table should have been created with 54 ids; 1 from CREATE and 53 from file
SELECT COUNT(*)=54 FROM "_agload_test_graph_ag_vertex_ids";
?column?
----------
t
(1 row)

-- Sequence should be equal to max entry id i.e. 248
SELECT currval('agload_test_graph."Country_id_seq"')=248;
?column?
----------
t
(1 row)

-- Should error out on loading the same file again due to duplicate id
SELECT load_labels_from_file('agload_test_graph', 'Country',
'age_load/countries.csv', true);
ERROR: Cannot insert duplicate vertex id: 844424930131970
HINT: Entry id 2 is already used
--
-- Load cities with id
--
-- Should create City label automatically and load cities
SELECT load_labels_from_file('agload_test_graph', 'City',
'age_load/cities.csv');
'age_load/cities.csv', true);
NOTICE: VLabel "City" has been created
load_labels_from_file
-----------------------

(1 row)

-- Temporary table should have 54+72485 rows now
SELECT COUNT(*)=54+72485 FROM "_agload_test_graph_ag_vertex_ids";
?column?
----------
t
(1 row)

-- Sequence should be equal to max entry id i.e. 146941
SELECT currval('agload_test_graph."City_id_seq"')=146941;
?column?
----------
t
(1 row)

-- Should error out on loading the same file again due to duplicate id
SELECT load_labels_from_file('agload_test_graph', 'City',
'age_load/cities.csv', true);
ERROR: Cannot insert duplicate vertex id: 1125899906842777
HINT: Entry id 153 is already used
--
-- Load edges -- Connects cities to countries
--
-- Should error out for using vertex label
SELECT load_edges_from_file('agload_test_graph', 'Country',
'age_load/edges.csv');
ERROR: label "Country" already exists as edge label
SELECT create_elabel('agload_test_graph','has_city');
NOTICE: ELabel "has_city" has been created
create_elabel
Expand All @@ -68,6 +114,17 @@ SELECT load_edges_from_file('agload_test_graph', 'has_city',

(1 row)

-- Sequence should be equal to number of edges loaded i.e. 72485
SELECT currval('agload_test_graph."has_city_id_seq"')=72485;
?column?
----------
t
(1 row)

-- Should error out for using edge label
SELECT load_labels_from_file('agload_test_graph', 'has_city',
'age_load/cities.csv');
ERROR: label "has_city" already exists as vertex label
SELECT table_catalog, table_schema, lower(table_name) as table_name, table_type
FROM information_schema.tables
WHERE table_schema = 'agload_test_graph' ORDER BY table_name ASC;
Expand All @@ -83,7 +140,7 @@ WHERE table_schema = 'agload_test_graph' ORDER BY table_name ASC;
SELECT COUNT(*) FROM agload_test_graph."Country";
count
-------
53
54
(1 row)

SELECT COUNT(*) FROM agload_test_graph."City";
Expand All @@ -101,7 +158,7 @@ SELECT COUNT(*) FROM agload_test_graph."has_city";
SELECT COUNT(*) FROM cypher('agload_test_graph', $$MATCH(n) RETURN n$$) as (n agtype);
count
-------
72538
72539
(1 row)

SELECT COUNT(*) FROM cypher('agload_test_graph', $$MATCH (a)-[e]->(b) RETURN e$$) as (n agtype);
Expand All @@ -110,6 +167,17 @@ SELECT COUNT(*) FROM cypher('agload_test_graph', $$MATCH (a)-[e]->(b) RETURN e$$
72485
(1 row)

--
-- Load countries and cities without id
--
-- Should load countries in Country label without error since it should use sequence now
SELECT load_labels_from_file('agload_test_graph', 'Country',
'age_load/countries.csv', false);
load_labels_from_file
-----------------------

(1 row)

SELECT create_vlabel('agload_test_graph','Country2');
NOTICE: VLabel "Country2" has been created
create_vlabel
Expand Down Expand Up @@ -153,6 +221,7 @@ SELECT COUNT(*) FROM agload_test_graph."City2";
SELECT id FROM agload_test_graph."Country" LIMIT 10;
id
-----------------
844424930131969
844424930131970
844424930131971
844424930131974
Expand All @@ -162,7 +231,6 @@ SELECT id FROM agload_test_graph."Country" LIMIT 10;
844424930131996
844424930132002
844424930132023
844424930132025
(10 rows)

SELECT id FROM agload_test_graph."Country2" LIMIT 10;
Expand All @@ -180,42 +248,57 @@ SELECT id FROM agload_test_graph."Country2" LIMIT 10;
1688849860263946
(10 rows)

-- Should return 2 rows for Country with same properties, but different ids
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country {iso2 : 'BE'})
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
id(n) | n.name | n.iso2
-----------------+-----------+--------
844424930131990 | "Belgium" | "BE"
(1 row)
844424930132223 | "Belgium" | "BE"
(2 rows)

-- Should return 1 row
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country2 {iso2 : 'BE'})
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
id(n) | n.name | n.iso2
------------------+-----------+--------
1688849860263942 | "Belgium" | "BE"
(1 row)

-- Should return 2 rows for Country with same properties, but different ids
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country {iso2 : 'AT'})
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
id(n) | n.name | n.iso2
-----------------+-----------+--------
844424930131983 | "Austria" | "AT"
(1 row)
844424930132221 | "Austria" | "AT"
(2 rows)

-- Should return 1 row
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country2 {iso2 : 'AT'})
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
id(n) | n.name | n.iso2
------------------+-----------+--------
1688849860263940 | "Austria" | "AT"
(1 row)

-- Should return 2 rows for Country with same properties, but different ids
SELECT * FROM cypher('agload_test_graph', $$
MATCH (u:Country {region : "Europe"})
WHERE u.name =~ 'Cro.*'
RETURN u.name, u.region
$$) AS (result_1 agtype, result_2 agtype);
result_1 | result_2
-----------+----------
"Croatia" | "Europe"
RETURN id(u), u.name, u.region
$$) AS ("id(u)" agtype, result_1 agtype, result_2 agtype);
id(u) | result_1 | result_2
-----------------+-----------+----------
844424930132023 | "Croatia" | "Europe"
844424930132226 | "Croatia" | "Europe"
(2 rows)

-- There shouldn't be any duplicates
SELECT * FROM cypher('agload_test_graph', $$return graph_stats('agload_test_graph')$$) as (a agtype);
a
------------------------------------------------------------------------------------------
{"graph": "agload_test_graph", "num_loaded_edges": 72485, "num_loaded_vertices": 145130}
(1 row)

SELECT drop_graph('agload_test_graph', true);
Expand All @@ -236,22 +319,11 @@ NOTICE: graph "agload_test_graph" has been dropped
--
-- Test property type conversion
--
SELECT create_graph('agload_conversion');
NOTICE: graph "agload_conversion" has been created
create_graph
--------------

(1 row)

-- vertex: load as agtype
SELECT create_vlabel('agload_conversion','Person1');
NOTICE: VLabel "Person1" has been created
create_vlabel
---------------

(1 row)

-- Should create graph and label automatically
SELECT load_labels_from_file('agload_conversion', 'Person1', 'age_load/conversion_vertices.csv', true, true);
NOTICE: graph "agload_conversion" has been created
NOTICE: VLabel "Person1" has been created
load_labels_from_file
-----------------------

Expand Down
77 changes: 69 additions & 8 deletions regress/sql/age_load.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,65 @@
LOAD 'age';

SET search_path TO ag_catalog;

-- Create a country using CREATE clause
SELECT create_graph('agload_test_graph');

SELECT create_vlabel('agload_test_graph','Country');
SELECT * FROM cypher('agload_test_graph', $$CREATE (n:Country {__id__:1}) RETURN n$$) as (n agtype);

--
-- Load countries with id
--
SELECT load_labels_from_file('agload_test_graph', 'Country',
'age_load/countries.csv', true);

-- A temporary table should have been created with 54 ids; 1 from CREATE and 53 from file
SELECT COUNT(*)=54 FROM "_agload_test_graph_ag_vertex_ids";

-- Sequence should be equal to max entry id i.e. 248
SELECT currval('agload_test_graph."Country_id_seq"')=248;

-- Should error out on loading the same file again due to duplicate id
SELECT load_labels_from_file('agload_test_graph', 'Country',
'age_load/countries.csv');
'age_load/countries.csv', true);

--
-- Load cities with id
--

-- Should create City label automatically and load cities
SELECT load_labels_from_file('agload_test_graph', 'City',
'age_load/cities.csv', true);

-- Temporary table should have 54+72485 rows now
SELECT COUNT(*)=54+72485 FROM "_agload_test_graph_ag_vertex_ids";

SELECT create_vlabel('agload_test_graph','City');
-- Sequence should be equal to max entry id i.e. 146941
SELECT currval('agload_test_graph."City_id_seq"')=146941;

-- Should error out on loading the same file again due to duplicate id
SELECT load_labels_from_file('agload_test_graph', 'City',
'age_load/cities.csv');
'age_load/cities.csv', true);

--
-- Load edges -- Connects cities to countries
--

-- Should error out for using vertex label
SELECT load_edges_from_file('agload_test_graph', 'Country',
'age_load/edges.csv');

SELECT create_elabel('agload_test_graph','has_city');
SELECT load_edges_from_file('agload_test_graph', 'has_city',
'age_load/edges.csv');

-- Sequence should be equal to number of edges loaded i.e. 72485
SELECT currval('agload_test_graph."has_city_id_seq"')=72485;

-- Should error out for using edge label
SELECT load_labels_from_file('agload_test_graph', 'has_city',
'age_load/cities.csv');

SELECT table_catalog, table_schema, lower(table_name) as table_name, table_type
FROM information_schema.tables
WHERE table_schema = 'agload_test_graph' ORDER BY table_name ASC;
Expand All @@ -48,6 +93,14 @@ SELECT COUNT(*) FROM cypher('agload_test_graph', $$MATCH(n) RETURN n$$) as (n ag

SELECT COUNT(*) FROM cypher('agload_test_graph', $$MATCH (a)-[e]->(b) RETURN e$$) as (n agtype);

--
-- Load countries and cities without id
--

-- Should load countries in Country label without error since it should use sequence now
SELECT load_labels_from_file('agload_test_graph', 'Country',
'age_load/countries.csv', false);

SELECT create_vlabel('agload_test_graph','Country2');
SELECT load_labels_from_file('agload_test_graph', 'Country2',
'age_load/countries.csv', false);
Expand All @@ -62,31 +115,39 @@ SELECT COUNT(*) FROM agload_test_graph."City2";
SELECT id FROM agload_test_graph."Country" LIMIT 10;
SELECT id FROM agload_test_graph."Country2" LIMIT 10;

-- Should return 2 rows for Country with same properties, but different ids
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country {iso2 : 'BE'})
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
-- Should return 1 row
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country2 {iso2 : 'BE'})
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);

-- Should return 2 rows for Country with same properties, but different ids
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country {iso2 : 'AT'})
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
-- Should return 1 row
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country2 {iso2 : 'AT'})
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);

-- Should return 2 rows for Country with same properties, but different ids
SELECT * FROM cypher('agload_test_graph', $$
MATCH (u:Country {region : "Europe"})
WHERE u.name =~ 'Cro.*'
RETURN u.name, u.region
$$) AS (result_1 agtype, result_2 agtype);
RETURN id(u), u.name, u.region
$$) AS ("id(u)" agtype, result_1 agtype, result_2 agtype);

-- There shouldn't be any duplicates
SELECT * FROM cypher('agload_test_graph', $$return graph_stats('agload_test_graph')$$) as (a agtype);

SELECT drop_graph('agload_test_graph', true);

--
-- Test property type conversion
--
SELECT create_graph('agload_conversion');

-- vertex: load as agtype
SELECT create_vlabel('agload_conversion','Person1');

-- Should create graph and label automatically
SELECT load_labels_from_file('agload_conversion', 'Person1', 'age_load/conversion_vertices.csv', true, true);
SELECT * FROM cypher('agload_conversion', $$ MATCH (n:Person1) RETURN properties(n) $$) as (a agtype);

Expand Down
Loading

0 comments on commit 750a09e

Please sign in to comment.