From 9ee4ca124f102bf4c2d0e676301103154b53f65d Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 18:45:36 +0000 Subject: [PATCH 01/22] Enable sql-formatter --- .trunk/configs/.sql-formatter.json | 8 ++++++++ .trunk/trunk.yaml | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 .trunk/configs/.sql-formatter.json diff --git a/.trunk/configs/.sql-formatter.json b/.trunk/configs/.sql-formatter.json new file mode 100644 index 00000000..20db85d9 --- /dev/null +++ b/.trunk/configs/.sql-formatter.json @@ -0,0 +1,8 @@ +{ + "language": "postgresql", + "dialect": "postgresql", + "tabWidth": 2, + "keywordCase": "upper", + "linesBetweenQueries": 2, + "denseOperators": false +} diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index 4ea4e0b2..d76187cf 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -5,6 +5,14 @@ version: 0.1 cli: version: 1.22.0 +tools: + definitions: + - name: sql-formatter + runtime: node + package: sql-formatter + shims: [sql-formatter] + known_good_version: 7.0.1 + # Trunk provides extensibility via plugins. # (https://docs.trunk.io/plugins) plugins: @@ -34,6 +42,24 @@ lint: - name: lint run: bandit --exit-zero -c bandit.yaml --format json --output ${tmpfile} ${target} + - name: sql-formatter + files: [sql] + description: A SQL formatter + commands: + - output: rewrite + # Force postgresql dialect + run: sql-formatter -l postgresql + success_codes: [0] + formatter: true + stdin: true + # Linter does not support batching. + tools: [sql-formatter] + known_good_version: 7.0.1 + suggest_if: never + version_command: + parse_regex: ${semver} + run: sql-formatter --version + ignore: - linters: [ALL] paths: @@ -45,6 +71,7 @@ lint: - LICENSE.md enabled: + - sql-formatter@15.4.5 - actionlint@1.6.27 - bandit@1.7.8 - black@24.4.2 From 047766a85f086fc0986a6f2b49fee9d73fa219e8 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 18:55:07 +0000 Subject: [PATCH 02/22] Fix formatting --- app/repository/sql/download.sql | 538 ++++++++++++++++------------- app/repository/sql/pipeline.sql | 475 ++++++++++++------------- app/repository/sql/slug_lookup.sql | 38 +- 3 files changed, 554 insertions(+), 497 deletions(-) diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index 8a807080..840d387b 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -1,243 +1,319 @@ WITH -deduplicated_family_slugs as ( - SELECT - distinct ON (slug.family_import_id) - slug.family_import_id, slug.created, slug.name - FROM ( + deduplicated_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id AS "family_import_id", + count(*) AS COUNT + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + count(*) > 1 + ) duplicates + LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + unique_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id AS "family_import_id", + count(*) AS COUNT + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + count(*) = 1 + ) non_duplicates + LEFT JOIN slug ON non_duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + most_recent_family_slugs AS ( SELECT - slug.family_import_id as "family_import_id", - count(*) as count - FROM slug - WHERE slug.family_import_id is not null - group by slug.family_import_id - having count(*) > 1 - ) duplicates - left join slug - on duplicates.family_import_id = slug.family_import_id - order by slug.family_import_id desc, slug.created desc, slug.ctid desc -), -unique_family_slugs as ( - SELECT - distinct ON (slug.family_import_id) - slug.family_import_id, slug.created, slug.name - FROM ( + deduplicated_family_slugs.family_import_id AS "family_import_id", + deduplicated_family_slugs.created AS "created", + deduplicated_family_slugs.name AS "name" + FROM + deduplicated_family_slugs + UNION ALL SELECT - slug.family_import_id as "family_import_id", - count(*) as count - FROM slug - WHERE slug.family_import_id is not null - group by slug.family_import_id - having count(*) = 1 - ) non_duplicates - left join slug - on non_duplicates.family_import_id = slug.family_import_id - order by slug.family_import_id desc, slug.created desc, slug.ctid desc - ), most_recent_family_slugs as ( - SELECT - deduplicated_family_slugs.family_import_id as "family_import_id", - deduplicated_family_slugs.created as "created", - deduplicated_family_slugs.name as "name" - FROM deduplicated_family_slugs - UNION ALL - SELECT - unique_family_slugs.family_import_id as "family_import_id", - unique_family_slugs.created as "created", - unique_family_slugs.name as "name" - FROM unique_family_slugs - order by family_import_id desc, created desc - ), deduplicated_doc_slugs as ( - SELECT - distinct ON (slug.family_document_import_id) - slug.family_document_import_id, - slug.created, - slug.name - FROM ( + unique_family_slugs.family_import_id AS "family_import_id", + unique_family_slugs.created AS "created", + unique_family_slugs.name AS "name" + FROM + unique_family_slugs + ORDER BY + family_import_id DESC, + created DESC + ), + deduplicated_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id AS "family_document_import_id", + count(*) AS COUNT + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + count(*) > 1 + ) duplicates + LEFT JOIN slug ON duplicates.family_document_import_id = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + unique_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id AS "family_document_import_id", + count(*) AS COUNT + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + count(*) = 1 + ) non_duplicates + LEFT JOIN slug ON non_duplicates.family_document_import_id = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + most_recent_doc_slugs AS ( SELECT - slug.family_document_import_id as "family_document_import_id", - count(*) as count - FROM slug - WHERE slug.family_document_import_id is not null - group by slug.family_document_import_id - having count(*) > 1 - ) duplicates - left join slug - on duplicates.family_document_import_id = slug.family_document_import_id - order by - slug.family_document_import_id desc, slug.created desc, slug.ctid desc -), -unique_doc_slugs as ( - SELECT - distinct ON (slug.family_document_import_id) - slug.family_document_import_id, - slug.created, - slug.name - FROM ( + deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", + deduplicated_doc_slugs.created, + deduplicated_doc_slugs.name + FROM + deduplicated_doc_slugs + UNION ALL + SELECT + unique_doc_slugs.family_document_import_id AS "family_document_import_id", + unique_doc_slugs.created, + unique_doc_slugs.name + FROM + unique_doc_slugs + ORDER BY + family_document_import_id DESC, + created DESC + ), + event_dates AS ( SELECT - slug.family_document_import_id as "family_document_import_id", - count(*) as count - FROM slug - WHERE slug.family_document_import_id is not null - group by slug.family_document_import_id - having count(*) = 1 - ) non_duplicates - left join slug - on non_duplicates.family_document_import_id = slug.family_document_import_id - order by - slug.family_document_import_id desc, slug.created desc, slug.ctid desc - ), most_recent_doc_slugs as ( - SELECT - deduplicated_doc_slugs.family_document_import_id - as "family_document_import_id", - deduplicated_doc_slugs.created, - deduplicated_doc_slugs.name - FROM deduplicated_doc_slugs - UNION ALL - SELECT - unique_doc_slugs.family_document_import_id as "family_document_import_id", - unique_doc_slugs.created, - unique_doc_slugs.name - FROM unique_doc_slugs - order by family_document_import_id desc, created desc - ), event_dates as ( - SELECT family_event.family_import_id AS family_import_id, CASE - WHEN COUNT(*) FILTER ( - WHERE family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - ) > 0 THEN - MIN(CASE - WHEN family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - THEN family_event.date::TIMESTAMPTZ - END) - ELSE - MIN(family_event.date::TIMESTAMPTZ) + WHEN COUNT(*) FILTER ( + WHERE + family_event.event_type_name = ( + family_event.valid_metadata -> 'datetime_event_name' ->> 0 + ) + ) > 0 THEN MIN( + CASE + WHEN family_event.event_type_name = ( + family_event.valid_metadata -> 'datetime_event_name' ->> 0 + ) THEN family_event.date::TIMESTAMPTZ + END + ) + ELSE MIN(family_event.date::TIMESTAMPTZ) END AS published_date, max(family_event.date::date) last_changed - FROM + FROM family_event - GROUP BY + GROUP BY family_import_id -) + ) SELECT -ds.name as "Document ID", -p.title as "Document Title", -fs.name as "Family ID", -f.title as "Family Title", -f.description as "Family Summary", -n1.collection_titles as "Collection Title(s)", -n1.collection_descriptions as "Collection Description(s)", -INITCAP(d.valid_metadata::json#>>'{ - role,0}') as -"Document Role", -d.variant_name as "Document Variant", -p.source_url as "Document Content URL", -INITCAP(d.valid_metadata::json#>>'{ - type,0}') as -"Document Type", -CASE - WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' - ELSE INITCAP(f.family_category::TEXT) -END "Category", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'framework')), ';') -as "Framework", -n2.language as "Language", -o.name as "Source", -fg.geo_isos as "Geography ISOs", -fg.geo_display_values as "Geographies", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'topic')), ';') -as "Topic/Response", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'hazard')), ';') -as "Hazard", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'sector')), ';') -as "Sector", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'keyword')), ';') -as "Keyword", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'instrument')), ';') -as "Instrument", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'author')), ';') -as "Author", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'author_type')), ';') -as "Author Type", -fp.published_date as "First event in timeline", -fp.last_changed as "Last event in timeline", -n3.event_type_names as "Full timeline of events (types)", -n3.event_dates as "Full timeline of events (dates)", -d.created::date as "Date Added to System", -f.last_modified::date as "Last ModIFied on System", -d.import_id as "Internal Document ID", -f.import_id as "Internal Family ID", -n1.collection_import_ids as "Internal Collection ID(s)" -FROM physical_document p -JOIN family_document d -ON p.id = d.physical_document_id -JOIN family f -ON d.family_import_id = f.import_id -FULL JOIN ( - SELECT - family_geography.family_import_id as "family_import_id", - string_agg(geography.value, ';') AS geo_isos, - string_agg(geography.display_value, ';') AS geo_display_values - FROM - geography - INNER JOIN family_geography - ON geography.id = family_geography.geography_id - GROUP BY family_geography.family_import_id -) fg ON fg.family_import_id=f.import_id -join family_corpus fc -on f.import_id = fc.family_import_id -join corpus c -on fc.corpus_import_id = c.import_id -join organisation o -on c.organisation_id = o.id -join family_metadata fm -on fm.family_import_id = f.import_id -FULL JOIN ( - SELECT - collection_family.family_import_id as "family_import_id", - string_agg(collection.import_id, ';') AS collection_import_ids, - string_agg(collection.title, ';') AS collection_titles, - string_agg(collection.description, ';') AS collection_descriptions - FROM - collection - INNER JOIN collection_family - ON collection_family.collection_import_id = collection.import_id - GROUP BY collection_family.family_import_id -) n1 ON n1.family_import_id=f.import_id -left JOIN ( - SELECT - p.id as "id", - string_agg(l.name, ';' ORDER BY l.name) AS language - FROM physical_document p - left join physical_document_language pdl - on pdl.document_id = p.id - left join language l - on l.id = pdl.language_id - GROUP BY p.id -) n2 ON n2.id=d.physical_document_id -FULL JOIN ( - SELECT - family_event.family_import_id, - string_agg(family_event.import_id, ';') AS event_import_ids, - string_agg(family_event.title, ';') AS event_titles, - string_agg(family_event.event_type_name, ';') AS event_type_names, - string_agg(family_event.date::date::text, ';') AS event_dates - FROM family_event - INNER JOIN family ON family.import_id = family_event.family_import_id - GROUP BY family_event.family_import_id -) n3 ON n3.family_import_id=f.import_id -LEFT JOIN most_recent_doc_slugs ds -on ds.family_document_import_id = d.import_id -LEFT JOIN most_recent_family_slugs fs on fs.family_import_id = f.import_id -LEFT JOIN event_dates fp on fp.family_import_id = f.import_id -WHERE d.last_modified < '{ingest_cycle_start}' AND fc.corpus_import_id in ({allowed_corpora_ids}) -ORDER BY d.last_modified desc, d.created desc, d.ctid desc, n1.family_import_id + ds.name AS "Document ID", + p.title AS "Document Title", + fs.name AS "Family ID", + f.title AS "Family Title", + f.description AS "Family Summary", + n1.collection_titles AS "Collection Title(s)", + n1.collection_descriptions AS "Collection Description(s)", + INITCAP(d.valid_metadata::json #>> '{ + role,0}') AS "Document Role", + d.variant_name AS "Document Variant", + p.source_url AS "Document Content URL", + INITCAP(d.valid_metadata::json #>> '{ + type,0}') AS "Document Type", + CASE + WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' + ELSE INITCAP(f.family_category::TEXT) + END "Category", + array_to_string( + ARRAY( + SELECT + jsonb_array_elements_text(fm.value -> 'framework') + ), + ';' + ) AS "Framework", + n2.language AS "Language", + o.name AS "Source", + fg.geo_isos AS "Geography ISOs", + fg.geo_display_values AS "Geographies", + array_to_string( + ARRAY( + SELECT + jsonb_array_elements_text(fm.value -> 'topic') + ), + ';' + ) AS "Topic/Response", + array_to_string( + ARRAY( + SELECT + jsonb_array_elements_text(fm.value -> 'hazard') + ), + ';' + ) AS "Hazard", + array_to_string( + ARRAY( + SELECT + jsonb_array_elements_text(fm.value -> 'sector') + ), + ';' + ) AS "Sector", + array_to_string( + ARRAY( + SELECT + jsonb_array_elements_text(fm.value -> 'keyword') + ), + ';' + ) AS "Keyword", + array_to_string( + ARRAY( + SELECT + jsonb_array_elements_text(fm.value -> 'instrument') + ), + ';' + ) AS "Instrument", + array_to_string( + ARRAY( + SELECT + jsonb_array_elements_text(fm.value -> 'author') + ), + ';' + ) AS "Author", + array_to_string( + ARRAY( + SELECT + jsonb_array_elements_text(fm.value -> 'author_type') + ), + ';' + ) AS "Author Type", + fp.published_date AS "First event in timeline", + fp.last_changed AS "Last event in timeline", + n3.event_type_names AS "Full timeline of events (types)", + n3.event_dates AS "Full timeline of events (dates)", + d.created::date AS "Date Added to System", + f.last_modified::date AS "Last ModIFied on System", + d.import_id AS "Internal Document ID", + f.import_id AS "Internal Family ID", + n1.collection_import_ids AS "Internal Collection ID(s)" +FROM + physical_document p + JOIN family_document d ON p.id = d.physical_document_id + JOIN FAMILY f ON d.family_import_id = f.import_id + FULL JOIN ( + SELECT + family_geography.family_import_id AS "family_import_id", + string_agg(geography.value, ';') AS geo_isos, + string_agg(geography.display_value, ';') AS geo_display_values + FROM + geography + INNER JOIN family_geography ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id + ) fg ON fg.family_import_id = f.import_id + JOIN family_corpus fc ON f.import_id = fc.family_import_id + JOIN corpus c ON fc.corpus_import_id = c.import_id + JOIN organisation o ON c.organisation_id = o.id + JOIN family_metadata fm ON fm.family_import_id = f.import_id + FULL JOIN ( + SELECT + collection_family.family_import_id AS "family_import_id", + string_agg(collection.import_id, ';') AS collection_import_ids, + string_agg(collection.title, ';') AS collection_titles, + string_agg(collection.description, ';') AS collection_descriptions + FROM + collection + INNER JOIN collection_family ON collection_family.collection_import_id = collection.import_id + GROUP BY + collection_family.family_import_id + ) n1 ON n1.family_import_id = f.import_id + LEFT JOIN ( + SELECT + p.id AS "id", + string_agg( + l.name, + ';' + ORDER BY + l.name + ) AS language + FROM + physical_document p + LEFT JOIN physical_document_language pdl ON pdl.document_id = p.id + LEFT JOIN language l ON l.id = pdl.language_id + GROUP BY + p.id + ) n2 ON n2.id = d.physical_document_id + FULL JOIN ( + SELECT + family_event.family_import_id, + string_agg(family_event.import_id, ';') AS event_import_ids, + string_agg(family_event.title, ';') AS event_titles, + string_agg(family_event.event_type_name, ';') AS event_type_names, + string_agg(family_event.date::date::text, ';') AS event_dates + FROM + family_event + INNER JOIN FAMILY ON FAMILY.import_id = family_event.family_import_id + GROUP BY + family_event.family_import_id + ) n3 ON n3.family_import_id = f.import_id + LEFT JOIN most_recent_doc_slugs ds ON ds.family_document_import_id = d.import_id + LEFT JOIN most_recent_family_slugs fs ON fs.family_import_id = f.import_id + LEFT JOIN event_dates fp ON fp.family_import_id = f.import_id +WHERE + d.last_modified < '{ingest_cycle_start}' + AND fc.corpus_import_id IN ({allowed_corpora_ids}) +ORDER BY + d.last_modified DESC, + d.created DESC, + d.ctid DESC, + n1.family_import_id diff --git a/app/repository/sql/pipeline.sql b/app/repository/sql/pipeline.sql index af6023e6..37bb5455 100644 --- a/app/repository/sql/pipeline.sql +++ b/app/repository/sql/pipeline.sql @@ -1,256 +1,233 @@ -WITH deduplicated_family_slugs AS ( SELECT - DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name +WITH + deduplicated_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name FROM - ( SELECT - slug.family_import_id AS "family_import_id", - Count(*) AS count + ( + SELECT + slug.family_import_id AS "family_import_id", + Count(*) AS count FROM - slug + slug WHERE - slug.family_import_id IS NOT NULL + slug.family_import_id IS NOT NULL GROUP BY - slug.family_import_id + slug.family_import_id HAVING - Count(*) > 1 ) duplicates - left join - slug - ON duplicates.family_import_id = slug.family_import_id + Count(*) > 1 + ) duplicates + left join slug ON duplicates.family_import_id = slug.family_import_id ORDER BY - slug.family_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - unique_family_slugs AS ( SELECT - DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + unique_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id AS "family_import_id", + Count(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + Count(*) = 1 + ) non_duplicates + left join slug ON non_duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + most_recent_family_slugs AS ( + SELECT + deduplicated_family_slugs.family_import_id AS "family_import_id", + deduplicated_family_slugs.created AS "created", + deduplicated_family_slugs.name AS "name" + FROM + deduplicated_family_slugs + UNION ALL + SELECT + unique_family_slugs.family_import_id AS "family_import_id", + unique_family_slugs.created AS "created", + unique_family_slugs.name AS "name" + FROM + unique_family_slugs + ORDER BY + family_import_id DESC, + created DESC + ), + deduplicated_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id AS "family_document_import_id", + Count(*) AS count FROM - ( SELECT - slug.family_import_id AS "family_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_import_id IS NOT NULL - GROUP BY - slug.family_import_id - HAVING - Count(*) = 1 ) non_duplicates - left join - slug - ON non_duplicates.family_import_id = slug.family_import_id - ORDER BY - slug.family_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - most_recent_family_slugs AS ( SELECT - deduplicated_family_slugs.family_import_id AS "family_import_id", - deduplicated_family_slugs.created AS "created", - deduplicated_family_slugs.name AS "name" - FROM - deduplicated_family_slugs - UNION - ALL SELECT - unique_family_slugs.family_import_id AS "family_import_id", - unique_family_slugs.created AS "created", - unique_family_slugs.name AS "name" - FROM - unique_family_slugs - ORDER BY - family_import_id DESC, - created DESC ), deduplicated_doc_slugs AS ( SELECT - DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name - FROM - ( SELECT - slug.family_document_import_id AS "family_document_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL - GROUP BY - slug.family_document_import_id - HAVING - Count(*) > 1 ) duplicates - left join - slug - ON duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - unique_doc_slugs AS ( SELECT - DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name - FROM - ( SELECT - slug.family_document_import_id AS "family_document_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL - GROUP BY - slug.family_document_import_id - HAVING - Count(*) = 1 ) non_duplicates - left join - slug - ON non_duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - most_recent_doc_slugs AS ( - SELECT - deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", - deduplicated_doc_slugs.created, - deduplicated_doc_slugs.name - FROM - deduplicated_doc_slugs - UNION - ALL SELECT - unique_doc_slugs.family_document_import_id AS "family_document_import_id", - unique_doc_slugs.created, - unique_doc_slugs.name - FROM - unique_doc_slugs - ORDER BY - family_document_import_id DESC, - created DESC - ), event_dates AS ( - SELECT - family_event.family_import_id AS family_import_id, - CASE - WHEN COUNT(*) FILTER ( - WHERE family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - ) > 0 THEN - MIN(CASE - WHEN family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - THEN family_event.date::TIMESTAMPTZ - END) - ELSE - MIN(family_event.date::TIMESTAMPTZ) - END AS published_date - FROM - family_event - GROUP BY - family_import_id - ) SELECT - f.title AS "family_title", - p.title AS "physical_document_title", - f.description AS "family_description", - CASE - WHEN f.family_category IN ('UNFCCC', - 'MCF') THEN Upper(f.family_category::text) - ELSE Initcap(f.family_category::text) - END "family_category", - fp.published_date AS "family_published_date", - d.import_id AS "family_document_import_id", - ds.name AS "family_document_slug", - f.import_id AS "family_import_id", - fs.name AS "family_slug", - p.source_url AS "physical_document_source_url", - d.valid_metadata::json#>>'{type,0}' AS "family_document_type", - o.name AS "organisation_name", - geos.geographies AS "geographies", - c.import_id AS "corpus_import_id", - c.corpus_type_name AS "corpus_type_name", - langs.languages AS "languages", - fm.value AS "family_metadata", - d.valid_metadata AS "family_document_metadata" - FROM - physical_document p - join - family_document d - ON p.id = d.physical_document_id - join - family f - ON d.family_import_id = f.import_id full - join - ( - SELECT - family_geography.family_import_id AS "family_import_id", - string_agg(geography.value, - ';') AS geo_isos, - string_agg(geography.display_value, - ';') AS geo_display_values - FROM - geography - inner join - family_geography - ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) fg - ON fg.family_import_id=f.import_id - join - family_corpus fc - ON f.import_id = fc.family_import_id - join - corpus c - ON fc.corpus_import_id = c.import_id - join - organisation o - ON c.organisation_id = o.id - join - family_metadata fm - ON fm.family_import_id = f.import_id - left outer join - ( - SELECT - family_document.import_id AS family_document_import_id, - json_agg(DISTINCT(LANGUAGE.name)) AS languages - FROM - family_document - join - physical_document_language - ON physical_document_language.document_id = family_document.physical_document_id - join - LANGUAGE - ON LANGUAGE.id = physical_document_language.language_id - GROUP BY - family_document.import_id - ) AS langs - ON langs.family_document_import_id = d.import_id - left outer join - ( - SELECT - family_geography.family_import_id AS family_import_id, - json_agg(DISTINCT(geography.value)) AS geographies - FROM - family_geography - join - geography - ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) AS geos - ON geos.family_import_id = f.import_id - left join - most_recent_doc_slugs ds - ON ds.family_document_import_id = d.import_id - left join - most_recent_family_slugs fs - ON fs.family_import_id = f.import_id - left join - event_dates fp - ON fp.family_import_id = f.import_id - WHERE - d.document_status != 'DELETED' - AND fg.family_import_id = f.import_id - ORDER BY - d.last_modified DESC, - d.created DESC, - d.ctid DESC, - f.import_id + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + Count(*) > 1 + ) duplicates + left join slug ON duplicates.family_document_import_id = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + unique_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id AS "family_document_import_id", + Count(*) AS count + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + Count(*) = 1 + ) non_duplicates + left join slug ON non_duplicates.family_document_import_id = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + most_recent_doc_slugs AS ( + SELECT + deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", + deduplicated_doc_slugs.created, + deduplicated_doc_slugs.name + FROM + deduplicated_doc_slugs + UNION ALL + SELECT + unique_doc_slugs.family_document_import_id AS "family_document_import_id", + unique_doc_slugs.created, + unique_doc_slugs.name + FROM + unique_doc_slugs + ORDER BY + family_document_import_id DESC, + created DESC + ), + event_dates AS ( + SELECT + family_event.family_import_id AS family_import_id, + CASE + WHEN COUNT(*) FILTER ( + WHERE + family_event.event_type_name = ( + family_event.valid_metadata -> 'datetime_event_name' ->> 0 + ) + ) > 0 THEN MIN( + CASE + WHEN family_event.event_type_name = ( + family_event.valid_metadata -> 'datetime_event_name' ->> 0 + ) THEN family_event.date::TIMESTAMPTZ + END + ) + ELSE MIN(family_event.date::TIMESTAMPTZ) + END AS published_date + FROM + family_event + GROUP BY + family_import_id + ) +SELECT + f.title AS "family_title", + p.title AS "physical_document_title", + f.description AS "family_description", + CASE + WHEN f.family_category IN ('UNFCCC', 'MCF') THEN Upper(f.family_category::text) + ELSE Initcap(f.family_category::text) + END "family_category", + fp.published_date AS "family_published_date", + d.import_id AS "family_document_import_id", + ds.name AS "family_document_slug", + f.import_id AS "family_import_id", + fs.name AS "family_slug", + p.source_url AS "physical_document_source_url", + d.valid_metadata::json #>> '{type,0}' AS "family_document_type", + o.name AS "organisation_name", + geos.geographies AS "geographies", + c.import_id AS "corpus_import_id", + c.corpus_type_name AS "corpus_type_name", + langs.languages AS "languages", + fm.value AS "family_metadata", + d.valid_metadata AS "family_document_metadata" +FROM + physical_document p + join family_document d ON p.id = d.physical_document_id + join family f ON d.family_import_id = f.import_id + full join ( + SELECT + family_geography.family_import_id AS "family_import_id", + string_agg(geography.value, ';') AS geo_isos, + string_agg(geography.display_value, ';') AS geo_display_values + FROM + geography + inner join family_geography ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id + ) fg ON fg.family_import_id = f.import_id + join family_corpus fc ON f.import_id = fc.family_import_id + join corpus c ON fc.corpus_import_id = c.import_id + join organisation o ON c.organisation_id = o.id + join family_metadata fm ON fm.family_import_id = f.import_id + left outer join ( + SELECT + family_document.import_id AS family_document_import_id, + json_agg(DISTINCT (LANGUAGE.name)) AS languages + FROM + family_document + join physical_document_language ON physical_document_language.document_id = family_document.physical_document_id + join LANGUAGE ON LANGUAGE.id = physical_document_language.language_id + GROUP BY + family_document.import_id + ) AS langs ON langs.family_document_import_id = d.import_id + left outer join ( + SELECT + family_geography.family_import_id AS family_import_id, + json_agg(DISTINCT (geography.value)) AS geographies + FROM + family_geography + join geography ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id + ) AS geos ON geos.family_import_id = f.import_id + left join most_recent_doc_slugs ds ON ds.family_document_import_id = d.import_id + left join most_recent_family_slugs fs ON fs.family_import_id = f.import_id + left join event_dates fp ON fp.family_import_id = f.import_id +WHERE + d.document_status != 'DELETED' + AND fg.family_import_id = f.import_id +ORDER BY + d.last_modified DESC, + d.created DESC, + d.ctid DESC, + f.import_id diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql index 9d649067..6ae319c5 100644 --- a/app/repository/sql/slug_lookup.sql +++ b/app/repository/sql/slug_lookup.sql @@ -1,20 +1,24 @@ SELECT - slug.family_document_import_id, slug.family_import_id -FROM slug -LEFT JOIN family ON family.import_id = slug.family_import_id -LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id -LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id -WHERE slug.name = '{slug_name}' -AND corpus.import_id IN ({allowed_corpora_ids}) - + slug.family_document_import_id, + slug.family_import_id +FROM + slug + LEFT JOIN family ON family.import_id = slug.family_import_id + LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id + LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id +WHERE + slug.name = '{slug_name}' + AND corpus.import_id IN ({allowed_corpora_ids}) UNION - SELECT - slug.family_document_import_id, slug.family_import_id -FROM slug -LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id -LEFT JOIN family ON family.import_id = family_document.family_import_id -LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id -LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id -WHERE slug.name = '{slug_name}' -AND corpus.import_id IN ({allowed_corpora_ids}); + slug.family_document_import_id, + slug.family_import_id +FROM + slug + LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id + LEFT JOIN family ON family.import_id = family_document.family_import_id + LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id + LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id +WHERE + slug.name = '{slug_name}' + AND corpus.import_id IN ({allowed_corpora_ids}); From 22123c168f29fcda10be805a69f246cc3754ad17 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 18:55:52 +0000 Subject: [PATCH 03/22] Replace with variables & bind --- app/repository/document.py | 21 ++------------------- app/repository/download.py | 29 ++++++++++++----------------- app/repository/sql/download.sql | 4 ++-- app/repository/sql/slug_lookup.sql | 8 ++++---- 4 files changed, 20 insertions(+), 42 deletions(-) diff --git a/app/repository/document.py b/app/repository/document.py index 57579a26..bb8f703d 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -42,22 +42,6 @@ _LOGGER = logging.getLogger(__file__) -def get_slugged_object_from_allowed_corpora_query( - template_query, slug_name: str, allowed_corpora_ids: list[str] -) -> str: - """Create download whole database query, replacing variables. - - :param str ingest_cycle_start: The current ingest cycle date. - :param list[str] allowed_corpora_ids: The corpora from which we - should allow the data to be dumped. - :return str: The SQL query to perform on the database session. - """ - corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" - return template_query.replace("{slug_name}", slug_name).replace( # type: ignore - "{allowed_corpora_ids}", corpora_ids - ) # type: ignore - - def get_slugged_objects( db: Session, slug: str, allowed_corpora: Optional[list[str]] = None ) -> tuple[Optional[str], Optional[str]]: @@ -78,10 +62,9 @@ def get_slugged_objects( query_template = get_query_template( os.path.join("app", "repository", "sql", "slug_lookup.sql") ) - query = get_slugged_object_from_allowed_corpora_query( - query_template, slug, allowed_corpora + query = db.execute( + query_template, {"slug_name": slug, "allowed_corpora_ids": slug} ) - query = db.execute(query) else: query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( Slug.name == slug diff --git a/app/repository/download.py b/app/repository/download.py index 1ed90396..1b928e69 100644 --- a/app/repository/download.py +++ b/app/repository/download.py @@ -12,32 +12,27 @@ _LOGGER = getLogger(__name__) -def create_query( - template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str] -) -> str: - """Create download whole database query, replacing variables. +def get_whole_database_dump( + ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) +): + """Get whole database dump and bind variables. :param str ingest_cycle_start: The current ingest cycle date. :param list[str] allowed_corpora_ids: The corpora from which we should allow the data to be dumped. :return str: The SQL query to perform on the database session. """ - corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" - return template_query.replace( # type: ignore - "{ingest_cycle_start}", ingest_cycle_start - ).replace( - "{allowed_corpora_ids}", corpora_ids - ) # type: ignore - - -def get_whole_database_dump( - ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) -): query_template = get_query_template( os.path.join("app", "repository", "sql", "download.sql") ) - query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids) with db.connection() as conn: - df = pd.read_sql(query, conn.connection) + df = pd.read_sql( + query_template, + conn.connection, + params={ + "ingest_cycle_start": ingest_cycle_start, + "allowed_corpora_ids": allowed_corpora_ids, + }, + ) return df diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index 840d387b..1af29ac4 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -310,8 +310,8 @@ FROM LEFT JOIN most_recent_family_slugs fs ON fs.family_import_id = f.import_id LEFT JOIN event_dates fp ON fp.family_import_id = f.import_id WHERE - d.last_modified < '{ingest_cycle_start}' - AND fc.corpus_import_id IN ({allowed_corpora_ids}) + d.last_modified < ':ingest_cycle_start' + AND fc.corpus_import_id in ':allowed_corpora_ids' ORDER BY d.last_modified DESC, d.created DESC, diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql index 6ae319c5..210c9dec 100644 --- a/app/repository/sql/slug_lookup.sql +++ b/app/repository/sql/slug_lookup.sql @@ -7,8 +7,8 @@ FROM LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id WHERE - slug.name = '{slug_name}' - AND corpus.import_id IN ({allowed_corpora_ids}) + slug.name = ':slug_name' + AND corpus.import_id IN ':allowed_corpora_ids' UNION SELECT slug.family_document_import_id, @@ -20,5 +20,5 @@ FROM LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id WHERE - slug.name = '{slug_name}' - AND corpus.import_id IN ({allowed_corpora_ids}); + slug.name = ':slug_name' + AND corpus.import_id IN ':allowed_corpora_ids'; From 5a7e170d4b66929d9b99475a8f3c83e76eada891 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 18:56:24 +0000 Subject: [PATCH 04/22] Update .git-blame-ignore-revs --- .git-blame-ignore-revs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 9a66c0e8..cfd5e1f0 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -23,3 +23,6 @@ # Updating the test data file for document passages to be indent=2 44624dcd1fa0835708bd9187a39bb0da8a31cd03 + +# Fix SQL query formatting +047766a85f086fc0986a6f2b49fee9d73fa219e8 From 13492fae2aeb38f5ca2cd8195cb0a8c677b35b90 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Tue, 5 Nov 2024 19:10:35 +0000 Subject: [PATCH 05/22] Fix formatting --- .trunk/configs/.sql-formatter.json | 5 +- app/repository/sql/download.sql | 606 ++++++++++++++--------------- app/repository/sql/pipeline.sql | 436 ++++++++++----------- app/repository/sql/slug_lookup.sql | 34 +- 4 files changed, 542 insertions(+), 539 deletions(-) diff --git a/.trunk/configs/.sql-formatter.json b/.trunk/configs/.sql-formatter.json index 20db85d9..e4046a0f 100644 --- a/.trunk/configs/.sql-formatter.json +++ b/.trunk/configs/.sql-formatter.json @@ -1,8 +1,11 @@ { "language": "postgresql", "dialect": "postgresql", - "tabWidth": 2, + "tabWidth": 4, "keywordCase": "upper", + "dataTypeCase": "upper", + "identifierCase": "lower", + "functionCase": "upper", "linesBetweenQueries": 2, "denseOperators": false } diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index 1af29ac4..cf8084b2 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -1,319 +1,319 @@ WITH - deduplicated_family_slugs AS ( - SELECT DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name - FROM - ( + deduplicated_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id AS "family_import_id", + COUNT(*) AS COUNT + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) > 1 + ) duplicates + LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + unique_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id AS "family_import_id", + COUNT(*) AS COUNT + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) = 1 + ) non_duplicates + LEFT JOIN slug ON non_duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + most_recent_family_slugs AS ( SELECT - slug.family_import_id AS "family_import_id", - count(*) AS COUNT + deduplicated_family_slugs.family_import_id AS "family_import_id", + deduplicated_family_slugs.created AS "created", + deduplicated_family_slugs.name AS "name" FROM - slug - WHERE - slug.family_import_id IS NOT NULL - GROUP BY - slug.family_import_id - HAVING - count(*) > 1 - ) duplicates - LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id - ORDER BY - slug.family_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - unique_family_slugs AS ( - SELECT DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name - FROM - ( + deduplicated_family_slugs + UNION ALL SELECT - slug.family_import_id AS "family_import_id", - count(*) AS COUNT + unique_family_slugs.family_import_id AS "family_import_id", + unique_family_slugs.created AS "created", + unique_family_slugs.name AS "name" FROM - slug - WHERE - slug.family_import_id IS NOT NULL - GROUP BY - slug.family_import_id - HAVING - count(*) = 1 - ) non_duplicates - LEFT JOIN slug ON non_duplicates.family_import_id = slug.family_import_id - ORDER BY - slug.family_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - most_recent_family_slugs AS ( - SELECT - deduplicated_family_slugs.family_import_id AS "family_import_id", - deduplicated_family_slugs.created AS "created", - deduplicated_family_slugs.name AS "name" - FROM - deduplicated_family_slugs - UNION ALL - SELECT - unique_family_slugs.family_import_id AS "family_import_id", - unique_family_slugs.created AS "created", - unique_family_slugs.name AS "name" - FROM - unique_family_slugs - ORDER BY - family_import_id DESC, - created DESC - ), - deduplicated_doc_slugs AS ( - SELECT DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name - FROM - ( + unique_family_slugs + ORDER BY + family_import_id DESC, + created DESC + ), + deduplicated_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id AS "family_document_import_id", + COUNT(*) AS COUNT + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) > 1 + ) duplicates + LEFT JOIN slug ON duplicates.family_document_import_id = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + unique_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id AS "family_document_import_id", + COUNT(*) AS COUNT + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) = 1 + ) non_duplicates + LEFT JOIN slug ON non_duplicates.family_document_import_id = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + most_recent_doc_slugs AS ( SELECT - slug.family_document_import_id AS "family_document_import_id", - count(*) AS COUNT + deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", + deduplicated_doc_slugs.created, + deduplicated_doc_slugs.name FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL - GROUP BY - slug.family_document_import_id - HAVING - count(*) > 1 - ) duplicates - LEFT JOIN slug ON duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - unique_doc_slugs AS ( - SELECT DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name - FROM - ( + deduplicated_doc_slugs + UNION ALL SELECT - slug.family_document_import_id AS "family_document_import_id", - count(*) AS COUNT + unique_doc_slugs.family_document_import_id AS "family_document_import_id", + unique_doc_slugs.created, + unique_doc_slugs.name FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL + unique_doc_slugs + ORDER BY + family_document_import_id DESC, + created DESC + ), + event_dates AS ( + SELECT + family_event.family_import_id AS family_import_id, + CASE + WHEN COUNT(*) FILTER ( + WHERE + family_event.event_type_name = ( + family_event.valid_metadata -> 'datetime_event_name' ->> 0 + ) + ) > 0 THEN MIN( + CASE + WHEN family_event.event_type_name = ( + family_event.valid_metadata -> 'datetime_event_name' ->> 0 + ) THEN family_event.date::TIMESTAMPTZ + END + ) + ELSE MIN(family_event.date::TIMESTAMPTZ) + END AS published_date, + MAX(family_event.date::date) last_changed + FROM + family_event GROUP BY - slug.family_document_import_id - HAVING - count(*) = 1 - ) non_duplicates - LEFT JOIN slug ON non_duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - most_recent_doc_slugs AS ( - SELECT - deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", - deduplicated_doc_slugs.created, - deduplicated_doc_slugs.name - FROM - deduplicated_doc_slugs - UNION ALL - SELECT - unique_doc_slugs.family_document_import_id AS "family_document_import_id", - unique_doc_slugs.created, - unique_doc_slugs.name - FROM - unique_doc_slugs - ORDER BY - family_document_import_id DESC, - created DESC - ), - event_dates AS ( - SELECT - family_event.family_import_id AS family_import_id, - CASE - WHEN COUNT(*) FILTER ( - WHERE - family_event.event_type_name = ( - family_event.valid_metadata -> 'datetime_event_name' ->> 0 - ) - ) > 0 THEN MIN( - CASE - WHEN family_event.event_type_name = ( - family_event.valid_metadata -> 'datetime_event_name' ->> 0 - ) THEN family_event.date::TIMESTAMPTZ - END - ) - ELSE MIN(family_event.date::TIMESTAMPTZ) - END AS published_date, - max(family_event.date::date) last_changed - FROM - family_event - GROUP BY - family_import_id - ) + family_import_id + ) SELECT - ds.name AS "Document ID", - p.title AS "Document Title", - fs.name AS "Family ID", - f.title AS "Family Title", - f.description AS "Family Summary", - n1.collection_titles AS "Collection Title(s)", - n1.collection_descriptions AS "Collection Description(s)", - INITCAP(d.valid_metadata::json #>> '{ + ds.name AS "Document ID", + p.title AS "Document Title", + fs.name AS "Family ID", + f.title AS "Family Title", + f.description AS "Family Summary", + n1.collection_titles AS "Collection Title(s)", + n1.collection_descriptions AS "Collection Description(s)", + INITCAP(d.valid_metadata::json #>> '{ role,0}') AS "Document Role", - d.variant_name AS "Document Variant", - p.source_url AS "Document Content URL", - INITCAP(d.valid_metadata::json #>> '{ + d.variant_name AS "Document Variant", + p.source_url AS "Document Content URL", + INITCAP(d.valid_metadata::json #>> '{ type,0}') AS "Document Type", - CASE - WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' - ELSE INITCAP(f.family_category::TEXT) - END "Category", - array_to_string( - ARRAY( - SELECT - jsonb_array_elements_text(fm.value -> 'framework') - ), - ';' - ) AS "Framework", - n2.language AS "Language", - o.name AS "Source", - fg.geo_isos AS "Geography ISOs", - fg.geo_display_values AS "Geographies", - array_to_string( - ARRAY( - SELECT - jsonb_array_elements_text(fm.value -> 'topic') - ), - ';' - ) AS "Topic/Response", - array_to_string( - ARRAY( - SELECT - jsonb_array_elements_text(fm.value -> 'hazard') - ), - ';' - ) AS "Hazard", - array_to_string( - ARRAY( - SELECT - jsonb_array_elements_text(fm.value -> 'sector') - ), - ';' - ) AS "Sector", - array_to_string( - ARRAY( - SELECT - jsonb_array_elements_text(fm.value -> 'keyword') - ), - ';' - ) AS "Keyword", - array_to_string( - ARRAY( - SELECT - jsonb_array_elements_text(fm.value -> 'instrument') - ), - ';' - ) AS "Instrument", - array_to_string( - ARRAY( - SELECT - jsonb_array_elements_text(fm.value -> 'author') - ), - ';' - ) AS "Author", - array_to_string( - ARRAY( - SELECT - jsonb_array_elements_text(fm.value -> 'author_type') - ), - ';' - ) AS "Author Type", - fp.published_date AS "First event in timeline", - fp.last_changed AS "Last event in timeline", - n3.event_type_names AS "Full timeline of events (types)", - n3.event_dates AS "Full timeline of events (dates)", - d.created::date AS "Date Added to System", - f.last_modified::date AS "Last ModIFied on System", - d.import_id AS "Internal Document ID", - f.import_id AS "Internal Family ID", - n1.collection_import_ids AS "Internal Collection ID(s)" -FROM - physical_document p - JOIN family_document d ON p.id = d.physical_document_id - JOIN FAMILY f ON d.family_import_id = f.import_id - FULL JOIN ( - SELECT - family_geography.family_import_id AS "family_import_id", - string_agg(geography.value, ';') AS geo_isos, - string_agg(geography.display_value, ';') AS geo_display_values - FROM - geography - INNER JOIN family_geography ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) fg ON fg.family_import_id = f.import_id - JOIN family_corpus fc ON f.import_id = fc.family_import_id - JOIN corpus c ON fc.corpus_import_id = c.import_id - JOIN organisation o ON c.organisation_id = o.id - JOIN family_metadata fm ON fm.family_import_id = f.import_id - FULL JOIN ( - SELECT - collection_family.family_import_id AS "family_import_id", - string_agg(collection.import_id, ';') AS collection_import_ids, - string_agg(collection.title, ';') AS collection_titles, - string_agg(collection.description, ';') AS collection_descriptions - FROM - collection - INNER JOIN collection_family ON collection_family.collection_import_id = collection.import_id - GROUP BY - collection_family.family_import_id - ) n1 ON n1.family_import_id = f.import_id - LEFT JOIN ( - SELECT - p.id AS "id", - string_agg( - l.name, + CASE + WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' + ELSE INITCAP(f.family_category::TEXT) + END "Category", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'framework') + ), ';' - ORDER BY - l.name - ) AS language - FROM - physical_document p - LEFT JOIN physical_document_language pdl ON pdl.document_id = p.id - LEFT JOIN language l ON l.id = pdl.language_id - GROUP BY - p.id - ) n2 ON n2.id = d.physical_document_id - FULL JOIN ( - SELECT - family_event.family_import_id, - string_agg(family_event.import_id, ';') AS event_import_ids, - string_agg(family_event.title, ';') AS event_titles, - string_agg(family_event.event_type_name, ';') AS event_type_names, - string_agg(family_event.date::date::text, ';') AS event_dates - FROM - family_event - INNER JOIN FAMILY ON FAMILY.import_id = family_event.family_import_id - GROUP BY - family_event.family_import_id - ) n3 ON n3.family_import_id = f.import_id - LEFT JOIN most_recent_doc_slugs ds ON ds.family_document_import_id = d.import_id - LEFT JOIN most_recent_family_slugs fs ON fs.family_import_id = f.import_id - LEFT JOIN event_dates fp ON fp.family_import_id = f.import_id + ) AS "Framework", + n2.language AS "Language", + o.name AS "Source", + fg.geo_isos AS "Geography ISOs", + fg.geo_display_values AS "Geographies", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'topic') + ), + ';' + ) AS "Topic/Response", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'hazard') + ), + ';' + ) AS "Hazard", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'sector') + ), + ';' + ) AS "Sector", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'keyword') + ), + ';' + ) AS "Keyword", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'instrument') + ), + ';' + ) AS "Instrument", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author') + ), + ';' + ) AS "Author", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author_type') + ), + ';' + ) AS "Author Type", + fp.published_date AS "First event in timeline", + fp.last_changed AS "Last event in timeline", + n3.event_type_names AS "Full timeline of events (types)", + n3.event_dates AS "Full timeline of events (dates)", + d.created::date AS "Date Added to System", + f.last_modified::date AS "Last ModIFied on System", + d.import_id AS "Internal Document ID", + f.import_id AS "Internal Family ID", + n1.collection_import_ids AS "Internal Collection ID(s)" +FROM + physical_document p + JOIN family_document d ON p.id = d.physical_document_id + JOIN FAMILY f ON d.family_import_id = f.import_id + FULL JOIN ( + SELECT + family_geography.family_import_id AS "family_import_id", + STRING_AGG(geography.value, ';') AS geo_isos, + STRING_AGG(geography.display_value, ';') AS geo_display_values + FROM + geography + INNER JOIN family_geography ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id + ) fg ON fg.family_import_id = f.import_id + JOIN family_corpus fc ON f.import_id = fc.family_import_id + JOIN corpus c ON fc.corpus_import_id = c.import_id + JOIN organisation o ON c.organisation_id = o.id + JOIN family_metadata fm ON fm.family_import_id = f.import_id + FULL JOIN ( + SELECT + collection_family.family_import_id AS "family_import_id", + STRING_AGG(collection.import_id, ';') AS collection_import_ids, + STRING_AGG(collection.title, ';') AS collection_titles, + STRING_AGG(collection.description, ';') AS collection_descriptions + FROM + collection + INNER JOIN collection_family ON collection_family.collection_import_id = collection.import_id + GROUP BY + collection_family.family_import_id + ) n1 ON n1.family_import_id = f.import_id + LEFT JOIN ( + SELECT + p.id AS "id", + STRING_AGG( + l.name, + ';' + ORDER BY + l.name + ) AS language + FROM + physical_document p + LEFT JOIN physical_document_language pdl ON pdl.document_id = p.id + LEFT JOIN language l ON l.id = pdl.language_id + GROUP BY + p.id + ) n2 ON n2.id = d.physical_document_id + FULL JOIN ( + SELECT + family_event.family_import_id, + STRING_AGG(family_event.import_id, ';') AS event_import_ids, + STRING_AGG(family_event.title, ';') AS event_titles, + STRING_AGG(family_event.event_type_name, ';') AS event_type_names, + STRING_AGG(family_event.date::date::TEXT, ';') AS event_dates + FROM + family_event + INNER JOIN FAMILY ON FAMILY.import_id = family_event.family_import_id + GROUP BY + family_event.family_import_id + ) n3 ON n3.family_import_id = f.import_id + LEFT JOIN most_recent_doc_slugs ds ON ds.family_document_import_id = d.import_id + LEFT JOIN most_recent_family_slugs fs ON fs.family_import_id = f.import_id + LEFT JOIN event_dates fp ON fp.family_import_id = f.import_id WHERE - d.last_modified < ':ingest_cycle_start' - AND fc.corpus_import_id in ':allowed_corpora_ids' + d.last_modified < ':ingest_cycle_start' + AND fc.corpus_import_id IN ':allowed_corpora_ids' ORDER BY - d.last_modified DESC, - d.created DESC, - d.ctid DESC, - n1.family_import_id + d.last_modified DESC, + d.created DESC, + d.ctid DESC, + n1.family_import_id diff --git a/app/repository/sql/pipeline.sql b/app/repository/sql/pipeline.sql index 37bb5455..dad0b329 100644 --- a/app/repository/sql/pipeline.sql +++ b/app/repository/sql/pipeline.sql @@ -1,233 +1,233 @@ WITH - deduplicated_family_slugs AS ( - SELECT DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name - FROM - ( + deduplicated_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id AS "family_import_id", + COUNT(*) AS COUNT + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) > 1 + ) duplicates + LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + unique_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id AS "family_import_id", + COUNT(*) AS COUNT + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) = 1 + ) non_duplicates + LEFT JOIN slug ON non_duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + most_recent_family_slugs AS ( + SELECT + deduplicated_family_slugs.family_import_id AS "family_import_id", + deduplicated_family_slugs.created AS "created", + deduplicated_family_slugs.name AS "name" + FROM + deduplicated_family_slugs + UNION ALL + SELECT + unique_family_slugs.family_import_id AS "family_import_id", + unique_family_slugs.created AS "created", + unique_family_slugs.name AS "name" + FROM + unique_family_slugs + ORDER BY + family_import_id DESC, + created DESC + ), + deduplicated_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id AS "family_document_import_id", + COUNT(*) AS COUNT + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) > 1 + ) duplicates + LEFT JOIN slug ON duplicates.family_document_import_id = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + unique_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id AS "family_document_import_id", + COUNT(*) AS COUNT + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) = 1 + ) non_duplicates + LEFT JOIN slug ON non_duplicates.family_document_import_id = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + most_recent_doc_slugs AS ( SELECT - slug.family_import_id AS "family_import_id", - Count(*) AS count + deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", + deduplicated_doc_slugs.created, + deduplicated_doc_slugs.name FROM - slug - WHERE - slug.family_import_id IS NOT NULL + deduplicated_doc_slugs + UNION ALL + SELECT + unique_doc_slugs.family_document_import_id AS "family_document_import_id", + unique_doc_slugs.created, + unique_doc_slugs.name + FROM + unique_doc_slugs + ORDER BY + family_document_import_id DESC, + created DESC + ), + event_dates AS ( + SELECT + family_event.family_import_id AS family_import_id, + CASE + WHEN COUNT(*) FILTER ( + WHERE + family_event.event_type_name = ( + family_event.valid_metadata -> 'datetime_event_name' ->> 0 + ) + ) > 0 THEN MIN( + CASE + WHEN family_event.event_type_name = ( + family_event.valid_metadata -> 'datetime_event_name' ->> 0 + ) THEN family_event.date::TIMESTAMPTZ + END + ) + ELSE MIN(family_event.date::TIMESTAMPTZ) + END AS published_date + FROM + family_event GROUP BY - slug.family_import_id - HAVING - Count(*) > 1 - ) duplicates - left join slug ON duplicates.family_import_id = slug.family_import_id - ORDER BY - slug.family_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - unique_family_slugs AS ( - SELECT DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name - FROM - ( + family_import_id + ) +SELECT + f.title AS "family_title", + p.title AS "physical_document_title", + f.description AS "family_description", + CASE + WHEN f.family_category IN ('UNFCCC', 'MCF') THEN UPPER(f.family_category::TEXT) + ELSE INITCAP(f.family_category::TEXT) + END "family_category", + fp.published_date AS "family_published_date", + d.import_id AS "family_document_import_id", + ds.name AS "family_document_slug", + f.import_id AS "family_import_id", + fs.name AS "family_slug", + p.source_url AS "physical_document_source_url", + d.valid_metadata::json #>> '{type,0}' AS "family_document_type", + o.name AS "organisation_name", + geos.geographies AS "geographies", + c.import_id AS "corpus_import_id", + c.corpus_type_name AS "corpus_type_name", + langs.languages AS "languages", + fm.value AS "family_metadata", + d.valid_metadata AS "family_document_metadata" +FROM + physical_document p + JOIN family_document d ON p.id = d.physical_document_id + JOIN FAMILY f ON d.family_import_id = f.import_id + FULL JOIN ( SELECT - slug.family_import_id AS "family_import_id", - Count(*) AS count + family_geography.family_import_id AS "family_import_id", + STRING_AGG(geography.value, ';') AS geo_isos, + STRING_AGG(geography.display_value, ';') AS geo_display_values FROM - slug - WHERE - slug.family_import_id IS NOT NULL + geography + INNER JOIN family_geography ON geography.id = family_geography.geography_id GROUP BY - slug.family_import_id - HAVING - Count(*) = 1 - ) non_duplicates - left join slug ON non_duplicates.family_import_id = slug.family_import_id - ORDER BY - slug.family_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - most_recent_family_slugs AS ( - SELECT - deduplicated_family_slugs.family_import_id AS "family_import_id", - deduplicated_family_slugs.created AS "created", - deduplicated_family_slugs.name AS "name" - FROM - deduplicated_family_slugs - UNION ALL - SELECT - unique_family_slugs.family_import_id AS "family_import_id", - unique_family_slugs.created AS "created", - unique_family_slugs.name AS "name" - FROM - unique_family_slugs - ORDER BY - family_import_id DESC, - created DESC - ), - deduplicated_doc_slugs AS ( - SELECT DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name - FROM - ( + family_geography.family_import_id + ) fg ON fg.family_import_id = f.import_id + JOIN family_corpus fc ON f.import_id = fc.family_import_id + JOIN corpus c ON fc.corpus_import_id = c.import_id + JOIN organisation o ON c.organisation_id = o.id + JOIN family_metadata fm ON fm.family_import_id = f.import_id + LEFT OUTER JOIN ( SELECT - slug.family_document_import_id AS "family_document_import_id", - Count(*) AS count + family_document.import_id AS family_document_import_id, + JSON_AGG(DISTINCT (language.name)) AS languages FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL + family_document + JOIN physical_document_language ON physical_document_language.document_id = family_document.physical_document_id + JOIN language ON language.id = physical_document_language.language_id GROUP BY - slug.family_document_import_id - HAVING - Count(*) > 1 - ) duplicates - left join slug ON duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - unique_doc_slugs AS ( - SELECT DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name - FROM - ( + family_document.import_id + ) AS langs ON langs.family_document_import_id = d.import_id + LEFT OUTER JOIN ( SELECT - slug.family_document_import_id AS "family_document_import_id", - Count(*) AS count + family_geography.family_import_id AS family_import_id, + JSON_AGG(DISTINCT (geography.value)) AS geographies FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL + family_geography + JOIN geography ON geography.id = family_geography.geography_id GROUP BY - slug.family_document_import_id - HAVING - Count(*) = 1 - ) non_duplicates - left join slug ON non_duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - most_recent_doc_slugs AS ( - SELECT - deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", - deduplicated_doc_slugs.created, - deduplicated_doc_slugs.name - FROM - deduplicated_doc_slugs - UNION ALL - SELECT - unique_doc_slugs.family_document_import_id AS "family_document_import_id", - unique_doc_slugs.created, - unique_doc_slugs.name - FROM - unique_doc_slugs - ORDER BY - family_document_import_id DESC, - created DESC - ), - event_dates AS ( - SELECT - family_event.family_import_id AS family_import_id, - CASE - WHEN COUNT(*) FILTER ( - WHERE - family_event.event_type_name = ( - family_event.valid_metadata -> 'datetime_event_name' ->> 0 - ) - ) > 0 THEN MIN( - CASE - WHEN family_event.event_type_name = ( - family_event.valid_metadata -> 'datetime_event_name' ->> 0 - ) THEN family_event.date::TIMESTAMPTZ - END - ) - ELSE MIN(family_event.date::TIMESTAMPTZ) - END AS published_date - FROM - family_event - GROUP BY - family_import_id - ) -SELECT - f.title AS "family_title", - p.title AS "physical_document_title", - f.description AS "family_description", - CASE - WHEN f.family_category IN ('UNFCCC', 'MCF') THEN Upper(f.family_category::text) - ELSE Initcap(f.family_category::text) - END "family_category", - fp.published_date AS "family_published_date", - d.import_id AS "family_document_import_id", - ds.name AS "family_document_slug", - f.import_id AS "family_import_id", - fs.name AS "family_slug", - p.source_url AS "physical_document_source_url", - d.valid_metadata::json #>> '{type,0}' AS "family_document_type", - o.name AS "organisation_name", - geos.geographies AS "geographies", - c.import_id AS "corpus_import_id", - c.corpus_type_name AS "corpus_type_name", - langs.languages AS "languages", - fm.value AS "family_metadata", - d.valid_metadata AS "family_document_metadata" -FROM - physical_document p - join family_document d ON p.id = d.physical_document_id - join family f ON d.family_import_id = f.import_id - full join ( - SELECT - family_geography.family_import_id AS "family_import_id", - string_agg(geography.value, ';') AS geo_isos, - string_agg(geography.display_value, ';') AS geo_display_values - FROM - geography - inner join family_geography ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) fg ON fg.family_import_id = f.import_id - join family_corpus fc ON f.import_id = fc.family_import_id - join corpus c ON fc.corpus_import_id = c.import_id - join organisation o ON c.organisation_id = o.id - join family_metadata fm ON fm.family_import_id = f.import_id - left outer join ( - SELECT - family_document.import_id AS family_document_import_id, - json_agg(DISTINCT (LANGUAGE.name)) AS languages - FROM - family_document - join physical_document_language ON physical_document_language.document_id = family_document.physical_document_id - join LANGUAGE ON LANGUAGE.id = physical_document_language.language_id - GROUP BY - family_document.import_id - ) AS langs ON langs.family_document_import_id = d.import_id - left outer join ( - SELECT - family_geography.family_import_id AS family_import_id, - json_agg(DISTINCT (geography.value)) AS geographies - FROM - family_geography - join geography ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) AS geos ON geos.family_import_id = f.import_id - left join most_recent_doc_slugs ds ON ds.family_document_import_id = d.import_id - left join most_recent_family_slugs fs ON fs.family_import_id = f.import_id - left join event_dates fp ON fp.family_import_id = f.import_id + family_geography.family_import_id + ) AS geos ON geos.family_import_id = f.import_id + LEFT JOIN most_recent_doc_slugs ds ON ds.family_document_import_id = d.import_id + LEFT JOIN most_recent_family_slugs fs ON fs.family_import_id = f.import_id + LEFT JOIN event_dates fp ON fp.family_import_id = f.import_id WHERE - d.document_status != 'DELETED' - AND fg.family_import_id = f.import_id + d.document_status != 'DELETED' + AND fg.family_import_id = f.import_id ORDER BY - d.last_modified DESC, - d.created DESC, - d.ctid DESC, - f.import_id + d.last_modified DESC, + d.created DESC, + d.ctid DESC, + f.import_id diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql index 210c9dec..60a7ea60 100644 --- a/app/repository/sql/slug_lookup.sql +++ b/app/repository/sql/slug_lookup.sql @@ -1,24 +1,24 @@ SELECT - slug.family_document_import_id, - slug.family_import_id + slug.family_document_import_id, + slug.family_import_id FROM - slug - LEFT JOIN family ON family.import_id = slug.family_import_id - LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id - LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id + slug + LEFT JOIN public.family f ON f.import_id = slug.family_import_id + LEFT JOIN family_corpus ON family_corpus.family_import_id = f.import_id + LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id WHERE - slug.name = ':slug_name' - AND corpus.import_id IN ':allowed_corpora_ids' + slug.name = ':slug_name' + AND corpus.import_id IN ':allowed_corpora_ids' UNION SELECT - slug.family_document_import_id, - slug.family_import_id + slug.family_document_import_id, + slug.family_import_id FROM - slug - LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id - LEFT JOIN family ON family.import_id = family_document.family_import_id - LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id - LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id + slug + LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id + LEFT JOIN public.family f ON public.family.import_id = family_document.family_import_id + LEFT JOIN family_corpus ON family_corpus.family_import_id = f.import_id + LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id WHERE - slug.name = ':slug_name' - AND corpus.import_id IN ':allowed_corpora_ids'; + slug.name = ':slug_name' + AND corpus.import_id IN ':allowed_corpora_ids'; From 8d54a4e6424e0b263529dbecba95147161a45ab4 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Wed, 6 Nov 2024 11:31:38 +0000 Subject: [PATCH 06/22] Remove sqlformatter & enable sqlfluff --- .trunk/configs/.sql-formatter.json | 11 ----------- .trunk/configs/.sqlfluff | 22 ++++++++++++++++++++++ .trunk/trunk.yaml | 29 ++--------------------------- 3 files changed, 24 insertions(+), 38 deletions(-) delete mode 100644 .trunk/configs/.sql-formatter.json create mode 100644 .trunk/configs/.sqlfluff diff --git a/.trunk/configs/.sql-formatter.json b/.trunk/configs/.sql-formatter.json deleted file mode 100644 index e4046a0f..00000000 --- a/.trunk/configs/.sql-formatter.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "language": "postgresql", - "dialect": "postgresql", - "tabWidth": 4, - "keywordCase": "upper", - "dataTypeCase": "upper", - "identifierCase": "lower", - "functionCase": "upper", - "linesBetweenQueries": 2, - "denseOperators": false -} diff --git a/.trunk/configs/.sqlfluff b/.trunk/configs/.sqlfluff new file mode 100644 index 00000000..62191d86 --- /dev/null +++ b/.trunk/configs/.sqlfluff @@ -0,0 +1,22 @@ +[sqlfluff] +dialect = postgres +exclude_rules = LT02, LT09 + +[sqlfluff:indentation] +indented_ctes = True + +[sqlfluff:rules:references.special_chars] +allow_space_in_identifier = True +additional_allowed_characters = ["/"] + +[sqlfluff:rules:capitalisation.keywords] +capitalisation_policy = upper + +[sqlfluff:rules:capitalisation.identifiers] +extended_capitalisation_policy = lower + +[sqlfluff:rules:capitalisation.functions] +extended_capitalisation_policy = upper + +[sqlfluff:rules:capitalisation.types] +extended_capitalisation_policy = upper diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index d76187cf..bc682e43 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -5,14 +5,6 @@ version: 0.1 cli: version: 1.22.0 -tools: - definitions: - - name: sql-formatter - runtime: node - package: sql-formatter - shims: [sql-formatter] - known_good_version: 7.0.1 - # Trunk provides extensibility via plugins. # (https://docs.trunk.io/plugins) plugins: @@ -42,24 +34,6 @@ lint: - name: lint run: bandit --exit-zero -c bandit.yaml --format json --output ${tmpfile} ${target} - - name: sql-formatter - files: [sql] - description: A SQL formatter - commands: - - output: rewrite - # Force postgresql dialect - run: sql-formatter -l postgresql - success_codes: [0] - formatter: true - stdin: true - # Linter does not support batching. - tools: [sql-formatter] - known_good_version: 7.0.1 - suggest_if: never - version_command: - parse_regex: ${semver} - run: sql-formatter --version - ignore: - linters: [ALL] paths: @@ -71,7 +45,8 @@ lint: - LICENSE.md enabled: - - sql-formatter@15.4.5 + - sqlfluff@3.2.5: + commands: [lint] - actionlint@1.6.27 - bandit@1.7.8 - black@24.4.2 From ab3476708920c5760f058ec40d14d008f94f5bad Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Wed, 6 Nov 2024 11:34:08 +0000 Subject: [PATCH 07/22] Fix formatting --- app/repository/sql/download.sql | 230 ++++++++------ app/repository/sql/pipeline.sql | 478 ++++++++++++++++------------- app/repository/sql/slug_lookup.sql | 16 +- 3 files changed, 404 insertions(+), 320 deletions(-) diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index cf8084b2..29da480f 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -1,5 +1,4 @@ -WITH - deduplicated_family_slugs AS ( +WITH deduplicated_family_slugs AS ( SELECT DISTINCT ON (slug.family_import_id) slug.family_import_id, slug.created, @@ -7,8 +6,8 @@ WITH FROM ( SELECT - slug.family_import_id AS "family_import_id", - COUNT(*) AS COUNT + slug.family_import_id, + COUNT(*) AS count FROM slug WHERE @@ -17,14 +16,15 @@ WITH slug.family_import_id HAVING COUNT(*) > 1 - ) duplicates - LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id + ) AS duplicates + LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id ORDER BY slug.family_import_id DESC, slug.created DESC, slug.ctid DESC ), - unique_family_slugs AS ( + +unique_family_slugs AS ( SELECT DISTINCT ON (slug.family_import_id) slug.family_import_id, slug.created, @@ -32,8 +32,8 @@ WITH FROM ( SELECT - slug.family_import_id AS "family_import_id", - COUNT(*) AS COUNT + slug.family_import_id, + COUNT(*) AS count FROM slug WHERE @@ -42,32 +42,36 @@ WITH slug.family_import_id HAVING COUNT(*) = 1 - ) non_duplicates - LEFT JOIN slug ON non_duplicates.family_import_id = slug.family_import_id + ) AS non_duplicates + LEFT JOIN + slug + ON non_duplicates.family_import_id = slug.family_import_id ORDER BY slug.family_import_id DESC, slug.created DESC, slug.ctid DESC ), - most_recent_family_slugs AS ( + +most_recent_family_slugs AS ( SELECT - deduplicated_family_slugs.family_import_id AS "family_import_id", - deduplicated_family_slugs.created AS "created", - deduplicated_family_slugs.name AS "name" + deduplicated_family_slugs.family_import_id, + deduplicated_family_slugs.created, + deduplicated_family_slugs.name FROM deduplicated_family_slugs UNION ALL SELECT - unique_family_slugs.family_import_id AS "family_import_id", - unique_family_slugs.created AS "created", - unique_family_slugs.name AS "name" + unique_family_slugs.family_import_id, + unique_family_slugs.created, + unique_family_slugs.name FROM unique_family_slugs ORDER BY family_import_id DESC, created DESC ), - deduplicated_doc_slugs AS ( + +deduplicated_doc_slugs AS ( SELECT DISTINCT ON (slug.family_document_import_id) slug.family_document_import_id, slug.created, @@ -75,8 +79,8 @@ WITH FROM ( SELECT - slug.family_document_import_id AS "family_document_import_id", - COUNT(*) AS COUNT + slug.family_document_import_id, + COUNT(*) AS count FROM slug WHERE @@ -85,14 +89,19 @@ WITH slug.family_document_import_id HAVING COUNT(*) > 1 - ) duplicates - LEFT JOIN slug ON duplicates.family_document_import_id = slug.family_document_import_id + ) AS duplicates + LEFT JOIN + slug + ON + duplicates.family_document_import_id + = slug.family_document_import_id ORDER BY slug.family_document_import_id DESC, slug.created DESC, slug.ctid DESC ), - unique_doc_slugs AS ( + +unique_doc_slugs AS ( SELECT DISTINCT ON (slug.family_document_import_id) slug.family_document_import_id, slug.created, @@ -100,8 +109,8 @@ WITH FROM ( SELECT - slug.family_document_import_id AS "family_document_import_id", - COUNT(*) AS COUNT + slug.family_document_import_id, + COUNT(*) AS count FROM slug WHERE @@ -110,23 +119,28 @@ WITH slug.family_document_import_id HAVING COUNT(*) = 1 - ) non_duplicates - LEFT JOIN slug ON non_duplicates.family_document_import_id = slug.family_document_import_id + ) AS non_duplicates + LEFT JOIN + slug + ON + non_duplicates.family_document_import_id + = slug.family_document_import_id ORDER BY slug.family_document_import_id DESC, slug.created DESC, slug.ctid DESC ), - most_recent_doc_slugs AS ( + +most_recent_doc_slugs AS ( SELECT - deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", + deduplicated_doc_slugs.family_document_import_id, deduplicated_doc_slugs.created, deduplicated_doc_slugs.name FROM deduplicated_doc_slugs UNION ALL SELECT - unique_doc_slugs.family_document_import_id AS "family_document_import_id", + unique_doc_slugs.family_document_import_id, unique_doc_slugs.created, unique_doc_slugs.name FROM @@ -135,30 +149,65 @@ WITH family_document_import_id DESC, created DESC ), - event_dates AS ( + +event_dates AS ( SELECT - family_event.family_import_id AS family_import_id, + family_event.family_import_id, CASE WHEN COUNT(*) FILTER ( WHERE family_event.event_type_name = ( - family_event.valid_metadata -> 'datetime_event_name' ->> 0 + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 ) ) > 0 THEN MIN( CASE WHEN family_event.event_type_name = ( - family_event.valid_metadata -> 'datetime_event_name' ->> 0 + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 ) THEN family_event.date::TIMESTAMPTZ END ) ELSE MIN(family_event.date::TIMESTAMPTZ) END AS published_date, - MAX(family_event.date::date) last_changed + MAX(family_event.date::DATE) AS last_changed FROM family_event GROUP BY - family_import_id + family_event.family_import_id + ), + +fg AS ( + SELECT + family_geography.family_import_id, + STRING_AGG(geography.value, ';') AS geo_isos, + STRING_AGG(geography.display_value, ';') AS geo_display_values + FROM + geography + INNER JOIN + family_geography + ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id + ), + +n1 AS ( + SELECT + collection_family.family_import_id, + STRING_AGG(collection.import_id, ';') AS collection_import_ids, + STRING_AGG(collection.title, ';') AS collection_titles, + STRING_AGG(collection.description, ';') AS collection_descriptions + FROM + collection + INNER JOIN + collection_family + ON collection.import_id = collection_family.collection_import_id + GROUP BY + collection_family.family_import_id ) + SELECT ds.name AS "Document ID", p.title AS "Document Title", @@ -167,16 +216,29 @@ SELECT f.description AS "Family Summary", n1.collection_titles AS "Collection Title(s)", n1.collection_descriptions AS "Collection Description(s)", - INITCAP(d.valid_metadata::json #>> '{ - role,0}') AS "Document Role", d.variant_name AS "Document Variant", p.source_url AS "Document Content URL", - INITCAP(d.valid_metadata::json #>> '{ + n2.language AS "Language", + o.name AS "Source", + fg.geo_isos AS "Geography ISOs", + fg.geo_display_values AS "Geographies", + fp.published_date AS "First event in timeline", + fp.last_changed AS "Last event in timeline", + n3.event_type_names AS "Full timeline of events (types)", + n3.event_dates AS "Full timeline of events (dates)", + d.created::DATE AS "Date Added to System", + f.last_modified::DATE AS "Last ModIFied on System", + d.import_id AS "Internal Document ID", + f.import_id AS "Internal Family ID", + n1.collection_import_ids AS "Internal Collection ID(s)", + INITCAP(d.valid_metadata::JSON #>> '{ + role,0}') AS "Document Role", + INITCAP(d.valid_metadata::JSON #>> '{ type,0}') AS "Document Type", CASE WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' ELSE INITCAP(f.family_category::TEXT) - END "Category", + END AS "Category", ARRAY_TO_STRING( ARRAY( SELECT @@ -184,10 +246,6 @@ SELECT ), ';' ) AS "Framework", - n2.language AS "Language", - o.name AS "Source", - fg.geo_isos AS "Geography ISOs", - fg.geo_display_values AS "Geographies", ARRAY_TO_STRING( ARRAY( SELECT @@ -236,50 +294,20 @@ SELECT JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author_type') ), ';' - ) AS "Author Type", - fp.published_date AS "First event in timeline", - fp.last_changed AS "Last event in timeline", - n3.event_type_names AS "Full timeline of events (types)", - n3.event_dates AS "Full timeline of events (dates)", - d.created::date AS "Date Added to System", - f.last_modified::date AS "Last ModIFied on System", - d.import_id AS "Internal Document ID", - f.import_id AS "Internal Family ID", - n1.collection_import_ids AS "Internal Collection ID(s)" + ) AS "Author Type" FROM - physical_document p - JOIN family_document d ON p.id = d.physical_document_id - JOIN FAMILY f ON d.family_import_id = f.import_id - FULL JOIN ( - SELECT - family_geography.family_import_id AS "family_import_id", - STRING_AGG(geography.value, ';') AS geo_isos, - STRING_AGG(geography.display_value, ';') AS geo_display_values - FROM - geography - INNER JOIN family_geography ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) fg ON fg.family_import_id = f.import_id - JOIN family_corpus fc ON f.import_id = fc.family_import_id - JOIN corpus c ON fc.corpus_import_id = c.import_id - JOIN organisation o ON c.organisation_id = o.id - JOIN family_metadata fm ON fm.family_import_id = f.import_id - FULL JOIN ( - SELECT - collection_family.family_import_id AS "family_import_id", - STRING_AGG(collection.import_id, ';') AS collection_import_ids, - STRING_AGG(collection.title, ';') AS collection_titles, - STRING_AGG(collection.description, ';') AS collection_descriptions - FROM - collection - INNER JOIN collection_family ON collection_family.collection_import_id = collection.import_id - GROUP BY - collection_family.family_import_id - ) n1 ON n1.family_import_id = f.import_id + physical_document AS p + INNER JOIN family_document AS d ON p.id = d.physical_document_id + INNER JOIN family AS f ON d.family_import_id = f.import_id + FULL JOIN fg ON f.import_id = fg.family_import_id + INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id + INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id + INNER JOIN organisation AS o ON c.organisation_id = o.id + INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id + FULL JOIN n1 ON f.import_id = n1.family_import_id LEFT JOIN ( SELECT - p.id AS "id", + p.id, STRING_AGG( l.name, ';' @@ -287,28 +315,36 @@ FROM l.name ) AS language FROM - physical_document p - LEFT JOIN physical_document_language pdl ON pdl.document_id = p.id - LEFT JOIN language l ON l.id = pdl.language_id + physical_document AS p + LEFT JOIN + physical_document_language AS pdl + ON p.id = pdl.document_id + LEFT JOIN language AS l ON pdl.language_id = l.id GROUP BY p.id - ) n2 ON n2.id = d.physical_document_id + ) AS n2 ON d.physical_document_id = n2.id FULL JOIN ( SELECT family_event.family_import_id, STRING_AGG(family_event.import_id, ';') AS event_import_ids, STRING_AGG(family_event.title, ';') AS event_titles, STRING_AGG(family_event.event_type_name, ';') AS event_type_names, - STRING_AGG(family_event.date::date::TEXT, ';') AS event_dates + STRING_AGG(family_event.date::DATE::TEXT, ';') AS event_dates FROM family_event - INNER JOIN FAMILY ON FAMILY.import_id = family_event.family_import_id + INNER JOIN + family + ON family_event.family_import_id = family.import_id GROUP BY family_event.family_import_id - ) n3 ON n3.family_import_id = f.import_id - LEFT JOIN most_recent_doc_slugs ds ON ds.family_document_import_id = d.import_id - LEFT JOIN most_recent_family_slugs fs ON fs.family_import_id = f.import_id - LEFT JOIN event_dates fp ON fp.family_import_id = f.import_id + ) AS n3 ON f.import_id = n3.family_import_id + LEFT JOIN + most_recent_doc_slugs AS ds + ON d.import_id = ds.family_document_import_id + LEFT JOIN + most_recent_family_slugs AS fs + ON f.import_id = fs.family_import_id + LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id WHERE d.last_modified < ':ingest_cycle_start' AND fc.corpus_import_id IN ':allowed_corpora_ids' @@ -316,4 +352,4 @@ ORDER BY d.last_modified DESC, d.created DESC, d.ctid DESC, - n1.family_import_id + n1.family_import_id ASC diff --git a/app/repository/sql/pipeline.sql b/app/repository/sql/pipeline.sql index dad0b329..7a5d0e40 100644 --- a/app/repository/sql/pipeline.sql +++ b/app/repository/sql/pipeline.sql @@ -1,228 +1,274 @@ -WITH - deduplicated_family_slugs AS ( - SELECT DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name - FROM - ( - SELECT - slug.family_import_id AS "family_import_id", - COUNT(*) AS COUNT - FROM - slug - WHERE - slug.family_import_id IS NOT NULL - GROUP BY - slug.family_import_id - HAVING - COUNT(*) > 1 - ) duplicates - LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id - ORDER BY - slug.family_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - unique_family_slugs AS ( - SELECT DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name - FROM - ( - SELECT - slug.family_import_id AS "family_import_id", - COUNT(*) AS COUNT - FROM - slug - WHERE - slug.family_import_id IS NOT NULL - GROUP BY - slug.family_import_id - HAVING - COUNT(*) = 1 - ) non_duplicates - LEFT JOIN slug ON non_duplicates.family_import_id = slug.family_import_id - ORDER BY - slug.family_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - most_recent_family_slugs AS ( - SELECT - deduplicated_family_slugs.family_import_id AS "family_import_id", - deduplicated_family_slugs.created AS "created", - deduplicated_family_slugs.name AS "name" - FROM - deduplicated_family_slugs - UNION ALL - SELECT - unique_family_slugs.family_import_id AS "family_import_id", - unique_family_slugs.created AS "created", - unique_family_slugs.name AS "name" - FROM - unique_family_slugs - ORDER BY - family_import_id DESC, - created DESC - ), - deduplicated_doc_slugs AS ( - SELECT DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name - FROM - ( - SELECT - slug.family_document_import_id AS "family_document_import_id", - COUNT(*) AS COUNT - FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL - GROUP BY - slug.family_document_import_id - HAVING - COUNT(*) > 1 - ) duplicates - LEFT JOIN slug ON duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - unique_doc_slugs AS ( - SELECT DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name - FROM - ( - SELECT - slug.family_document_import_id AS "family_document_import_id", - COUNT(*) AS COUNT - FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL - GROUP BY - slug.family_document_import_id - HAVING - COUNT(*) = 1 - ) non_duplicates - LEFT JOIN slug ON non_duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC - ), - most_recent_doc_slugs AS ( - SELECT - deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", - deduplicated_doc_slugs.created, - deduplicated_doc_slugs.name - FROM - deduplicated_doc_slugs - UNION ALL - SELECT - unique_doc_slugs.family_document_import_id AS "family_document_import_id", - unique_doc_slugs.created, - unique_doc_slugs.name - FROM - unique_doc_slugs - ORDER BY - family_document_import_id DESC, - created DESC - ), - event_dates AS ( - SELECT - family_event.family_import_id AS family_import_id, - CASE - WHEN COUNT(*) FILTER ( +WITH deduplicated_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) + slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN + slug + ON duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +unique_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) + slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON non_duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +most_recent_family_slugs AS ( + SELECT + deduplicated_family_slugs.family_import_id, + deduplicated_family_slugs.created, + deduplicated_family_slugs.name + FROM + deduplicated_family_slugs + UNION ALL + SELECT + unique_family_slugs.family_import_id, + unique_family_slugs.created, + unique_family_slugs.name + FROM + unique_family_slugs + ORDER BY + family_import_id DESC, + created DESC +), + +deduplicated_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) + slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN + slug + ON + duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +unique_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) + slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON + non_duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +most_recent_doc_slugs AS ( + SELECT + deduplicated_doc_slugs.family_document_import_id, + deduplicated_doc_slugs.created, + deduplicated_doc_slugs.name + FROM + deduplicated_doc_slugs + UNION ALL + SELECT + unique_doc_slugs.family_document_import_id, + unique_doc_slugs.created, + unique_doc_slugs.name + FROM + unique_doc_slugs + ORDER BY + family_document_import_id DESC, + created DESC +), + +event_dates AS ( + SELECT + family_event.family_import_id, + CASE + WHEN + COUNT(*) FILTER ( WHERE - family_event.event_type_name = ( - family_event.valid_metadata -> 'datetime_event_name' ->> 0 - ) - ) > 0 THEN MIN( + family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) + ) > 0 + THEN MIN( CASE WHEN family_event.event_type_name = ( - family_event.valid_metadata -> 'datetime_event_name' ->> 0 + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 ) THEN family_event.date::TIMESTAMPTZ END ) - ELSE MIN(family_event.date::TIMESTAMPTZ) - END AS published_date - FROM - family_event - GROUP BY - family_import_id - ) + ELSE MIN(family_event.date::TIMESTAMPTZ) + END AS published_date + FROM + family_event + GROUP BY + family_event.family_import_id +), + +fg AS ( + SELECT + family_geography.family_import_id, + STRING_AGG(geography.value, ';') AS geo_isos, + STRING_AGG(geography.display_value, ';') AS geo_display_values + FROM + geography + INNER JOIN + family_geography + ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id +), + +geos AS ( + SELECT + family_geography.family_import_id, + JSON_AGG(DISTINCT geography.value) AS geographies + FROM + family_geography + INNER JOIN geography ON family_geography.geography_id = geography.id + GROUP BY + family_geography.family_import_id +) + SELECT - f.title AS "family_title", - p.title AS "physical_document_title", - f.description AS "family_description", + f.title AS family_title, + p.title AS physical_document_title, + f.description AS family_description, + fp.published_date AS family_published_date, + d.import_id AS family_document_import_id, + ds.name AS family_document_slug, + f.import_id AS family_import_id, + fs.name AS family_slug, + p.source_url AS physical_document_source_url, + o.name AS organisation_name, + geos.geographies, + c.import_id AS corpus_import_id, + c.corpus_type_name, + langs.languages, + fm.value AS family_metadata, + d.valid_metadata AS family_document_metadata, CASE - WHEN f.family_category IN ('UNFCCC', 'MCF') THEN UPPER(f.family_category::TEXT) + WHEN + f.family_category IN ('UNFCCC', 'MCF') + THEN UPPER(f.family_category::TEXT) ELSE INITCAP(f.family_category::TEXT) - END "family_category", - fp.published_date AS "family_published_date", - d.import_id AS "family_document_import_id", - ds.name AS "family_document_slug", - f.import_id AS "family_import_id", - fs.name AS "family_slug", - p.source_url AS "physical_document_source_url", - d.valid_metadata::json #>> '{type,0}' AS "family_document_type", - o.name AS "organisation_name", - geos.geographies AS "geographies", - c.import_id AS "corpus_import_id", - c.corpus_type_name AS "corpus_type_name", - langs.languages AS "languages", - fm.value AS "family_metadata", - d.valid_metadata AS "family_document_metadata" + END AS family_category, + d.valid_metadata::JSON #>> '{type,0}' AS family_document_type FROM - physical_document p - JOIN family_document d ON p.id = d.physical_document_id - JOIN FAMILY f ON d.family_import_id = f.import_id - FULL JOIN ( - SELECT - family_geography.family_import_id AS "family_import_id", - STRING_AGG(geography.value, ';') AS geo_isos, - STRING_AGG(geography.display_value, ';') AS geo_display_values - FROM - geography - INNER JOIN family_geography ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) fg ON fg.family_import_id = f.import_id - JOIN family_corpus fc ON f.import_id = fc.family_import_id - JOIN corpus c ON fc.corpus_import_id = c.import_id - JOIN organisation o ON c.organisation_id = o.id - JOIN family_metadata fm ON fm.family_import_id = f.import_id - LEFT OUTER JOIN ( - SELECT - family_document.import_id AS family_document_import_id, - JSON_AGG(DISTINCT (language.name)) AS languages - FROM - family_document - JOIN physical_document_language ON physical_document_language.document_id = family_document.physical_document_id - JOIN language ON language.id = physical_document_language.language_id - GROUP BY - family_document.import_id - ) AS langs ON langs.family_document_import_id = d.import_id - LEFT OUTER JOIN ( - SELECT - family_geography.family_import_id AS family_import_id, - JSON_AGG(DISTINCT (geography.value)) AS geographies - FROM - family_geography - JOIN geography ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) AS geos ON geos.family_import_id = f.import_id - LEFT JOIN most_recent_doc_slugs ds ON ds.family_document_import_id = d.import_id - LEFT JOIN most_recent_family_slugs fs ON fs.family_import_id = f.import_id - LEFT JOIN event_dates fp ON fp.family_import_id = f.import_id + physical_document AS p +INNER JOIN family_document AS d ON p.id = d.physical_document_id +INNER JOIN family AS f ON d.family_import_id = f.import_id +FULL JOIN fg ON f.import_id = fg.family_import_id +INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id +INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id +INNER JOIN organisation AS o ON c.organisation_id = o.id +INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id +LEFT OUTER JOIN ( + SELECT + family_document.import_id AS family_document_import_id, + JSON_AGG(DISTINCT language.name) AS languages + FROM + family_document + INNER JOIN + physical_document_language + ON + family_document.physical_document_id + = physical_document_language.document_id + INNER JOIN + language + ON physical_document_language.language_id = language.id + GROUP BY + family_document.import_id +) AS langs ON d.import_id = langs.family_document_import_id +LEFT OUTER JOIN geos ON f.import_id = geos.family_import_id +LEFT JOIN + most_recent_doc_slugs AS ds + ON d.import_id = ds.family_document_import_id +LEFT JOIN + most_recent_family_slugs AS fs + ON f.import_id = fs.family_import_id +LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id WHERE d.document_status != 'DELETED' AND fg.family_import_id = f.import_id @@ -230,4 +276,4 @@ ORDER BY d.last_modified DESC, d.created DESC, d.ctid DESC, - f.import_id + f.import_id ASC diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql index 60a7ea60..2bfd363a 100644 --- a/app/repository/sql/slug_lookup.sql +++ b/app/repository/sql/slug_lookup.sql @@ -3,9 +3,9 @@ SELECT slug.family_import_id FROM slug - LEFT JOIN public.family f ON f.import_id = slug.family_import_id - LEFT JOIN family_corpus ON family_corpus.family_import_id = f.import_id - LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id + LEFT JOIN family ON slug.family_import_id = family.import_id + LEFT JOIN family_corpus ON family.import_id = family_corpus.family_import_id + LEFT JOIN corpus ON family_corpus.corpus_import_id = corpus.import_id WHERE slug.name = ':slug_name' AND corpus.import_id IN ':allowed_corpora_ids' @@ -15,10 +15,12 @@ SELECT slug.family_import_id FROM slug - LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id - LEFT JOIN public.family f ON public.family.import_id = family_document.family_import_id - LEFT JOIN family_corpus ON family_corpus.family_import_id = f.import_id - LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id + LEFT JOIN + family_document + ON slug.family_document_import_id = family_document.import_id + LEFT JOIN family ON family_document.import_id = family.import_id + LEFT JOIN family_corpus ON family.import_id = family_corpus.family_import_id + LEFT JOIN corpus ON family_corpus.corpus_import_id = corpus.import_id WHERE slug.name = ':slug_name' AND corpus.import_id IN ':allowed_corpora_ids'; From 27bf6156435335d9c56e2c5a42dc49d54a93481e Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Wed, 6 Nov 2024 11:34:24 +0000 Subject: [PATCH 08/22] Fix formatting --- .git-blame-ignore-revs | 1 + 1 file changed, 1 insertion(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index cfd5e1f0..8c4c0200 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -26,3 +26,4 @@ # Fix SQL query formatting 047766a85f086fc0986a6f2b49fee9d73fa219e8 +ab3476708920c5760f058ec40d14d008f94f5bad From 40ff9cdada5bf3b36b77e50d98cb5aa9a532fd96 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Wed, 6 Nov 2024 11:36:15 +0000 Subject: [PATCH 09/22] Use sqlfluff lint and fix --- .trunk/trunk.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index bc682e43..531b59a2 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -46,7 +46,7 @@ lint: enabled: - sqlfluff@3.2.5: - commands: [lint] + commands: [lint, fix] - actionlint@1.6.27 - bandit@1.7.8 - black@24.4.2 From 6796cb73d7410a7e3aefecbc7c5ef0cbcfabfc57 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Wed, 6 Nov 2024 11:37:14 +0000 Subject: [PATCH 10/22] Bump pyproject --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0a2c8f23..04fde860 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "navigator_backend" -version = "1.19.11" +version = "1.19.12" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] From 5ca24f2d168eec6702d7de97b16d42f7d2b4cac9 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Wed, 6 Nov 2024 17:04:29 +0000 Subject: [PATCH 11/22] Fix Download query placeholders --- app/repository/download.py | 4 +-- app/repository/helpers.py | 59 ++++++++++++++++++++++++++++++--- app/repository/sql/download.sql | 4 +-- 3 files changed, 58 insertions(+), 9 deletions(-) diff --git a/app/repository/download.py b/app/repository/download.py index 1b928e69..5209d199 100644 --- a/app/repository/download.py +++ b/app/repository/download.py @@ -18,7 +18,7 @@ def get_whole_database_dump( """Get whole database dump and bind variables. :param str ingest_cycle_start: The current ingest cycle date. - :param list[str] allowed_corpora_ids: The corpora from which we + :param list[str] corpora_ids: The corpora from which we should allow the data to be dumped. :return str: The SQL query to perform on the database session. """ @@ -32,7 +32,7 @@ def get_whole_database_dump( conn.connection, params={ "ingest_cycle_start": ingest_cycle_start, - "allowed_corpora_ids": allowed_corpora_ids, + "allowed_corpora_ids": tuple(allowed_corpora_ids), }, ) return df diff --git a/app/repository/helpers.py b/app/repository/helpers.py index e976683b..cdbb9587 100644 --- a/app/repository/helpers.py +++ b/app/repository/helpers.py @@ -1,11 +1,60 @@ -""" -Functions to support the documents endpoints - -old functions (non DFC) are moved to the deprecated_documents.py file. -""" +"""Helper functions for the repository layer.""" +from datetime import date, timedelta, datetime from functools import lru_cache +from sqlalchemy.orm import Query + + +def render_query(statement, db_session): + """ + Generate an SQL expression string with bound parameters rendered inline + for the given SQLAlchemy statement. + WARNING: This method of escaping is insecure, incomplete, and for debugging + purposes only. Executing SQL statements with inline-rendered user values is + extremely insecure. + Based on http://stackoverflow.com/questions/5631078/sqlalchemy-print-the-actual-query + """ + if isinstance(statement, Query): + statement = statement.statement + dialect = db_session.bind.dialect + + class LiteralCompiler(dialect.statement_compiler): + def visit_bindparam( + self, bindparam, within_columns_clause=False, literal_binds=False, **kwargs + ): + return self.render_literal_value(bindparam.value, bindparam.type) + + def render_array_value(self, val, item_type): + if isinstance(val, list): + return "{}".format( + ",".join([self.render_array_value(x, item_type) for x in val]) + ) + return self.render_literal_value(val, item_type) + + def render_literal_value(self, value, type_): + if value is None: + return None + if isinstance(value, int): + return str(value) + if isinstance(value, (str, date, datetime, timedelta)): + return "'{}'".format(str(value).replace("'", "''")) + if isinstance(value, (list)): + return "'{{{}}}'".format( + ",".join( + [self.render_array_value(x, type_.item_type) for x in value] + ) + ) + if isinstance(value, tuple): + return "'{{{}}}'".format( + ",".join( + [self.render_array_value(x, type_.item_type) for x in value] + ) + ) + return super(LiteralCompiler, self).render_literal_value(value, type_) + + return LiteralCompiler(dialect, statement).process(statement) + @lru_cache() def get_query_template(filepath: str) -> str: diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index 29da480f..2b3302ee 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -346,8 +346,8 @@ FROM ON f.import_id = fs.family_import_id LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id WHERE - d.last_modified < ':ingest_cycle_start' - AND fc.corpus_import_id IN ':allowed_corpora_ids' + d.last_modified < %(ingest_cycle_start)s + AND fc.corpus_import_id IN %(allowed_corpora_ids)s ORDER BY d.last_modified DESC, d.created DESC, From 1a6dc04ff9a408d90991e112987c2df0a90cf13a Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:17:44 +0000 Subject: [PATCH 12/22] Add sql linting --- .trunk/trunk.yaml | 46 +++++++++++++++++++++++++++++- app/repository/document.py | 20 +++++++++++-- app/repository/helpers.py | 2 +- app/repository/sql/slug_lookup.sql | 8 +++--- makefile-docker.defs | 11 +++---- 5 files changed, 73 insertions(+), 14 deletions(-) diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index 531b59a2..01ba24cc 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -5,6 +5,13 @@ version: 0.1 cli: version: 1.22.0 +tools: + definitions: + - name: sqlfluff + runtime: python + package: sqlfluff + shims: [sqlfluff] + known_good_version: 1.4.5 # Trunk provides extensibility via plugins. # (https://docs.trunk.io/plugins) plugins: @@ -34,6 +41,43 @@ lint: - name: lint run: bandit --exit-zero -c bandit.yaml --format json --output ${tmpfile} ${target} + - name: sqlfluff + files: [sql, sql-j2, dml, ddl] + tools: [sqlfluff] + description: A dialect-flexible and configurable SQL linter + known_good_version: 1.4.5 + direct_configs: + - .sqlfluff + affects_cache: + - pyproject.toml + suggest_if: config_present + commands: + - name: lint + run: sqlfluff lint ${target} --format json --nofail + output: sarif + success_codes: [0] + read_output_from: stdout + parser: + runtime: python + run: python3 ${plugin}/linters/sqlfluff/sqlfluff_to_sarif.py + - name: fix + version: ">=3.0.0" + run: sqlfluff fix ${target} --disable-progress-bar + output: rewrite + formatter: true + in_place: true + success_codes: [0, 1] + enabled: false + batch: true + - name: format + run: sqlfluff format ${target} --disable-progress-bar + output: rewrite + formatter: true + in_place: true + success_codes: [0, 1] + enabled: false + batch: true + ignore: - linters: [ALL] paths: @@ -46,7 +90,7 @@ lint: enabled: - sqlfluff@3.2.5: - commands: [lint, fix] + commands: [lint, fix, format] - actionlint@1.6.27 - bandit@1.7.8 - black@24.4.2 diff --git a/app/repository/document.py b/app/repository/document.py index bb8f703d..5fbb151f 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -22,8 +22,9 @@ from db_client.models.dfce.metadata import FamilyMetadata from db_client.models.document.physical_document import PhysicalDocument from db_client.models.organisation.organisation import Organisation -from sqlalchemy import func +from sqlalchemy import ARRAY, bindparam, func, text from sqlalchemy.orm import Session +from sqlalchemy.types import String from app.models.document import ( CollectionOverviewResponse, @@ -35,7 +36,7 @@ LinkableFamily, ) from app.repository.geography import get_geo_subquery -from app.repository.helpers import get_query_template +from app.repository.helpers import get_query_template, render_query from app.repository.lookups import doc_type_from_family_document_metadata from app.service.util import to_cdn_url @@ -62,8 +63,21 @@ def get_slugged_objects( query_template = get_query_template( os.path.join("app", "repository", "sql", "slug_lookup.sql") ) + # query_template = text(query_template).bindparams( + # bindparam("slug_name", value=slug, type_=String), + # bindparam( + # "allowed_corpora_ids", value=allowed_corpora + # # , type_=ARRAY(String) + # ), + # ) + + # # Log the compiled SQL query + # compiled_query = query_template.compile( + # dialect=db.bind.dialect, compile_kwargs={"literal_binds": True} + # ) + # _LOGGER.info("🔍 Compiled SQL Query: %s", render_query(compiled_query, db)) query = db.execute( - query_template, {"slug_name": slug, "allowed_corpora_ids": slug} + query_template, {"slug_name": slug, "allowed_corpora_ids": allowed_corpora} ) else: query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( diff --git a/app/repository/helpers.py b/app/repository/helpers.py index cdbb9587..a74f5377 100644 --- a/app/repository/helpers.py +++ b/app/repository/helpers.py @@ -1,6 +1,6 @@ """Helper functions for the repository layer.""" -from datetime import date, timedelta, datetime +from datetime import date, datetime, timedelta from functools import lru_cache from sqlalchemy.orm import Query diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql index 2bfd363a..ab1b6184 100644 --- a/app/repository/sql/slug_lookup.sql +++ b/app/repository/sql/slug_lookup.sql @@ -7,8 +7,8 @@ FROM LEFT JOIN family_corpus ON family.import_id = family_corpus.family_import_id LEFT JOIN corpus ON family_corpus.corpus_import_id = corpus.import_id WHERE - slug.name = ':slug_name' - AND corpus.import_id IN ':allowed_corpora_ids' + slug.name=:slug_name + AND corpus.import_id = ANY(:allowed_corpora_ids) UNION SELECT slug.family_document_import_id, @@ -22,5 +22,5 @@ FROM LEFT JOIN family_corpus ON family.import_id = family_corpus.family_import_id LEFT JOIN corpus ON family_corpus.corpus_import_id = corpus.import_id WHERE - slug.name = ':slug_name' - AND corpus.import_id IN ':allowed_corpora_ids'; + slug.name=:slug_name + AND corpus.import_id = ANY(:allowed_corpora_ids) diff --git a/makefile-docker.defs b/makefile-docker.defs index 276a67f0..32d36b71 100644 --- a/makefile-docker.defs +++ b/makefile-docker.defs @@ -104,7 +104,7 @@ vespa_setup: vespa_confirm_cli_installed vespa_healthy vespa_deploy_schema vespa .ONESHELL: test_search: - docker compose \ + SQLALCHEMY_WARN_20=1 docker compose \ -f docker-compose.yml \ -f docker-compose.dev.yml \ run --rm --name search_test \ @@ -114,17 +114,18 @@ test_search: -m 'search' ${ARGS} test_cors: - docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'cors' ${ARGS} + SQLALCHEMY_WARN_20=1 docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'cors' ${ARGS} test_unit: - docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/unit ${ARGS} + SQLALCHEMY_WARN_20=1 docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/unit ${ARGS} test_non_search: - docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'not search' ${ARGS} + SQLALCHEMY_WARN_20=1 docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'not search' ${ARGS} test: - docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv ${ARGS} + SQLALCHEMY_WARN_20=1 docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/non_search/routers/documents/test_get_document.py ${ARGS} +#docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/search/vespa/test_whole_database_download.py ${ARGS} # ---------------------------------- # tasks # ---------------------------------- From 6b98ade86cbc99ebad29e5309bb52ef695f756cb Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 10:45:38 +0000 Subject: [PATCH 13/22] Fixed slug_lookup query --- app/repository/document.py | 23 +++++---------- app/repository/sql/slug_lookup.sql | 47 +++++++++++++++++------------- 2 files changed, 34 insertions(+), 36 deletions(-) diff --git a/app/repository/document.py b/app/repository/document.py index 5fbb151f..a97f84f7 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -22,7 +22,7 @@ from db_client.models.dfce.metadata import FamilyMetadata from db_client.models.document.physical_document import PhysicalDocument from db_client.models.organisation.organisation import Organisation -from sqlalchemy import ARRAY, bindparam, func, text +from sqlalchemy import bindparam, func, text from sqlalchemy.orm import Session from sqlalchemy.types import String @@ -36,7 +36,7 @@ LinkableFamily, ) from app.repository.geography import get_geo_subquery -from app.repository.helpers import get_query_template, render_query +from app.repository.helpers import get_query_template from app.repository.lookups import doc_type_from_family_document_metadata from app.service.util import to_cdn_url @@ -59,23 +59,14 @@ def get_slugged_objects( :return tuple[Optional[str], Optional[str]]: the FamilyDocument import id or the Family import_id. """ - if allowed_corpora is not None: + if allowed_corpora not in [None, []]: query_template = get_query_template( os.path.join("app", "repository", "sql", "slug_lookup.sql") ) - # query_template = text(query_template).bindparams( - # bindparam("slug_name", value=slug, type_=String), - # bindparam( - # "allowed_corpora_ids", value=allowed_corpora - # # , type_=ARRAY(String) - # ), - # ) - - # # Log the compiled SQL query - # compiled_query = query_template.compile( - # dialect=db.bind.dialect, compile_kwargs={"literal_binds": True} - # ) - # _LOGGER.info("🔍 Compiled SQL Query: %s", render_query(compiled_query, db)) + query_template = text(query_template).bindparams( + bindparam("slug_name", type_=String), + bindparam("allowed_corpora_ids", value=allowed_corpora, expanding=True), + ) query = db.execute( query_template, {"slug_name": slug, "allowed_corpora_ids": allowed_corpora} ) diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql index ab1b6184..31606cd8 100644 --- a/app/repository/sql/slug_lookup.sql +++ b/app/repository/sql/slug_lookup.sql @@ -1,26 +1,33 @@ -SELECT +-- First query for family document slugs +SELECT DISTINCT slug.family_document_import_id, slug.family_import_id -FROM - slug - LEFT JOIN family ON slug.family_import_id = family.import_id - LEFT JOIN family_corpus ON family.import_id = family_corpus.family_import_id - LEFT JOIN corpus ON family_corpus.corpus_import_id = corpus.import_id +FROM slug + INNER JOIN family_document + ON slug.family_document_import_id = family_document.import_id + INNER JOIN family + ON family_document.family_import_id = family.import_id + INNER JOIN family_corpus + ON family.import_id = family_corpus.family_import_id + INNER JOIN corpus + ON family_corpus.corpus_import_id = corpus.import_id WHERE - slug.name=:slug_name - AND corpus.import_id = ANY(:allowed_corpora_ids) + slug.name = :slug_name + AND corpus.import_id IN :allowed_corpora_ids + UNION -SELECT - slug.family_document_import_id, + +-- Second query for family slugs +SELECT DISTINCT + NULL AS family_document_import_id, slug.family_import_id -FROM - slug - LEFT JOIN - family_document - ON slug.family_document_import_id = family_document.import_id - LEFT JOIN family ON family_document.import_id = family.import_id - LEFT JOIN family_corpus ON family.import_id = family_corpus.family_import_id - LEFT JOIN corpus ON family_corpus.corpus_import_id = corpus.import_id +FROM slug + INNER JOIN family + ON slug.family_import_id = family.import_id + INNER JOIN family_corpus + ON family.import_id = family_corpus.family_import_id + INNER JOIN corpus + ON family_corpus.corpus_import_id = corpus.import_id WHERE - slug.name=:slug_name - AND corpus.import_id = ANY(:allowed_corpora_ids) + slug.name = :slug_name + AND corpus.import_id IN :allowed_corpora_ids From 8061866776aee7fab542840a19c3bcf88a004cf8 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 11:15:24 +0000 Subject: [PATCH 14/22] Fix query param binding --- app/repository/document.py | 21 +++++++++++---------- app/repository/sql/slug_lookup.sql | 4 ++-- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/app/repository/document.py b/app/repository/document.py index a97f84f7..73d19d47 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -1,8 +1,4 @@ -""" -Functions to support the documents endpoints - -old functions (non DFC) are moved to the deprecated_documents.py file. -""" +"""Database helper functions for the documents entity.""" import logging import os @@ -24,7 +20,7 @@ from db_client.models.organisation.organisation import Organisation from sqlalchemy import bindparam, func, text from sqlalchemy.orm import Session -from sqlalchemy.types import String +from sqlalchemy.types import ARRAY, String from app.models.document import ( CollectionOverviewResponse, @@ -60,12 +56,17 @@ def get_slugged_objects( import id or the Family import_id. """ if allowed_corpora not in [None, []]: - query_template = get_query_template( - os.path.join("app", "repository", "sql", "slug_lookup.sql") + query_template = text( + get_query_template( + os.path.join("app", "repository", "sql", "slug_lookup.sql") + ) ) - query_template = text(query_template).bindparams( + + query_template = query_template.bindparams( bindparam("slug_name", type_=String), - bindparam("allowed_corpora_ids", value=allowed_corpora, expanding=True), + bindparam( + "allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String) + ), ) query = db.execute( query_template, {"slug_name": slug, "allowed_corpora_ids": allowed_corpora} diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql index 31606cd8..09cb2e69 100644 --- a/app/repository/sql/slug_lookup.sql +++ b/app/repository/sql/slug_lookup.sql @@ -13,7 +13,7 @@ FROM slug ON family_corpus.corpus_import_id = corpus.import_id WHERE slug.name = :slug_name - AND corpus.import_id IN :allowed_corpora_ids + AND corpus.import_id = ANY(:allowed_corpora_ids) UNION @@ -30,4 +30,4 @@ FROM slug ON family_corpus.corpus_import_id = corpus.import_id WHERE slug.name = :slug_name - AND corpus.import_id IN :allowed_corpora_ids + AND corpus.import_id = ANY(:allowed_corpora_ids) From d913d522ee3e001be72917f6e8096ec84c730865 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 11:41:47 +0000 Subject: [PATCH 15/22] Update .sqlfluff --- .trunk/configs/.sqlfluff | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.trunk/configs/.sqlfluff b/.trunk/configs/.sqlfluff index 62191d86..de7aacd3 100644 --- a/.trunk/configs/.sqlfluff +++ b/.trunk/configs/.sqlfluff @@ -5,9 +5,17 @@ exclude_rules = LT02, LT09 [sqlfluff:indentation] indented_ctes = True +[sqlfluff:layout:type:colon] +spacing_before = single +spacing_after = single + +[sqlfluff:layout:type:parameter] +spacing_before = touch +spacing_after = any + [sqlfluff:rules:references.special_chars] allow_space_in_identifier = True -additional_allowed_characters = ["/"] +additional_allowed_characters = ["/", "_", "-", "(", ")"] [sqlfluff:rules:capitalisation.keywords] capitalisation_policy = upper From a94f35727d3a5bac6496151cfe450cb351c37813 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 11:42:27 +0000 Subject: [PATCH 16/22] Rename keyword identifier to non keyword identifier --- app/repository/sql/download.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index 2b3302ee..c71c2e7c 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -218,7 +218,7 @@ SELECT n1.collection_descriptions AS "Collection Description(s)", d.variant_name AS "Document Variant", p.source_url AS "Document Content URL", - n2.language AS "Language", + language_agg.display_name AS "Language", o.name AS "Source", fg.geo_isos AS "Geography ISOs", fg.geo_display_values AS "Geographies", @@ -313,7 +313,7 @@ FROM ';' ORDER BY l.name - ) AS language + ) AS display_name FROM physical_document AS p LEFT JOIN @@ -322,7 +322,7 @@ FROM LEFT JOIN language AS l ON pdl.language_id = l.id GROUP BY p.id - ) AS n2 ON d.physical_document_id = n2.id + ) AS language_agg ON d.physical_document_id = language_agg.id FULL JOIN ( SELECT family_event.family_import_id, From 70ad7ed73ea692c6ab1be8c672e68f24f18aee29 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:32:18 +0000 Subject: [PATCH 17/22] Revert makefile changes --- makefile-docker.defs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/makefile-docker.defs b/makefile-docker.defs index 32d36b71..511c1b4b 100644 --- a/makefile-docker.defs +++ b/makefile-docker.defs @@ -114,18 +114,17 @@ test_search: -m 'search' ${ARGS} test_cors: - SQLALCHEMY_WARN_20=1 docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'cors' ${ARGS} + docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'cors' ${ARGS} test_unit: - SQLALCHEMY_WARN_20=1 docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/unit ${ARGS} + docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/unit ${ARGS} test_non_search: - SQLALCHEMY_WARN_20=1 docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'not search' ${ARGS} + docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'not search' ${ARGS} test: - SQLALCHEMY_WARN_20=1 docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/non_search/routers/documents/test_get_document.py ${ARGS} + docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests ${ARGS} -#docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests/search/vespa/test_whole_database_download.py ${ARGS} # ---------------------------------- # tasks # ---------------------------------- From a42458665b09026eae1a404b2e086bd932b82361 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:32:44 +0000 Subject: [PATCH 18/22] Fix linting errors --- app/repository/sql/download.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index c71c2e7c..15bbf8ac 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -346,8 +346,8 @@ FROM ON f.import_id = fs.family_import_id LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id WHERE - d.last_modified < %(ingest_cycle_start)s - AND fc.corpus_import_id IN %(allowed_corpora_ids)s + d.last_modified < :ingest_cycle_start + AND fc.corpus_import_id = ANY(:allowed_corpora_ids) ORDER BY d.last_modified DESC, d.created DESC, From 6d9a219442439a25e473065e3b3feb52386edf48 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:33:41 +0000 Subject: [PATCH 19/22] Update download query logic based on linting --- app/repository/download.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/app/repository/download.py b/app/repository/download.py index 5209d199..33592cc0 100644 --- a/app/repository/download.py +++ b/app/repository/download.py @@ -5,6 +5,8 @@ import pandas as pd from fastapi import Depends +from sqlalchemy import bindparam, text +from sqlalchemy.types import ARRAY, DATETIME, String from app.clients.db.session import get_db from app.repository.helpers import get_query_template @@ -15,24 +17,31 @@ def get_whole_database_dump( ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) ): - """Get whole database dump and bind variables. + """Get whole database dump and bind variables. :param str ingest_cycle_start: The current ingest cycle date. :param list[str] corpora_ids: The corpora from which we should allow the data to be dumped. - :return str: The SQL query to perform on the database session. + :return pd.DataFrame: A DataFrame containing the results of the SQL + query that gets the whole database dump in our desired format. """ - query_template = get_query_template( - os.path.join("app", "repository", "sql", "download.sql") + query = text( + get_query_template(os.path.join("app", "repository", "sql", "download.sql")) + ).bindparams( + bindparam("ingest_cycle_start", type_=DATETIME), + bindparam( + "allowed_corpora_ids", value=allowed_corpora_ids, type_=ARRAY(String) + ), ) with db.connection() as conn: - df = pd.read_sql( - query_template, - conn.connection, - params={ + result = conn.execute( + query, + { "ingest_cycle_start": ingest_cycle_start, - "allowed_corpora_ids": tuple(allowed_corpora_ids), + "allowed_corpora_ids": allowed_corpora_ids, }, ) + columns = result.keys() + df = pd.DataFrame(result.fetchall(), columns=columns) return df From 25879b8beb2e90d9234ad1c346769514863d2181 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:45:54 +0000 Subject: [PATCH 20/22] Update formatting --- .trunk/trunk.yaml | 4 ++++ app/repository/helpers.py | 15 +++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index 24a8e935..31ff5439 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -12,6 +12,7 @@ tools: package: sqlfluff shims: [sqlfluff] known_good_version: 1.4.5 + # Trunk provides extensibility via plugins. # (https://docs.trunk.io/plugins) plugins: @@ -34,6 +35,7 @@ lint: disabled: - hadolint - oxipng + definitions: - name: bandit direct_configs: [bandit.yaml] @@ -60,6 +62,7 @@ lint: parser: runtime: python run: python3 ${plugin}/linters/sqlfluff/sqlfluff_to_sarif.py + - name: fix version: ">=3.0.0" run: sqlfluff fix ${target} --disable-progress-bar @@ -69,6 +72,7 @@ lint: success_codes: [0, 1] enabled: false batch: true + - name: format run: sqlfluff format ${target} --disable-progress-bar output: rewrite diff --git a/app/repository/helpers.py b/app/repository/helpers.py index a74f5377..652d07de 100644 --- a/app/repository/helpers.py +++ b/app/repository/helpers.py @@ -7,12 +7,15 @@ def render_query(statement, db_session): - """ - Generate an SQL expression string with bound parameters rendered inline - for the given SQLAlchemy statement. - WARNING: This method of escaping is insecure, incomplete, and for debugging - purposes only. Executing SQL statements with inline-rendered user values is - extremely insecure. + """Generate a string representing a query with bound parameters. + + Generate an SQL expression string with bound parameters rendered + inline inline for the given SQLAlchemy statement. + + WARNING: This method of escaping is insecure, incomplete, and for + debugging purposes only. Executing SQL statements with + inline-rendered user values is extremely insecure. + Based on http://stackoverflow.com/questions/5631078/sqlalchemy-print-the-actual-query """ if isinstance(statement, Query): From 7b3b4df72475aeb39463373256f6d5871707db84 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:47:36 +0000 Subject: [PATCH 21/22] Remove debug function --- app/repository/helpers.py | 56 --------------------------------------- 1 file changed, 56 deletions(-) diff --git a/app/repository/helpers.py b/app/repository/helpers.py index 652d07de..958e0b38 100644 --- a/app/repository/helpers.py +++ b/app/repository/helpers.py @@ -1,63 +1,7 @@ """Helper functions for the repository layer.""" -from datetime import date, datetime, timedelta from functools import lru_cache -from sqlalchemy.orm import Query - - -def render_query(statement, db_session): - """Generate a string representing a query with bound parameters. - - Generate an SQL expression string with bound parameters rendered - inline inline for the given SQLAlchemy statement. - - WARNING: This method of escaping is insecure, incomplete, and for - debugging purposes only. Executing SQL statements with - inline-rendered user values is extremely insecure. - - Based on http://stackoverflow.com/questions/5631078/sqlalchemy-print-the-actual-query - """ - if isinstance(statement, Query): - statement = statement.statement - dialect = db_session.bind.dialect - - class LiteralCompiler(dialect.statement_compiler): - def visit_bindparam( - self, bindparam, within_columns_clause=False, literal_binds=False, **kwargs - ): - return self.render_literal_value(bindparam.value, bindparam.type) - - def render_array_value(self, val, item_type): - if isinstance(val, list): - return "{}".format( - ",".join([self.render_array_value(x, item_type) for x in val]) - ) - return self.render_literal_value(val, item_type) - - def render_literal_value(self, value, type_): - if value is None: - return None - if isinstance(value, int): - return str(value) - if isinstance(value, (str, date, datetime, timedelta)): - return "'{}'".format(str(value).replace("'", "''")) - if isinstance(value, (list)): - return "'{{{}}}'".format( - ",".join( - [self.render_array_value(x, type_.item_type) for x in value] - ) - ) - if isinstance(value, tuple): - return "'{{{}}}'".format( - ",".join( - [self.render_array_value(x, type_.item_type) for x in value] - ) - ) - return super(LiteralCompiler, self).render_literal_value(value, type_) - - return LiteralCompiler(dialect, statement).process(statement) - @lru_cache() def get_query_template(filepath: str) -> str: From e16aedbc0d449f09a1d2128de14ffe6b0a47a2f3 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:48:37 +0000 Subject: [PATCH 22/22] Revert SQLalchemy 2 --- makefile-docker.defs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/makefile-docker.defs b/makefile-docker.defs index 511c1b4b..b41a9358 100644 --- a/makefile-docker.defs +++ b/makefile-docker.defs @@ -104,7 +104,7 @@ vespa_setup: vespa_confirm_cli_installed vespa_healthy vespa_deploy_schema vespa .ONESHELL: test_search: - SQLALCHEMY_WARN_20=1 docker compose \ + docker compose \ -f docker-compose.yml \ -f docker-compose.dev.yml \ run --rm --name search_test \