Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sql linting using SQLFluff #417

Merged
merged 23 commits into from
Nov 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .git-blame-ignore-revs
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,7 @@

# Updating the test data file for document passages to be indent=2
44624dcd1fa0835708bd9187a39bb0da8a31cd03

# Fix SQL query formatting
047766a85f086fc0986a6f2b49fee9d73fa219e8
ab3476708920c5760f058ec40d14d008f94f5bad
30 changes: 30 additions & 0 deletions .trunk/configs/.sqlfluff
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[sqlfluff]
dialect = postgres
exclude_rules = LT02, LT09
katybaulch marked this conversation as resolved.
Show resolved Hide resolved

[sqlfluff:indentation]
indented_ctes = True

[sqlfluff:layout:type:colon]
spacing_before = single
spacing_after = single

[sqlfluff:layout:type:parameter]
spacing_before = touch
spacing_after = any

[sqlfluff:rules:references.special_chars]
allow_space_in_identifier = True
additional_allowed_characters = ["/", "_", "-", "(", ")"]

[sqlfluff:rules:capitalisation.keywords]
capitalisation_policy = upper

[sqlfluff:rules:capitalisation.identifiers]
extended_capitalisation_policy = lower

[sqlfluff:rules:capitalisation.functions]
extended_capitalisation_policy = upper

[sqlfluff:rules:capitalisation.types]
extended_capitalisation_policy = upper
50 changes: 50 additions & 0 deletions .trunk/trunk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ version: 0.1
cli:
version: 1.22.0

tools:
definitions:
- name: sqlfluff
runtime: python
package: sqlfluff
shims: [sqlfluff]
known_good_version: 1.4.5

# Trunk provides extensibility via plugins.
# (https://docs.trunk.io/plugins)
plugins:
Expand All @@ -27,13 +35,53 @@ lint:
disabled:
- hadolint
- oxipng

definitions:
- name: bandit
direct_configs: [bandit.yaml]
commands:
- name: lint
run: bandit --exit-zero -c bandit.yaml --format json --output ${tmpfile} ${target}

- name: sqlfluff
files: [sql, sql-j2, dml, ddl]
tools: [sqlfluff]
description: A dialect-flexible and configurable SQL linter
known_good_version: 1.4.5
direct_configs:
- .sqlfluff
affects_cache:
- pyproject.toml
suggest_if: config_present
commands:
- name: lint
run: sqlfluff lint ${target} --format json --nofail
output: sarif
success_codes: [0]
read_output_from: stdout
parser:
runtime: python
run: python3 ${plugin}/linters/sqlfluff/sqlfluff_to_sarif.py

- name: fix
version: ">=3.0.0"
run: sqlfluff fix ${target} --disable-progress-bar
output: rewrite
formatter: true
in_place: true
success_codes: [0, 1]
enabled: false
batch: true

- name: format
run: sqlfluff format ${target} --disable-progress-bar
output: rewrite
formatter: true
in_place: true
success_codes: [0, 1]
enabled: false
batch: true

ignore:
- linters: [ALL]
paths:
Expand All @@ -45,6 +93,8 @@ lint:
- LICENSE.md

enabled:
- [email protected]:
commands: [lint, fix, format]
- [email protected]
- [email protected]
- [email protected]
Expand Down
45 changes: 17 additions & 28 deletions app/repository/document.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
"""
Functions to support the documents endpoints

old functions (non DFC) are moved to the deprecated_documents.py file.
"""
"""Database helper functions for the documents entity."""

import logging
import os
Expand All @@ -22,8 +18,9 @@
from db_client.models.dfce.metadata import FamilyMetadata
from db_client.models.document.physical_document import PhysicalDocument
from db_client.models.organisation.organisation import Organisation
from sqlalchemy import func
from sqlalchemy import bindparam, func, text
from sqlalchemy.orm import Session
from sqlalchemy.types import ARRAY, String

from app.models.document import (
CollectionOverviewResponse,
Expand All @@ -42,22 +39,6 @@
_LOGGER = logging.getLogger(__file__)


def get_slugged_object_from_allowed_corpora_query(
template_query, slug_name: str, allowed_corpora_ids: list[str]
) -> str:
"""Create download whole database query, replacing variables.

:param str ingest_cycle_start: The current ingest cycle date.
:param list[str] allowed_corpora_ids: The corpora from which we
should allow the data to be dumped.
:return str: The SQL query to perform on the database session.
"""
corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
return template_query.replace("{slug_name}", slug_name).replace( # type: ignore
"{allowed_corpora_ids}", corpora_ids
) # type: ignore


def get_slugged_objects(
db: Session, slug: str, allowed_corpora: Optional[list[str]] = None
) -> tuple[Optional[str], Optional[str]]:
Expand All @@ -74,14 +55,22 @@ def get_slugged_objects(
:return tuple[Optional[str], Optional[str]]: the FamilyDocument
import id or the Family import_id.
"""
if allowed_corpora is not None:
query_template = get_query_template(
os.path.join("app", "repository", "sql", "slug_lookup.sql")
if allowed_corpora not in [None, []]:
query_template = text(
get_query_template(
katybaulch marked this conversation as resolved.
Show resolved Hide resolved
os.path.join("app", "repository", "sql", "slug_lookup.sql")
)
)

query_template = query_template.bindparams(
bindparam("slug_name", type_=String),
bindparam(
"allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String)
),
)
query = get_slugged_object_from_allowed_corpora_query(
query_template, slug, allowed_corpora
query = db.execute(
query_template, {"slug_name": slug, "allowed_corpora_ids": allowed_corpora}
)
query = db.execute(query)
else:
query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter(
Slug.name == slug
Expand Down
46 changes: 25 additions & 21 deletions app/repository/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,43 @@

import pandas as pd
from fastapi import Depends
from sqlalchemy import bindparam, text
from sqlalchemy.types import ARRAY, DATETIME, String

from app.clients.db.session import get_db
from app.repository.helpers import get_query_template

_LOGGER = getLogger(__name__)


def create_query(
template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str]
) -> str:
"""Create download whole database query, replacing variables.
def get_whole_database_dump(
ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
):
"""Get whole database dump and bind variables.

:param str ingest_cycle_start: The current ingest cycle date.
:param list[str] allowed_corpora_ids: The corpora from which we
:param list[str] corpora_ids: The corpora from which we
katybaulch marked this conversation as resolved.
Show resolved Hide resolved
should allow the data to be dumped.
:return str: The SQL query to perform on the database session.
:return pd.DataFrame: A DataFrame containing the results of the SQL
query that gets the whole database dump in our desired format.
"""
corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
return template_query.replace( # type: ignore
"{ingest_cycle_start}", ingest_cycle_start
).replace(
"{allowed_corpora_ids}", corpora_ids
) # type: ignore


def get_whole_database_dump(
ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
):
query_template = get_query_template(
os.path.join("app", "repository", "sql", "download.sql")
query = text(
get_query_template(os.path.join("app", "repository", "sql", "download.sql"))
).bindparams(
bindparam("ingest_cycle_start", type_=DATETIME),
bindparam(
"allowed_corpora_ids", value=allowed_corpora_ids, type_=ARRAY(String)
),
)
query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids)

with db.connection() as conn:
df = pd.read_sql(query, conn.connection)
result = conn.execute(
query,
{
"ingest_cycle_start": ingest_cycle_start,
"allowed_corpora_ids": allowed_corpora_ids,
},
)
columns = result.keys()
df = pd.DataFrame(result.fetchall(), columns=columns)
return df
6 changes: 1 addition & 5 deletions app/repository/helpers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
"""
Functions to support the documents endpoints

old functions (non DFC) are moved to the deprecated_documents.py file.
"""
"""Helper functions for the repository layer."""

from functools import lru_cache

Expand Down
Loading
Loading