Skip to content

Commit

Permalink
Merge pull request #203 from spraakbanken/feat-cli-add-entries-validate
Browse files Browse the repository at this point in the history
add cli command `entries validate`
  • Loading branch information
kod-kristoff authored May 3, 2022
2 parents 9fe05fa + 996b88c commit 371918e
Show file tree
Hide file tree
Showing 9 changed files with 582 additions and 92 deletions.
Empty file added data/.gitkeep
Empty file.
104 changes: 102 additions & 2 deletions karp/cliapp/subapps/entries_subapp.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
import collections.abc
import logging
from pathlib import Path
import sys
from typing import Optional


import json_streams
import json_streams.jsonlib
from sb_json_tools import jt_val
import typer
from tabulate import tabulate

# from tabulate import tabulate
from tqdm import tqdm

from karp.foundation.commands import CommandBus
from karp import lex
from karp.utility import json_schema

# from karp.lex.domain.errors import ResourceAlreadyPublished

Expand Down Expand Up @@ -82,9 +89,102 @@ def export_entries(
unit=" entries",
),
output,
use_stdout_as_default=None,
use_stdout_as_default=True,
)


class Counter(collections.abc.Generator):
def __init__(self, sink) -> None:
self._counter: int = 0
self._sink = sink

@property
def counter(self) -> int:
return self._counter

def send(self, value):
self._counter += 1
self._sink.send(value)

def throw(self, typ=None, val=None, tb=None):
raise StopIteration


@subapp.command("validate")
@cli_error_handler
@cli_timer
def validate_entries(
ctx: typer.Context,
path: Optional[Path] = typer.Argument(None),
config_path: Optional[Path] = typer.Option(
None,
"--config",
"-c",
help="resource config",
),
resource_id_raw: Optional[str] = typer.Option(None, "--resource_id"),
output: Optional[Path] = typer.Option(
None, "--output", "-o", help="file to write to"
),
):
typer.echo(f"reading from {path if path else 'stdin'} ...", err=True)
err_output = None

if not output and path:
output = Path(f"{path}.v6.jsonl")

if not err_output and output:
err_output = Path(f"{output}.errors.jsonl")

if config_path and resource_id_raw:
typer.echo("You can't provide both '--resource_id' and '--config/-c'", err=True)
raise typer.Exit(301)

if config_path:
config = json_streams.jsonlib.load_from_file(config_path)
elif resource_id_raw:
repo = inject_from_ctx(lex.ReadOnlyResourceRepository, ctx=ctx)
resource = repo.get_by_resource_id(resource_id_raw)
if resource:
config = resource.config
else:
typer.echo(f"Can't find resource '{resource_id_raw}'", err=True)
raise typer.Exit(302)
else:
typer.echo("You must provide either '--resource_id' or '--config/-c'", err=True)
raise typer.Exit(code=300)

schema = json_schema.create_entry_json_schema(config["fields"])

error_code = 0

with json_streams.sink_from_file(
err_output, use_stderr_as_default=True
) as error_sink, json_streams.sink_from_file(
output, use_stdout_as_default=True
) as correct_sink:
error_counter = Counter(error_sink)
# error_counter.send(None)
jt_val.processing_validate(
schema,
tqdm(
json_streams.load_from_file(path, use_stdin_as_default=True),
desc="Validating",
unit=" entries",
),
on_ok=correct_sink,
on_error=error_counter,
)
if error_counter.counter > 0:
error_code = 130
print(
f'{error_counter.counter} entries failed validation, see "{err_output}"',
file=sys.stderr,
)

if error_code:
raise typer.Exit(error_code)


def init_app(app):
app.add_typer(subapp, name="entries")
19 changes: 18 additions & 1 deletion karp/lex/application/queries/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,25 @@ def query(self, resource_id: str) -> UniqueId:


class ReadOnlyResourceRepository(abc.ABC):
def get_by_resource_id(
self, resource_id: str, version: Optional[int] = None
) -> Optional[ResourceDto]:
resource = self._get_by_resource_id(resource_id)
if not resource:
return None

if version is not None:
resource = self.get_by_id(resource.entity_id, version=version)
return resource

@abc.abstractmethod
def get_by_id(
self, entity_id: UniqueId, version: Optional[int] = None
) -> Optional[ResourceDto]:
pass

@abc.abstractmethod
def get_by_resource_id(self, resource_id: str, version: Optional[int] = None) -> Optional[ResourceDto]:
def _get_by_resource_id(self, resource_id: str) -> Optional[ResourceDto]:
pass

@abc.abstractmethod
Expand Down
3 changes: 1 addition & 2 deletions karp/lex/domain/value_objects/entry_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def validate_entry(self, json_obj: dict):
self._compiled_schema(json_obj)
except fastjsonschema.JsonSchemaException as e:
logger.warning(
"Entry not valid",
extra={'entry': json_obj, 'error_message': str(e)}
"Entry not valid", extra={"entry": json_obj, "error_message": str(e)}
)
raise errors.InvalidEntry() from e
144 changes: 82 additions & 62 deletions karp/lex_infrastructure/queries/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,120 +2,140 @@

import sqlalchemy as sa
from sqlalchemy import sql
from karp.foundation.value_objects.unique_id import UniqueId

from karp.lex.application.queries import GetPublishedResources, ResourceDto, GetResources
from karp.lex.application.queries import (
GetPublishedResources,
ResourceDto,
GetResources,
)
from karp.lex.application.queries.resources import ReadOnlyResourceRepository
from karp.lex.domain.entities import resource
from karp.lex_infrastructure.sql.sql_models import ResourceModel
from karp.lex_infrastructure.queries.base import SqlQuery


class SqlGetPublishedResources(
GetPublishedResources,
SqlQuery
):
class SqlGetPublishedResources(GetPublishedResources, SqlQuery):
def query(self) -> Iterable[ResourceDto]:
subq = sql.select(
ResourceModel.entity_id,
sa.func.max(ResourceModel.last_modified).label('maxdate')
).group_by(ResourceModel.entity_id).subquery('t2')
subq = (
sql.select(
ResourceModel.entity_id,
sa.func.max(ResourceModel.last_modified).label("maxdate"),
)
.group_by(ResourceModel.entity_id)
.subquery("t2")
)

stmt = sql.select(ResourceModel).join(
subq,
sa.and_(
ResourceModel.entity_id == subq.c.entity_id,
ResourceModel.last_modified == subq.c.maxdate,
ResourceModel.is_published == True
)
)
return (
_row_to_dto(row)
for row in self._conn.execute(stmt)
ResourceModel.is_published == True,
),
)
return (_row_to_dto(row) for row in self._conn.execute(stmt))


class SqlGetResources(
GetResources,
SqlQuery
):
class SqlGetResources(GetResources, SqlQuery):
def query(self) -> Iterable[ResourceDto]:
subq = sql.select(
ResourceModel.entity_id,
sa.func.max(ResourceModel.last_modified).label('maxdate')
).group_by(ResourceModel.entity_id).subquery('t2')
subq = (
sql.select(
ResourceModel.entity_id,
sa.func.max(ResourceModel.last_modified).label("maxdate"),
)
.group_by(ResourceModel.entity_id)
.subquery("t2")
)

stmt = sql.select(ResourceModel).join(
subq,
sa.and_(
ResourceModel.entity_id == subq.c.entity_id,
ResourceModel.last_modified == subq.c.maxdate,
)
)
return (
_row_to_dto(row)
for row in self._conn.execute(stmt)
),
)
return (_row_to_dto(row) for row in self._conn.execute(stmt))


class SqlReadOnlyResourceRepository(
ReadOnlyResourceRepository,
SqlQuery
):
def get_by_resource_id(
self,
resource_id: str,
version: Optional[int] = None
class SqlReadOnlyResourceRepository(ReadOnlyResourceRepository, SqlQuery):
def get_by_id(
self, entity_id: UniqueId, version: Optional[int] = None
) -> Optional[ResourceDto]:
filters: dict[str, str | int] = {
'resource_id': resource_id
}
filters: dict[str, UniqueId | str | int] = {"entity_id": entity_id}
if version:
filters['version'] = version
stmt = sql.select(
ResourceModel
).filter_by(**filters).order_by(
ResourceModel.last_modified.desc()
filters["version"] = version
stmt = (
sql.select(ResourceModel)
.filter_by(**filters)
.order_by(ResourceModel.last_modified.desc())
)
print(f"stmt={str(stmt)}")
row = self._conn.execute(stmt).first()

return _row_to_dto(row) if row else None

def get_published_resources(self) -> Iterable[ResourceDto]:
subq = sql.select(
ResourceModel.entity_id,
sa.func.max(ResourceModel.last_modified).label('maxdate')
).group_by(ResourceModel.entity_id).subquery('t2')
def _get_by_resource_id(self, resource_id: str) -> Optional[ResourceDto]:
subq = (
sql.select(
ResourceModel.entity_id,
sa.func.max(ResourceModel.last_modified).label("maxdate"),
)
.group_by(ResourceModel.entity_id)
.subquery("t2")
)

stmt = sql.select(ResourceModel).join(
subq,
sa.and_(
ResourceModel.entity_id == subq.c.entity_id,
ResourceModel.last_modified == subq.c.maxdate,
ResourceModel.is_published == True
ResourceModel.resource_id == resource_id,
),
)
stmt = stmt.order_by(ResourceModel.last_modified.desc())
row = self._conn.execute(stmt).first()

return _row_to_dto(row) if row else None

def get_published_resources(self) -> Iterable[ResourceDto]:
subq = (
sql.select(
ResourceModel.entity_id,
sa.func.max(ResourceModel.last_modified).label("maxdate"),
)
.group_by(ResourceModel.entity_id)
.subquery("t2")
)
return (
_row_to_dto(row)
for row in self._conn.execute(stmt)

stmt = sql.select(ResourceModel).join(
subq,
sa.and_(
ResourceModel.entity_id == subq.c.entity_id,
ResourceModel.last_modified == subq.c.maxdate,
ResourceModel.is_published == True,
),
)
return (_row_to_dto(row) for row in self._conn.execute(stmt))

def get_all_resources(self) -> Iterable[ResourceDto]:
subq = sql.select(
ResourceModel.entity_id,
sa.func.max(ResourceModel.last_modified).label('maxdate')
).group_by(ResourceModel.entity_id).subquery('t2')
subq = (
sql.select(
ResourceModel.entity_id,
sa.func.max(ResourceModel.last_modified).label("maxdate"),
)
.group_by(ResourceModel.entity_id)
.subquery("t2")
)

stmt = sql.select(ResourceModel).join(
subq,
sa.and_(
ResourceModel.entity_id == subq.c.entity_id,
ResourceModel.last_modified == subq.c.maxdate,
)
)
return (
_row_to_dto(row)
for row in self._conn.execute(stmt)
),
)
return (_row_to_dto(row) for row in self._conn.execute(stmt))


def _row_to_dto(row_proxy) -> ResourceDto:
Expand Down
Loading

0 comments on commit 371918e

Please sign in to comment.