Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/efficient reporting #2516

Merged
merged 40 commits into from
Feb 29, 2024
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
0ee2a76
Change report interface to support multiple oois being handled
Donnype Feb 14, 2024
54b0e43
Add a query-many endpoint and rewrite the mail report to use it.
Donnype Feb 15, 2024
97bb105
Style
Donnype Feb 15, 2024
f7ef285
Fixes for the benchmark suite
Donnype Feb 15, 2024
32bac52
WIP: add where_in to query
Donnype Feb 15, 2024
b9b6f62
Add where_in functionality to the XTDB queries
Donnype Feb 15, 2024
e700349
Be able to match input sources back to their output.
Donnype Feb 15, 2024
f3be1fa
Fix benchmarking
Donnype Feb 15, 2024
ecab36d
Merge branch 'feature/efficient-reporting' of github.com:minvws/nl-ka…
Donnype Feb 16, 2024
66c8d31
Merge branch 'main' into feature/efficient-reporting
Donnype Feb 16, 2024
c9ca5ae
Add comments to code
Donnype Feb 16, 2024
d9e65a8
Style
Donnype Feb 16, 2024
012dccb
Fix unit test
Donnype Feb 16, 2024
c50dda3
Add collect_data method to web_system_report
Donnype Feb 16, 2024
9117c8d
Add collect_data method to name_server_report
Donnype Feb 16, 2024
53bbbbc
Add collect_data method to ipv6 report
Donnype Feb 16, 2024
748fcb7
Style
Donnype Feb 16, 2024
302123e
Merge branch 'main' into feature/efficient-reporting-for-all-reports
Donnype Feb 20, 2024
124f314
Merge remote-tracking branch 'origin/feature/efficient-reporting-for-…
Donnype Feb 20, 2024
44e7745
Merge branch 'feature/efficient-reporting-for-all-reports' into featu…
Donnype Feb 20, 2024
9f12f53
Merge branch 'main' into feature/efficient-reporting
Donnype Feb 20, 2024
8e7d92c
Style fix
Donnype Feb 20, 2024
9445e6f
Fix typing and an import
Donnype Feb 20, 2024
d0f65f6
PR feedback
Donnype Feb 22, 2024
31dcd48
PR comments
Donnype Feb 23, 2024
3799be7
Merge branch 'main' into feature/efficient-reporting
ammar92 Feb 27, 2024
344d02a
Update definitions.py
ammar92 Feb 27, 2024
fc60890
- Revert `any` call since mypy cries in terror for some unknown reason.
Donnype Feb 27, 2024
7bd0850
Fix unit test for ipv6 report
Donnype Feb 27, 2024
39da260
Fix integration tests
Donnype Feb 27, 2024
acf00fe
Style
Donnype Feb 27, 2024
4e9a9f1
Merge branch 'main' into feature/efficient-reporting
Donnype Feb 27, 2024
60f8e41
Add missing Error type while catching exceptions parsing query result…
Donnype Feb 27, 2024
955592b
Merge branch 'feature/efficient-reporting' of github.com:minvws/nl-ka…
Donnype Feb 27, 2024
87d59eb
Fix findings endpoint
Donnype Feb 28, 2024
d9f9c53
Remove old code and fix related bugs in the collect_data() method
Donnype Feb 28, 2024
07bc876
Fix bug ignoring web checks
Donnype Feb 28, 2024
f56176a
Fix: finding_types field default should be a dict, not a list
Donnype Feb 28, 2024
647501b
Merge branch 'main' into feature/efficient-reporting
Donnype Feb 29, 2024
7d2a04d
Fix merge conflict test issue
Donnype Feb 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bytes/bytes/api/root.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ def validation_exception_handler(_: Request, exc: RequestValidationError | Valid


@router.get("/", include_in_schema=False)
def health() -> RedirectResponse:
def root() -> RedirectResponse:
return RedirectResponse(url="/health")


@router.get("/health", response_model=ServiceHealth)
def root() -> ServiceHealth:
def health() -> ServiceHealth:
bytes_health = ServiceHealth(service="bytes", healthy=True, version=__version__)
return bytes_health

Expand Down
50 changes: 50 additions & 0 deletions octopoes/octopoes/api/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from octopoes.version import __version__
from octopoes.xtdb.client import XTDBSession
from octopoes.xtdb.exceptions import XTDBException
from octopoes.xtdb.query import A
from octopoes.xtdb.query import Query as XTDBQuery

logger = getLogger(__name__)
Expand Down Expand Up @@ -153,6 +154,55 @@ def query(
return octopoes.ooi_repository.query(xtdb_query, valid_time)


@router.get("/query-many", tags=["Objects"])
def query_many(
path: str,
sources: list[Reference] = Query(None),
octopoes: OctopoesService = Depends(octopoes_service),
valid_time: datetime = Depends(extract_valid_time),
offset: int = DEFAULT_OFFSET,
limit: int = DEFAULT_LIMIT,
):
if not sources:
return []

object_path = ObjectPath.parse(path)
if not object_path.segments:
raise HTTPException(status_code=400, detail="No path components provided.")

"""
How does this work and why do we do this?

We want to fetch all results but be able to tie these back to the source that was used for a result.
If we query "Network.hostname" for a list of Networks ids, how do we know which hostname lives on which network?
The answer is to add the network id to the "select" statement, so the result is of the form

[(network_id_1, hostname1), (network_id_2, hostname3), ...]

Because you can only select variables in Datalog, "network_id_1" needs to be an Alias. Hence `source_alias`.
We need to tie that to the Network primary_key and add a where-in clause. The example projected on the code:

q = XTDBQuery.from_path(object_path) # Adds "where ?Hostname.network = ?Network

q.find(source_alias).pull(query.result_type) # "select ?network_id, ?Hostname
.where(object_path.segments[0].source_type, primary_key=source_alias) # where ?Network.primary_key = ?network_id
.where_in(object_path.segments[0].source_type, primary_key=sources) # and ?Network.primary_key in ["1", ...]"
"""
Donnype marked this conversation as resolved.
Show resolved Hide resolved

q = XTDBQuery.from_path(object_path)
source_alias = A(object_path.segments[0].source_type, field="primary_key")

return octopoes.ooi_repository.query(
q.find(source_alias)
.pull(q.result_type)
.offset(offset)
.limit(limit)
.where(object_path.segments[0].source_type, primary_key=source_alias)
.where_in(object_path.segments[0].source_type, primary_key=sources),
valid_time,
)


@router.post("/objects/load_bulk", tags=["Objects"])
def load_objects_bulk(
octopoes: OctopoesService = Depends(octopoes_service),
Expand Down
24 changes: 22 additions & 2 deletions octopoes/octopoes/connector/octopoes.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,13 +268,13 @@ def query(
self,
path: str,
valid_time: datetime,
source: Reference | str | None = None,
source: OOI | Reference | str | None = None,
offset: int = DEFAULT_OFFSET,
limit: int = DEFAULT_LIMIT,
) -> list[OOI]:
params = {
"path": path,
"source": source,
"source": source.reference if isinstance(source, OOI) else source,
"valid_time": valid_time,
"offset": offset,
"limit": limit,
Expand All @@ -283,3 +283,23 @@ def query(
TypeAdapter(OOIType).validate_python(ooi)
for ooi in self.session.get(f"/{self.client}/query", params=params).json()
]

def query_many(
self,
path: str,
valid_time: datetime,
sources: list[OOI | Reference | str] = None,
Donnype marked this conversation as resolved.
Show resolved Hide resolved
offset: int = DEFAULT_OFFSET,
limit: int = DEFAULT_LIMIT,
Donnype marked this conversation as resolved.
Show resolved Hide resolved
) -> list[tuple[str, OOIType]]:
params = {
"path": path,
"sources": [str(ooi) for ooi in sources],
"valid_time": valid_time,
"offset": offset,
"limit": limit,
}

result = self.session.get(f"/{self.client}/query-many", params=params).json()

return TypeAdapter(list[tuple[str, OOIType]]).validate_python(result)
26 changes: 21 additions & 5 deletions octopoes/octopoes/repositories/ooi_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,12 +735,28 @@ def list_findings(
}}
"""

res = self.session.client.query(finding_query, valid_time)
findings = [self.deserialize(x[0]) for x in res]
return Paginated(
count=count,
items=findings,
items=self.query(finding_query, valid_time),
)

def query(self, query: Query, valid_time: datetime) -> list[OOI]:
return [self.deserialize(row[0]) for row in self.session.client.query(query, valid_time=valid_time)]
def query(self, query: str | Query, valid_time: datetime) -> list[OOI | tuple]:
results = self.session.client.query(query, valid_time=valid_time)

parsed_results = []
for result in results:
parsed_result = []

for item in result:
try:
parsed_result.append(self.deserialize(item))
except ValueError:
parsed_result.append(item)

if len(parsed_result) == 1:
parsed_results.append(parsed_result[0])
continue

parsed_results.append(tuple(parsed_result))

return parsed_results
75 changes: 63 additions & 12 deletions octopoes/octopoes/xtdb/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from octopoes.models import OOI
from octopoes.models.path import Direction, Path
from octopoes.models.types import get_abstract_types, get_relations, to_concrete
from octopoes.models.types import get_abstract_types, to_concrete


class InvalidField(ValueError):
Expand Down Expand Up @@ -41,6 +41,10 @@ class Aliased:
# https://stackoverflow.com/questions/61257658/python-dataclasses-mocking-the-default-factory-in-a-frozen-dataclass
alias: UUID = field(default_factory=lambda: uuid4())

# Sometimes an Alias refers to a plain field, not a whole model. The current solution is suboptimal
# as you can use aliases freely in Datalog but are now tied to the OOI types too much. TODO!
field: str | None = field(default=None)


Ref = type[OOI] | Aliased
A = Aliased
Expand Down Expand Up @@ -78,6 +82,14 @@ def where(self, ooi_type: Ref, **kwargs) -> "Query":

return self

def where_in(self, ooi_type: Ref, **kwargs) -> "Query":
"""Allows for filtering on multiple values for a specific field. Assumes these values are strings for now."""
Donnype marked this conversation as resolved.
Show resolved Hide resolved

for field_name, values in kwargs.items():
self._where_field_in(ooi_type, field_name, values)

return self

def format(self) -> str:
return self._compile(separator="\n ")

Expand Down Expand Up @@ -119,10 +131,19 @@ def from_path(cls, path: Path) -> "Query":
return query

def pull(self, ooi_type: Ref) -> "Query":
"""By default, we pull the target type. But when using find, count, etc., you have to pull explicitly."""

self._find_clauses.append(f"(pull {self._get_object_alias(ooi_type)} [*])")

return self

def find(self, item: Ref) -> "Query":
"""Add a find clause, so we can select specific fields in a query to be returned as well."""

self._find_clauses.append(self._get_object_alias(item))

return self

def count(self, ooi_type: Ref) -> "Query":
self._find_clauses.append(f"(count {self._get_object_alias(ooi_type)})")

Expand All @@ -146,17 +167,19 @@ def _where_field_is(self, ref: Ref, field_name: str, value: Ref | str | set[str]

abstract_types = get_abstract_types()

if isinstance(value, str):
value = value.replace('"', r"\"")

if ooi_type in abstract_types:
if isinstance(value, str):
value = value.replace('"', r"\"")
self._add_or_statement(ref, field_name, f'"{value}"')
self._add_or_statement_for_abstract_types(ref, field_name, f'"{value}"')
return

if not isinstance(value, type):
Donnype marked this conversation as resolved.
Show resolved Hide resolved
raise InvalidField(f"value '{value}' for abstract class fields should be a string or an OOI Type")

if issubclass(value, OOI):
self._add_or_statement(
self._add_or_statement_for_abstract_types(
ref,
field_name,
self._get_object_alias(
Expand All @@ -166,7 +189,6 @@ def _where_field_is(self, ref: Ref, field_name: str, value: Ref | str | set[str]
return

if isinstance(value, str):
value = value.replace('"', r"\"")
self._add_where_statement(ref, field_name, f'"{value}"')
return

Expand All @@ -176,11 +198,26 @@ def _where_field_is(self, ref: Ref, field_name: str, value: Ref | str | set[str]
if not isinstance(value, Aliased) and not issubclass(value, OOI):
raise InvalidField(f"{value} is not an OOI")

if field_name not in get_relations(ooi_type):
raise InvalidField(f'"{field_name}" is not a relation of {ooi_type.get_object_type()}')

self._add_where_statement(ref, field_name, self._get_object_alias(value))

def _where_field_in(self, ref: Ref, field_name: str, values: list[str]) -> None:
ooi_type = ref.type if isinstance(ref, Aliased) else ref

if field_name not in ooi_type.model_fields:
raise InvalidField(f'"{field_name}" is not a field of {ooi_type.get_object_type()}')

new_values = []
for value in values:
if not isinstance(value, str):
raise InvalidField("Only strings allowed as values for a WHERE IN statement for now.")

value = value.replace('"', r"\"")
new_values.append(f'"{value}"')

self._where_clauses.append(
self._or_statement_for_multiple_values(self._get_object_alias(ref), ooi_type, field_name, new_values)
)

def _add_where_statement(self, ref: Ref, field_name: str, to_alias: str) -> None:
ooi_type = ref.type if isinstance(ref, Aliased) else ref

Expand All @@ -194,27 +231,38 @@ def _add_where_statement(self, ref: Ref, field_name: str, to_alias: str) -> None
)
)

def _add_or_statement(self, ref: Ref, field_name: str, to_alias: str) -> None:
def _add_or_statement_for_abstract_types(self, ref: Ref, field_name: str, to_alias: str) -> None:
ooi_type = ref.type if isinstance(ref, Aliased) else ref

self._where_clauses.append(self._assert_type(ref, ooi_type))
self._where_clauses.append(
self._or_statement(
self._or_statement_for_abstract_types(
self._get_object_alias(ref),
ooi_type.strict_subclasses(),
field_name,
to_alias,
)
)

def _or_statement(self, from_alias: str, concrete_types: list[type[OOI]], field_name: str, to_alias: str) -> str:
def _or_statement_for_abstract_types(
self, from_alias: str, concrete_types: list[type[OOI]], field_name: str, to_alias: str
) -> str:
relationships = [
self._relationship(from_alias, concrete_type.get_object_type(), field_name, to_alias)
for concrete_type in concrete_types
]

return f"(or {' '.join(relationships)} )"

def _or_statement_for_multiple_values(
self, from_alias: str, ooi_type: type[OOI], field_name: str, to_aliases: list[str]
) -> str:
relationships = [
self._relationship(from_alias, ooi_type.get_object_type(), field_name, to_alias) for to_alias in to_aliases
]

return f"(or {' '.join(relationships)} )"

def _relationship(self, from_alias: str, field_type: str, field_name: str, to_alias: str) -> str:
return f"[ {from_alias} :{field_type}/{field_name} {to_alias} ]"

Expand Down Expand Up @@ -258,7 +306,10 @@ def _compile(self, *, separator=" ") -> str:

def _get_object_alias(self, object_type: Ref) -> str:
if isinstance(object_type, Aliased):
return "?" + str(object_type.alias)
base = "?" + str(object_type.alias)

# To have at least a way to separate aliases for types and plain fields in the raw query
return base if not object_type.field else base + "?" + object_type.field

return object_type.get_object_type()

Expand Down
19 changes: 17 additions & 2 deletions octopoes/tests/integration/test_api_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,23 @@ def test_query(octopoes_api_connector: OctopoesAPIConnector, valid_time: datetim
assert len(results) == 1
assert str(results[0].port) == "443"

results = octopoes_api_connector.query(query, valid_time, source=hostnames[0].reference)
results = octopoes_api_connector.query(query, valid_time, source=hostnames[0])
assert len(results) == 0

results = octopoes_api_connector.query(query, valid_time, source=hostnames[1].reference)
results = octopoes_api_connector.query(query, valid_time, source=hostnames[1])
assert len(results) == 1

query = "Hostname.<hostname[is DNSNSRecord]"
assert len(octopoes_api_connector.query(query, valid_time, hostnames[0])) == 1
assert len(octopoes_api_connector.query(query, valid_time, hostnames[1])) == 1
assert len(octopoes_api_connector.query(query, valid_time, hostnames[2])) == 1
assert len(octopoes_api_connector.query(query, valid_time, hostnames[3])) == 0

result = octopoes_api_connector.query_many(
query,
valid_time,
[hostnames[0], hostnames[1], hostnames[2], hostnames[3]],
)
assert len(result) == 3
assert result[0][0] == hostnames[0].reference
assert result[0][1] == dns_ns_records[0]
28 changes: 28 additions & 0 deletions octopoes/tests/integration/test_ooi_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@

import pytest

from octopoes.models.ooi.dns.zone import Hostname
from octopoes.models.ooi.network import Network
from octopoes.models.pagination import Paginated
from octopoes.models.path import Path
from octopoes.repositories.ooi_repository import XTDBOOIRepository
from octopoes.xtdb.query import A, Query

if os.environ.get("CI") != "1":
pytest.skip("Needs XTDB multinode container.", allow_module_level=True)
Expand All @@ -20,3 +23,28 @@ def test_list_oois(xtdb_ooi_repository: XTDBOOIRepository, valid_time: datetime)

# list() does not return any OOI without a scan profile
assert xtdb_ooi_repository.list_oois({Network}, valid_time) == Paginated(count=0, items=[])


def test_complex_query(xtdb_ooi_repository: XTDBOOIRepository, valid_time: datetime):
network = Network(name="testnetwork")
network2 = Network(name="testnetwork2")
xtdb_ooi_repository.save(network, valid_time)
xtdb_ooi_repository.save(network2, valid_time)
xtdb_ooi_repository.save(Hostname(network=network2.reference, name="testhostname"), valid_time)
xtdb_ooi_repository.session.commit()

# router logic
object_path = Path.parse("Network.<network[is Hostname]")
sources = ["Network|testnetwork", "Network|testnetwork2"]
source_pk_alias = A(object_path.segments[0].source_type, field="primary_key")
query = (
Query.from_path(object_path)
.find(source_pk_alias)
.pull(Network)
.where(Network, primary_key=source_pk_alias)
.where_in(Network, primary_key=sources)
)

result = xtdb_ooi_repository.query(query, valid_time)

assert len(result) == 1
Loading
Loading