From 5377ab95b74fbc24f8a3028dd79f2f876f56c447 Mon Sep 17 00:00:00 2001 From: Fabian Engelniederhammer Date: Wed, 20 Dec 2023 17:39:49 +0100 Subject: [PATCH] feat(docs): add more important parts of the arc42 docs #535 --- lapis2-docs/astro.config.mjs | 20 +++++-- .../{introduction.mdx => 01-introduction.mdx} | 8 +-- .../02-architecture-and-constraints.mdx | 16 +++++ ...xt.mdx => 03-system-scope-and-context.mdx} | 21 ++++--- .../04-solution-strategy.mdx | 58 +++++++++++++++++++ .../05-building-block-view.mdx | 11 ++++ .../06-runtime-view.mdx | 21 +++++++ .../{glossary.mdx => 99-glossary.mdx} | 6 +- .../architecture-and-constraints.mdx | 21 ------- lapis2-docs/src/plantuml/.gitignore | 1 + lapis2-docs/src/plantuml/README.md | 10 ++++ .../src/plantuml/building-block-view.puml | 16 +++++ .../src/plantuml/building-block-view.svg | 1 + lapis2-docs/src/plantuml/runtime-view.puml | 25 ++++++++ lapis2-docs/src/plantuml/runtime-view.svg | 1 + lapis2-docs/tests/docs.spec.ts | 3 + 16 files changed, 195 insertions(+), 44 deletions(-) rename lapis2-docs/src/content/docs/architecture-and-dev-docs/{introduction.mdx => 01-introduction.mdx} (92%) create mode 100644 lapis2-docs/src/content/docs/architecture-and-dev-docs/02-architecture-and-constraints.mdx rename lapis2-docs/src/content/docs/architecture-and-dev-docs/{system-scope-and-context.mdx => 03-system-scope-and-context.mdx} (56%) create mode 100644 lapis2-docs/src/content/docs/architecture-and-dev-docs/04-solution-strategy.mdx create mode 100644 lapis2-docs/src/content/docs/architecture-and-dev-docs/05-building-block-view.mdx create mode 100644 lapis2-docs/src/content/docs/architecture-and-dev-docs/06-runtime-view.mdx rename lapis2-docs/src/content/docs/architecture-and-dev-docs/{glossary.mdx => 99-glossary.mdx} (98%) delete mode 100644 lapis2-docs/src/content/docs/architecture-and-dev-docs/architecture-and-constraints.mdx create mode 100644 lapis2-docs/src/plantuml/.gitignore create mode 100644 lapis2-docs/src/plantuml/README.md create mode 100644 lapis2-docs/src/plantuml/building-block-view.puml create mode 100644 lapis2-docs/src/plantuml/building-block-view.svg create mode 100644 lapis2-docs/src/plantuml/runtime-view.puml create mode 100644 lapis2-docs/src/plantuml/runtime-view.svg diff --git a/lapis2-docs/astro.config.mjs b/lapis2-docs/astro.config.mjs index 078f9afb9..1f0a8e5c2 100644 --- a/lapis2-docs/astro.config.mjs +++ b/lapis2-docs/astro.config.mjs @@ -88,19 +88,31 @@ export default defineConfig({ items: [ { label: 'Introduction and Goals', - link: '/architecture-and-dev-docs/introduction', + link: '/architecture-and-dev-docs/01-introduction', }, { label: 'Architecture and Constraints', - link: '/architecture-and-dev-docs/architecture-and-constraints', + link: '/architecture-and-dev-docs/02-architecture-and-constraints', }, { label: 'System Scope and Context', - link: '/architecture-and-dev-docs/system-scope-and-context', + link: '/architecture-and-dev-docs/03-system-scope-and-context', + }, + { + label: 'Solution Strategy', + link: '/architecture-and-dev-docs/04-solution-strategy', + }, + { + label: 'Building Block View', + link: '/architecture-and-dev-docs/05-building-block-view', + }, + { + label: 'Runtime View', + link: '/architecture-and-dev-docs/06-runtime-view', }, { label: 'Glossary', - link: '/architecture-and-dev-docs/glossary', + link: '/architecture-and-dev-docs/99-glossary', }, ], }, diff --git a/lapis2-docs/src/content/docs/architecture-and-dev-docs/introduction.mdx b/lapis2-docs/src/content/docs/architecture-and-dev-docs/01-introduction.mdx similarity index 92% rename from lapis2-docs/src/content/docs/architecture-and-dev-docs/introduction.mdx rename to lapis2-docs/src/content/docs/architecture-and-dev-docs/01-introduction.mdx index 03af83388..3df3a517e 100644 --- a/lapis2-docs/src/content/docs/architecture-and-dev-docs/introduction.mdx +++ b/lapis2-docs/src/content/docs/architecture-and-dev-docs/01-introduction.mdx @@ -5,7 +5,7 @@ description: Introduction to the architecture and the developers docs of LAPIS This document was inspired by the [arc42 template](https://arc42.org/). -It describes LAPIS (**L**ightweight **API** for **S**equences), +It describes LAPIS (**L**ightweight **API** for **S**equences) and SILO, which is a platform to give easy access to genomic sequence data alongside metadata of the sequenced probes. It is used to filter potentially large sequence data and return the result to the user through web access, so that a user can develop their own evaluation of the data. @@ -33,10 +33,8 @@ The following goals have been established for this system: | Requirement | | | --------------------------------------- | -------------------------------------------------------------------------------- | | Create an instance for a given organism | Create an instance of the whole system by giving a configuration for a organism. | -| Load data | Load sequence data that has to be provided in a defined format. | -| Store data | Store data in compressed form. | +| Store data efficiently | Store data in compressed form. | | Provide web access to data | Provide endpoints for custom user queries to the data. | -| Provide statistics | Provide monitoring and statistics of usage of the system. | ## Quality Goals @@ -49,7 +47,7 @@ The following goals have been established for this system: | Performance efficiency | Time behaviour | It is possible to query millions of sequences in less than a second. | | | Scalability | Performance (query response time, memory usage) grows at most linearly with the number of stored sequences. | | | | | -| Maintainability | Reusability | It is possible to use LAPIS with any other database that implements the SILO query language | +| Maintainability | Reusability | It is possible to use LAPIS with any other database that implements the SILO query language. | | | Testability | SILO-LAPIS is well tested on end to end scope. The tests serve as examples for users and maintainers. | ## Stakeholders diff --git a/lapis2-docs/src/content/docs/architecture-and-dev-docs/02-architecture-and-constraints.mdx b/lapis2-docs/src/content/docs/architecture-and-dev-docs/02-architecture-and-constraints.mdx new file mode 100644 index 000000000..27bd44057 --- /dev/null +++ b/lapis2-docs/src/content/docs/architecture-and-dev-docs/02-architecture-and-constraints.mdx @@ -0,0 +1,16 @@ +--- +title: Architecture and Constraints +description: Overview of the architecture and constraints of the software. +--- + +We identified the following constraints for our software: + +- Developed under an **open-source** licence. + We chose the tooling such that a broad spectrum of developers can in principle work on the software. +- The software is designed to be **highly configurable** so that it can be used for various organisms. + Configuration files have to be passed to LAPIS and SILO at runtime that determine the nature of the organism such as: + - a reference genome + - which metadata is available on the genomic data +- The system is designed to have the best possible **performance**. + This mostly targets SILO, but also in LAPIS, + we have to keep in mind that we are dealing with potentially large data that we have to serve to the client. diff --git a/lapis2-docs/src/content/docs/architecture-and-dev-docs/system-scope-and-context.mdx b/lapis2-docs/src/content/docs/architecture-and-dev-docs/03-system-scope-and-context.mdx similarity index 56% rename from lapis2-docs/src/content/docs/architecture-and-dev-docs/system-scope-and-context.mdx rename to lapis2-docs/src/content/docs/architecture-and-dev-docs/03-system-scope-and-context.mdx index e5297bb54..e296e9ac4 100644 --- a/lapis2-docs/src/content/docs/architecture-and-dev-docs/system-scope-and-context.mdx +++ b/lapis2-docs/src/content/docs/architecture-and-dev-docs/03-system-scope-and-context.mdx @@ -6,22 +6,21 @@ description: Putting LAPIS in the context of its surroundings This chapter specifies the boundaries of SILO-LAPIS and describes the interfaces to other systems and users. - **Configuration and operation**: - LAPIS is provisioned by the maintainer, who provides a configuration file and operates the instance + LAPIS is provisioned by the maintainer, who provides a configuration file and operates the instance. - **Data updates**: - the maintainer can trigger an update upon which LAPIS reads data from the disc and stores it internally - - the maintainer needs to make sure that the data is in the correct format - - providing correct data will most likely involve some preprocessing (e.g. using Nextstrain) + The maintainer can trigger an update upon which SILO reads data from the disc and stores it internally. + - Raw data might for example be downloaded from Genbank. + - Providing correct data will most likely involve some preprocessing (e.g. using Nextstrain) to compute e.g. aligned sequences and insertions. + - The maintainer needs to make sure that the data is in the correct format. - LAPIS can be accessed by the end-user through a **REST interface** - LAPIS has endpoints to fetch the following types of data: - - aggregated data - - returns how many sequences fulfill the provided filter criteria - - amino acid mutations, nucleotide mutations + - **Aggregated data**: returns how many sequences fulfill the provided filter criteria + - Amino acid and nucleotide **mutations** - returns a list of mutations (with their proportion and count) which fulfill the provided filter criteria - the proportion is relative to all sequences which fulfill the provided criteria - - details - - returns a list of metadata of the sequences which fulfill the provided criteria - - amino acid sequences, aligned and unaligned nucleotide sequences - - returns the corresponding sequences which fulfill the provided criteria + - **Details**: returns a list of metadata of the sequences which fulfill the provided criteria + - Amino acid, aligned and unaligned nucleotide **sequences**: + returns the corresponding sequences which fulfill the provided criteria - the provided filter criteria can be a compilation of metadata fields, mutations and insertions. - LAPIS offers an OpenAPI specification and a [Swagger UI](../references/open-api-definition) that documents the API. diff --git a/lapis2-docs/src/content/docs/architecture-and-dev-docs/04-solution-strategy.mdx b/lapis2-docs/src/content/docs/architecture-and-dev-docs/04-solution-strategy.mdx new file mode 100644 index 000000000..5f86b2b45 --- /dev/null +++ b/lapis2-docs/src/content/docs/architecture-and-dev-docs/04-solution-strategy.mdx @@ -0,0 +1,58 @@ +--- +title: Solution Strategy +description: How LAPIS and SILO aim to solve the problem +--- + +## Setting Up Your Own Instance + +We want to make it as easy as possible for you to set up your own instance of SILO-LAPIS for an organism of your +choice. +We solve this in two aspects: + +- **Configuration:** LAPIS and SILO are highly configurable regarding the data that they process. + The available data and the reference genome can be configured to fit your needs. +- **Deployment:** We provide Docker containers for LAPIS and SILO that are ready to use. + You only need to provide the data and the configuration. + We also provide examples and tutorials to help you get started. + +## Query Performance + +LAPIS and SILO are designed to process queries as fast as possible. +One should be able to search for mutations in millions of samples in a matter of seconds. + +SILO contains an in-memory database that holds the data. +The data is stored column-wise in bitmaps, +since the nature of most queries targets columns. + +Example: A common query is to search for a mutation at a certain position in the genome. +SILO stores each position in the genome as a separate column, +thus the filter becomes trivial (reading the respective precomputed bitmap). +The bitmap is interpreted as the filter result (having a `1` in the positions of the samples that match the filter). + +### Preprocessing + +Precomputing the bitmaps is a time-consuming task. +SILO does this ahead of time in a separate step, the preprocessing. +The preprocessing is a separate part of SILO that builds the in-memory database from the input files +and serializes it to disk. +At runtime, SILO can then load the serialized database from disk. +Having the preprocessing as a separate step has major advantages: + +- The preprocessing can be done on a different machine than the one that runs the queries. +- The startup time of SILO is reduced, since it only needs to load the database from disk. + - Scalability: Thus, it is possible to quickly launch several instances of SILO from the same preprocessing result. + +## Storage Efficiency + +SILO uses [Roaring bitmaps](https://roaringbitmap.org/) to store the data, +since they are designed to be space-efficient. +Internally, Roaring bitmaps store data in chunks. +SILO aims to sort sequences such that +similar sequences (i.e. sequences that have similar mutations) are stored in the same chunk. +The goal is to have many bitmaps that are either almost completely empty or almost completely full. +This will result in a very high compression ratio. + +## Easy Access To Data + +SILO offers a rather complex query language to query the data. +LAPIS aims to simplify the usage of SILO by providing a simple REST API. diff --git a/lapis2-docs/src/content/docs/architecture-and-dev-docs/05-building-block-view.mdx b/lapis2-docs/src/content/docs/architecture-and-dev-docs/05-building-block-view.mdx new file mode 100644 index 000000000..486e052a0 --- /dev/null +++ b/lapis2-docs/src/content/docs/architecture-and-dev-docs/05-building-block-view.mdx @@ -0,0 +1,11 @@ +--- +title: Building Block View +description: A view into SILO and LAPIS +--- + +The system consists of two artifacts: + +- LAPIS: A simple REST API. +- SILO: A more detailed view into SILO is depicted below. + +![Building Block View](../../../plantuml/building-block-view.svg) diff --git a/lapis2-docs/src/content/docs/architecture-and-dev-docs/06-runtime-view.mdx b/lapis2-docs/src/content/docs/architecture-and-dev-docs/06-runtime-view.mdx new file mode 100644 index 000000000..bd2aef26f --- /dev/null +++ b/lapis2-docs/src/content/docs/architecture-and-dev-docs/06-runtime-view.mdx @@ -0,0 +1,21 @@ +--- +title: Runtime View +description: Building Blocks And How They Interact At Runtime +--- + +SILO-LAPIS consists of three main components: + +- **LAPIS:** A web service wrapping the SILO API. + - It maps the request to a corresponding SILO query. +- **SILO API:** The query engine exposed as a web service. + - It accepts **SILO queries** and returns the results. A SILO query specifies + - a filter expression for which samples should be considered, + - an action what kind of data should be returned (details, aggregated data, etc.). + - The SILO API regularly checks for new serialized states of the database (the output of the preprocessing) + and loads them into memory. +- **SILO Preprocessing:** A command line tool that preprocesses the data for SILO. + It builds a database from the input data and serializes it to disk. + - The SILO Preprocessing has to be started by the maintainer of the instance (or e.g. a cronjob). + It is not a continuously running process. + +![Runtime View](../../../plantuml/runtime-view.svg) diff --git a/lapis2-docs/src/content/docs/architecture-and-dev-docs/glossary.mdx b/lapis2-docs/src/content/docs/architecture-and-dev-docs/99-glossary.mdx similarity index 98% rename from lapis2-docs/src/content/docs/architecture-and-dev-docs/glossary.mdx rename to lapis2-docs/src/content/docs/architecture-and-dev-docs/99-glossary.mdx index 96d7ee902..9dfa3f7c1 100644 --- a/lapis2-docs/src/content/docs/architecture-and-dev-docs/glossary.mdx +++ b/lapis2-docs/src/content/docs/architecture-and-dev-docs/99-glossary.mdx @@ -13,7 +13,7 @@ description: Explanation of terms used in the context of LAPIS. | Segment | The genome of an organism may consist of multiple nucleotide sequence pieces. We call those pieces "segments". | | Variant | We follow a very open definition of variants. Every subset of sequences is considered a variant. A variant is specified by lineage/clade names and mutations. A variant does not need to be [monophyletic](https://en.wikipedia.org/wiki/Monophyly). | -### Mutations +## Mutations Mutations can occur either on nucleotide level or on amino acid level. For the nucleotides a single symbol can produce a mutation, whereas for the amino acids, @@ -22,7 +22,7 @@ some nucleotide mutations still produce the same amino acid The following explains the notations for mutations. -#### Amino acid +### Amino Acid Mutations The gene has to be provided for the AA mutation, since AAs only make sense within a gene. @@ -34,7 +34,7 @@ The gene has to be provided for the AA mutation, since AAs only make sense withi The origin AA symbol can be omitted, since it is clear from the reference genome. **Example: ORF_1a:1234S** -#### Nucleotide +### Nucleotide Mutations **Example: C1234T**. This translates to diff --git a/lapis2-docs/src/content/docs/architecture-and-dev-docs/architecture-and-constraints.mdx b/lapis2-docs/src/content/docs/architecture-and-dev-docs/architecture-and-constraints.mdx deleted file mode 100644 index 5380c94ae..000000000 --- a/lapis2-docs/src/content/docs/architecture-and-dev-docs/architecture-and-constraints.mdx +++ /dev/null @@ -1,21 +0,0 @@ ---- -title: Architecture and Constraints -description: Overview of the architecture and constraints of the software. ---- - -We identified the following constraints for our software: - -- Developed under an open-source licence. We chose the tooling such that a broad spectrum of developers can in principle - work on the software. -- The software is designed to be highly configurable so that it can be used for various organisms. - Configuration files have to be passed to LAPIS and SILO at runtime that determine the nature of the organism such as: - - a reference genome - - which metadata is available on the genomic data -- Input files for SILO are provided in a specific format: - - metadata and quality control - - sequence data (non-aligned nucleotide sequences, aligned nucleotide sequences, aligned AA sequences) - - a Pango lineage alias map (if available for the organism) -- LAPIS is backed by a database that understands SILO queries. -- The system is designed to have the best possible performance. - This mostly targets SILO, but also in LAPIS, - we have to keep in mind that we are dealing with potentially large data that we have to serve to the client. diff --git a/lapis2-docs/src/plantuml/.gitignore b/lapis2-docs/src/plantuml/.gitignore new file mode 100644 index 000000000..2945a1fe8 --- /dev/null +++ b/lapis2-docs/src/plantuml/.gitignore @@ -0,0 +1 @@ +plantuml.jar \ No newline at end of file diff --git a/lapis2-docs/src/plantuml/README.md b/lapis2-docs/src/plantuml/README.md new file mode 100644 index 000000000..015d50c81 --- /dev/null +++ b/lapis2-docs/src/plantuml/README.md @@ -0,0 +1,10 @@ +# Generating the PlantUML diagrams + +Download `plantuml.jar` from the following link and place it in this directory. + + + +run +```bash +java -jar plantuml.jar -tsvg ./*.puml +``` diff --git a/lapis2-docs/src/plantuml/building-block-view.puml b/lapis2-docs/src/plantuml/building-block-view.puml new file mode 100644 index 000000000..8587ed083 --- /dev/null +++ b/lapis2-docs/src/plantuml/building-block-view.puml @@ -0,0 +1,16 @@ +@startuml + +node SILO { + package "SILO Api" { + component "Query Engine" as query + component "Runtime Database" as db + "Web API" -> query + query -> db + } + + package "SILO Preprocessing" { + component "Preprocessing Database" + } +} + +@enduml diff --git a/lapis2-docs/src/plantuml/building-block-view.svg b/lapis2-docs/src/plantuml/building-block-view.svg new file mode 100644 index 000000000..82044f9ba --- /dev/null +++ b/lapis2-docs/src/plantuml/building-block-view.svg @@ -0,0 +1 @@ +SILOSILO ApiSILO PreprocessingQuery EngineRuntime DatabaseWeb APIPreprocessing Database \ No newline at end of file diff --git a/lapis2-docs/src/plantuml/runtime-view.puml b/lapis2-docs/src/plantuml/runtime-view.puml new file mode 100644 index 000000000..89a7fa1b6 --- /dev/null +++ b/lapis2-docs/src/plantuml/runtime-view.puml @@ -0,0 +1,25 @@ +@startuml + +cloud { + file "input files" as input +} +folder "Folder with serialized states" as files { + file "oldstate.silo" + file "newstate.silo" +} +component "SILO Preprocessing" as siloPreprocessing #lightblue +package "SILO-LAPIS runtime" { + component "SILO Api" as siloApi #lightblue + component LAPIS #lightblue +} +actor User + +User -> LAPIS : simplified query +User <- LAPIS : data +LAPIS -> siloApi : SILO query +LAPIS <- siloApi : data +siloApi <- files : reads newest serialized state +siloPreprocessing --> files : serializes state +input --> siloPreprocessing : reads externally provided input files + +@enduml diff --git a/lapis2-docs/src/plantuml/runtime-view.svg b/lapis2-docs/src/plantuml/runtime-view.svg new file mode 100644 index 000000000..74e28ab1f --- /dev/null +++ b/lapis2-docs/src/plantuml/runtime-view.svg @@ -0,0 +1 @@ +Folder with serialized statesSILO-LAPIS runtimeinput filesoldstate.silonewstate.siloSILO ApiLAPISSILO PreprocessingUsersimplified querydataSILO querydatareads newest serialized stateserializes statereads externally provided input files \ No newline at end of file diff --git a/lapis2-docs/tests/docs.spec.ts b/lapis2-docs/tests/docs.spec.ts index 39285f42c..9bc28c31c 100644 --- a/lapis2-docs/tests/docs.spec.ts +++ b/lapis2-docs/tests/docs.spec.ts @@ -17,6 +17,9 @@ const pages = [ 'Introduction and Goals', 'Architecture and Constraints', 'System Scope and Context', + 'Solution Strategy', + 'Building Block View', + 'Runtime View', 'Glossary', ];