Skip to content

Commit

Permalink
feat(lapis): make it possible to fetch sequences in JSON and NDJSON f…
Browse files Browse the repository at this point in the history
…ormat

resolves #971
  • Loading branch information
fengelniederhammer committed Oct 15, 2024
1 parent c89ef6c commit ba18956
Show file tree
Hide file tree
Showing 24 changed files with 1,026 additions and 487 deletions.
9 changes: 8 additions & 1 deletion lapis-docs/src/content/docs/concepts/response-format.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,14 @@ A '200' response code indicates a successful request, while any other code signi
Endpoints typically support returning data in JSON, [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) or
[TSV](https://en.wikipedia.org/wiki/Tab-separated_values), with JSON being the default.
Genomic sequences (such as those from `unalignedNucleotideSequences`, `alignedAminoAcidSequences`, etc.) are provided in the
[FASTA format](https://en.wikipedia.org/wiki/FASTA_format).
[FASTA format](https://en.wikipedia.org/wiki/FASTA_format) by default,
but can also be requested in JSON or [NDJSON](https://github.com/ndjson/ndjson-spec) format.

:::note
NDJSON is in particular useful for downloading large datasets, as it allows for streaming data processing.
You can read line by line without loading the entire file into memory.
Since every line is a valid JSON object, it is usually easier to handle programmatically than FASTA.
:::

## Example

Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
87 changes: 35 additions & 52 deletions lapis-e2e/test/alignedNucleotideSequence.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,117 +10,100 @@ import {
describe('The /alignedNucleotideSequence endpoint', () => {
it('should return aligned nucleotide sequences for Switzerland', async () => {
const result = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: { country: 'Switzerland' },
nucleotideSequenceRequest: { country: 'Switzerland', dataFormat: 'JSON' },
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_3259931');
expect(sequences[0]).to.have.length(29903);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_3259931');
expect(result[0].main).to.have.length(29903);
});

it('should return aligned nucleotide sequences for multi segmented sequences', async () => {
const result = await lapisMultiSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: { country: 'Switzerland' },
nucleotideSequenceRequest: { country: 'Switzerland', dataFormat: 'JSON' },
segment: 'M',
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(6);
expect(sequences).to.have.length(6);
expect(primaryKeys[0]).to.equal('>key_5');
expect(sequences[0]).to.equal('TGGG');
expect(result).to.have.length(6);
expect(result[0].primaryKey).to.equal('key_5');
expect(result[0].m).to.equal('TGGG');
});

it('should order ascending by specified fields', async () => {
const result = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: { orderBy: [{ field: 'primaryKey', type: 'ascending' }] },
nucleotideSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'ascending' }],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_1001493');
expect(sequences[0]).to.have.length(29903);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_1001493');
expect(result[0].main).to.have.length(29903);
});

it('should order descending by specified fields', async () => {
const result = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: { orderBy: [{ field: 'primaryKey', type: 'descending' }] },
nucleotideSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'descending' }],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_931279');
expect(sequences[0]).to.have.length(29903);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_931279');
expect(result[0].main).to.have.length(29903);
});

it('should apply limit and offset', async () => {
const resultWithLimit = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'ascending' }],
limit: 2,
dataFormat: 'JSON',
},
});

const { primaryKeys: primaryKeysWithLimit, sequences: sequencesWithLimit } =
sequenceData(resultWithLimit);

expect(primaryKeysWithLimit).to.have.length(2);
expect(sequencesWithLimit).to.have.length(2);
expect(primaryKeysWithLimit[0]).to.equal('>key_1001493');
expect(sequencesWithLimit[0]).to.have.length(29903);
expect(resultWithLimit).to.have.length(2);
expect(resultWithLimit[0].primaryKey).to.equal('key_1001493');
expect(resultWithLimit[0].main).to.have.length(29903);

const resultWithLimitAndOffset =
await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'ascending' }],
limit: 2,
offset: 1,
dataFormat: 'JSON',
},
});

const { primaryKeys: primaryKeysWithLimitAndOffset, sequences: sequencesWithLimitAndOffset } =
sequenceData(resultWithLimitAndOffset);

expect(primaryKeysWithLimitAndOffset).to.have.length(2);
expect(sequencesWithLimitAndOffset).to.have.length(2);
expect(primaryKeysWithLimitAndOffset[0]).to.equal(primaryKeysWithLimit[1]);
expect(sequencesWithLimitAndOffset[0]).to.equal(sequencesWithLimit[1]);
expect(resultWithLimitAndOffset).to.have.length(2);
expect(resultWithLimitAndOffset[0]).to.deep.equal(resultWithLimit[1]);
});

it('should correctly handle nucleotide insertion requests', async () => {
const result = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: {
nucleotideInsertions: ['ins_25701:CC?', 'ins_5959:?AT'],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(1);
expect(sequences).to.have.length(1);
expect(primaryKeys[0]).to.equal('>key_3578231');
expect(result).to.have.length(1);
expect(result[0].primaryKey).to.equal('key_3578231');
});

it('should correctly handle amino acid insertion requests', async () => {
const result = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: {
aminoAcidInsertions: ['ins_S:143:T', 'ins_ORF1a:3602:F?P'],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(1);
expect(sequences).to.have.length(1);
expect(primaryKeys[0]).to.equal('>key_3259931');
expect(result).to.have.length(1);
expect(result[0].primaryKey).to.equal('key_3259931');
});

it('should return an empty zstd compressed file', async () => {
Expand Down
77 changes: 30 additions & 47 deletions lapis-e2e/test/aminoAcidSequence.spec.ts
Original file line number Diff line number Diff line change
@@ -1,47 +1,41 @@
import { expect } from 'chai';
import { lapisClient, sequenceData } from './common';
import { lapisClient } from './common';

describe('The /alignedAminoAcidSequence endpoint', () => {
it('should return amino acid sequences for Switzerland', async () => {
const result = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: { country: 'Switzerland' },
aminoAcidSequenceRequest: { country: 'Switzerland', dataFormat: 'JSON' },
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_3259931');
expect(sequences[0]).to.have.length(1274);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_3259931');
expect(result[0].s).to.have.length(1274);
});

it('should order ascending by specified fields', async () => {
const result = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: { orderBy: [{ field: 'primaryKey', type: 'ascending' }] },
aminoAcidSequenceRequest: { orderBy: [{ field: 'primaryKey', type: 'ascending' }], dataFormat: 'JSON' },
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_1001493');
expect(sequences[0]).to.have.length(1274);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_1001493');
expect(result[0].s).to.have.length(1274);
});

it('should order descending by specified fields', async () => {
const result = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: { orderBy: [{ field: 'primaryKey', type: 'descending' }] },
aminoAcidSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'descending' }],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_931279');
expect(sequences[0]).to.have.length(1274);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_931279');
expect(result[0].s).to.have.length(1274);
});

it('should apply limit and offset', async () => {
Expand All @@ -50,63 +44,52 @@ describe('The /alignedAminoAcidSequence endpoint', () => {
aminoAcidSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'ascending' }],
limit: 2,
dataFormat: 'JSON',
},
});

const { primaryKeys: primaryKeysWithLimit, sequences: sequencesWithLimit } =
sequenceData(resultWithLimit);

expect(primaryKeysWithLimit).to.have.length(2);
expect(sequencesWithLimit).to.have.length(2);
expect(primaryKeysWithLimit[0]).to.equal('>key_1001493');
expect(sequencesWithLimit[0]).to.have.length(1274);
expect(resultWithLimit).to.have.length(2);
expect(resultWithLimit[0].primaryKey).to.equal('key_1001493');
expect(resultWithLimit[0].s).to.have.length(1274);

const resultWithLimitAndOffset = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'ascending' }],
limit: 2,
offset: 1,
dataFormat: 'JSON',
},
});

const { primaryKeys: primaryKeysWithLimitAndOffset, sequences: sequencesWithLimitAndOffset } =
sequenceData(resultWithLimitAndOffset);

expect(primaryKeysWithLimitAndOffset).to.have.length(2);
expect(sequencesWithLimitAndOffset).to.have.length(2);
expect(primaryKeysWithLimitAndOffset[0]).to.equal(primaryKeysWithLimit[1]);
expect(sequencesWithLimitAndOffset[0]).to.equal(sequencesWithLimit[1]);
expect(resultWithLimitAndOffset).to.have.length(2);
expect(resultWithLimitAndOffset[0]).to.deep.equal(resultWithLimit[1]);
});

it('should correctly handle nucleotide insertion requests', async () => {
const result = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: {
nucleotideInsertions: ['ins_25701:CC?', 'ins_5959:?AT'],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(1);
expect(sequences).to.have.length(1);
expect(primaryKeys[0]).to.equal('>key_3578231');
expect(sequences[0]).to.have.length(1274);
expect(result).to.have.length(1);
expect(result[0].primaryKey).to.equal('key_3578231');
expect(result[0].s).to.have.length(1274);
});

it('should correctly handle amino acid insertion requests', async () => {
const result = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: {
aminoAcidInsertions: ['ins_S:143:T', 'ins_ORF1a:3602:F?P'],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(1);
expect(sequences).to.have.length(1);
expect(primaryKeys[0]).to.equal('>key_3259931');
expect(result).to.have.length(1);
expect(result[0].primaryKey).to.equal('key_3259931');
});
});
22 changes: 20 additions & 2 deletions lapis-e2e/test/common.spec.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { expect } from 'chai';
import { basePath, expectIsGzipEncoded, expectIsZstdEncoded } from './common';
import { basePath, expectIsGzipEncoded, expectIsZstdEncoded, sequenceData } from './common';

const routes = [
{ pathSegment: '/aggregated', servesFasta: false, expectedDownloadFilename: 'aggregated.json' },
Expand Down Expand Up @@ -83,7 +83,25 @@ describe('All endpoints', () => {
expect(response.headers.get('lapis-data-version')).to.match(/\d{10}/);
});

if (!route.servesFasta) {
if (route.servesFasta) {
it('should return sequences in fasta format', async () => {
const response = await get(new URLSearchParams({ dataFormat: 'fasta' }));

const { primaryKeys, sequences } = sequenceData(await response.text());

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
});

it('should return sequences in ndjson format', async () => {
const response = await get(new URLSearchParams({ dataFormat: 'ndjson' }));

const lines = (await response.text()).split('\n').filter(line => line.length > 0);

expect(lines).to.have.length(100);
expect(JSON.parse(lines[0])).to.have.property('primaryKey');
});
} else {
it('should return the lapis data version header for CSV data', async () => {
const response = await get(new URLSearchParams({ dataFormat: 'csv' }));

Expand Down
Loading

0 comments on commit ba18956

Please sign in to comment.