Skip to content

Commit

Permalink
feat(lapis): make it possible to fetch sequences in JSON and NDJSON f…
Browse files Browse the repository at this point in the history
…ormat

resolves #971
  • Loading branch information
fengelniederhammer committed Oct 14, 2024
1 parent 0c16c92 commit d6c33cb
Show file tree
Hide file tree
Showing 23 changed files with 1,026 additions and 487 deletions.
9 changes: 8 additions & 1 deletion lapis-docs/src/content/docs/concepts/response-format.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,14 @@ A '200' response code indicates a successful request, while any other code signi
Endpoints typically support returning data in JSON, [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) or
[TSV](https://en.wikipedia.org/wiki/Tab-separated_values), with JSON being the default.
Genomic sequences (such as those from `unalignedNucleotideSequences`, `alignedAminoAcidSequences`, etc.) are provided in the
[FASTA format](https://en.wikipedia.org/wiki/FASTA_format).
[FASTA format](https://en.wikipedia.org/wiki/FASTA_format) by default,
but can also be requested in JSON or [NDJSON](https://github.com/ndjson/ndjson-spec) format.

:::note
NDJSON is in particular useful for downloading large datasets, as it allows for streaming data processing.
You can read line by line without loading the entire file into memory.
Since every line is a valid JSON object, it is usually easier to handle programmatically than FASTA.
:::

## Example

Expand Down
87 changes: 35 additions & 52 deletions lapis-e2e/test/alignedNucleotideSequence.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,117 +10,100 @@ import {
describe('The /alignedNucleotideSequence endpoint', () => {
it('should return aligned nucleotide sequences for Switzerland', async () => {
const result = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: { country: 'Switzerland' },
nucleotideSequenceRequest: { country: 'Switzerland', dataFormat: 'JSON' },
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_3259931');
expect(sequences[0]).to.have.length(29903);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_3259931');
expect(result[0].main).to.have.length(29903);
});

it('should return aligned nucleotide sequences for multi segmented sequences', async () => {
const result = await lapisMultiSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: { country: 'Switzerland' },
nucleotideSequenceRequest: { country: 'Switzerland', dataFormat: 'JSON' },
segment: 'M',
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(6);
expect(sequences).to.have.length(6);
expect(primaryKeys[0]).to.equal('>key_5');
expect(sequences[0]).to.equal('TGGG');
expect(result).to.have.length(6);
expect(result[0].primaryKey).to.equal('key_5');
expect(result[0].m).to.equal('TGGG');
});

it('should order ascending by specified fields', async () => {
const result = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: { orderBy: [{ field: 'primaryKey', type: 'ascending' }] },
nucleotideSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'ascending' }],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_1001493');
expect(sequences[0]).to.have.length(29903);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_1001493');
expect(result[0].main).to.have.length(29903);
});

it('should order descending by specified fields', async () => {
const result = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: { orderBy: [{ field: 'primaryKey', type: 'descending' }] },
nucleotideSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'descending' }],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_931279');
expect(sequences[0]).to.have.length(29903);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_931279');
expect(result[0].main).to.have.length(29903);
});

it('should apply limit and offset', async () => {
const resultWithLimit = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'ascending' }],
limit: 2,
dataFormat: 'JSON',
},
});

const { primaryKeys: primaryKeysWithLimit, sequences: sequencesWithLimit } =
sequenceData(resultWithLimit);

expect(primaryKeysWithLimit).to.have.length(2);
expect(sequencesWithLimit).to.have.length(2);
expect(primaryKeysWithLimit[0]).to.equal('>key_1001493');
expect(sequencesWithLimit[0]).to.have.length(29903);
expect(resultWithLimit).to.have.length(2);
expect(resultWithLimit[0].primaryKey).to.equal('key_1001493');
expect(resultWithLimit[0].main).to.have.length(29903);

const resultWithLimitAndOffset =
await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'ascending' }],
limit: 2,
offset: 1,
dataFormat: 'JSON',
},
});

const { primaryKeys: primaryKeysWithLimitAndOffset, sequences: sequencesWithLimitAndOffset } =
sequenceData(resultWithLimitAndOffset);

expect(primaryKeysWithLimitAndOffset).to.have.length(2);
expect(sequencesWithLimitAndOffset).to.have.length(2);
expect(primaryKeysWithLimitAndOffset[0]).to.equal(primaryKeysWithLimit[1]);
expect(sequencesWithLimitAndOffset[0]).to.equal(sequencesWithLimit[1]);
expect(resultWithLimitAndOffset).to.have.length(2);
expect(resultWithLimitAndOffset[0]).to.deep.equal(resultWithLimit[1]);
});

it('should correctly handle nucleotide insertion requests', async () => {
const result = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: {
nucleotideInsertions: ['ins_25701:CC?', 'ins_5959:?AT'],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(1);
expect(sequences).to.have.length(1);
expect(primaryKeys[0]).to.equal('>key_3578231');
expect(result).to.have.length(1);
expect(result[0].primaryKey).to.equal('key_3578231');
});

it('should correctly handle amino acid insertion requests', async () => {
const result = await lapisSingleSegmentedSequenceController.postAlignedNucleotideSequence({
nucleotideSequenceRequest: {
aminoAcidInsertions: ['ins_S:143:T', 'ins_ORF1a:3602:F?P'],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(1);
expect(sequences).to.have.length(1);
expect(primaryKeys[0]).to.equal('>key_3259931');
expect(result).to.have.length(1);
expect(result[0].primaryKey).to.equal('key_3259931');
});

it('should return an empty zstd compressed file', async () => {
Expand Down
77 changes: 30 additions & 47 deletions lapis-e2e/test/aminoAcidSequence.spec.ts
Original file line number Diff line number Diff line change
@@ -1,47 +1,41 @@
import { expect } from 'chai';
import { lapisClient, sequenceData } from './common';
import { lapisClient } from './common';

describe('The /alignedAminoAcidSequence endpoint', () => {
it('should return amino acid sequences for Switzerland', async () => {
const result = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: { country: 'Switzerland' },
aminoAcidSequenceRequest: { country: 'Switzerland', dataFormat: 'JSON' },
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_3259931');
expect(sequences[0]).to.have.length(1274);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_3259931');
expect(result[0].s).to.have.length(1274);
});

it('should order ascending by specified fields', async () => {
const result = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: { orderBy: [{ field: 'primaryKey', type: 'ascending' }] },
aminoAcidSequenceRequest: { orderBy: [{ field: 'primaryKey', type: 'ascending' }], dataFormat: 'JSON' },
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_1001493');
expect(sequences[0]).to.have.length(1274);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_1001493');
expect(result[0].s).to.have.length(1274);
});

it('should order descending by specified fields', async () => {
const result = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: { orderBy: [{ field: 'primaryKey', type: 'descending' }] },
aminoAcidSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'descending' }],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
expect(primaryKeys[0]).to.equal('>key_931279');
expect(sequences[0]).to.have.length(1274);
expect(result).to.have.length(100);
expect(result[0].primaryKey).to.equal('key_931279');
expect(result[0].s).to.have.length(1274);
});

it('should apply limit and offset', async () => {
Expand All @@ -50,63 +44,52 @@ describe('The /alignedAminoAcidSequence endpoint', () => {
aminoAcidSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'ascending' }],
limit: 2,
dataFormat: 'JSON',
},
});

const { primaryKeys: primaryKeysWithLimit, sequences: sequencesWithLimit } =
sequenceData(resultWithLimit);

expect(primaryKeysWithLimit).to.have.length(2);
expect(sequencesWithLimit).to.have.length(2);
expect(primaryKeysWithLimit[0]).to.equal('>key_1001493');
expect(sequencesWithLimit[0]).to.have.length(1274);
expect(resultWithLimit).to.have.length(2);
expect(resultWithLimit[0].primaryKey).to.equal('key_1001493');
expect(resultWithLimit[0].s).to.have.length(1274);

const resultWithLimitAndOffset = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: {
orderBy: [{ field: 'primaryKey', type: 'ascending' }],
limit: 2,
offset: 1,
dataFormat: 'JSON',
},
});

const { primaryKeys: primaryKeysWithLimitAndOffset, sequences: sequencesWithLimitAndOffset } =
sequenceData(resultWithLimitAndOffset);

expect(primaryKeysWithLimitAndOffset).to.have.length(2);
expect(sequencesWithLimitAndOffset).to.have.length(2);
expect(primaryKeysWithLimitAndOffset[0]).to.equal(primaryKeysWithLimit[1]);
expect(sequencesWithLimitAndOffset[0]).to.equal(sequencesWithLimit[1]);
expect(resultWithLimitAndOffset).to.have.length(2);
expect(resultWithLimitAndOffset[0]).to.deep.equal(resultWithLimit[1]);
});

it('should correctly handle nucleotide insertion requests', async () => {
const result = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: {
nucleotideInsertions: ['ins_25701:CC?', 'ins_5959:?AT'],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(1);
expect(sequences).to.have.length(1);
expect(primaryKeys[0]).to.equal('>key_3578231');
expect(sequences[0]).to.have.length(1274);
expect(result).to.have.length(1);
expect(result[0].primaryKey).to.equal('key_3578231');
expect(result[0].s).to.have.length(1274);
});

it('should correctly handle amino acid insertion requests', async () => {
const result = await lapisClient.postAlignedAminoAcidSequence({
gene: 'S',
aminoAcidSequenceRequest: {
aminoAcidInsertions: ['ins_S:143:T', 'ins_ORF1a:3602:F?P'],
dataFormat: 'JSON',
},
});

const { primaryKeys, sequences } = sequenceData(result);

expect(primaryKeys).to.have.length(1);
expect(sequences).to.have.length(1);
expect(primaryKeys[0]).to.equal('>key_3259931');
expect(result).to.have.length(1);
expect(result[0].primaryKey).to.equal('key_3259931');
});
});
22 changes: 20 additions & 2 deletions lapis-e2e/test/common.spec.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { expect } from 'chai';
import { basePath, expectIsGzipEncoded, expectIsZstdEncoded } from './common';
import { basePath, expectIsGzipEncoded, expectIsZstdEncoded, sequenceData } from './common';

const routes = [
{ pathSegment: '/aggregated', servesFasta: false, expectedDownloadFilename: 'aggregated.json' },
Expand Down Expand Up @@ -83,7 +83,25 @@ describe('All endpoints', () => {
expect(response.headers.get('lapis-data-version')).to.match(/\d{10}/);
});

if (!route.servesFasta) {
if (route.servesFasta) {
it('should return sequences in fasta format', async () => {
const response = await get(new URLSearchParams({ dataFormat: 'fasta' }));

const { primaryKeys, sequences } = sequenceData(await response.text());

expect(primaryKeys).to.have.length(100);
expect(sequences).to.have.length(100);
});

it('should return sequences in ndjson format', async () => {
const response = await get(new URLSearchParams({ dataFormat: 'ndjson' }));

const lines = (await response.text()).split('\n').filter(line => line.length > 0);

expect(lines).to.have.length(100);
expect(JSON.parse(lines[0])).to.have.property('primaryKey');
});
} else {
it('should return the lapis data version header for CSV data', async () => {
const response = await get(new URLSearchParams({ dataFormat: 'csv' }));

Expand Down
Loading

0 comments on commit d6c33cb

Please sign in to comment.