Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to use generic-filehandle2 #107

Merged
merged 3 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,11 @@
## [4.0.1](https://github.com/GMOD/bam-js/compare/v4.0.0...v4.0.1) (2024-11-12)



# [4.0.0](https://github.com/GMOD/bam-js/compare/v3.0.3...v4.0.0) (2024-11-12)



## [3.0.3](https://github.com/GMOD/bam-js/compare/v3.0.0...v3.0.3) (2024-11-11)



## [3.0.2](https://github.com/GMOD/bam-js/compare/v3.0.0...v3.0.2) (2024-11-11)



- republish v3.0.1 since it got tagged on a deleted branch

## [3.0.1](https://github.com/GMOD/bam-js/compare/v3.0.0...v3.0.1) (2024-11-11)
Expand Down
18 changes: 7 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ var records = await t.getRecordsForRange('ctgA', 0, 50000)
```

The `bamPath` argument only works on nodejs. In the browser, you should pass
`bamFilehandle` with a generic-filehandle e.g. `RemoteFile`
`bamFilehandle` with a generic-filehandle2 e.g. `RemoteFile`

```typescript
const { RemoteFile } = require('generic-filehandle')
const { RemoteFile } = require('generic-filehandle2')
const bam = new BamFile({
bamFilehandle: new RemoteFile('yourfile.bam'), // or a full http url
baiFilehandle: new RemoteFile('yourfile.bam.bai'), // or a full http url
Expand Down Expand Up @@ -76,9 +76,10 @@ The BAM class constructor accepts arguments
yielding

Note: filehandles implement the Filehandle interface from
https://www.npmjs.com/package/generic-filehandle. This module offers the path
and url arguments as convenience methods for supplying the LocalFile and
RemoteFile
https://www.npmjs.com/package/generic-filehandle2.

This module offers the path and url arguments as convenience methods for
supplying the LocalFile and RemoteFile

### async getRecordsForRange(refName, start, end, opts)

Expand Down Expand Up @@ -112,7 +113,7 @@ for await (const chunk of file.streamRecordsForRange(
The `getRecordsForRange` simply wraps this process by concatenating chunks into
an array

### async getHeader(opts: {....anything to pass to generic-filehandle opts})
### async getHeader(opts: {....anything to pass to generic-filehandle2 opts})

This obtains the header from `HtsgetFile` or `BamFile`. Retrieves BAM file and
BAI/CSI header if applicable, or API request for refnames from htsget
Expand Down Expand Up @@ -157,11 +158,6 @@ feature.flags // flags
feature.template_length // TLEN
```

#### Note

The reason that we hide the data behind this ".get" function is that we lazily
decode records on demand, which can reduce memory consumption.

## License

MIT © [Colin Diesh](https://github.com/cmdcolin)
10 changes: 5 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
"test": "vitest",
"lint": "eslint --report-unused-disable-directives --max-warnings 0",
"clean": "rimraf dist esm",
"build:esm": "tsc --target es2018 --outDir esm",
"build:es5": "tsc --target es2015 --module commonjs --outDir dist",
"build:esm": "tsc --outDir esm",
"build:es5": "tsc --module commonjs --outDir dist",
"build": "npm run build:esm && npm run build:es5",
"prebuild": "npm run clean && npm run lint",
"preversion": "npm run lint && npm test run && npm run build",
Expand All @@ -39,10 +39,9 @@
],
"dependencies": {
"@gmod/abortable-promise-cache": "^2.0.0",
"@gmod/bgzf-filehandle": "^1.4.4",
"buffer": "^6.0.3",
"@gmod/bgzf-filehandle": "^2.0.0",
"crc": "^4.3.2",
"generic-filehandle": "^3.0.0",
"generic-filehandle2": "^0.0.1",
"long": "^4.0.0",
"quick-lru": "^4.0.0"
},
Expand All @@ -52,6 +51,7 @@
"@typescript-eslint/eslint-plugin": "^8.1.0",
"@typescript-eslint/parser": "^8.1.0",
"@vitest/coverage-v8": "^2.0.5",
"buffer": "^6.0.3",
"eslint": "^9.9.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-prettier": "^5.1.3",
Expand Down
19 changes: 11 additions & 8 deletions src/bai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,16 @@ export default class BAI extends IndexFile {
}

// fetch and parse the index
async _parse(opts?: BaseOpts) {
const bytes = (await this.filehandle.readFile(opts)) as Buffer
async _parse(_opts?: BaseOpts) {
const bytes = await this.filehandle.readFile()
const dataView = new DataView(bytes.buffer)

// check BAI magic numbers
if (bytes.readUInt32LE(0) !== BAI_MAGIC) {
if (dataView.getUint32(0, true) !== BAI_MAGIC) {
throw new Error('Not a BAI file')
}

const refCount = bytes.readInt32LE(4)
const refCount = dataView.getInt32(4, true)
const depth = 5
const binLimit = ((1 << ((depth + 1) * 3)) - 1) / 7

Expand All @@ -57,16 +58,18 @@ export default class BAI extends IndexFile {
linearIndex: LinearIndex
stats?: { lineCount: number }
}>(refCount)

for (let i = 0; i < refCount; i++) {
// the binning index
const binCount = bytes.readInt32LE(curr)

const binCount = dataView.getInt32(curr, true)
let stats

curr += 4
const binIndex: Record<number, Chunk[]> = {}

for (let j = 0; j < binCount; j += 1) {
const bin = bytes.readUInt32LE(curr)
const bin = dataView.getUint32(curr, true)
curr += 4
if (bin === binLimit + 1) {
curr += 4
Expand All @@ -75,7 +78,7 @@ export default class BAI extends IndexFile {
} else if (bin > binLimit + 1) {
throw new Error('bai index contains too many bins, please use CSI')
} else {
const chunkCount = bytes.readInt32LE(curr)
const chunkCount = dataView.getInt32(curr, true)
curr += 4
const chunks = new Array<Chunk>(chunkCount)
for (let k = 0; k < chunkCount; k++) {
Expand All @@ -90,7 +93,7 @@ export default class BAI extends IndexFile {
}
}

const linearCount = bytes.readInt32LE(curr)
const linearCount = dataView.getInt32(curr, true)
curr += 4
// as we're going through the linear index, figure out the smallest
// virtual offset in the indexes, which tells us where the BAM header
Expand Down
63 changes: 22 additions & 41 deletions src/bamFile.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { Buffer } from 'buffer'
import crc32 from 'crc/crc32'
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
import { LocalFile, RemoteFile, GenericFilehandle } from 'generic-filehandle'
import { LocalFile, RemoteFile, GenericFilehandle } from 'generic-filehandle2'
import AbortablePromiseCache from '@gmod/abortable-promise-cache'
import QuickLRU from 'quick-lru'

Expand Down Expand Up @@ -148,23 +147,21 @@ export default class BamFile {
let buffer
if (ret) {
const s = ret + blockLen
const res = await this.bam.read(Buffer.alloc(s), 0, s, 0, opts)
if (!res.bytesRead) {
throw new Error('Error reading header')
}
buffer = res.buffer.subarray(0, Math.min(res.bytesRead, ret))
buffer = await this.bam.read(s, 0)
} else {
buffer = await this.bam.readFile(opts)
}

const uncba = await unzip(buffer)
const dataView = new DataView(uncba.buffer)

if (uncba.readInt32LE(0) !== BAM_MAGIC) {
if (dataView.getInt32(0, true) !== BAM_MAGIC) {
throw new Error('Not a BAM file')
}
const headLen = uncba.readInt32LE(4)
const headLen = dataView.getInt32(4, true)

this.header = uncba.toString('utf8', 8, 8 + headLen)
const decoder = new TextDecoder('utf8')
this.header = decoder.decode(uncba.subarray(8, 8 + headLen))
const { chrToIndex, indexToChr } = await this._readRefSeqs(
headLen + 8,
65535,
Expand Down Expand Up @@ -204,30 +201,21 @@ export default class BamFile {
if (start > refSeqBytes) {
return this._readRefSeqs(start, refSeqBytes * 2, opts)
}
const size = refSeqBytes + blockLen
const { bytesRead, buffer } = await this.bam.read(
Buffer.alloc(size),
0,
refSeqBytes,
0,
opts,
)
if (!bytesRead) {
throw new Error('Error reading refseqs from header')
}
const uncba = await unzip(
buffer.subarray(0, Math.min(bytesRead, refSeqBytes)),
)
const nRef = uncba.readInt32LE(start)
// const size = refSeqBytes + blockLen <-- use this?
const buffer = await this.bam.read(refSeqBytes, 0, opts)
const uncba = await unzip(buffer)
const dataView = new DataView(uncba.buffer)
const nRef = dataView.getInt32(start, true)
let p = start + 4
const chrToIndex: Record<string, number> = {}
const indexToChr: { refName: string; length: number }[] = []
const decoder = new TextDecoder('utf8')
for (let i = 0; i < nRef; i += 1) {
const lName = uncba.readInt32LE(p)
const lName = dataView.getInt32(p, true)
const refName = this.renameRefSeq(
uncba.toString('utf8', p + 4, p + 4 + lName - 1),
decoder.decode(uncba.subarray(p + 4, p + 4 + lName - 1)),
)
const lRef = uncba.readInt32LE(p + lName + 4)
const lRef = dataView.getInt32(p + lName + 4, true)

chrToIndex[refName] = i
indexToChr.push({ refName, length: lRef })
Expand Down Expand Up @@ -388,15 +376,7 @@ export default class BamFile {
}

async _readRegion(position: number, size: number, opts: BaseOpts = {}) {
const { bytesRead, buffer } = await this.bam.read(
Buffer.alloc(size),
0,
size,
position,
opts,
)

return buffer.subarray(0, Math.min(bytesRead, size))
return this.bam.read(size, position, opts)
}

async _readChunk({ chunk, opts }: { chunk: Chunk; opts: BaseOpts }) {
Expand All @@ -415,7 +395,7 @@ export default class BamFile {
}

async readBamFeatures(
ba: Buffer,
ba: Uint8Array,
cpositions: number[],
dpositions: number[],
chunk: Chunk,
Expand All @@ -425,8 +405,9 @@ export default class BamFile {
let pos = 0
let last = +Date.now()

const dataView = new DataView(ba.buffer)
while (blockStart + 4 < ba.length) {
const blockSize = ba.readInt32LE(blockStart)
const blockSize = dataView.getInt32(blockStart, true)
const blockEnd = blockStart + 4 + blockSize - 1

// increment position to the current decompressed status
Expand Down Expand Up @@ -471,8 +452,8 @@ export default class BamFile {
chunk.minv.dataPosition +
1
: // must be slice, not subarray for buffer polyfill on web
// eslint-disable-next-line @typescript-eslint/no-deprecated
crc32.signed(ba.slice(blockStart, blockEnd)),
// @ts-expect-error
crc32.signed(ba.subarray(blockStart, blockEnd)),
})

sink.push(feature)
Expand Down
2 changes: 1 addition & 1 deletion src/chunk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import VirtualOffset from './virtualOffset'

// little class representing a chunk in the index
export default class Chunk {
public buffer?: Buffer
public buffer?: Uint8Array

constructor(
public minv: VirtualOffset,
Expand Down
41 changes: 22 additions & 19 deletions src/csi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@ export default class CSI extends IndexFile {
return []
}

parseAuxData(bytes: Buffer, offset: number) {
const formatFlags = bytes.readInt32LE(offset)
parseAuxData(bytes: Uint8Array, offset: number) {
const dataView = new DataView(bytes.buffer)
const formatFlags = dataView.getUint32(offset, true)
const coordinateType =
formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed'
const format = (
Expand All @@ -48,14 +49,14 @@ export default class CSI extends IndexFile {
throw new Error(`invalid Tabix preset format flags ${formatFlags}`)
}
const columnNumbers = {
ref: bytes.readInt32LE(offset + 4),
start: bytes.readInt32LE(offset + 8),
end: bytes.readInt32LE(offset + 12),
ref: dataView.getInt32(offset + 4, true),
start: dataView.getInt32(offset + 8, true),
end: dataView.getInt32(offset + 12, true),
}
const metaValue = bytes.readInt32LE(offset + 16)
const metaValue = dataView.getInt32(offset + 16, true)
const metaChar = metaValue ? String.fromCharCode(metaValue) : ''
const skipLines = bytes.readInt32LE(offset + 20)
const nameSectionLength = bytes.readInt32LE(offset + 24)
const skipLines = dataView.getInt32(offset + 20, true)
const nameSectionLength = dataView.getInt32(offset + 24, true)

return {
columnNumbers,
Expand All @@ -77,23 +78,25 @@ export default class CSI extends IndexFile {
const buffer = await this.filehandle.readFile(opts)
const bytes = await unzip(buffer)

const dataView = new DataView(bytes.buffer)
let csiVersion
// check TBI magic numbers
if (bytes.readUInt32LE(0) === CSI1_MAGIC) {
const magic = dataView.getUint32(0, true)

if (magic === CSI1_MAGIC) {
csiVersion = 1
} else if (bytes.readUInt32LE(0) === CSI2_MAGIC) {
} else if (magic === CSI2_MAGIC) {
csiVersion = 2
} else {
throw new Error('Not a CSI file')
throw new Error(`Not a CSI file ${magic}`)
// TODO: do we need to support big-endian CSI files?
}

this.minShift = bytes.readInt32LE(4)
this.depth = bytes.readInt32LE(8)
this.minShift = dataView.getInt32(4, true)
this.depth = dataView.getInt32(8, true)
this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
const auxLength = bytes.readInt32LE(12)
const auxLength = dataView.getInt32(12, true)
const aux = auxLength >= 30 ? this.parseAuxData(bytes, 16) : undefined
const refCount = bytes.readInt32LE(16 + auxLength)
const refCount = dataView.getInt32(16 + auxLength, true)

type BinIndex = Record<string, Chunk[]>

Expand All @@ -106,20 +109,20 @@ export default class CSI extends IndexFile {
}>(refCount)
for (let i = 0; i < refCount; i++) {
// the binning index
const binCount = bytes.readInt32LE(curr)
const binCount = dataView.getInt32(curr, true)
curr += 4
const binIndex: Record<string, Chunk[]> = {}
let stats // < provided by parsing a pseudo-bin, if present
for (let j = 0; j < binCount; j++) {
const bin = bytes.readUInt32LE(curr)
const bin = dataView.getUint32(curr, true)
curr += 4
if (bin > this.maxBinNumber) {
stats = parsePseudoBin(bytes, curr + 28)
curr += 28 + 16
} else {
firstDataLine = findFirstData(firstDataLine, fromBytes(bytes, curr))
curr += 8
const chunkCount = bytes.readInt32LE(curr)
const chunkCount = dataView.getInt32(curr, true)
curr += 4
const chunks = new Array<Chunk>(chunkCount)
for (let k = 0; k < chunkCount; k += 1) {
Expand Down
Loading
Loading