Skip to content

Commit

Permalink
Update to use generic-filehandle2 (#107)
Browse files Browse the repository at this point in the history
  • Loading branch information
cmdcolin authored Dec 12, 2024
1 parent 8a312e0 commit d92f513
Show file tree
Hide file tree
Showing 16 changed files with 514 additions and 558 deletions.
8 changes: 0 additions & 8 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,11 @@
## [4.0.1](https://github.com/GMOD/bam-js/compare/v4.0.0...v4.0.1) (2024-11-12)



# [4.0.0](https://github.com/GMOD/bam-js/compare/v3.0.3...v4.0.0) (2024-11-12)



## [3.0.3](https://github.com/GMOD/bam-js/compare/v3.0.0...v3.0.3) (2024-11-11)



## [3.0.2](https://github.com/GMOD/bam-js/compare/v3.0.0...v3.0.2) (2024-11-11)



- republish v3.0.1 since it got tagged on a deleted branch

## [3.0.1](https://github.com/GMOD/bam-js/compare/v3.0.0...v3.0.1) (2024-11-11)
Expand Down
18 changes: 7 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ var records = await t.getRecordsForRange('ctgA', 0, 50000)
```

The `bamPath` argument only works on nodejs. In the browser, you should pass
`bamFilehandle` with a generic-filehandle e.g. `RemoteFile`
`bamFilehandle` with a generic-filehandle2 e.g. `RemoteFile`

```typescript
const { RemoteFile } = require('generic-filehandle')
const { RemoteFile } = require('generic-filehandle2')
const bam = new BamFile({
bamFilehandle: new RemoteFile('yourfile.bam'), // or a full http url
baiFilehandle: new RemoteFile('yourfile.bam.bai'), // or a full http url
Expand Down Expand Up @@ -76,9 +76,10 @@ The BAM class constructor accepts arguments
yielding

Note: filehandles implement the Filehandle interface from
https://www.npmjs.com/package/generic-filehandle. This module offers the path
and url arguments as convenience methods for supplying the LocalFile and
RemoteFile
https://www.npmjs.com/package/generic-filehandle2.

This module offers the path and url arguments as convenience methods for
supplying the LocalFile and RemoteFile

### async getRecordsForRange(refName, start, end, opts)

Expand Down Expand Up @@ -112,7 +113,7 @@ for await (const chunk of file.streamRecordsForRange(
The `getRecordsForRange` simply wraps this process by concatenating chunks into
an array

### async getHeader(opts: {....anything to pass to generic-filehandle opts})
### async getHeader(opts: {....anything to pass to generic-filehandle2 opts})

This obtains the header from `HtsgetFile` or `BamFile`. Retrieves BAM file and
BAI/CSI header if applicable, or API request for refnames from htsget
Expand Down Expand Up @@ -157,11 +158,6 @@ feature.flags // flags
feature.template_length // TLEN
```

#### Note

The reason that we hide the data behind this ".get" function is that we lazily
decode records on demand, which can reduce memory consumption.

## License

MIT © [Colin Diesh](https://github.com/cmdcolin)
10 changes: 5 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
"test": "vitest",
"lint": "eslint --report-unused-disable-directives --max-warnings 0",
"clean": "rimraf dist esm",
"build:esm": "tsc --target es2018 --outDir esm",
"build:es5": "tsc --target es2015 --module commonjs --outDir dist",
"build:esm": "tsc --outDir esm",
"build:es5": "tsc --module commonjs --outDir dist",
"build": "npm run build:esm && npm run build:es5",
"prebuild": "npm run clean && npm run lint",
"preversion": "npm run lint && npm test run && npm run build",
Expand All @@ -39,10 +39,9 @@
],
"dependencies": {
"@gmod/abortable-promise-cache": "^2.0.0",
"@gmod/bgzf-filehandle": "^1.4.4",
"buffer": "^6.0.3",
"@gmod/bgzf-filehandle": "^2.0.0",
"crc": "^4.3.2",
"generic-filehandle": "^3.0.0",
"generic-filehandle2": "^0.0.1",
"long": "^4.0.0",
"quick-lru": "^4.0.0"
},
Expand All @@ -52,6 +51,7 @@
"@typescript-eslint/eslint-plugin": "^8.1.0",
"@typescript-eslint/parser": "^8.1.0",
"@vitest/coverage-v8": "^2.0.5",
"buffer": "^6.0.3",
"eslint": "^9.9.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-prettier": "^5.1.3",
Expand Down
19 changes: 11 additions & 8 deletions src/bai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,16 @@ export default class BAI extends IndexFile {
}

// fetch and parse the index
async _parse(opts?: BaseOpts) {
const bytes = (await this.filehandle.readFile(opts)) as Buffer
async _parse(_opts?: BaseOpts) {
const bytes = await this.filehandle.readFile()
const dataView = new DataView(bytes.buffer)

// check BAI magic numbers
if (bytes.readUInt32LE(0) !== BAI_MAGIC) {
if (dataView.getUint32(0, true) !== BAI_MAGIC) {
throw new Error('Not a BAI file')
}

const refCount = bytes.readInt32LE(4)
const refCount = dataView.getInt32(4, true)
const depth = 5
const binLimit = ((1 << ((depth + 1) * 3)) - 1) / 7

Expand All @@ -57,16 +58,18 @@ export default class BAI extends IndexFile {
linearIndex: LinearIndex
stats?: { lineCount: number }
}>(refCount)

for (let i = 0; i < refCount; i++) {
// the binning index
const binCount = bytes.readInt32LE(curr)

const binCount = dataView.getInt32(curr, true)
let stats

curr += 4
const binIndex: Record<number, Chunk[]> = {}

for (let j = 0; j < binCount; j += 1) {
const bin = bytes.readUInt32LE(curr)
const bin = dataView.getUint32(curr, true)
curr += 4
if (bin === binLimit + 1) {
curr += 4
Expand All @@ -75,7 +78,7 @@ export default class BAI extends IndexFile {
} else if (bin > binLimit + 1) {
throw new Error('bai index contains too many bins, please use CSI')
} else {
const chunkCount = bytes.readInt32LE(curr)
const chunkCount = dataView.getInt32(curr, true)
curr += 4
const chunks = new Array<Chunk>(chunkCount)
for (let k = 0; k < chunkCount; k++) {
Expand All @@ -90,7 +93,7 @@ export default class BAI extends IndexFile {
}
}

const linearCount = bytes.readInt32LE(curr)
const linearCount = dataView.getInt32(curr, true)
curr += 4
// as we're going through the linear index, figure out the smallest
// virtual offset in the indexes, which tells us where the BAM header
Expand Down
63 changes: 22 additions & 41 deletions src/bamFile.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { Buffer } from 'buffer'
import crc32 from 'crc/crc32'
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
import { LocalFile, RemoteFile, GenericFilehandle } from 'generic-filehandle'
import { LocalFile, RemoteFile, GenericFilehandle } from 'generic-filehandle2'
import AbortablePromiseCache from '@gmod/abortable-promise-cache'
import QuickLRU from 'quick-lru'

Expand Down Expand Up @@ -148,23 +147,21 @@ export default class BamFile {
let buffer
if (ret) {
const s = ret + blockLen
const res = await this.bam.read(Buffer.alloc(s), 0, s, 0, opts)
if (!res.bytesRead) {
throw new Error('Error reading header')
}
buffer = res.buffer.subarray(0, Math.min(res.bytesRead, ret))
buffer = await this.bam.read(s, 0)
} else {
buffer = await this.bam.readFile(opts)
}

const uncba = await unzip(buffer)
const dataView = new DataView(uncba.buffer)

if (uncba.readInt32LE(0) !== BAM_MAGIC) {
if (dataView.getInt32(0, true) !== BAM_MAGIC) {
throw new Error('Not a BAM file')
}
const headLen = uncba.readInt32LE(4)
const headLen = dataView.getInt32(4, true)

this.header = uncba.toString('utf8', 8, 8 + headLen)
const decoder = new TextDecoder('utf8')
this.header = decoder.decode(uncba.subarray(8, 8 + headLen))
const { chrToIndex, indexToChr } = await this._readRefSeqs(
headLen + 8,
65535,
Expand Down Expand Up @@ -204,30 +201,21 @@ export default class BamFile {
if (start > refSeqBytes) {
return this._readRefSeqs(start, refSeqBytes * 2, opts)
}
const size = refSeqBytes + blockLen
const { bytesRead, buffer } = await this.bam.read(
Buffer.alloc(size),
0,
refSeqBytes,
0,
opts,
)
if (!bytesRead) {
throw new Error('Error reading refseqs from header')
}
const uncba = await unzip(
buffer.subarray(0, Math.min(bytesRead, refSeqBytes)),
)
const nRef = uncba.readInt32LE(start)
// const size = refSeqBytes + blockLen <-- use this?
const buffer = await this.bam.read(refSeqBytes, 0, opts)
const uncba = await unzip(buffer)
const dataView = new DataView(uncba.buffer)
const nRef = dataView.getInt32(start, true)
let p = start + 4
const chrToIndex: Record<string, number> = {}
const indexToChr: { refName: string; length: number }[] = []
const decoder = new TextDecoder('utf8')
for (let i = 0; i < nRef; i += 1) {
const lName = uncba.readInt32LE(p)
const lName = dataView.getInt32(p, true)
const refName = this.renameRefSeq(
uncba.toString('utf8', p + 4, p + 4 + lName - 1),
decoder.decode(uncba.subarray(p + 4, p + 4 + lName - 1)),
)
const lRef = uncba.readInt32LE(p + lName + 4)
const lRef = dataView.getInt32(p + lName + 4, true)

chrToIndex[refName] = i
indexToChr.push({ refName, length: lRef })
Expand Down Expand Up @@ -388,15 +376,7 @@ export default class BamFile {
}

async _readRegion(position: number, size: number, opts: BaseOpts = {}) {
const { bytesRead, buffer } = await this.bam.read(
Buffer.alloc(size),
0,
size,
position,
opts,
)

return buffer.subarray(0, Math.min(bytesRead, size))
return this.bam.read(size, position, opts)
}

async _readChunk({ chunk, opts }: { chunk: Chunk; opts: BaseOpts }) {
Expand All @@ -415,7 +395,7 @@ export default class BamFile {
}

async readBamFeatures(
ba: Buffer,
ba: Uint8Array,
cpositions: number[],
dpositions: number[],
chunk: Chunk,
Expand All @@ -425,8 +405,9 @@ export default class BamFile {
let pos = 0
let last = +Date.now()

const dataView = new DataView(ba.buffer)
while (blockStart + 4 < ba.length) {
const blockSize = ba.readInt32LE(blockStart)
const blockSize = dataView.getInt32(blockStart, true)
const blockEnd = blockStart + 4 + blockSize - 1

// increment position to the current decompressed status
Expand Down Expand Up @@ -471,8 +452,8 @@ export default class BamFile {
chunk.minv.dataPosition +
1
: // must be slice, not subarray for buffer polyfill on web
// eslint-disable-next-line @typescript-eslint/no-deprecated
crc32.signed(ba.slice(blockStart, blockEnd)),
// @ts-expect-error
crc32.signed(ba.subarray(blockStart, blockEnd)),
})

sink.push(feature)
Expand Down
2 changes: 1 addition & 1 deletion src/chunk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import VirtualOffset from './virtualOffset'

// little class representing a chunk in the index
export default class Chunk {
public buffer?: Buffer
public buffer?: Uint8Array

constructor(
public minv: VirtualOffset,
Expand Down
41 changes: 22 additions & 19 deletions src/csi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@ export default class CSI extends IndexFile {
return []
}

parseAuxData(bytes: Buffer, offset: number) {
const formatFlags = bytes.readInt32LE(offset)
parseAuxData(bytes: Uint8Array, offset: number) {
const dataView = new DataView(bytes.buffer)
const formatFlags = dataView.getUint32(offset, true)
const coordinateType =
formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed'
const format = (
Expand All @@ -48,14 +49,14 @@ export default class CSI extends IndexFile {
throw new Error(`invalid Tabix preset format flags ${formatFlags}`)
}
const columnNumbers = {
ref: bytes.readInt32LE(offset + 4),
start: bytes.readInt32LE(offset + 8),
end: bytes.readInt32LE(offset + 12),
ref: dataView.getInt32(offset + 4, true),
start: dataView.getInt32(offset + 8, true),
end: dataView.getInt32(offset + 12, true),
}
const metaValue = bytes.readInt32LE(offset + 16)
const metaValue = dataView.getInt32(offset + 16, true)
const metaChar = metaValue ? String.fromCharCode(metaValue) : ''
const skipLines = bytes.readInt32LE(offset + 20)
const nameSectionLength = bytes.readInt32LE(offset + 24)
const skipLines = dataView.getInt32(offset + 20, true)
const nameSectionLength = dataView.getInt32(offset + 24, true)

return {
columnNumbers,
Expand All @@ -77,23 +78,25 @@ export default class CSI extends IndexFile {
const buffer = await this.filehandle.readFile(opts)
const bytes = await unzip(buffer)

const dataView = new DataView(bytes.buffer)
let csiVersion
// check TBI magic numbers
if (bytes.readUInt32LE(0) === CSI1_MAGIC) {
const magic = dataView.getUint32(0, true)

if (magic === CSI1_MAGIC) {
csiVersion = 1
} else if (bytes.readUInt32LE(0) === CSI2_MAGIC) {
} else if (magic === CSI2_MAGIC) {
csiVersion = 2
} else {
throw new Error('Not a CSI file')
throw new Error(`Not a CSI file ${magic}`)
// TODO: do we need to support big-endian CSI files?
}

this.minShift = bytes.readInt32LE(4)
this.depth = bytes.readInt32LE(8)
this.minShift = dataView.getInt32(4, true)
this.depth = dataView.getInt32(8, true)
this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
const auxLength = bytes.readInt32LE(12)
const auxLength = dataView.getInt32(12, true)
const aux = auxLength >= 30 ? this.parseAuxData(bytes, 16) : undefined
const refCount = bytes.readInt32LE(16 + auxLength)
const refCount = dataView.getInt32(16 + auxLength, true)

type BinIndex = Record<string, Chunk[]>

Expand All @@ -106,20 +109,20 @@ export default class CSI extends IndexFile {
}>(refCount)
for (let i = 0; i < refCount; i++) {
// the binning index
const binCount = bytes.readInt32LE(curr)
const binCount = dataView.getInt32(curr, true)
curr += 4
const binIndex: Record<string, Chunk[]> = {}
let stats // < provided by parsing a pseudo-bin, if present
for (let j = 0; j < binCount; j++) {
const bin = bytes.readUInt32LE(curr)
const bin = dataView.getUint32(curr, true)
curr += 4
if (bin > this.maxBinNumber) {
stats = parsePseudoBin(bytes, curr + 28)
curr += 28 + 16
} else {
firstDataLine = findFirstData(firstDataLine, fromBytes(bytes, curr))
curr += 8
const chunkCount = bytes.readInt32LE(curr)
const chunkCount = dataView.getInt32(curr, true)
curr += 4
const chunks = new Array<Chunk>(chunkCount)
for (let k = 0; k < chunkCount; k += 1) {
Expand Down
Loading

0 comments on commit d92f513

Please sign in to comment.