Update to use generic-filehandle2 (#107)

GMOD · Dec 12, 2024 · d92f513 · d92f513
1 parent 8a312e0
commit d92f513
Show file tree

Hide file tree

Showing 16 changed files with 514 additions and 558 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,19 +1,11 @@
 ## [4.0.1](https://github.com/GMOD/bam-js/compare/v4.0.0...v4.0.1) (2024-11-12)
 
-
-
 # [4.0.0](https://github.com/GMOD/bam-js/compare/v3.0.3...v4.0.0) (2024-11-12)
 
-
-
 ## [3.0.3](https://github.com/GMOD/bam-js/compare/v3.0.0...v3.0.3) (2024-11-11)
 
-
-
 ## [3.0.2](https://github.com/GMOD/bam-js/compare/v3.0.0...v3.0.2) (2024-11-11)
 
-
-
 - republish v3.0.1 since it got tagged on a deleted branch
 
 ## [3.0.1](https://github.com/GMOD/bam-js/compare/v3.0.0...v3.0.1) (2024-11-11)

diff --git a/README.md b/README.md
@@ -26,10 +26,10 @@ var records = await t.getRecordsForRange('ctgA', 0, 50000)
 ```
 
 The `bamPath` argument only works on nodejs. In the browser, you should pass
-`bamFilehandle` with a generic-filehandle e.g. `RemoteFile`
+`bamFilehandle` with a generic-filehandle2 e.g. `RemoteFile`
 
 ```typescript
-const { RemoteFile } = require('generic-filehandle')
+const { RemoteFile } = require('generic-filehandle2')
 const bam = new BamFile({
   bamFilehandle: new RemoteFile('yourfile.bam'), // or a full http url
   baiFilehandle: new RemoteFile('yourfile.bam.bai'), // or a full http url
@@ -76,9 +76,10 @@ The BAM class constructor accepts arguments
   yielding
 
 Note: filehandles implement the Filehandle interface from
-https://www.npmjs.com/package/generic-filehandle. This module offers the path
-and url arguments as convenience methods for supplying the LocalFile and
-RemoteFile
+https://www.npmjs.com/package/generic-filehandle2.
+
+This module offers the path and url arguments as convenience methods for
+supplying the LocalFile and RemoteFile
 
 ### async getRecordsForRange(refName, start, end, opts)
 
@@ -112,7 +113,7 @@ for await (const chunk of file.streamRecordsForRange(
 The `getRecordsForRange` simply wraps this process by concatenating chunks into
 an array
 
-### async getHeader(opts: {....anything to pass to generic-filehandle opts})
+### async getHeader(opts: {....anything to pass to generic-filehandle2 opts})
 
 This obtains the header from `HtsgetFile` or `BamFile`. Retrieves BAM file and
 BAI/CSI header if applicable, or API request for refnames from htsget
@@ -157,11 +158,6 @@ feature.flags // flags
 feature.template_length // TLEN
 ```
 
-#### Note
-
-The reason that we hide the data behind this ".get" function is that we lazily
-decode records on demand, which can reduce memory consumption.
-
 ## License
 
 MIT © [Colin Diesh](https://github.com/cmdcolin)
diff --git a/package.json b/package.json
@@ -23,8 +23,8 @@
     "test": "vitest",
     "lint": "eslint --report-unused-disable-directives --max-warnings 0",
     "clean": "rimraf dist esm",
-    "build:esm": "tsc --target es2018 --outDir esm",
-    "build:es5": "tsc --target es2015 --module commonjs --outDir dist",
+    "build:esm": "tsc --outDir esm",
+    "build:es5": "tsc --module commonjs --outDir dist",
     "build": "npm run build:esm && npm run build:es5",
     "prebuild": "npm run clean && npm run lint",
     "preversion": "npm run lint && npm test run && npm run build",
@@ -39,10 +39,9 @@
   ],
   "dependencies": {
     "@gmod/abortable-promise-cache": "^2.0.0",
-    "@gmod/bgzf-filehandle": "^1.4.4",
-    "buffer": "^6.0.3",
+    "@gmod/bgzf-filehandle": "^2.0.0",
     "crc": "^4.3.2",
-    "generic-filehandle": "^3.0.0",
+    "generic-filehandle2": "^0.0.1",
     "long": "^4.0.0",
     "quick-lru": "^4.0.0"
   },
@@ -52,6 +51,7 @@
     "@typescript-eslint/eslint-plugin": "^8.1.0",
     "@typescript-eslint/parser": "^8.1.0",
     "@vitest/coverage-v8": "^2.0.5",
+    "buffer": "^6.0.3",
     "eslint": "^9.9.0",
     "eslint-config-prettier": "^9.1.0",
     "eslint-plugin-prettier": "^5.1.3",

diff --git a/src/bai.ts b/src/bai.ts
@@ -34,15 +34,16 @@ export default class BAI extends IndexFile {
   }
 
   // fetch and parse the index
-  async _parse(opts?: BaseOpts) {
-    const bytes = (await this.filehandle.readFile(opts)) as Buffer
+  async _parse(_opts?: BaseOpts) {
+    const bytes = await this.filehandle.readFile()
+    const dataView = new DataView(bytes.buffer)
 
     // check BAI magic numbers
-    if (bytes.readUInt32LE(0) !== BAI_MAGIC) {
+    if (dataView.getUint32(0, true) !== BAI_MAGIC) {
       throw new Error('Not a BAI file')
     }
 
-    const refCount = bytes.readInt32LE(4)
+    const refCount = dataView.getInt32(4, true)
     const depth = 5
     const binLimit = ((1 << ((depth + 1) * 3)) - 1) / 7
 
@@ -57,16 +58,18 @@ export default class BAI extends IndexFile {
       linearIndex: LinearIndex
       stats?: { lineCount: number }
     }>(refCount)
+
     for (let i = 0; i < refCount; i++) {
       // the binning index
-      const binCount = bytes.readInt32LE(curr)
+
+      const binCount = dataView.getInt32(curr, true)
       let stats
 
       curr += 4
       const binIndex: Record<number, Chunk[]> = {}
 
       for (let j = 0; j < binCount; j += 1) {
-        const bin = bytes.readUInt32LE(curr)
+        const bin = dataView.getUint32(curr, true)
         curr += 4
         if (bin === binLimit + 1) {
           curr += 4
@@ -75,7 +78,7 @@ export default class BAI extends IndexFile {
         } else if (bin > binLimit + 1) {
           throw new Error('bai index contains too many bins, please use CSI')
         } else {
-          const chunkCount = bytes.readInt32LE(curr)
+          const chunkCount = dataView.getInt32(curr, true)
           curr += 4
           const chunks = new Array<Chunk>(chunkCount)
           for (let k = 0; k < chunkCount; k++) {
@@ -90,7 +93,7 @@ export default class BAI extends IndexFile {
         }
       }
 
-      const linearCount = bytes.readInt32LE(curr)
+      const linearCount = dataView.getInt32(curr, true)
       curr += 4
       // as we're going through the linear index, figure out the smallest
       // virtual offset in the indexes, which tells us where the BAM header

diff --git a/src/bamFile.ts b/src/bamFile.ts
@@ -1,7 +1,6 @@
-import { Buffer } from 'buffer'
 import crc32 from 'crc/crc32'
 import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
-import { LocalFile, RemoteFile, GenericFilehandle } from 'generic-filehandle'
+import { LocalFile, RemoteFile, GenericFilehandle } from 'generic-filehandle2'
 import AbortablePromiseCache from '@gmod/abortable-promise-cache'
 import QuickLRU from 'quick-lru'
 
@@ -148,23 +147,21 @@ export default class BamFile {
     let buffer
     if (ret) {
       const s = ret + blockLen
-      const res = await this.bam.read(Buffer.alloc(s), 0, s, 0, opts)
-      if (!res.bytesRead) {
-        throw new Error('Error reading header')
-      }
-      buffer = res.buffer.subarray(0, Math.min(res.bytesRead, ret))
+      buffer = await this.bam.read(s, 0)
     } else {
       buffer = await this.bam.readFile(opts)
     }
 
     const uncba = await unzip(buffer)
+    const dataView = new DataView(uncba.buffer)
 
-    if (uncba.readInt32LE(0) !== BAM_MAGIC) {
+    if (dataView.getInt32(0, true) !== BAM_MAGIC) {
       throw new Error('Not a BAM file')
     }
-    const headLen = uncba.readInt32LE(4)
+    const headLen = dataView.getInt32(4, true)
 
-    this.header = uncba.toString('utf8', 8, 8 + headLen)
+    const decoder = new TextDecoder('utf8')
+    this.header = decoder.decode(uncba.subarray(8, 8 + headLen))
     const { chrToIndex, indexToChr } = await this._readRefSeqs(
       headLen + 8,
       65535,
@@ -204,30 +201,21 @@ export default class BamFile {
     if (start > refSeqBytes) {
       return this._readRefSeqs(start, refSeqBytes * 2, opts)
     }
-    const size = refSeqBytes + blockLen
-    const { bytesRead, buffer } = await this.bam.read(
-      Buffer.alloc(size),
-      0,
-      refSeqBytes,
-      0,
-      opts,
-    )
-    if (!bytesRead) {
-      throw new Error('Error reading refseqs from header')
-    }
-    const uncba = await unzip(
-      buffer.subarray(0, Math.min(bytesRead, refSeqBytes)),
-    )
-    const nRef = uncba.readInt32LE(start)
+    // const size = refSeqBytes + blockLen <-- use this?
+    const buffer = await this.bam.read(refSeqBytes, 0, opts)
+    const uncba = await unzip(buffer)
+    const dataView = new DataView(uncba.buffer)
+    const nRef = dataView.getInt32(start, true)
     let p = start + 4
     const chrToIndex: Record<string, number> = {}
     const indexToChr: { refName: string; length: number }[] = []
+    const decoder = new TextDecoder('utf8')
     for (let i = 0; i < nRef; i += 1) {
-      const lName = uncba.readInt32LE(p)
+      const lName = dataView.getInt32(p, true)
       const refName = this.renameRefSeq(
-        uncba.toString('utf8', p + 4, p + 4 + lName - 1),
+        decoder.decode(uncba.subarray(p + 4, p + 4 + lName - 1)),
       )
-      const lRef = uncba.readInt32LE(p + lName + 4)
+      const lRef = dataView.getInt32(p + lName + 4, true)
 
       chrToIndex[refName] = i
       indexToChr.push({ refName, length: lRef })
@@ -388,15 +376,7 @@ export default class BamFile {
   }
 
   async _readRegion(position: number, size: number, opts: BaseOpts = {}) {
-    const { bytesRead, buffer } = await this.bam.read(
-      Buffer.alloc(size),
-      0,
-      size,
-      position,
-      opts,
-    )
-
-    return buffer.subarray(0, Math.min(bytesRead, size))
+    return this.bam.read(size, position, opts)
   }
 
   async _readChunk({ chunk, opts }: { chunk: Chunk; opts: BaseOpts }) {
@@ -415,7 +395,7 @@ export default class BamFile {
   }
 
   async readBamFeatures(
-    ba: Buffer,
+    ba: Uint8Array,
     cpositions: number[],
     dpositions: number[],
     chunk: Chunk,
@@ -425,8 +405,9 @@ export default class BamFile {
     let pos = 0
     let last = +Date.now()
 
+    const dataView = new DataView(ba.buffer)
     while (blockStart + 4 < ba.length) {
-      const blockSize = ba.readInt32LE(blockStart)
+      const blockSize = dataView.getInt32(blockStart, true)
       const blockEnd = blockStart + 4 + blockSize - 1
 
       // increment position to the current decompressed status
@@ -471,8 +452,8 @@ export default class BamFile {
                 chunk.minv.dataPosition +
                 1
               : // must be slice, not subarray for buffer polyfill on web
-                // eslint-disable-next-line @typescript-eslint/no-deprecated
-                crc32.signed(ba.slice(blockStart, blockEnd)),
+                // @ts-expect-error
+                crc32.signed(ba.subarray(blockStart, blockEnd)),
         })
 
         sink.push(feature)

diff --git a/src/chunk.ts b/src/chunk.ts
@@ -2,7 +2,7 @@ import VirtualOffset from './virtualOffset'
 
 // little class representing a chunk in the index
 export default class Chunk {
-  public buffer?: Buffer
+  public buffer?: Uint8Array
 
   constructor(
     public minv: VirtualOffset,

diff --git a/src/csi.ts b/src/csi.ts
@@ -37,8 +37,9 @@ export default class CSI extends IndexFile {
     return []
   }
 
-  parseAuxData(bytes: Buffer, offset: number) {
-    const formatFlags = bytes.readInt32LE(offset)
+  parseAuxData(bytes: Uint8Array, offset: number) {
+    const dataView = new DataView(bytes.buffer)
+    const formatFlags = dataView.getUint32(offset, true)
     const coordinateType =
       formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed'
     const format = (
@@ -48,14 +49,14 @@ export default class CSI extends IndexFile {
       throw new Error(`invalid Tabix preset format flags ${formatFlags}`)
     }
     const columnNumbers = {
-      ref: bytes.readInt32LE(offset + 4),
-      start: bytes.readInt32LE(offset + 8),
-      end: bytes.readInt32LE(offset + 12),
+      ref: dataView.getInt32(offset + 4, true),
+      start: dataView.getInt32(offset + 8, true),
+      end: dataView.getInt32(offset + 12, true),
     }
-    const metaValue = bytes.readInt32LE(offset + 16)
+    const metaValue = dataView.getInt32(offset + 16, true)
     const metaChar = metaValue ? String.fromCharCode(metaValue) : ''
-    const skipLines = bytes.readInt32LE(offset + 20)
-    const nameSectionLength = bytes.readInt32LE(offset + 24)
+    const skipLines = dataView.getInt32(offset + 20, true)
+    const nameSectionLength = dataView.getInt32(offset + 24, true)
 
     return {
       columnNumbers,
@@ -77,23 +78,25 @@ export default class CSI extends IndexFile {
     const buffer = await this.filehandle.readFile(opts)
     const bytes = await unzip(buffer)
 
+    const dataView = new DataView(bytes.buffer)
     let csiVersion
-    // check TBI magic numbers
-    if (bytes.readUInt32LE(0) === CSI1_MAGIC) {
+    const magic = dataView.getUint32(0, true)
+
+    if (magic === CSI1_MAGIC) {
       csiVersion = 1
-    } else if (bytes.readUInt32LE(0) === CSI2_MAGIC) {
+    } else if (magic === CSI2_MAGIC) {
       csiVersion = 2
     } else {
-      throw new Error('Not a CSI file')
+      throw new Error(`Not a CSI file ${magic}`)
       // TODO: do we need to support big-endian CSI files?
     }
 
-    this.minShift = bytes.readInt32LE(4)
-    this.depth = bytes.readInt32LE(8)
+    this.minShift = dataView.getInt32(4, true)
+    this.depth = dataView.getInt32(8, true)
     this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
-    const auxLength = bytes.readInt32LE(12)
+    const auxLength = dataView.getInt32(12, true)
     const aux = auxLength >= 30 ? this.parseAuxData(bytes, 16) : undefined
-    const refCount = bytes.readInt32LE(16 + auxLength)
+    const refCount = dataView.getInt32(16 + auxLength, true)
 
     type BinIndex = Record<string, Chunk[]>
 
@@ -106,20 +109,20 @@ export default class CSI extends IndexFile {
     }>(refCount)
     for (let i = 0; i < refCount; i++) {
       // the binning index
-      const binCount = bytes.readInt32LE(curr)
+      const binCount = dataView.getInt32(curr, true)
       curr += 4
       const binIndex: Record<string, Chunk[]> = {}
       let stats // < provided by parsing a pseudo-bin, if present
       for (let j = 0; j < binCount; j++) {
-        const bin = bytes.readUInt32LE(curr)
+        const bin = dataView.getUint32(curr, true)
         curr += 4
         if (bin > this.maxBinNumber) {
           stats = parsePseudoBin(bytes, curr + 28)
           curr += 28 + 16
         } else {
           firstDataLine = findFirstData(firstDataLine, fromBytes(bytes, curr))
           curr += 8
-          const chunkCount = bytes.readInt32LE(curr)
+          const chunkCount = dataView.getInt32(curr, true)
           curr += 4
           const chunks = new Array<Chunk>(chunkCount)
           for (let k = 0; k < chunkCount; k += 1) {