feat: init boox-cli

bent10 · May 8, 2024 · 4c8645e · 4c8645e
1 parent a0cd619
commit 4c8645e
Show file tree

Hide file tree

Showing 15 changed files with 1,000 additions and 8 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -41,11 +41,11 @@
     "prebuild:doc": "npm run train -w boox-site",
     "build:doc": "npm run build -w boox-site",
     "build:examples": "npm run build -w examples-vanilla -w examples-react -w examples-vue -w examples-svelte -w examples-nextjs",
-    "build": "npm run build -w boox",
-    "types": "npm run types -w boox",
-    "lint": "npm run lint -w boox",
-    "test": "npm test -w boox",
-    "coverage": "npm run coverage -w boox",
+    "build": "npm run build -w boox -w boox-cli",
+    "types": "npm run types -w boox -w boox-cli",
+    "lint": "npm run lint -w boox -w boox-cli",
+    "test": "npm test -w boox -w boox-cli",
+    "coverage": "npm run coverage -w boox -w boox-cli",
     "format": "npm run format --workspaces --if-present"
   },
   "devDependencies": {

diff --git a/packages/cli/changelog.md b/packages/cli/changelog.md
diff --git a/packages/cli/package.json b/packages/cli/package.json
@@ -0,0 +1,48 @@
+{
+  "name": "boox-cli",
+  "description": "A command-line interface (CLI) for training and searching Boox datasets.",
+  "version": "0.0.0-development",
+  "publishConfig": {
+    "access": "public"
+  },
+  "author": "Beni Arisandi (https://stilearning.com)",
+  "repository": "https://github.com/bent10/boox",
+  "homepage": "https://stilearning.com/boox",
+  "license": "MIT",
+  "keywords": [
+    "boox",
+    "train",
+    "trainer",
+    "document",
+    "index",
+    "indexing",
+    "nlp"
+  ],
+  "type": "module",
+  "bin": "dist/index.js",
+  "files": [
+    "dist",
+    "changelog.md",
+    "readme.md"
+  ],
+  "scripts": {
+    "start": "vite",
+    "dev": "vite build --watch",
+    "build": "vite build && npm run chmod",
+    "test": "vitest",
+    "bench": "vitest bench",
+    "coverage": "vitest run --coverage",
+    "lint": "tsc --noEmit && eslint . --cache --cache-location ../../node_modules/.eslint",
+    "format": "prettier . --write --cache-location ../../node_modules/.prettier --ignore-path ../../.prettierignore",
+    "chmod": "chmod +x dist/index.js"
+  },
+  "dependencies": {
+    "commander": "^12.0.0",
+    "ora": "^8.0.1",
+    "pako": "^2.1.0",
+    "rcfy": "^2.1.0"
+  },
+  "devDependencies": {
+    "@types/pako": "^2.0.3"
+  }
+}
diff --git a/packages/cli/readme.md b/packages/cli/readme.md
@@ -0,0 +1,131 @@
+# Boox CLI
+
+A command-line interface (CLI) for training and searching [Boox](https://github.com/bent10/boox) datasets.
+
+## Installation
+
+Install `boox-cli` globally using npm or yarn:
+
+```bash
+npm install -g boox-cli
+
+# Or
+
+yarn global add boox-cli
+```
+
+## Usage
+
+### Training
+
+To train a Boox dataset, use the train command:
+
+```bash
+boox-cli train <source> [destination] [options]
+```
+
+- `<source>`: The path to your dataset file (JSON format).
+- `[destination]`: (Optional) The path where the trained data will be saved. Defaults to the current directory.
+
+Options:
+
+- `-i, --id <field>`: The field in your dataset objects that uniquely identifies each document (default: `'id'`).
+- `-f, --features <fields...>`: The fields to index for search (multiple fields can be specified).
+- `-a, --attributes <fields...>`: The fields to include as-is without indexing (multiple fields can be specified).
+- `-d, --deflate`: Compress the trained data as `.dat` file (default: `false`).
+- `-c, --cwd <folder>`: The working directory (default: current directory).
+- `-r, --rcname <name>`: The name of the Boox configuration file (default: `'boox'`).
+
+Example:
+
+```bash
+boox-cli train data/products.json -f title description -a price
+```
+
+This command will train a Boox dataset from the `data/products.json` file, indexing the `title` and `description` fields for search and including the `price` field as-is. The trained data will be saved as a compressed `.gz` file.
+
+### Searching
+
+To search a trained Boox dataset, use the `search` command:
+
+```bash
+boox-cli search <source> <query> [options]
+```
+
+- `<source>`: The path to the trained dataset file (`.dat` or `.gz`).
+- `<query>`: The search query string.
+
+Options:
+
+- `-o, --offset <number>`: The offset for pagination (default: `'1'`).
+- `-l, --length <number>`: The number of results per page (default: `'10'`).
+- `-k, --context <field>`: Display the context instead of paginated results object.
+- `-a, --attrs <fields...>`: Fields to display when `--context` is provided.
+- `-d, --deflate`: Assume the trained data is deflated as `.dat` file (default: `false`).
+- `-c, --cwd <folder>`: The working directory (default: current directory).
+- `-r, --rcname <name>`: The name of the Boox configuration file (default: `'boox'`).
+
+Example:
+
+```bash
+boox-cli search data/products-trained.gz "shoes" -o 2 -l 20
+```
+
+This command will search the `data/products-trained.gz` dataset for documents containing the word `"shoes"`, starting from the second page and displaying 20 results per page.
+
+## Using configuration file
+
+You can create a Boox configuration file in your project's root directory to specify default options for the `boox-cli train` and `boox-cli search` commands:
+
+- `.booxrc`
+- `.booxrc.json`
+- `.booxrc.{yaml,yml}`
+- `.boox.{mjs,cjs,js}`
+- `boox.config.{mjs,cjs,js}`
+
+Before using the example below, make sure to install the required libraries:
+
+```bash
+npm install -D double-metaphone stemmer stopword marked marked-plaintify
+```
+
+Here's an example of a Boox configuration file:
+
+```js
+// boox.config.js
+import { doubleMetaphone } from 'double-metaphone'
+import { Marked } from 'marked'
+import markedPlaintify from 'marked-plaintify'
+import { stemmer } from 'stemmer'
+import { removeStopwords } from 'stopword'
+
+const marked = new Marked({ gfm: true }).use(markedPlaintify())
+const wordRegexp = /\b\w+\b/g
+
+/** @type {() => import('boox').BooxOptions} */
+export default function defineBooxConfig() {
+  return {
+    id: 'customId',
+    features: ['title', 'content', 'tags'],
+    attributes: ['author', 'date'],
+    modelOptions: {
+      normalizer(input) {
+        // Remove Markdown formatting
+        return marked.parse(input)
+      },
+      tokenizer(input) {
+        const tokens = Array.from(input.match(wordRegexp) || [])
+        return removeStopwords(tokens)
+      },
+      stemmer: stemmer,
+      phonetic: doubleMetaphone
+    }
+  }
+}
+```
+
+The `--rcname` flag allows you to customize the name of the configuration file. For example, to use a configuration file named `my-appname.config.js`, you would run the following command:
+
+```bash
+boox-cli train src/dataset.json --rcname my-appname
+```
diff --git a/packages/cli/src/api.ts b/packages/cli/src/api.ts
@@ -0,0 +1,191 @@
+import { access, constants, mkdir, readFile, writeFile } from 'node:fs/promises'
+import { basename, dirname, extname, join, relative, resolve } from 'path'
+import generateBatches from 'batch-me-up'
+import Boox, { type BooxOptions, type Dataset, type SearchOptions } from 'boox'
+import { oraPromise } from 'ora'
+import { deflate, gzip, inflate, ungzip } from 'pako'
+import { loadRc } from 'rcfy'
+import { getDataSize, getElapsedTime } from './utils.js'
+
+export interface Options extends BooxOptions {
+  isDeflate?: boolean
+  cwd?: string
+  rcname?: string
+}
+
+export interface PageOptions {
+  offset?: string
+  length?: string
+}
+
+// --- Constants ---
+const DEFAULT_COMPRESSION_LEVEL = 6
+
+/**
+ * Trains a Boox dataset and saves the trained data.
+ *
+ * @param src Path to the dataset file.
+ * @param dest Path where the trained data will be saved.
+ * @param options Training options.
+ */
+export async function trainDataset(
+  src: string,
+  dest: string,
+  { rcname = 'boox', cwd, ...options }: Options = {}
+) {
+  const resolvedCwd = cwd ? resolve(cwd) : process.cwd()
+  // Load user config from (e.g. boox.config.js) file, if present
+  const userConfig: Options = await loadRc(rcname, resolvedCwd)
+  const {
+    id = 'id',
+    features = ['text'],
+    attributes = [],
+    modelOptions,
+    isDeflate = false
+  } = { ...options, ...userConfig }
+
+  const resolvedSrc = relative(process.cwd(), join(resolvedCwd, src))
+
+  const trainedFile = join(
+    dest
+      ? relative(process.cwd(), join(resolvedCwd, dest))
+      : dirname(resolvedSrc),
+    `${basename(src).replace(
+      extname(src),
+      isDeflate ? '-trained.dat' : '-trained.gz'
+    )}`
+  )
+
+  // Create Boox instance
+  const boox = new Boox<Dataset>({ id, features, attributes, modelOptions })
+
+  try {
+    // Read dataset from file
+    const datasets = await oraPromise<Dataset[]>(
+      async () => JSON.parse(await readFile(resolvedSrc, 'utf8')),
+      {
+        text: 'Reading data...',
+        successText(data) {
+          return `Reading ${getDataSize(data)} data!`
+        }
+      }
+    )
+
+    const batches = await generateBatches(datasets)
+    const progress = {
+      current: 0,
+      length: datasets.length
+    }
+    const startTime = new Date()
+
+    // Train the model in batches
+    await oraPromise(
+      ora => {
+        return Promise.all(
+          batches.map(batch =>
+            batch.map(dataset => {
+              progress.current++
+              ora.text = `Training ${resolvedSrc} ${progress.current} of ${progress.length} - ${getElapsedTime(startTime)}`
+              ora.render()
+              boox.addDocumentSync(dataset)
+            })
+          )
+        )
+      },
+      {
+        text: 'Start training...',
+        successText() {
+          return `Trained ${progress.current} documents in ${getElapsedTime(startTime)}`
+        }
+      }
+    )
+
+    // Compress and save the trained state
+    const compressor = isDeflate ? deflate : gzip
+    const state = JSON.stringify(boox.currentState)
+    const compressedState = compressor(state, {
+      level: DEFAULT_COMPRESSION_LEVEL
+    })
+
+    await oraPromise(
+      async () => {
+        const distDir = dirname(trainedFile)
+
+        try {
+          await access(distDir, constants.F_OK)
+        } catch {
+          await mkdir(distDir, { recursive: true })
+        }
+
+        return await writeFile(trainedFile, compressedState)
+      },
+      {
+        text: 'Saving...',
+        successText: `Saved ${getDataSize(state)} state to ${trainedFile}`
+      }
+    )
+  } catch (error) {
+    throw error
+  }
+}
+
+/**
+ * Searches a trained Boox dataset.
+ *
+ * @param src Path to the trained dataset file.
+ * @param query The search query.
+ * @param options Search options.
+ */
+export async function searchDataset(
+  src: string,
+  query: string,
+  {
+    rcname = 'boox',
+    cwd,
+    ...options
+  }: Pick<Options, 'cwd' | 'rcname' | 'isDeflate'> & PageOptions = {}
+) {
+  const resolvedCwd = cwd ? resolve(cwd) : process.cwd()
+  // Load user config from (e.g. boox.config.js) file, if present
+  const userConfig: Options = await loadRc(rcname, resolvedCwd)
+  const {
+    modelOptions,
+    isDeflate = false,
+    offset = 1,
+    length = 10
+  } = { ...options, ...userConfig }
+
+  const resolvedSrc = relative(process.cwd(), join(resolvedCwd, src))
+
+  try {
+    // Create Boox instance
+    const decompressor = isDeflate ? inflate : ungzip
+
+    // Read trained state from file
+    console.time('Loading state')
+    const compressedState = await readFile(resolvedSrc)
+    const decompressedState = decompressor(compressedState, { to: 'string' })
+    const boox = new Boox({ modelOptions })
+    const state = JSON.parse(decompressedState)
+    // set state
+    boox.currentState = state
+    console.timeEnd('Loading state')
+
+    console.info('State size:', getDataSize(decompressedState))
+
+    // Perform the search
+    // Load user config from (e.g. boox-results.config.js) file, if present
+    const resultsConfig: SearchOptions = await loadRc(
+      'boox-results',
+      resolvedCwd
+    )
+    console.time('Search in')
+    const results = await boox.search(query, resultsConfig)
+    console.timeEnd('Search in')
+    console.log()
+
+    return Boox.paginateSearchResults(results, +offset, +length)
+  } catch (error) {
+    throw error
+  }
+}