Skip to content

Commit

Permalink
feat: init boox-cli
Browse files Browse the repository at this point in the history
  • Loading branch information
bent10 committed May 8, 2024
1 parent a0cd619 commit 4c8645e
Show file tree
Hide file tree
Showing 15 changed files with 1,000 additions and 8 deletions.
311 changes: 308 additions & 3 deletions package-lock.json

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@
"prebuild:doc": "npm run train -w boox-site",
"build:doc": "npm run build -w boox-site",
"build:examples": "npm run build -w examples-vanilla -w examples-react -w examples-vue -w examples-svelte -w examples-nextjs",
"build": "npm run build -w boox",
"types": "npm run types -w boox",
"lint": "npm run lint -w boox",
"test": "npm test -w boox",
"coverage": "npm run coverage -w boox",
"build": "npm run build -w boox -w boox-cli",
"types": "npm run types -w boox -w boox-cli",
"lint": "npm run lint -w boox -w boox-cli",
"test": "npm test -w boox -w boox-cli",
"coverage": "npm run coverage -w boox -w boox-cli",
"format": "npm run format --workspaces --if-present"
},
"devDependencies": {
Expand Down
Empty file added packages/cli/changelog.md
Empty file.
48 changes: 48 additions & 0 deletions packages/cli/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"name": "boox-cli",
"description": "A command-line interface (CLI) for training and searching Boox datasets.",
"version": "0.0.0-development",
"publishConfig": {
"access": "public"
},
"author": "Beni Arisandi (https://stilearning.com)",
"repository": "https://github.com/bent10/boox",
"homepage": "https://stilearning.com/boox",
"license": "MIT",
"keywords": [
"boox",
"train",
"trainer",
"document",
"index",
"indexing",
"nlp"
],
"type": "module",
"bin": "dist/index.js",
"files": [
"dist",
"changelog.md",
"readme.md"
],
"scripts": {
"start": "vite",
"dev": "vite build --watch",
"build": "vite build && npm run chmod",
"test": "vitest",
"bench": "vitest bench",
"coverage": "vitest run --coverage",
"lint": "tsc --noEmit && eslint . --cache --cache-location ../../node_modules/.eslint",
"format": "prettier . --write --cache-location ../../node_modules/.prettier --ignore-path ../../.prettierignore",
"chmod": "chmod +x dist/index.js"
},
"dependencies": {
"commander": "^12.0.0",
"ora": "^8.0.1",
"pako": "^2.1.0",
"rcfy": "^2.1.0"
},
"devDependencies": {
"@types/pako": "^2.0.3"
}
}
131 changes: 131 additions & 0 deletions packages/cli/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Boox CLI

A command-line interface (CLI) for training and searching [Boox](https://github.com/bent10/boox) datasets.

## Installation

Install `boox-cli` globally using npm or yarn:

```bash
npm install -g boox-cli

# Or

yarn global add boox-cli
```

## Usage

### Training

To train a Boox dataset, use the train command:

```bash
boox-cli train <source> [destination] [options]
```

- `<source>`: The path to your dataset file (JSON format).
- `[destination]`: (Optional) The path where the trained data will be saved. Defaults to the current directory.

Options:

- `-i, --id <field>`: The field in your dataset objects that uniquely identifies each document (default: `'id'`).
- `-f, --features <fields...>`: The fields to index for search (multiple fields can be specified).
- `-a, --attributes <fields...>`: The fields to include as-is without indexing (multiple fields can be specified).
- `-d, --deflate`: Compress the trained data as `.dat` file (default: `false`).
- `-c, --cwd <folder>`: The working directory (default: current directory).
- `-r, --rcname <name>`: The name of the Boox configuration file (default: `'boox'`).

Example:

```bash
boox-cli train data/products.json -f title description -a price
```

This command will train a Boox dataset from the `data/products.json` file, indexing the `title` and `description` fields for search and including the `price` field as-is. The trained data will be saved as a compressed `.gz` file.

### Searching

To search a trained Boox dataset, use the `search` command:

```bash
boox-cli search <source> <query> [options]
```

- `<source>`: The path to the trained dataset file (`.dat` or `.gz`).
- `<query>`: The search query string.

Options:

- `-o, --offset <number>`: The offset for pagination (default: `'1'`).
- `-l, --length <number>`: The number of results per page (default: `'10'`).
- `-k, --context <field>`: Display the context instead of paginated results object.
- `-a, --attrs <fields...>`: Fields to display when `--context` is provided.
- `-d, --deflate`: Assume the trained data is deflated as `.dat` file (default: `false`).
- `-c, --cwd <folder>`: The working directory (default: current directory).
- `-r, --rcname <name>`: The name of the Boox configuration file (default: `'boox'`).

Example:

```bash
boox-cli search data/products-trained.gz "shoes" -o 2 -l 20
```

This command will search the `data/products-trained.gz` dataset for documents containing the word `"shoes"`, starting from the second page and displaying 20 results per page.

## Using configuration file

You can create a Boox configuration file in your project's root directory to specify default options for the `boox-cli train` and `boox-cli search` commands:

- `.booxrc`
- `.booxrc.json`
- `.booxrc.{yaml,yml}`
- `.boox.{mjs,cjs,js}`
- `boox.config.{mjs,cjs,js}`

Before using the example below, make sure to install the required libraries:

```bash
npm install -D double-metaphone stemmer stopword marked marked-plaintify
```

Here's an example of a Boox configuration file:

```js
// boox.config.js
import { doubleMetaphone } from 'double-metaphone'
import { Marked } from 'marked'
import markedPlaintify from 'marked-plaintify'
import { stemmer } from 'stemmer'
import { removeStopwords } from 'stopword'

const marked = new Marked({ gfm: true }).use(markedPlaintify())
const wordRegexp = /\b\w+\b/g

/** @type {() => import('boox').BooxOptions} */
export default function defineBooxConfig() {
return {
id: 'customId',
features: ['title', 'content', 'tags'],
attributes: ['author', 'date'],
modelOptions: {
normalizer(input) {
// Remove Markdown formatting
return marked.parse(input)
},
tokenizer(input) {
const tokens = Array.from(input.match(wordRegexp) || [])
return removeStopwords(tokens)
},
stemmer: stemmer,
phonetic: doubleMetaphone
}
}
}
```

The `--rcname` flag allows you to customize the name of the configuration file. For example, to use a configuration file named `my-appname.config.js`, you would run the following command:

```bash
boox-cli train src/dataset.json --rcname my-appname
```
191 changes: 191 additions & 0 deletions packages/cli/src/api.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import { access, constants, mkdir, readFile, writeFile } from 'node:fs/promises'
import { basename, dirname, extname, join, relative, resolve } from 'path'
import generateBatches from 'batch-me-up'
import Boox, { type BooxOptions, type Dataset, type SearchOptions } from 'boox'
import { oraPromise } from 'ora'
import { deflate, gzip, inflate, ungzip } from 'pako'
import { loadRc } from 'rcfy'
import { getDataSize, getElapsedTime } from './utils.js'

export interface Options extends BooxOptions {
isDeflate?: boolean
cwd?: string
rcname?: string
}

export interface PageOptions {
offset?: string
length?: string
}

// --- Constants ---
const DEFAULT_COMPRESSION_LEVEL = 6

/**
* Trains a Boox dataset and saves the trained data.
*
* @param src Path to the dataset file.
* @param dest Path where the trained data will be saved.
* @param options Training options.
*/
export async function trainDataset(
src: string,
dest: string,
{ rcname = 'boox', cwd, ...options }: Options = {}
) {
const resolvedCwd = cwd ? resolve(cwd) : process.cwd()
// Load user config from (e.g. boox.config.js) file, if present
const userConfig: Options = await loadRc(rcname, resolvedCwd)
const {
id = 'id',
features = ['text'],
attributes = [],
modelOptions,
isDeflate = false
} = { ...options, ...userConfig }

const resolvedSrc = relative(process.cwd(), join(resolvedCwd, src))

const trainedFile = join(
dest
? relative(process.cwd(), join(resolvedCwd, dest))
: dirname(resolvedSrc),
`${basename(src).replace(
extname(src),
isDeflate ? '-trained.dat' : '-trained.gz'
)}`
)

// Create Boox instance
const boox = new Boox<Dataset>({ id, features, attributes, modelOptions })

try {
// Read dataset from file
const datasets = await oraPromise<Dataset[]>(
async () => JSON.parse(await readFile(resolvedSrc, 'utf8')),
{
text: 'Reading data...',
successText(data) {
return `Reading ${getDataSize(data)} data!`
}
}
)

const batches = await generateBatches(datasets)
const progress = {
current: 0,
length: datasets.length
}
const startTime = new Date()

// Train the model in batches
await oraPromise(
ora => {
return Promise.all(
batches.map(batch =>
batch.map(dataset => {
progress.current++
ora.text = `Training ${resolvedSrc} ${progress.current} of ${progress.length} - ${getElapsedTime(startTime)}`
ora.render()
boox.addDocumentSync(dataset)
})
)
)
},
{
text: 'Start training...',
successText() {
return `Trained ${progress.current} documents in ${getElapsedTime(startTime)}`
}
}
)

// Compress and save the trained state
const compressor = isDeflate ? deflate : gzip
const state = JSON.stringify(boox.currentState)
const compressedState = compressor(state, {
level: DEFAULT_COMPRESSION_LEVEL
})

await oraPromise(
async () => {
const distDir = dirname(trainedFile)

try {
await access(distDir, constants.F_OK)
} catch {
await mkdir(distDir, { recursive: true })
}

return await writeFile(trainedFile, compressedState)
},
{
text: 'Saving...',
successText: `Saved ${getDataSize(state)} state to ${trainedFile}`
}
)
} catch (error) {
throw error
}
}

/**
* Searches a trained Boox dataset.
*
* @param src Path to the trained dataset file.
* @param query The search query.
* @param options Search options.
*/
export async function searchDataset(
src: string,
query: string,
{
rcname = 'boox',
cwd,
...options
}: Pick<Options, 'cwd' | 'rcname' | 'isDeflate'> & PageOptions = {}
) {
const resolvedCwd = cwd ? resolve(cwd) : process.cwd()
// Load user config from (e.g. boox.config.js) file, if present
const userConfig: Options = await loadRc(rcname, resolvedCwd)
const {
modelOptions,
isDeflate = false,
offset = 1,
length = 10
} = { ...options, ...userConfig }

const resolvedSrc = relative(process.cwd(), join(resolvedCwd, src))

try {
// Create Boox instance
const decompressor = isDeflate ? inflate : ungzip

// Read trained state from file
console.time('Loading state')
const compressedState = await readFile(resolvedSrc)
const decompressedState = decompressor(compressedState, { to: 'string' })
const boox = new Boox({ modelOptions })
const state = JSON.parse(decompressedState)
// set state
boox.currentState = state
console.timeEnd('Loading state')

console.info('State size:', getDataSize(decompressedState))

// Perform the search
// Load user config from (e.g. boox-results.config.js) file, if present
const resultsConfig: SearchOptions = await loadRc(
'boox-results',
resolvedCwd
)
console.time('Search in')
const results = await boox.search(query, resultsConfig)
console.timeEnd('Search in')
console.log()

return Boox.paginateSearchResults(results, +offset, +length)
} catch (error) {
throw error
}
}
Loading

0 comments on commit 4c8645e

Please sign in to comment.