-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
1,000 additions
and
8 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
{ | ||
"name": "boox-cli", | ||
"description": "A command-line interface (CLI) for training and searching Boox datasets.", | ||
"version": "0.0.0-development", | ||
"publishConfig": { | ||
"access": "public" | ||
}, | ||
"author": "Beni Arisandi (https://stilearning.com)", | ||
"repository": "https://github.com/bent10/boox", | ||
"homepage": "https://stilearning.com/boox", | ||
"license": "MIT", | ||
"keywords": [ | ||
"boox", | ||
"train", | ||
"trainer", | ||
"document", | ||
"index", | ||
"indexing", | ||
"nlp" | ||
], | ||
"type": "module", | ||
"bin": "dist/index.js", | ||
"files": [ | ||
"dist", | ||
"changelog.md", | ||
"readme.md" | ||
], | ||
"scripts": { | ||
"start": "vite", | ||
"dev": "vite build --watch", | ||
"build": "vite build && npm run chmod", | ||
"test": "vitest", | ||
"bench": "vitest bench", | ||
"coverage": "vitest run --coverage", | ||
"lint": "tsc --noEmit && eslint . --cache --cache-location ../../node_modules/.eslint", | ||
"format": "prettier . --write --cache-location ../../node_modules/.prettier --ignore-path ../../.prettierignore", | ||
"chmod": "chmod +x dist/index.js" | ||
}, | ||
"dependencies": { | ||
"commander": "^12.0.0", | ||
"ora": "^8.0.1", | ||
"pako": "^2.1.0", | ||
"rcfy": "^2.1.0" | ||
}, | ||
"devDependencies": { | ||
"@types/pako": "^2.0.3" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
# Boox CLI | ||
|
||
A command-line interface (CLI) for training and searching [Boox](https://github.com/bent10/boox) datasets. | ||
|
||
## Installation | ||
|
||
Install `boox-cli` globally using npm or yarn: | ||
|
||
```bash | ||
npm install -g boox-cli | ||
|
||
# Or | ||
|
||
yarn global add boox-cli | ||
``` | ||
|
||
## Usage | ||
|
||
### Training | ||
|
||
To train a Boox dataset, use the train command: | ||
|
||
```bash | ||
boox-cli train <source> [destination] [options] | ||
``` | ||
|
||
- `<source>`: The path to your dataset file (JSON format). | ||
- `[destination]`: (Optional) The path where the trained data will be saved. Defaults to the current directory. | ||
|
||
Options: | ||
|
||
- `-i, --id <field>`: The field in your dataset objects that uniquely identifies each document (default: `'id'`). | ||
- `-f, --features <fields...>`: The fields to index for search (multiple fields can be specified). | ||
- `-a, --attributes <fields...>`: The fields to include as-is without indexing (multiple fields can be specified). | ||
- `-d, --deflate`: Compress the trained data as `.dat` file (default: `false`). | ||
- `-c, --cwd <folder>`: The working directory (default: current directory). | ||
- `-r, --rcname <name>`: The name of the Boox configuration file (default: `'boox'`). | ||
|
||
Example: | ||
|
||
```bash | ||
boox-cli train data/products.json -f title description -a price | ||
``` | ||
|
||
This command will train a Boox dataset from the `data/products.json` file, indexing the `title` and `description` fields for search and including the `price` field as-is. The trained data will be saved as a compressed `.gz` file. | ||
|
||
### Searching | ||
|
||
To search a trained Boox dataset, use the `search` command: | ||
|
||
```bash | ||
boox-cli search <source> <query> [options] | ||
``` | ||
|
||
- `<source>`: The path to the trained dataset file (`.dat` or `.gz`). | ||
- `<query>`: The search query string. | ||
|
||
Options: | ||
|
||
- `-o, --offset <number>`: The offset for pagination (default: `'1'`). | ||
- `-l, --length <number>`: The number of results per page (default: `'10'`). | ||
- `-k, --context <field>`: Display the context instead of paginated results object. | ||
- `-a, --attrs <fields...>`: Fields to display when `--context` is provided. | ||
- `-d, --deflate`: Assume the trained data is deflated as `.dat` file (default: `false`). | ||
- `-c, --cwd <folder>`: The working directory (default: current directory). | ||
- `-r, --rcname <name>`: The name of the Boox configuration file (default: `'boox'`). | ||
|
||
Example: | ||
|
||
```bash | ||
boox-cli search data/products-trained.gz "shoes" -o 2 -l 20 | ||
``` | ||
|
||
This command will search the `data/products-trained.gz` dataset for documents containing the word `"shoes"`, starting from the second page and displaying 20 results per page. | ||
|
||
## Using configuration file | ||
|
||
You can create a Boox configuration file in your project's root directory to specify default options for the `boox-cli train` and `boox-cli search` commands: | ||
|
||
- `.booxrc` | ||
- `.booxrc.json` | ||
- `.booxrc.{yaml,yml}` | ||
- `.boox.{mjs,cjs,js}` | ||
- `boox.config.{mjs,cjs,js}` | ||
|
||
Before using the example below, make sure to install the required libraries: | ||
|
||
```bash | ||
npm install -D double-metaphone stemmer stopword marked marked-plaintify | ||
``` | ||
|
||
Here's an example of a Boox configuration file: | ||
|
||
```js | ||
// boox.config.js | ||
import { doubleMetaphone } from 'double-metaphone' | ||
import { Marked } from 'marked' | ||
import markedPlaintify from 'marked-plaintify' | ||
import { stemmer } from 'stemmer' | ||
import { removeStopwords } from 'stopword' | ||
|
||
const marked = new Marked({ gfm: true }).use(markedPlaintify()) | ||
const wordRegexp = /\b\w+\b/g | ||
|
||
/** @type {() => import('boox').BooxOptions} */ | ||
export default function defineBooxConfig() { | ||
return { | ||
id: 'customId', | ||
features: ['title', 'content', 'tags'], | ||
attributes: ['author', 'date'], | ||
modelOptions: { | ||
normalizer(input) { | ||
// Remove Markdown formatting | ||
return marked.parse(input) | ||
}, | ||
tokenizer(input) { | ||
const tokens = Array.from(input.match(wordRegexp) || []) | ||
return removeStopwords(tokens) | ||
}, | ||
stemmer: stemmer, | ||
phonetic: doubleMetaphone | ||
} | ||
} | ||
} | ||
``` | ||
|
||
The `--rcname` flag allows you to customize the name of the configuration file. For example, to use a configuration file named `my-appname.config.js`, you would run the following command: | ||
|
||
```bash | ||
boox-cli train src/dataset.json --rcname my-appname | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
import { access, constants, mkdir, readFile, writeFile } from 'node:fs/promises' | ||
import { basename, dirname, extname, join, relative, resolve } from 'path' | ||
import generateBatches from 'batch-me-up' | ||
import Boox, { type BooxOptions, type Dataset, type SearchOptions } from 'boox' | ||
import { oraPromise } from 'ora' | ||
import { deflate, gzip, inflate, ungzip } from 'pako' | ||
import { loadRc } from 'rcfy' | ||
import { getDataSize, getElapsedTime } from './utils.js' | ||
|
||
export interface Options extends BooxOptions { | ||
isDeflate?: boolean | ||
cwd?: string | ||
rcname?: string | ||
} | ||
|
||
export interface PageOptions { | ||
offset?: string | ||
length?: string | ||
} | ||
|
||
// --- Constants --- | ||
const DEFAULT_COMPRESSION_LEVEL = 6 | ||
|
||
/** | ||
* Trains a Boox dataset and saves the trained data. | ||
* | ||
* @param src Path to the dataset file. | ||
* @param dest Path where the trained data will be saved. | ||
* @param options Training options. | ||
*/ | ||
export async function trainDataset( | ||
src: string, | ||
dest: string, | ||
{ rcname = 'boox', cwd, ...options }: Options = {} | ||
) { | ||
const resolvedCwd = cwd ? resolve(cwd) : process.cwd() | ||
// Load user config from (e.g. boox.config.js) file, if present | ||
const userConfig: Options = await loadRc(rcname, resolvedCwd) | ||
const { | ||
id = 'id', | ||
features = ['text'], | ||
attributes = [], | ||
modelOptions, | ||
isDeflate = false | ||
} = { ...options, ...userConfig } | ||
|
||
const resolvedSrc = relative(process.cwd(), join(resolvedCwd, src)) | ||
|
||
const trainedFile = join( | ||
dest | ||
? relative(process.cwd(), join(resolvedCwd, dest)) | ||
: dirname(resolvedSrc), | ||
`${basename(src).replace( | ||
extname(src), | ||
isDeflate ? '-trained.dat' : '-trained.gz' | ||
)}` | ||
) | ||
|
||
// Create Boox instance | ||
const boox = new Boox<Dataset>({ id, features, attributes, modelOptions }) | ||
|
||
try { | ||
// Read dataset from file | ||
const datasets = await oraPromise<Dataset[]>( | ||
async () => JSON.parse(await readFile(resolvedSrc, 'utf8')), | ||
{ | ||
text: 'Reading data...', | ||
successText(data) { | ||
return `Reading ${getDataSize(data)} data!` | ||
} | ||
} | ||
) | ||
|
||
const batches = await generateBatches(datasets) | ||
const progress = { | ||
current: 0, | ||
length: datasets.length | ||
} | ||
const startTime = new Date() | ||
|
||
// Train the model in batches | ||
await oraPromise( | ||
ora => { | ||
return Promise.all( | ||
batches.map(batch => | ||
batch.map(dataset => { | ||
progress.current++ | ||
ora.text = `Training ${resolvedSrc} ${progress.current} of ${progress.length} - ${getElapsedTime(startTime)}` | ||
ora.render() | ||
boox.addDocumentSync(dataset) | ||
}) | ||
) | ||
) | ||
}, | ||
{ | ||
text: 'Start training...', | ||
successText() { | ||
return `Trained ${progress.current} documents in ${getElapsedTime(startTime)}` | ||
} | ||
} | ||
) | ||
|
||
// Compress and save the trained state | ||
const compressor = isDeflate ? deflate : gzip | ||
const state = JSON.stringify(boox.currentState) | ||
const compressedState = compressor(state, { | ||
level: DEFAULT_COMPRESSION_LEVEL | ||
}) | ||
|
||
await oraPromise( | ||
async () => { | ||
const distDir = dirname(trainedFile) | ||
|
||
try { | ||
await access(distDir, constants.F_OK) | ||
} catch { | ||
await mkdir(distDir, { recursive: true }) | ||
} | ||
|
||
return await writeFile(trainedFile, compressedState) | ||
}, | ||
{ | ||
text: 'Saving...', | ||
successText: `Saved ${getDataSize(state)} state to ${trainedFile}` | ||
} | ||
) | ||
} catch (error) { | ||
throw error | ||
} | ||
} | ||
|
||
/** | ||
* Searches a trained Boox dataset. | ||
* | ||
* @param src Path to the trained dataset file. | ||
* @param query The search query. | ||
* @param options Search options. | ||
*/ | ||
export async function searchDataset( | ||
src: string, | ||
query: string, | ||
{ | ||
rcname = 'boox', | ||
cwd, | ||
...options | ||
}: Pick<Options, 'cwd' | 'rcname' | 'isDeflate'> & PageOptions = {} | ||
) { | ||
const resolvedCwd = cwd ? resolve(cwd) : process.cwd() | ||
// Load user config from (e.g. boox.config.js) file, if present | ||
const userConfig: Options = await loadRc(rcname, resolvedCwd) | ||
const { | ||
modelOptions, | ||
isDeflate = false, | ||
offset = 1, | ||
length = 10 | ||
} = { ...options, ...userConfig } | ||
|
||
const resolvedSrc = relative(process.cwd(), join(resolvedCwd, src)) | ||
|
||
try { | ||
// Create Boox instance | ||
const decompressor = isDeflate ? inflate : ungzip | ||
|
||
// Read trained state from file | ||
console.time('Loading state') | ||
const compressedState = await readFile(resolvedSrc) | ||
const decompressedState = decompressor(compressedState, { to: 'string' }) | ||
const boox = new Boox({ modelOptions }) | ||
const state = JSON.parse(decompressedState) | ||
// set state | ||
boox.currentState = state | ||
console.timeEnd('Loading state') | ||
|
||
console.info('State size:', getDataSize(decompressedState)) | ||
|
||
// Perform the search | ||
// Load user config from (e.g. boox-results.config.js) file, if present | ||
const resultsConfig: SearchOptions = await loadRc( | ||
'boox-results', | ||
resolvedCwd | ||
) | ||
console.time('Search in') | ||
const results = await boox.search(query, resultsConfig) | ||
console.timeEnd('Search in') | ||
console.log() | ||
|
||
return Boox.paginateSearchResults(results, +offset, +length) | ||
} catch (error) { | ||
throw error | ||
} | ||
} |
Oops, something went wrong.