Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Used Japanese tokenization in queries for accurate word searching in … #2

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
259 changes: 181 additions & 78 deletions lunr-repro.tsx
Original file line number Diff line number Diff line change
@@ -1,83 +1,103 @@
// Set up Lunr
// Polyfill for Unicode Property Escapes in RegExp
import rewritePattern from 'regexpu-core';
if (!RegExp.prototype.unicode) {
const OriginalRegExp = RegExp;
RegExp = function (pattern, flags) {
if (typeof flags === 'string' && flags.includes('u')) {
return new OriginalRegExp(
rewritePattern(pattern, { unicodePropertyEscape: true }),
flags.replace('u', '')
);
}
return new OriginalRegExp(pattern, flags);
};
RegExp.prototype = OriginalRegExp.prototype;
}

// Polyfill for Node.js environments to handle browser-like `self`
if (typeof self === 'undefined') {
global.self = global;
}

// Import Lunr and its Japanese plugin
import lunr from 'lunr';
import { type Index as LunrIndex } from 'lunr';

import enableLunrStemmer from 'lunr-languages/lunr.stemmer.support';
import enableTinyLunrSegmenter from 'lunr-languages/tinyseg';
import enableLunrFr from 'lunr-languages/lunr.fr';
import enableLunrJa from 'lunr-languages/lunr.ja';
// Import 'createRequire' to allow requiring CommonJS modules
import { createRequire } from 'module';
const require = createRequire(import.meta.url);

// These two must be done before enabling lunr.ja
// (cf. https://github.com/MihaiValentin/lunr-languages/issues/42):
enableTinyLunrSegmenter(lunr);
enableLunrStemmer(lunr);
// Import Lunr languages using require (CommonJS modules)
const lunrStemmerSupport = require('lunr-languages/lunr.stemmer.support');
const tinyseg = require('lunr-languages/tinyseg');
const lunrJa = require('lunr-languages/lunr.ja');

enableLunrJa(lunr);
// Initialize Lunr plugins
lunrStemmerSupport(lunr);
tinyseg(lunr);
lunrJa(lunr);

// Set the global tokenizer to the Japanese tokenizer
lunr.tokenizer = lunr.ja.tokenizer;

// Show search UI
// Prevent unnecessary imports in Node.js
if (typeof window !== 'undefined' && typeof self !== 'undefined') {
import('react-devtools-core');
}

// Import UI libraries
import { TextInput } from '@inkjs/ui';
import React, { useMemo, useState } from 'react';
import { render, useInput, useFocus, Box, Text } from 'ink';
import { render, useFocus, Box, Text } from 'ink';

const Home: React.FC<Record<never, never>> = function () {
const [selectedMode, selectMode] = useState<'index' | 'query'>('index');
const [isSelectingMode, setIsSelectingMode] = useState(false);
// Main Application
const Home = () => {
const [queryString, setQueryString] = useState('');
const [docs, setDocs] = useState<Record<string, string>>({});
function handleAddDoc(body: string) {
setDocs(docs => ({
const [docs, setDocs] = useState({});

const handleAddDoc = (body) => {
setDocs((docs) => ({
...docs,
[`Document ${Object.keys(docs).length + 1}`]: body,
}));
}

function handleSelectMode(name: string) {
if (name === 'index' || name === 'query') {
selectMode(name);
}
}

};

return (
<Box flexDirection="column">
<Text>Use tab to move around.</Text>
<Text>Use tab to move around and press Enter to select or confirm.</Text>
<Index onAddDoc={handleAddDoc} docsIndexed={Object.keys(docs).length} />
<Query query={queryString} onQueryChange={setQueryString} />
<Search query={queryString} docs={docs} />
</Box>
);
};

const Index: React.FC<{
onAddDoc: (doc: string) => void;
docsIndexed: number;
}> = function ({ onAddDoc, docsIndexed }) {
// Component to Index Documents
const Index = ({ onAddDoc, docsIndexed }) => {
const { isFocused } = useFocus({ autoFocus: true });
return (
<Box gap={2}>
<Box gap={2} flexDirection="row">
<Text inverse={isFocused}>Index a document</Text>
{isFocused
? <TextInput
key={docsIndexed}
placeholder="Enter or paste a Japanese string and press enter…"
isDisabled={!isFocused}
onSubmit={val => { onAddDoc(val) }}
/>
: <Text> </Text>}
{isFocused ? (
<TextInput
key={docsIndexed}
placeholder="Enter or paste a Japanese string and press Enter…"
isDisabled={!isFocused}
onSubmit={(val) => {
onAddDoc(val);
}}
/>
) : (
<Text>Press Tab to focus</Text>
)}
</Box>
);
};

const Query: React.FC<{
query: string;
onQueryChange: (query: string) => void;
}> = function ({ query, onQueryChange }) {
// Component to Handle Query Input
const Query = ({ query, onQueryChange }) => {
const { isFocused } = useFocus();
return (
<Box gap={2}>
<Box gap={2} flexDirection="row">
<Text inverse={isFocused}>Search documents</Text>
<TextInput
isDisabled={!isFocused}
Expand All @@ -87,53 +107,136 @@ const Query: React.FC<{
/>
</Box>
);
}
};

// Component to Perform Search
const Search = ({ query, docs }) => {
const lunrIndex = useMemo(() => {
try {
// Create a Lunr index with Japanese language support
return lunr(function () {
// Use the Japanese language plugin
this.use(lunr.ja);

// Setup reference and field
this.ref('name');
this.field('body');

const Search: React.FC<{
query: string;
docs: Record<string, string>;
}> = function ({ query, docs }) {
const lunrIndex = useMemo((() => lunr(function () {
this.ref('name');
this.field('body');
for (const [name, body] of Object.entries(docs)) {
this.use((lunr as any).ja);
this.add({ name, body });
// Add documents to the index
for (const [name, body] of Object.entries(docs)) {
this.add({ name, body });
}
});
} catch (error) {
console.error('Error initializing Lunr index:', error);
return null;
}
}, [docs]);

const [searchData, setSearchData] = useState({
results: [],
error: null,
tokens: [],
});

useMemo(() => {
if (!lunrIndex || !query.trim()) {
setSearchData({
results: [],
error: null,
tokens: [],
});
return;
}
//console.debug(`Indexed ${docs.length} docs`);
})), [docs]);

const [results, error] = useMemo(() => {
try {
return [lunrIndex.search(query) ?? [], null];
// Tokenize the query using lunr.tokenizer
const tokens = lunr.tokenizer(query.trim()).map((token) => token.toString());

// Use Lunr's query builder to search
const results = lunrIndex.query((q) => {
tokens.forEach((token) => {
q.term(token, {
wildcard: lunr.Query.wildcard.TRAILING,
});
});
}) ?? [];

setSearchData({
results,
error: null,
tokens,
});
} catch (e) {
return [[], `${e.message}`];
console.error('Error searching:', e);
setSearchData({
results: [],
error: e instanceof Error ? e.message : 'Unknown error occurred',
tokens: [],
});
}
}, [lunrIndex, query]);

const { results, error, tokens } = searchData;

return (
<Box flexDirection="column">
{error
? <Text>Error searching: {error}</Text>
: query.trim() !== ''
? <Text>{results.length} documents matched:</Text>
// Empty query means all docs are shown
: <Text>{Object.keys(docs).length} documents indexed:</Text>}
<Box flexDirection="column">
{results.map((res) => <Result key={res.ref} name={res.ref} body={docs[res.ref]} />)}
<Box flexDirection="column" marginTop={1}>
{error ? (
<Text color="red">Error searching: {error}</Text>
) : query.trim() !== '' ? (
<Box flexDirection="column">
<Text>
{results.length} document{results.length === 1 ? '' : 's'} matched
</Text>
<Box>
<Text bold>Query: </Text>
<Text>{query}</Text>
</Box>
{tokens.length > 0 && (
<Box>
<Text bold>Tokens: </Text>
<Text>{tokens.join(', ')}</Text>
</Box>
)}
</Box>
) : (
<Text>
{Object.keys(docs).length} document
{Object.keys(docs).length === 1 ? '' : 's'} indexed
</Text>
)}
<Box flexDirection="column" marginTop={1}>
{results.map((res) => (
<Result key={res.ref} name={res.ref} body={docs[res.ref]} />
))}
</Box>
</Box>
);
}
};

const Result: React.FC<{ name: string, body: string }> = function ({ name, body }) {
// Component to Display Search Results and Document Tokens
const Result = ({ name, body }) => {
const { isFocused } = useFocus();
const docTokens = useMemo(() => {
// Use the global Japanese tokenizer for tokenization
return lunr.tokenizer(body).map((token) => token.toString());
}, [body]);

return (
<Box gap={2}>
<Text inverse={isFocused}>{name}</Text>
{isFocused ? <Text>{body}</Text> : null}
<Box flexDirection="column" gap={1}>
<Box gap={2}>
<Text inverse={isFocused}>{name}</Text>
{isFocused && <Text>{body}</Text>}
</Box>
{isFocused && docTokens.length > 0 && (
<Box flexDirection="column" marginLeft={2}>
<Text bold>Document Tokens:</Text>
<Text>{docTokens.join(', ')}</Text>
</Box>
)}
</Box>
);
}
};

// Render the Application
render(<Home />);