Skip to content

Commit

Permalink
Added collection indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
stephenwf committed Feb 15, 2024
1 parent 888a54a commit 85bb17b
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 35 deletions.
Binary file modified bun.lockb
Binary file not shown.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"@types/object-hash": "^3.0.4",
"bun-plugin-yaml": "^0.0.1",
"bun-types": "^1.0.1",
"keyword-extractor": "^0.0.28",
"commander": "^11.0.0",
"detect-python-interpreter": "^1.0.0",
"object-hash": "^3.0.0",
Expand Down
2 changes: 1 addition & 1 deletion src/commands/build.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ import { extractRemoteSource } from '../extract/extract-remote-source.ts';
import { loadScripts } from '../util/load-scripts.ts';
import { generate } from './generate.ts';
import { extractFolderCollections } from '../extract/extract-folder-collections.ts';
import { enrichTypesense } from '../enrich/typesense-manifests.ts';
import { enrichTypesense } from '../enrich/typesense-index.ts';
import { extractPlaintext } from '../extract/extract-plaintext.ts';
import { typesensePlaintext } from '../enrich/typesense-plaintext.ts';
// import { pdiiif } from "../enrich/pdiiif.ts";
Expand Down
40 changes: 23 additions & 17 deletions src/enrich/typesense-manifests.ts → src/enrich/typesense-index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,38 +3,35 @@ import { Enrichment } from '../util/enrich';
import { getValue } from '@iiif/helpers';
import { InternationalString } from '@iiif/presentation-3';
import { join } from 'node:path';
import { writeFile } from 'node:fs/promises';

// Schema:
// - label
// - summary
// - thumbnail, index: false
// - .*_topic, facet: true, type: string[], optional: true
// - collections, type: string[], facet: true
// - plaintext content
import { readFile, writeFile } from 'node:fs/promises';
import { existsSync } from 'node:fs';

const schema = {
name: 'manifests',
enable_nested_fields: true,
fields: [
{ name: 'id', type: 'string' },
{ name: 'type', type: 'string', facet: true },
{ name: 'label', type: 'string' },
{ name: 'all_labels', type: 'string[]' },
{ name: 'summary', type: 'string' },
{ name: 'full_label', type: 'object', optional: true },
{ name: 'summary', type: 'string', optional: true },
{ name: 'collections', type: 'string[]', facet: true, optional: true },
{ name: 'plaintext', type: 'string', optional: true },
// { name: 'topic_.*', type: 'string[]', facet: true, optional: true },

// other fields
{ name: 'slug', type: 'string' },
{ name: 'url', type: 'string', optional: true },
{ name: 'totalItems', type: 'int32', optional: true },
{ name: 'thumbnail', type: 'string', index: false, optional: true },
],
};

type SingleRecord = {
id: string;
type: string;
slug: string;
label: string;
all_labels: string[];
full_label: InternationalString;
summary: string;
collections: string[];
plaintext?: string;
Expand All @@ -46,7 +43,7 @@ type TopicRecord = Record<`topic_${string}`, string[]>;
export const enrichTypesense: Enrichment<{}, { record: SingleRecord; foundTopics: string[] }> = {
id: 'typesense-manifests',
name: 'Typesense manifest collection',
types: ['Manifest'],
types: ['Manifest', 'Collection'],
invalidate: async () => {
return true;
},
Expand All @@ -58,21 +55,30 @@ export const enrichTypesense: Enrichment<{}, { record: SingleRecord; foundTopics

const extraTopics: TopicRecord = {};

for (const [k, v] of Object.entries(indices)) {
for (const [k, v] of Object.entries(indices || {})) {
extraTopics[`topic_${k}`] = v;
}

let plaintext = '';
const keywordsFile = join(api.files, 'keywords.txt');
if (existsSync(keywordsFile)) {
plaintext = await readFile(keywordsFile, 'utf-8');
}

return {
temp: {
record: {
id: btoa(id),
type: resource.type,
slug: resource.slug,
label: getValue(api.resource.label),
all_labels: Object.entries(api.resource.label as InternationalString).map(([_, v]) => (v || []).join(' ')),
full_label: api.resource.label,
summary: getValue(api.resource.summary),
thumbnail: meta.thumbnail?.id,
url: meta.url,
totalItems: meta.totalItems,
collections: [],
plaintext: '',
plaintext,
...extraTopics,
},
foundTopics: Object.keys(extraTopics),
Expand Down
31 changes: 27 additions & 4 deletions src/extract/extract-plaintext.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import { join } from 'node:path';
import { Extraction } from '../util/extract';
import { mkdirp } from 'mkdirp';
import keywordExtractor from 'keyword-extractor';

export const extractPlaintext: Extraction = {
type ExtractPlaintextConfig = {
keywords: boolean;
};

export const extractPlaintext: Extraction<ExtractPlaintextConfig> = {
id: 'extract-plaintext',
name: 'Extract plaintext',
types: ['Canvas'],
Expand All @@ -12,15 +17,33 @@ export const extractPlaintext: Extraction = {
async collectManifest(manifest, temp, api, config) {
if (temp.canvases) {
const filesDir = join(api.build.cacheDir, manifest.slug, 'files', 'plaintext');
const keywordsFile = join(api.build.cacheDir, manifest.slug, 'files', 'keywords.txt');
await mkdirp(filesDir);

for (const [canvasIdx, canvas] of Object.entries(temp.canvases)) {
if ((canvas as any).plaintext) {
const allText: string[] = [];

for (const [canvasIdx, canvas] of Object.entries(temp.canvases || {})) {
const text = (canvas as any).plaintext;
if (text) {
const canvasFile = join(filesDir, `${canvasIdx}.txt`);
await mkdirp(filesDir);
await Bun.write(canvasFile, (canvas as any).plaintext);
await Bun.write(canvasFile, text);
allText.push(text);
}
}

const keywords = keywordExtractor
.extract(allText.join(' '), {
language: 'en',
remove_digits: true,
return_changed_case: true,
remove_duplicates: true,
})
.join(' ');

if (config.keywords && keywords) {
await Bun.write(keywordsFile, keywords);
}
}
},
async handler(resource, api) {
Expand Down
40 changes: 27 additions & 13 deletions src/extract/extract-thumbnail.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,35 @@
import { Extraction } from "../util/extract.ts";
import { createThumbnailHelper } from "@iiif/helpers";
import { Extraction } from '../util/extract.ts';
import { createThumbnailHelper } from '@iiif/helpers';

type ExtractThumbnailConfig = {
width: number;
height: number;
dereference: boolean;
};

export const extractThumbnail: Extraction = {
id: "extract-thumbnail",
name: "Extract Thumbnail",
types: ["Manifest"],
id: 'extract-thumbnail',
name: 'Extract Thumbnail',
types: ['Manifest'],
invalidate: async (resource, api, config) => {
return true;
const cache = await api.caches.value;
return !cache.extractThumbnail && cache.extractThumbnail !== false;
},
handler: async (resource, api) => {
handler: async (resource, api, config) => {
const vault = resource.vault;
const helper = createThumbnailHelper(vault);
const thumbnail = await helper.getBestThumbnailAtSize(
api.resource,
{
width: 256,
height: 256,
},
false,
config.width
? {
width: config.width,
height: config.height || config.width,
}
: {
width: 256,
height: 256,
},
config.dereference || false
);

if (thumbnail && thumbnail.best) {
Expand All @@ -27,6 +39,8 @@ export const extractThumbnail: Extraction = {
};
}

return {};
return {
caches: { extractThumbnail: false },
};
},
};

0 comments on commit 85bb17b

Please sign in to comment.