From 71dce22749addb082aaaf27e733dcabb41b4df4a Mon Sep 17 00:00:00 2001 From: Daniel von Atzigen Date: Wed, 9 Oct 2024 07:52:23 +0200 Subject: [PATCH] Integrate new ocr service Add ocr service Add custom logger Add ocr development service Handle ocr error response Use SKIP_PROCESSING on ocr service in development --- .github/workflows/code-quality.yml | 1 + README.md | 10 +- apps/server-asset-sg/.env | 4 +- apps/server-asset-sg/src/app.logger.ts | 149 ++++++++++++++ apps/server-asset-sg/src/app.module.ts | 4 +- .../src/features/files/file-ocr.service.ts | 188 ++++++++++++++++++ .../src/features/files/files.controller.ts | 6 + .../src/features/ocr/ocr.controller.ts | 179 ----------------- apps/server-asset-sg/src/main.ts | 7 +- apps/server-asset-sg/src/utils/sleep.ts | 4 + development/.env | 1 + development/.gitignore | 1 + development/docker-compose.yaml | 17 ++ package-lock.json | 21 +- package.json | 1 + 15 files changed, 399 insertions(+), 194 deletions(-) create mode 100644 apps/server-asset-sg/src/app.logger.ts create mode 100644 apps/server-asset-sg/src/features/files/file-ocr.service.ts delete mode 100644 apps/server-asset-sg/src/features/ocr/ocr.controller.ts create mode 100644 apps/server-asset-sg/src/utils/sleep.ts diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml index affb7f04..a7c6800a 100644 --- a/.github/workflows/code-quality.yml +++ b/.github/workflows/code-quality.yml @@ -182,6 +182,7 @@ jobs: DB_PASSWORD: ${{ env.DB_PASSWORD }} run: | cd development + touch .env.ocr dos2unix ./init/elasticsearch/init.sh chmod +x ./init/elasticsearch/init.sh chmod +r ./init/elasticsearch/mappings/swissgeol_asset_asset.json diff --git a/README.md b/README.md index 17c04e2a..681dc435 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ Be aware that you need to manually insert the `{DB_*}` values beforehand. ```bash cd development -docker compose exec db sh -c 'pg_dump --dbname=postgresql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:5432/{DB_DATABASE} --data-only --exclude-table asset_user --exclude-table _prisma_migrations -n public > /dump.sql' +docker compose exec db sh -c 'pg_dump --dbname=postgresql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:5432/{DB_DATABASE} --data-only --exclude-table asset_user --exclude-table workgroups_on_users --exclude-table _prisma_migrations -n public > /dump.sql' ``` > The export will output warnings related to circular foreign-key constraints. @@ -105,10 +105,14 @@ Ensure to start your database service beforehand. ```bash # Reset the database: npm run prisma -- migrate reset -f -npm run prisma -- migrate deploy -# Import example data: +# Switch to the directory containing the database's `docker-compose.yml`: cd development + +# Remove the initial workgroup as it will collide with the import: +docker compose exec db sh -c 'psql --dbname=postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@localhost:5432/${POSTGRES_DB} -c "DELETE FROM workgroup"' + +# Import example data: docker compose exec db sh -c 'psql --dbname=postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@localhost:5432/${POSTGRES_DB} -v ON_ERROR_STOP=1 -f /dump.sql' ``` diff --git a/apps/server-asset-sg/.env b/apps/server-asset-sg/.env index b4ffa40b..8f6bf031 100644 --- a/apps/server-asset-sg/.env +++ b/apps/server-asset-sg/.env @@ -13,7 +13,5 @@ OAUTH_SHOW_DEBUG_INFO=true OAUTH_TOKEN_ENDPOINT=http://localhost:4011/connect/token OAUTH_AUTHORIZED_GROUPS=assets.swissgeol ANONYMOUS_MODE=false -OCR_URL= -OCR_CALLBACK_URL= - +OCR_SERVICE_URL=http://localhost:5052 diff --git a/apps/server-asset-sg/src/app.logger.ts b/apps/server-asset-sg/src/app.logger.ts new file mode 100644 index 00000000..29733e99 --- /dev/null +++ b/apps/server-asset-sg/src/app.logger.ts @@ -0,0 +1,149 @@ +import { Injectable, LoggerService, LogLevel } from '@nestjs/common'; +import colors from 'colors/safe'; + +@Injectable() +export class AppLogger implements LoggerService { + levels: Set | null = null; + + log(message: unknown, ...optionalParams: unknown[]) { + this.write(levels.log, message, optionalParams); + } + + error(message: unknown, ...optionalParams: unknown[]) { + this.write(levels.error, message, optionalParams); + } + + warn(message: unknown, ...optionalParams: unknown[]) { + this.write(levels.warn, message, optionalParams); + } + + debug?(message: unknown, ...optionalParams: unknown[]) { + this.write(levels.debug, message, optionalParams); + } + + verbose?(message: unknown, ...optionalParams: unknown[]) { + this.write(levels.verbose, message, optionalParams); + } + + fatal?(message: unknown, ...optionalParams: unknown[]) { + this.write(levels.fatal, message, optionalParams); + } + + setLogLevels?(levels: LogLevel[]) { + this.levels = new Set(levels); + } + + private hasLevel(level: LogLevel): boolean { + return this.levels == null || this.levels.has(level); + } + + private write(level: Level, message: unknown, params: unknown[]) { + if (!this.hasLevel(level.key)) { + return; + } + const lastParam = params[params.length - 1]; + let source = 'main'; + if (typeof lastParam === 'string') { + source = lastParam; + params = params.slice(0, params.length - 1); + } + + const now = new Date(); + const nameSpacer = ' '.repeat(MAX_NAME_LENGTH - level.name.length); + const prefix = + colors.reset(` ${now.toISOString()} `) + nameSpacer + level.bgColor(` ${level.name} `) + ' ' + source; + let output = ' ' + level.color(`${message}`); + if (params.length !== 0) { + output += ' ' + stringify(params, level); + } + console.log(`${prefix} ${output}`); + } +} + +const stringify = (value: unknown, level: Level, options: { isNested?: boolean } = {}): string => { + if (Array.isArray(value)) { + return stringifyArray(value, level, options); + } + if (value instanceof Error) { + return JSON.stringify(value.message); + } + if (value != null && typeof value === 'object') { + return stringifyObject(value, level, options); + } + return level.color(JSON.stringify(value)); +}; + +const stringifyArray = (value: unknown[], level: Level, options: { isNested?: boolean } = {}): string => { + let output = ''; + for (const element of value) { + if (output.length > 0) { + output += ', '; + } + output += stringify(element, level, { isNested: true }); + } + return options.isNested ? `[${output}]` : output; +}; + +const stringifyObject = (value: object, level: Level, options: { isNested?: boolean } = {}): string => { + const valueString = value.toString(); + if (valueString !== '[object Object]') { + return level.color(valueString); + } + + let output = ''; + for (const [k, v] of Object.entries(value)) { + if (output.length > 0) { + output += ', '; + } + output += `${k}: ${stringify(v, level, { isNested: true })}`; + } + return options.isNested ? `[${output}]` : output; +}; + +interface Level { + key: LogLevel; + name: string; + color: typeof colors.white; + bgColor: typeof colors.bgWhite; +} + +const levels: Record = { + log: { + key: 'log', + name: 'INFO', + color: colors.green, + bgColor: colors.bgGreen, + }, + error: { + key: 'error', + name: 'ERROR', + color: colors.red, + bgColor: colors.bgRed, + }, + warn: { + key: 'warn', + name: 'WARN', + color: colors.yellow, + bgColor: colors.bgYellow, + }, + debug: { + key: 'debug', + name: 'DEBUG', + color: colors.blue, + bgColor: colors.bgBlue, + }, + verbose: { + key: 'verbose', + name: 'VERBOSE', + color: colors.magenta, + bgColor: colors.bgMagenta, + }, + fatal: { + key: 'fatal', + name: 'FATAL', + color: colors.red, + bgColor: colors.bgRed, + }, +}; + +const MAX_NAME_LENGTH = Math.max(...Object.values(levels).map((it) => it.name.length)); diff --git a/apps/server-asset-sg/src/app.module.ts b/apps/server-asset-sg/src/app.module.ts index 0fc161bb..9fd76b90 100644 --- a/apps/server-asset-sg/src/app.module.ts +++ b/apps/server-asset-sg/src/app.module.ts @@ -23,9 +23,9 @@ import { ContactRepo } from '@/features/contacts/contact.repo'; import { ContactsController } from '@/features/contacts/contacts.controller'; import { FavoriteRepo } from '@/features/favorites/favorite.repo'; import { FavoritesController } from '@/features/favorites/favorites.controller'; +import { FileOcrService } from '@/features/files/file-ocr.service'; import { FileRepo } from '@/features/files/file.repo'; import { FilesController } from '@/features/files/files.controller'; -import { OcrController } from '@/features/ocr/ocr.controller'; import { StudiesController } from '@/features/studies/studies.controller'; import { StudyRepo } from '@/features/studies/study.repo'; import { UserRepo } from '@/features/users/user.repo'; @@ -43,7 +43,6 @@ import { WorkgroupsController } from '@/features/workgroups/workgroups.controlle ContactsController, FavoritesController, FilesController, - OcrController, StudiesController, UsersController, WorkgroupsController, @@ -60,6 +59,7 @@ import { WorkgroupsController } from '@/features/workgroups/workgroups.controlle ContactRepo, FavoriteRepo, FileRepo, + FileOcrService, PrismaService, StudyRepo, UserRepo, diff --git a/apps/server-asset-sg/src/features/files/file-ocr.service.ts b/apps/server-asset-sg/src/features/files/file-ocr.service.ts new file mode 100644 index 00000000..ebaf93d7 --- /dev/null +++ b/apps/server-asset-sg/src/features/files/file-ocr.service.ts @@ -0,0 +1,188 @@ +import { exit } from 'process'; +import { AssetFile } from '@asset-sg/shared'; +import { Injectable, Logger, OnModuleInit } from '@nestjs/common'; +import { OcrState } from '@prisma/client'; +import { PrismaService } from '@/core/prisma.service'; +import { sleep } from '@/utils/sleep'; + +const serviceUrl = process.env.OCR_SERVICE_URL as string; +if (serviceUrl == null || serviceUrl.length == 0) { + console.error("Missing 'OCR_SERVICE_URL' environment variable."); + exit(1); +} + +type OcrFile = Pick; + +const BATCH_SIZE = 10; + +@Injectable() +export class FileOcrService implements OnModuleInit { + private readonly logger = new Logger(FileOcrService.name); + + constructor(private readonly prisma: PrismaService) {} + + onModuleInit() { + this.processRemaining().then(); + } + + async processRemaining(): Promise { + const unprocessedFiles = await this.prisma.file.findMany({ + select: { id: true, name: true, ocrStatus: true }, + where: { ocrStatus: { notIn: ['success', 'error', 'willNotBeProcessed'] } }, + }); + const errorCount = await this.prisma.file.count({ + where: { ocrStatus: 'error' }, + }); + if (errorCount > 0) { + this.logger.log( + "Found files whose OCR failed. Please reset their 'ocrStatus' manually if you want to retry them.", + { count: errorCount } + ); + } + + if (unprocessedFiles.length === 0) { + this.logger.log('No unprocessed files found.'); + return; + } + + this.logger.log('Found unprocessed files.', { count: unprocessedFiles.length }); + + let batch: Array> = []; + const processBatch = async (): Promise => (await Promise.all(batch)).reduce((a, b) => a + b, 0 as number); + + let successCount = 0; + for (const file of unprocessedFiles) { + if (batch.length >= BATCH_SIZE) { + successCount += await processBatch(); + batch = []; + } + batch.push(this.processRemainingFile(file).then((ok) => (ok ? 1 : 0))); + } + successCount += await processBatch(); + + this.logger.log('OCR processing finished.', { + successes: successCount, + failures: unprocessedFiles.length - successCount, + }); + } + + async processRemainingFile(file: OcrFile & { ocrStatus: OcrState }): Promise { + if (file.ocrStatus === 'processing') { + this.logger.log('Ongoing OCR found, will attempt to finish it.', { file: file.name }); + try { + await this.finishProcessing(file); + this.logger.log('Ongoing OCR finished.', { file: file.name }); + return true; + } catch (e) { + this.logger.error(`Failed to finish ongoing OCR, a retry will be attempted.`, { + file: file.name, + error: e, + }); + } + } + await this.updateStatus(file.id, 'created'); + return await this.process(file); + } + + async process(file: OcrFile): Promise { + try { + this.logger.log('Starting OCR.', { file: file.name }); + await this.startProcessing(file); + await this.updateStatus(file.id, 'processing'); + await this.finishProcessing(file); + this.logger.log('OCR finished.', { file: file.name }); + return true; + } catch (e) { + await this.updateStatus(file.id, 'error'); + this.logger.error('OCR failed.', { file: file.name, error: e }); + return false; + } + } + + async finishProcessing(file: OcrFile): Promise { + for (;;) { + await sleep(1000); + const ok = await this.collectResult(file); + if (ok) { + await this.updateStatus(file.id, 'success'); + return; + } + } + } + + private async updateStatus(fileId: number, status: OcrState): Promise { + await this.prisma.file.update({ + select: { id: true }, + data: { ocrStatus: status }, + where: { id: fileId }, + }); + } + + private async startProcessing(file: OcrFile): Promise { + await this.fetch('/', { file: file.name }); + } + + private async collectResult(file: OcrFile): Promise { + interface ApiData { + has_finished: boolean; + data: unknown; + } + + interface ApiError { + has_finished: true; + error: string; + } + + const data = await this.fetch('/collect', { file: file.name }); + if ('error' in data) { + throw new Error(data.error); + } + return data.has_finished; + } + + private async fetch(path: string, body: object): Promise { + const response = await fetch(`${serviceUrl}${path}`, { + method: 'POST', + body: JSON.stringify(body), + headers: { + 'Content-Type': 'application/json', + }, + }); + if (response.status < 200 || response.status > 299) { + throw await makeResponseError(response); + } + if (response.status === 204) { + // "204 - No Content" indicates that the response does not contain any data. + // Parsing the body as JSON would most likely fail. + return undefined as T; + } + return await response.json(); + } +} + +const makeResponseError = async (response: Response): Promise => { + let body = await response.text(); + if (body.length === 0) { + body = ''; + } + let data = parseJSON(body); + if (hasKey(data, 'detail')) { + data = data.detail; + if (hasKey(data, 'message')) { + data = data.message; + } + } + return new Error(`${response.status} ${response.statusText} - ${data ?? body}`); +}; + +const hasKey = (value: unknown, key: K): value is { [k in K]: unknown } => { + return value != null && typeof value == 'object' && key in value; +}; + +const parseJSON = (input: string): unknown | null => { + try { + return JSON.parse(input); + } catch (e) { + return null; + } +}; diff --git a/apps/server-asset-sg/src/features/files/files.controller.ts b/apps/server-asset-sg/src/features/files/files.controller.ts index 20bda4a3..e30e1fa1 100644 --- a/apps/server-asset-sg/src/features/files/files.controller.ts +++ b/apps/server-asset-sg/src/features/files/files.controller.ts @@ -24,6 +24,7 @@ import { authorize } from '@/core/authorize'; import { CurrentUser } from '@/core/decorators/current-user.decorator'; import { PrismaService } from '@/core/prisma.service'; import { AssetEditRepo } from '@/features/asset-edit/asset-edit.repo'; +import { FileOcrService } from '@/features/files/file-ocr.service'; import { FileRepo } from '@/features/files/file.repo'; import { getFile } from '@/utils/file/get-file'; @@ -31,6 +32,7 @@ import { getFile } from '@/utils/file/get-file'; export class FilesController { constructor( private readonly fileRepo: FileRepo, + private readonly fileOcrService: FileOcrService, private readonly assetEditRepo: AssetEditRepo, private readonly prismaService: PrismaService ) {} @@ -116,6 +118,10 @@ export class FilesController { assetId: asset.assetId, user, }); + + // Run OCR on the file in the background. + setTimeout(() => this.fileOcrService.process(record)); + return AssetFile.encode(record); } diff --git a/apps/server-asset-sg/src/features/ocr/ocr.controller.ts b/apps/server-asset-sg/src/features/ocr/ocr.controller.ts deleted file mode 100644 index 2c5af96c..00000000 --- a/apps/server-asset-sg/src/features/ocr/ocr.controller.ts +++ /dev/null @@ -1,179 +0,0 @@ -import { isNotNil, unknownToUnknownError } from '@asset-sg/core'; -import { HttpService } from '@nestjs/axios'; -import { - BadRequestException, - Body, - Controller, - createParamDecorator, - ExecutionContext, - Injectable, - Logger, - Param, - Post, -} from '@nestjs/common'; -import type { AxiosRequestConfig } from 'axios'; -import * as E from 'fp-ts/Either'; -import { pipe } from 'fp-ts/function'; -import * as TE from 'fp-ts/TaskEither'; -import * as D from 'io-ts/Decoder'; -import { catchError, firstValueFrom, map, of } from 'rxjs'; - -import { PrismaService } from '@/core/prisma.service'; -import { getFile } from '@/utils/file/get-file'; -import { putFile } from '@/utils/file/put-file'; - -const BufferBody = createParamDecorator(async (_, context: ExecutionContext) => { - const req = context.switchToHttp().getRequest(); - if (!req.readable) { - throw new BadRequestException('Invalid body'); - } - return await streamToBufferAsync(req); -}); - -const Config = D.struct({ - ocrUrl: D.string, - ocrCallbackUrl: D.string, -}); -type Config = D.TypeOf; - -@Injectable() -@Controller('ocr') -export class OcrController { - private readonly config: Config; - - constructor(private readonly prismaService: PrismaService, private readonly httpService: HttpService) { - this.config = pipe( - Config.decode({ - ocrUrl: process.env.OCR_URL, - ocrCallbackUrl: process.env.OCR_CALLBACK_URL, - }), - E.getOrElseW((e) => { - console.error(D.draw(e)); - process.exit(1); - }) - ); - } - - @Post('ocr-success/:fileId/:filename') - async ocrSuccess(@BufferBody() body: Buffer, @Param('fileId') fileId: string, @Param('filename') filename: string) { - if (!body) { - Logger.warn('ocrSuccess ------------> Empty Body'); - return; - } - Logger.log('OcrService ------------> success, saving pdf, size: ' + body.length); - await pipe( - putFile(filename, body, 'application/pdf'), - TE.chain(() => - TE.tryCatch( - () => - this.prismaService.file.update({ - where: { id: Number(fileId) }, - data: { ocrStatus: 'success', size: body.length, lastModifiedAt: new Date() }, - }), - unknownToUnknownError - ) - ) - )(); - Logger.log(`SUCCESS OCR pdf ${filename} with id ${fileId}`); - } - - @Post('ocr-error/:fileId/:filename') - async ocrError( - @Body() body: { error: string }, - @Param('fileId') fileId: string, - @Param('filename') filename: string - ) { - Logger.log('OcrService ------------> error', `${body.error}`); - await TE.tryCatch( - () => - this.prismaService.file.update({ - where: { id: Number(fileId) }, - data: { ocrStatus: 'error', lastModifiedAt: new Date() }, - }), - unknownToUnknownError - )(); - Logger.warn('OCR Job Error for file: ' + filename); - } - - // @Cron(CronExpression.EVERY_30_SECONDS) - async handleCron() { - Logger.log('cron job running'); - const result = await pipe( - TE.tryCatch( - () => - this.prismaService.file.findFirst({ - where: { ocrStatus: 'waiting' }, - }), - unknownToUnknownError - ), - TE.filterOrElseW(isNotNil, () => ({ _tag: 'nothingToDo' as const })), - TE.bindTo('file'), - TE.bindW('s3File', ({ file }) => getFile(this.prismaService, file.id)), - TE.bindW('buffer', ({ s3File }) => streamToBufferTE(s3File.stream)), - TE.bindW('updateResult', ({ file }) => - TE.tryCatch( - () => - this.prismaService.file.update({ - where: { id: file.id }, - data: { ocrStatus: 'processing', lastModifiedAt: new Date() }, - }), - unknownToUnknownError - ) - ), - TE.chainW(({ buffer, file }) => this.extractText(buffer, file.id, file.name)) - )(); - Logger.log('result', result); - } - - extractText(pdfData: Buffer, fileId: number, filename: string) { - const config: AxiosRequestConfig = { - method: 'POST', - url: `${this.config.ocrUrl}/ocr`, - headers: { - 'Content-Length': pdfData.length, - 'Content-Type': 'application/octet-stream', - 'x-filename': filename, - 'x-callback-success': `${this.config.ocrCallbackUrl}/ocr/ocr-success/${fileId}/${filename}`, - 'x-callback-error': `${this.config.ocrCallbackUrl}/ocr/ocr-error/${fileId}/${filename}`, - }, - data: pdfData, - timeout: 1000 * 60 * 15, // 15 minutes - maxContentLength: 9000000000, - maxBodyLength: 90000000000, - }; - - return TE.tryCatch( - () => - firstValueFrom( - this.httpService.request(config).pipe( - map(() => true), - catchError((e) => { - if (e.code === 'ECONNABORTED' || e.code === 'ECONNRESET' || e.response?.status === 503) - Logger.log('server busy'); - else { - Logger.warn(`server problem: ${e}`); - } - return of(false); - }) - ) - ), - unknownToUnknownError - ); - } -} - -const streamToBufferAsync = (readableStream: NodeJS.ReadableStream) => - new Promise((resolve, reject) => { - const bufs: Array = []; - readableStream.on('data', (d) => { - bufs.push(d); - }); - readableStream.on('end', () => { - const result = Buffer.concat(bufs); - resolve(result); - }); - readableStream.on('error', reject); - }); - -const streamToBufferTE = (readableStream: NodeJS.ReadableStream) => - TE.tryCatch(() => streamToBufferAsync(readableStream), unknownToUnknownError); diff --git a/apps/server-asset-sg/src/main.ts b/apps/server-asset-sg/src/main.ts index 164c4f8a..b79fe2d7 100644 --- a/apps/server-asset-sg/src/main.ts +++ b/apps/server-asset-sg/src/main.ts @@ -2,6 +2,7 @@ import { Logger, ValidationPipe } from '@nestjs/common'; import { NestFactory } from '@nestjs/core'; import { AppModule } from './app.module'; +import { AppLogger } from '@/app.logger'; import { PrismaExceptionFilter } from '@/core/exception-filters/prisma.exception-filter'; export * from 'fp-ts'; @@ -11,12 +12,14 @@ const API_PREFIX = 'api'; const API_PORT = process.env.PORT || 3333; async function bootstrap(): Promise { - const app = await NestFactory.create(AppModule); + const app = await NestFactory.create(AppModule, { + logger: new AppLogger(), + }); app.setGlobalPrefix(API_PREFIX); app.useGlobalPipes(new ValidationPipe({ transform: true, whitelist: true, forbidNonWhitelisted: true })); app.useGlobalFilters(new PrismaExceptionFilter()); await app.listen(API_PORT); - Logger.log(`🚀 application is running on: http://localhost:${API_PORT}/${API_PREFIX}`); + Logger.log('🚀 application is running!', { url: new URL(`http://localhost:${API_PORT}/${API_PREFIX}`) }); } bootstrap().catch((err: unknown) => { diff --git a/apps/server-asset-sg/src/utils/sleep.ts b/apps/server-asset-sg/src/utils/sleep.ts new file mode 100644 index 00000000..5eaa8e91 --- /dev/null +++ b/apps/server-asset-sg/src/utils/sleep.ts @@ -0,0 +1,4 @@ +export const sleep = (millis: number) => + new Promise((resolve) => { + setTimeout(() => resolve(), millis); + }); diff --git a/development/.env b/development/.env index 4c7eae60..f14cb211 100644 --- a/development/.env +++ b/development/.env @@ -4,3 +4,4 @@ DB_USER=postgres DB_PASSWORD=postgres PGADMIN_EMAIL=pg@admin.ch PGADMIN_PASSWORD=pgadmin + diff --git a/development/.gitignore b/development/.gitignore index a770a938..a1a46345 100644 --- a/development/.gitignore +++ b/development/.gitignore @@ -1,6 +1,7 @@ # See http://help.github.com/ignore-files/ for more about ignoring files. volumes +.env.ocr .env.local # DB dump for local development diff --git a/development/docker-compose.yaml b/development/docker-compose.yaml index 8a628e31..6cfc3daf 100644 --- a/development/docker-compose.yaml +++ b/development/docker-compose.yaml @@ -108,3 +108,20 @@ services: - ./init/oidc/oidc-mock-scopes.json:/tmp/config/scopes-config.json:ro - ./init/oidc/oidc-mock-server_options.json:/tmp/config/server_options-config.json:ro - ./volumes/oidc/keys:/tmp/data/keys + + ocr: + container_name: swissgeol-assets-ocr + # TODO change this to `latest` once it is released. + image: ghcr.io/swisstopo/swissgeol-ocr-api:edge + restart: unless-stopped + ports: + - "5052:8000" + environment: + TMP_PATH: /tmp/ocr/ + SKIP_PROCESSING: true + S3_INPUT_BUCKET: asset-sg + S3_INPUT_FOLDER: / + S3_OUTPUT_BUCKET: asset-sg + S3_OUTPUT_FOLDER: / + CONFIDENCE_THRESHOLD: 0.7 + USE_AGGRESSIVE_STRATEGY: false diff --git a/package-lock.json b/package-lock.json index c87f0c76..6a7bb821 100644 --- a/package-lock.json +++ b/package-lock.json @@ -51,6 +51,7 @@ "cache-manager": "^5.4.0", "class-transformer": "^0.5.1", "class-validator": "^0.14.1", + "colors": "^1.4.0", "cron": "^3.1.7", "csv-parse": "^5.3.3", "fp-ts": "^2.13.1", @@ -6834,6 +6835,12 @@ "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==" }, + "node_modules/@nuxtjs/opencollective/node_modules/consola": { + "version": "2.15.3", + "resolved": "https://registry.npmjs.org/consola/-/consola-2.15.3.tgz", + "integrity": "sha512-9vAdYbHj6x2fLKC4+oPH0kFzY/orMZyG2Aj+kNylHxKGJ/Ed4dpNyAQYwJOdqO4zdM7XpVHmyejQDcQHrnuXbw==", + "license": "MIT" + }, "node_modules/@nuxtjs/opencollective/node_modules/has-flag": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", @@ -14181,6 +14188,15 @@ "resolved": "https://registry.npmjs.org/colorette/-/colorette-2.0.20.tgz", "integrity": "sha512-IfEDxwoWIjkeXL1eXcDiow4UbKjhLdq6/EuSVR9GMN7KVH3r9gQ83e73hsz1Nd1T3ijd5xv1wcWRYO+D6kCI2w==" }, + "node_modules/colors": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/colors/-/colors-1.4.0.tgz", + "integrity": "sha512-a+UqTh4kgZg/SlGvfbzDHpgRu7AAQOmmqRHJnxhRZICKFUT91brVhNNt58CMWU9PsBbv3PDCZUHbVxuDiH2mtA==", + "license": "MIT", + "engines": { + "node": ">=0.1.90" + } + }, "node_modules/columnify": { "version": "1.6.0", "resolved": "https://registry.npmjs.org/columnify/-/columnify-1.6.0.tgz", @@ -14475,11 +14491,6 @@ "node": ">=0.8" } }, - "node_modules/consola": { - "version": "2.15.3", - "resolved": "https://registry.npmjs.org/consola/-/consola-2.15.3.tgz", - "integrity": "sha512-9vAdYbHj6x2fLKC4+oPH0kFzY/orMZyG2Aj+kNylHxKGJ/Ed4dpNyAQYwJOdqO4zdM7XpVHmyejQDcQHrnuXbw==" - }, "node_modules/content-disposition": { "version": "0.5.4", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", diff --git a/package.json b/package.json index ab98b57a..d1ea3d9d 100644 --- a/package.json +++ b/package.json @@ -63,6 +63,7 @@ "cache-manager": "^5.4.0", "class-transformer": "^0.5.1", "class-validator": "^0.14.1", + "colors": "^1.4.0", "cron": "^3.1.7", "csv-parse": "^5.3.3", "fp-ts": "^2.13.1",