From c8e7cc3c1949bfa3748eed57773ed0bfab701ec5 Mon Sep 17 00:00:00 2001 From: Sean Sundberg Date: Tue, 7 Nov 2023 08:10:07 -0600 Subject: [PATCH] Update data extract handling of Discovery responses - Update default similarity-check url - Strip html tags and urls from each discovery passage Signed-off-by: Sean Sundberg --- .../data-extraction/data-extraction.impl.ts | 24 ++++++++---- src/utils/index.ts | 1 + src/utils/p-queue/index.ts | 6 +-- src/utils/p-queue/options.ts | 2 +- src/utils/p-queue/priority-queue.ts | 6 +-- src/utils/strip-urls/index.ts | 1 + src/utils/strip-urls/strip-urls.spec.ts | 37 +++++++++++++++++++ src/utils/strip-urls/strip-urls.ts | 3 ++ 8 files changed, 65 insertions(+), 15 deletions(-) create mode 100644 src/utils/strip-urls/index.ts create mode 100644 src/utils/strip-urls/strip-urls.spec.ts create mode 100644 src/utils/strip-urls/strip-urls.ts diff --git a/src/services/data-extraction/data-extraction.impl.ts b/src/services/data-extraction/data-extraction.impl.ts index 86aac6e..ad51e08 100644 --- a/src/services/data-extraction/data-extraction.impl.ts +++ b/src/services/data-extraction/data-extraction.impl.ts @@ -1,7 +1,7 @@ import * as process from "process"; import {IamAuthenticator, IamTokenManager} from "ibm-cloud-sdk-core"; import DiscoveryV2 = require("ibm-watson/discovery/v2"); -const striptags = require("striptags"); +const stripTags = require("striptags"); import axios from "axios"; import {DataExtractionApi} from "./data-extraction.api"; @@ -10,7 +10,7 @@ import {createDiscoveryV2} from "../../utils/discovery-v2"; import {DataExtractionConfig, DataExtractionCsv} from "./data-extraction.csv"; import {kycCaseSummaryApi, KycCaseSummaryApi} from "../kyc-case-summary"; import {DataExtractionResultModel} from "../../models"; -import {first, GenAiModel, GenerativeResponse} from "../../utils"; +import {first, GenAiModel, GenerativeResponse, stripUrls} from "../../utils"; import PQueue from "../../utils/p-queue"; const concurrency = parseInt(process.env.FIND_PASSAGE_CONCURRENCY || '8') @@ -146,7 +146,7 @@ export class DataExtractionImpl extends DataExtractionCsv { const naturalLanguageQuery = config.question + ' ' + customer; - const passagesPerDocument = true; + const passagesPerDocument: boolean = true; const response: DiscoveryV2.Response = await backends.discovery.query({ projectId: this.backendConfig.discoveryProjectId, naturalLanguageQuery, @@ -159,9 +159,7 @@ export class DataExtractionImpl extends DataExtractionCsv { const organizations = extractEntities(val.enriched_text, 'Organization') @@ -198,7 +206,7 @@ export class DataExtractionImpl extends DataExtractionCsv { - const url = process.env.RELEVANT_PASSAGES_URL || 'https://similarity-check.18xu6cedovu0.us-south.codeengine.appdomain.cloud/api/find_relevant_passage' + const url = process.env.RELEVANT_PASSAGES_URL || 'https://similarity-check.18z7sftfb1j5.us-south.codeengine.appdomain.cloud/api/find_relevant_passage' if (passages.length === 1) { return passages[0] @@ -220,7 +228,7 @@ export class DataExtractionImpl extends DataExtractionCsv { console.error('Error getting relevant passages: ', {err}) - return striptags(passages.join('\n')) + return passages.join('\n') }) console.log('Found relevant passage: ', {relevantPassage}) diff --git a/src/utils/index.ts b/src/utils/index.ts index 3e543bb..b6e3b2b 100644 --- a/src/utils/index.ts +++ b/src/utils/index.ts @@ -5,3 +5,4 @@ export * from './gen-ai-model'; export * from './stream-to-buffer'; export * from './url-to-stream'; export * from './validate-url'; +export * from './strip-urls'; diff --git a/src/utils/p-queue/index.ts b/src/utils/p-queue/index.ts index c677c8c..60909ae 100644 --- a/src/utils/p-queue/index.ts +++ b/src/utils/p-queue/index.ts @@ -1,8 +1,8 @@ import {EventEmitter} from 'eventemitter3'; import pTimeout, {TimeoutError} from '../p-timeout'; -import {Queue, RunFunction} from './queue.js'; -import PriorityQueue from './priority-queue.js'; -import {QueueAddOptions, Options, TaskOptions} from './options.js'; +import {Queue, RunFunction} from './queue'; +import PriorityQueue from './priority-queue'; +import {QueueAddOptions, Options, TaskOptions} from './options'; type Task = | ((options: TaskOptions) => PromiseLike) diff --git a/src/utils/p-queue/options.ts b/src/utils/p-queue/options.ts index af5a196..a5ab739 100644 --- a/src/utils/p-queue/options.ts +++ b/src/utils/p-queue/options.ts @@ -1,4 +1,4 @@ -import {Queue, RunFunction} from './queue.js'; +import {Queue, RunFunction} from './queue'; interface TimeoutOptions { /** diff --git a/src/utils/p-queue/priority-queue.ts b/src/utils/p-queue/priority-queue.ts index 45993bc..db80fad 100644 --- a/src/utils/p-queue/priority-queue.ts +++ b/src/utils/p-queue/priority-queue.ts @@ -1,6 +1,6 @@ -import {Queue, RunFunction} from './queue.js'; -import lowerBound from './lower-bound.js'; -import {QueueAddOptions} from './options.js'; +import {Queue, RunFunction} from './queue'; +import lowerBound from './lower-bound'; +import {QueueAddOptions} from './options'; export interface PriorityQueueOptions extends QueueAddOptions { priority?: number; diff --git a/src/utils/strip-urls/index.ts b/src/utils/strip-urls/index.ts new file mode 100644 index 0000000..407c6e6 --- /dev/null +++ b/src/utils/strip-urls/index.ts @@ -0,0 +1 @@ +export * from './strip-urls' diff --git a/src/utils/strip-urls/strip-urls.spec.ts b/src/utils/strip-urls/strip-urls.spec.ts new file mode 100644 index 0000000..82f1c61 --- /dev/null +++ b/src/utils/strip-urls/strip-urls.spec.ts @@ -0,0 +1,37 @@ +import {stripUrls} from "./strip-urls"; + +describe('strip-urls', () => { + describe('Given stripUrls', () => { + const url = 'https://host.com/path/to/file.html' + + describe('when "{url}" provided', () => { + test('then return ""', () => { + expect(stripUrls(url)).toEqual('') + }) + }) + + describe('when "http://host.com/path/to/file.html" provided', () => { + test('then return ""', () => { + expect(stripUrls('http://host.com/path/to/file.html')).toEqual('') + }) + }) + + describe('when "This is a test {url}." provided', () => { + test('then return "This is a test "', () => { + expect(stripUrls(`This is a test ${url}.`)).toEqual('This is a test ') + }) + }) + + describe('when "This is a test {url})" provided', () => { + test('then return "This is a test "', () => { + expect(stripUrls(`This is a test ${url})`)).toEqual('This is a test ') + }) + }) + + describe('when "This is a test {url} )" provided', () => { + test('then return "This is a test )"', () => { + expect(stripUrls(`This is a test ${url} )`)).toEqual('This is a test )') + }) + }) + }) +}) \ No newline at end of file diff --git a/src/utils/strip-urls/strip-urls.ts b/src/utils/strip-urls/strip-urls.ts new file mode 100644 index 0000000..6004697 --- /dev/null +++ b/src/utils/strip-urls/strip-urls.ts @@ -0,0 +1,3 @@ +export const stripUrls = (text: string): string => { + return text.replace(/https?:\/\/[\n\S]+/g, '');; +}