From 10ecdf6a42cd294e6ec887a752d3ff388bb621f0 Mon Sep 17 00:00:00 2001 From: Sean Sundberg Date: Tue, 7 Nov 2023 22:06:18 -0600 Subject: [PATCH] Replace stripTags module with local utility (#148) Signed-off-by: Sean Sundberg --- config/KYCDataValidationQuestions.csv | 42 +++++++++---------- package-lock.json | 31 ++++++-------- package.json | 7 ++-- .../data-extraction/data-extraction.csv.ts | 8 ++-- .../data-extraction/data-extraction.impl.ts | 20 +++++---- src/utils/index.ts | 1 + src/utils/strip-tags/index.ts | 1 + src/utils/strip-tags/strip-tags.ts | 7 ++++ src/utils/strip-urls/strip-urls.ts | 5 ++- 9 files changed, 66 insertions(+), 56 deletions(-) create mode 100644 src/utils/strip-tags/index.ts create mode 100644 src/utils/strip-tags/strip-tags.ts diff --git a/config/KYCDataValidationQuestions.csv b/config/KYCDataValidationQuestions.csv index d8002b6..f029be6 100644 --- a/config/KYCDataValidationQuestions.csv +++ b/config/KYCDataValidationQuestions.csv @@ -1,21 +1,21 @@ -ID,Question,Source,Model,Token,PoCScope,Company,Prompt,Expected Answer,watsonx Response -1,What is Name and trading name of the organization?,Discovery,google/flan-t5-xxl,20,,,"From below text,What is Name and trading name of the organization #?",, -2,What is the registered address of the company?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,From below text find the registered address of the company #?,"1 St James's Square, London, SW1Y 4PD","1 St James's Square, London, SW1Y 4PD" -3,What is the business/trading address of the company?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the business/trading address of the company #?",, -4,What is identification number of the organization?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,from below text find identification number of the organization #? ,102498,102498 -5,Who are the key controllers and authorized signatories?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who are the key controllers and authorized signatories of the company #?",, -6,Names all the active directors of the company.,KYCSummary,meta-llama/llama-2-70b-chat,30,X,BP P.L.C,"from below text, find the names of all active directors of the company # in sequence ?","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela" -7,"What is the status of the organization ex; active, dissolved?",Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, what is the status of the organization # ?",Active,Active -8,What is the year of incorporation?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, What is the year of incorporation of the company #?",1909,1909 -9,Who are the shareholders of the company along with the percentage of ownership?,Discovery,google/flan-t5-xxl,20,,,"from below text, Who are the shareholders of the company # along with the percentage of ownership?",, -10,Who is the ultimate owner of the company?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who is the ultimate owner of the company #?",, -11,Who are the key controllers and authorized signatories?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, Who are the key controllers and authorized signatories of the company #?",, -12,What is the industry type/SIC/NICS code of the company?,KYCSummary,google/flan-t5-xxl,20,,,"from below text, What is the industry type/SIC/NICS code of the company #?",, -13,What are the products utilized by the company?,KYCSummary,google/flan-ul2,20,X,,"from below text, What are the products manufactured by the company #?",, -14,What is/are operation location/s or jurisdiction/s?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is/are operation location/s or jurisdiction/s of the comoany #?",, -15,Number of employees of the firm,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the Number of employees of the company #?",, -16,Name of the subsidiary of the company,Discovery,google/flan-t5-xxl,20,,,"from below text, find the Name of the subsidiary of the company #?",, -17,What is the Legal entity Type of the organization ex; publicly traded/limited liability etc.,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, What is the Legal entity Type of the organization # ex; publicly traded/limited liability? etc.",, -18,What is the turnover or revenue of the organization?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the turnover or revenue of the organization #?",, -19,Certificate/licence issued by the government.,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the Certificate/licence issued by the government for company #?",, -20,Whats is the next date of confirmation statement?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,BP P.L.C,"from below text, find the next date of confirmation statement for company #?",30/06/24,30/06/24 \ No newline at end of file +ID,Question,Source,Model,Token,PoCScope,Company,Prompt,Expected Answer,watsonx Response +1,What is Name and trading name of the organization?,Discovery,google/flan-t5-xxl,20,,,"From below text,What is Name and trading name of the organization #?",, +2,What is the registered address of the company?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,From below text find the registered address of the company #?,"1 St James's Square, London, SW1Y 4PD","1 St James's Square, London, SW1Y 4PD" +3,What is the business/trading address of the company?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the business / trading Address of the Company #?",, +4,What is identification number of the organization?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,from below text find identification number of the organization #? ,102498,102498 +5,Who are the key controllers and authorized signatories?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who are the key controllers and authorized signatories of the company #?",, +6,Names all the active directors of the company.,KYCSummary,meta-llama/llama-2-70b-chat,30,X,BP P.L.C,"from below text, find the names of all active directors of the company # in sequence ?","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela" +7,"What is the status of the organization ex; active, dissolved?",Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, what is the status of the organization # ex: Active or Dissolved ?",Active,Active +8,What is the year of incorporation?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, What is the year of incorporation of the company #?",1909,1909 +9,Who are the shareholders of the company along with the percentage of ownership?,Discovery,google/flan-t5-xxl,20,,,"from below text, Who are the shareholders of the company # along with the percentage of ownership?",, +10,Who is the ultimate owner of the company?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who is the ultimate owner of the company #?",, +11,Who are the key controllers and authorized signatories?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, Who are the key controllers and authorized signatories of the company #?",, +12,What is the industry type/SIC/NICS code of the company?,KYCSummary,google/flan-t5-xxl,20,,,"from below text, What is the industry type/SIC/NICS code of the company #?",, +13,What are the products utilized by the company?,KYCSummary,google/flan-ul2,20,X,,"from below text, What are the products manufactured by the company #?",, +14,What is/are operation location/s or jurisdiction/s?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is/are operation location/s or jurisdiction/s of the comoany #?",, +15,Number of employees of the firm,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the Number of employees of the company #?",, +16,Name of the subsidiary of the company,Discovery,google/flan-t5-xxl,20,,,"from below text, find the Name of the subsidiary of the company #?",, +17,What is the Legal entity Type of the organization ex; publicly traded/limited liability etc.,Discovery,google/flan-t5-xxl,30,X,,"from below text, What is the Legal entity Type of the organization # ex; publicly traded or limited liability or Private limited? etc.",, +18,What is the turnover or revenue of the organization?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the turnover or revenue of the organization #?",, +19,Certificate/licence issued by the government.,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the Certificate/licence issued by the government for company #?",, +20,Whats is the next date of confirmation statement?,Discovery,google/flan-t5-xxl,30,X,BP P.L.C,"from below text, find the next date of confirmation statement for company #?",30/06/24,30/06/24 \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 37ca4ed..b0edbda 100644 --- a/package-lock.json +++ b/package-lock.json @@ -34,8 +34,7 @@ "optional-js": "^2.3.0", "reflect-metadata": "^0.1.13", "rxjs": "^7.8.1", - "stream-to-blob": "^2.0.1", - "striptags": "^3.2.0" + "stream-to-blob": "^2.0.1" }, "devDependencies": { "@nestjs/cli": "^10.2.1", @@ -44,10 +43,10 @@ "@types/express": "^4.17.21", "@types/html-to-text": "^9.0.3", "@types/jest": "^29.5.7", + "@types/jsdom": "^21.1.5", "@types/mime": "^3.0.3", "@types/multer": "^1.4.9", "@types/node": "^20.8.10", - "@types/striptags": "^3.1.1", "@types/supertest": "^2.0.15", "@typescript-eslint/eslint-plugin": "^6.10.0", "@typescript-eslint/parser": "^6.10.0", @@ -2764,6 +2763,17 @@ "pretty-format": "^29.0.0" } }, + "node_modules/@types/jsdom": { + "version": "21.1.5", + "resolved": "https://registry.npmjs.org/@types/jsdom/-/jsdom-21.1.5.tgz", + "integrity": "sha512-sBK/3YjS3uuPj+HzZyhB4GGTnFmk0mdyQfhzZ/sqs9ciyG41QJdZZdwcPa6OfW97OTNTwl5tBAsfEOm/dui9pQ==", + "dev": true, + "dependencies": { + "@types/node": "*", + "@types/tough-cookie": "*", + "parse5": "^7.0.0" + } + }, "node_modules/@types/json-schema": { "version": "7.0.12", "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.12.tgz", @@ -2869,16 +2879,6 @@ "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.1.tgz", "integrity": "sha512-Hl219/BT5fLAaz6NDkSuhzasy49dwQS/DSdu4MdggFB8zcXv7vflBI3xp7FEmkmdDkBUI2bPUNeMttp2knYdxw==" }, - "node_modules/@types/striptags": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/@types/striptags/-/striptags-3.1.1.tgz", - "integrity": "sha512-t11pzegWB32MpVjCMXD0LoSxsUQESC7CInDtVoEmnPbLWA5hMRSLBa0U/xCOdj6zFQyHlLn0Qmp76kyp/KyfQw==", - "deprecated": "This is a stub types definition for striptags (https://github.com/ericnorris/striptags). striptags provides its own type definitions, so you don't need @types/striptags installed!", - "dev": true, - "dependencies": { - "striptags": "*" - } - }, "node_modules/@types/superagent": { "version": "4.1.18", "resolved": "https://registry.npmjs.org/@types/superagent/-/superagent-4.1.18.tgz", @@ -10359,11 +10359,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/striptags": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/striptags/-/striptags-3.2.0.tgz", - "integrity": "sha512-g45ZOGzHDMe2bdYMdIvdAfCQkCTDMGBazSw1ypMowwGIee7ZQ5dU0rBJ8Jqgl+jAKIv4dbeE1jscZq9wid1Tkw==" - }, "node_modules/strtok3": { "version": "6.3.0", "resolved": "https://registry.npmjs.org/strtok3/-/strtok3-6.3.0.tgz", diff --git a/package.json b/package.json index 3b1e858..80ab6d9 100644 --- a/package.json +++ b/package.json @@ -12,7 +12,7 @@ "start": "nest start", "start:dev": "nest start --watch", "start:debug": "nest start --debug --watch", - "start:prod": "node dist/main", + "start:prod": "node dist/src/main", "lint": "eslint \"{src,apps,libs,test}/**/*.ts\" --fix", "test": "jest", "test:watch": "jest --watch", @@ -46,8 +46,7 @@ "optional-js": "^2.3.0", "reflect-metadata": "^0.1.13", "rxjs": "^7.8.1", - "stream-to-blob": "^2.0.1", - "striptags": "^3.2.0" + "stream-to-blob": "^2.0.1" }, "devDependencies": { "@nestjs/cli": "^10.2.1", @@ -56,10 +55,10 @@ "@types/express": "^4.17.21", "@types/html-to-text": "^9.0.3", "@types/jest": "^29.5.7", + "@types/jsdom": "^21.1.5", "@types/mime": "^3.0.3", "@types/multer": "^1.4.9", "@types/node": "^20.8.10", - "@types/striptags": "^3.1.1", "@types/supertest": "^2.0.15", "@typescript-eslint/eslint-plugin": "^6.10.0", "@typescript-eslint/parser": "^6.10.0", diff --git a/src/services/data-extraction/data-extraction.csv.ts b/src/services/data-extraction/data-extraction.csv.ts index 80c5a14..1bd8374 100644 --- a/src/services/data-extraction/data-extraction.csv.ts +++ b/src/services/data-extraction/data-extraction.csv.ts @@ -9,11 +9,11 @@ import {first, parseCsv} from "../../utils"; const csvFile: string = `ID,Question,Source,Model,Token,PoCScope,Company,Prompt,Expected Answer,watsonx Response 1,What is Name and trading name of the organization?,Discovery,google/flan-t5-xxl,20,,,"From below text,What is Name and trading name of the organization #?",, 2,What is the registered address of the company?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,From below text find the registered address of the company #?,"1 St James's Square, London, SW1Y 4PD","1 St James's Square, London, SW1Y 4PD" -3,What is the business/trading address of the company?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the business/trading address of the company #?",, +3,What is the business/trading address of the company?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the business / trading Address of the Company #?",, 4,What is identification number of the organization?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,from below text find identification number of the organization #? ,102498,102498 5,Who are the key controllers and authorized signatories?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who are the key controllers and authorized signatories of the company #?",, 6,Names all the active directors of the company.,KYCSummary,meta-llama/llama-2-70b-chat,30,X,BP P.L.C,"from below text, find the names of all active directors of the company # in sequence ?","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela" -7,"What is the status of the organization ex; active, dissolved?",Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, what is the status of the organization # ?",Active,Active +7,"What is the status of the organization ex; active, dissolved?",Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, what is the status of the organization # ex: Active or Dissolved ?",Active,Active 8,What is the year of incorporation?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, What is the year of incorporation of the company #?",1909,1909 9,Who are the shareholders of the company along with the percentage of ownership?,Discovery,google/flan-t5-xxl,20,,,"from below text, Who are the shareholders of the company # along with the percentage of ownership?",, 10,Who is the ultimate owner of the company?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who is the ultimate owner of the company #?",, @@ -23,10 +23,10 @@ const csvFile: string = `ID,Question,Source,Model,Token,PoCScope,Company,Prompt, 14,What is/are operation location/s or jurisdiction/s?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is/are operation location/s or jurisdiction/s of the comoany #?",, 15,Number of employees of the firm,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the Number of employees of the company #?",, 16,Name of the subsidiary of the company,Discovery,google/flan-t5-xxl,20,,,"from below text, find the Name of the subsidiary of the company #?",, -17,What is the Legal entity Type of the organization ex; publicly traded/limited liability etc.,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, What is the Legal entity Type of the organization # ex; publicly traded/limited liability? etc.",, +17,What is the Legal entity Type of the organization ex; publicly traded/limited liability etc.,Discovery,google/flan-t5-xxl,30,X,,"from below text, What is the Legal entity Type of the organization # ex; publicly traded or limited liability or Private limited? etc.",, 18,What is the turnover or revenue of the organization?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the turnover or revenue of the organization #?",, 19,Certificate/licence issued by the government.,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the Certificate/licence issued by the government for company #?",, -20,Whats is the next date of confirmation statement?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,BP P.L.C,"from below text, find the next date of confirmation statement for company #?",30/06/24,30/06/24` +20,Whats is the next date of confirmation statement?,Discovery,google/flan-t5-xxl,30,X,BP P.L.C,"from below text, find the next date of confirmation statement for company #?",30/06/24,30/06/24` export interface DataExtractionConfig extends DataExtractionQuestionModel { source: string; diff --git a/src/services/data-extraction/data-extraction.impl.ts b/src/services/data-extraction/data-extraction.impl.ts index ad51e08..1eab5ab 100644 --- a/src/services/data-extraction/data-extraction.impl.ts +++ b/src/services/data-extraction/data-extraction.impl.ts @@ -1,7 +1,6 @@ import * as process from "process"; import {IamAuthenticator, IamTokenManager} from "ibm-cloud-sdk-core"; import DiscoveryV2 = require("ibm-watson/discovery/v2"); -const stripTags = require("striptags"); import axios from "axios"; import {DataExtractionApi} from "./data-extraction.api"; @@ -10,7 +9,7 @@ import {createDiscoveryV2} from "../../utils/discovery-v2"; import {DataExtractionConfig, DataExtractionCsv} from "./data-extraction.csv"; import {kycCaseSummaryApi, KycCaseSummaryApi} from "../kyc-case-summary"; import {DataExtractionResultModel} from "../../models"; -import {first, GenAiModel, GenerativeResponse, stripUrls} from "../../utils"; +import {first, GenAiModel, GenerativeResponse, stripUrls, stripTags} from "../../utils"; import PQueue from "../../utils/p-queue"; const concurrency = parseInt(process.env.FIND_PASSAGE_CONCURRENCY || '8') @@ -161,8 +160,6 @@ export class DataExtractionImpl extends DataExtractionCsv { + const originalPassage = passages[index] + if (cleanPassage.length !== originalPassage.length) { + console.log('Passage changed', {originalPassage, cleanPassage}) + } + }) + + return cleanPassages } filterDocuments(result: DiscoveryV2.QueryResponse, subject: string): DiscoveryV2.QueryResult[] { @@ -214,8 +220,6 @@ export class DataExtractionImpl extends DataExtractionCsv { - console.log('Getting relevant passage') - const relevantPassage = await axios .post<{relevant_passage: string} | string>(url, {question, passages}) .then(response => { @@ -231,7 +235,7 @@ export class DataExtractionImpl extends DataExtractionCsv { + const dom: JSDOM = new JSDOM(text); + + return dom.window.document.body.textContent || ''; +} diff --git a/src/utils/strip-urls/strip-urls.ts b/src/utils/strip-urls/strip-urls.ts index 6004697..1d0f6d2 100644 --- a/src/utils/strip-urls/strip-urls.ts +++ b/src/utils/strip-urls/strip-urls.ts @@ -1,3 +1,6 @@ export const stripUrls = (text: string): string => { - return text.replace(/https?:\/\/[\n\S]+/g, '');; + return text + .replace(/https?:\/\/[\n\S]+/g, '') + .replace(/[0-9A-Za-z]+\/[0-9A-Za-z]+\/[0-9A-Za-z\/]+/, '') + .replace(/([ (])\/[0-9A-Za-z\/]+/, '$1') }