Skip to content

Commit

Permalink
Replace stripTags module with local utility (#148)
Browse files Browse the repository at this point in the history
Signed-off-by: Sean Sundberg <[email protected]>
  • Loading branch information
seansund authored Nov 8, 2023
1 parent 6fa4bd1 commit 10ecdf6
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 56 deletions.
42 changes: 21 additions & 21 deletions config/KYCDataValidationQuestions.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
ID,Question,Source,Model,Token,PoCScope,Company,Prompt,Expected Answer,watsonx Response
1,What is Name and trading name of the organization?,Discovery,google/flan-t5-xxl,20,,,"From below text,What is Name and trading name of the organization #?",,
2,What is the registered address of the company?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,From below text find the registered address of the company #?,"1 St James's Square, London, SW1Y 4PD","1 St James's Square, London, SW1Y 4PD"
3,What is the business/trading address of the company?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the business/trading address of the company #?",,
4,What is identification number of the organization?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,from below text find identification number of the organization #? ,102498,102498
5,Who are the key controllers and authorized signatories?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who are the key controllers and authorized signatories of the company #?",,
6,Names all the active directors of the company.,KYCSummary,meta-llama/llama-2-70b-chat,30,X,BP P.L.C,"from below text, find the names of all active directors of the company # in sequence ?","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela"
7,"What is the status of the organization ex; active, dissolved?",Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, what is the status of the organization # ?",Active,Active
8,What is the year of incorporation?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, What is the year of incorporation of the company #?",1909,1909
9,Who are the shareholders of the company along with the percentage of ownership?,Discovery,google/flan-t5-xxl,20,,,"from below text, Who are the shareholders of the company # along with the percentage of ownership?",,
10,Who is the ultimate owner of the company?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who is the ultimate owner of the company #?",,
11,Who are the key controllers and authorized signatories?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, Who are the key controllers and authorized signatories of the company #?",,
12,What is the industry type/SIC/NICS code of the company?,KYCSummary,google/flan-t5-xxl,20,,,"from below text, What is the industry type/SIC/NICS code of the company #?",,
13,What are the products utilized by the company?,KYCSummary,google/flan-ul2,20,X,,"from below text, What are the products manufactured by the company #?",,
14,What is/are operation location/s or jurisdiction/s?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is/are operation location/s or jurisdiction/s of the comoany #?",,
15,Number of employees of the firm,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the Number of employees of the company #?",,
16,Name of the subsidiary of the company,Discovery,google/flan-t5-xxl,20,,,"from below text, find the Name of the subsidiary of the company #?",,
17,What is the Legal entity Type of the organization ex; publicly traded/limited liability etc.,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, What is the Legal entity Type of the organization # ex; publicly traded/limited liability? etc.",,
18,What is the turnover or revenue of the organization?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the turnover or revenue of the organization #?",,
19,Certificate/licence issued by the government.,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the Certificate/licence issued by the government for company #?",,
20,Whats is the next date of confirmation statement?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,BP P.L.C,"from below text, find the next date of confirmation statement for company #?",30/06/24,30/06/24
ID,Question,Source,Model,Token,PoCScope,Company,Prompt,Expected Answer,watsonx Response
1,What is Name and trading name of the organization?,Discovery,google/flan-t5-xxl,20,,,"From below text,What is Name and trading name of the organization #?",,
2,What is the registered address of the company?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,From below text find the registered address of the company #?,"1 St James's Square, London, SW1Y 4PD","1 St James's Square, London, SW1Y 4PD"
3,What is the business/trading address of the company?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the business / trading Address of the Company #?",,
4,What is identification number of the organization?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,from below text find identification number of the organization #? ,102498,102498
5,Who are the key controllers and authorized signatories?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who are the key controllers and authorized signatories of the company #?",,
6,Names all the active directors of the company.,KYCSummary,meta-llama/llama-2-70b-chat,30,X,BP P.L.C,"from below text, find the names of all active directors of the company # in sequence ?","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela"
7,"What is the status of the organization ex; active, dissolved?",Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, what is the status of the organization # ex: Active or Dissolved ?",Active,Active
8,What is the year of incorporation?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, What is the year of incorporation of the company #?",1909,1909
9,Who are the shareholders of the company along with the percentage of ownership?,Discovery,google/flan-t5-xxl,20,,,"from below text, Who are the shareholders of the company # along with the percentage of ownership?",,
10,Who is the ultimate owner of the company?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who is the ultimate owner of the company #?",,
11,Who are the key controllers and authorized signatories?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, Who are the key controllers and authorized signatories of the company #?",,
12,What is the industry type/SIC/NICS code of the company?,KYCSummary,google/flan-t5-xxl,20,,,"from below text, What is the industry type/SIC/NICS code of the company #?",,
13,What are the products utilized by the company?,KYCSummary,google/flan-ul2,20,X,,"from below text, What are the products manufactured by the company #?",,
14,What is/are operation location/s or jurisdiction/s?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is/are operation location/s or jurisdiction/s of the comoany #?",,
15,Number of employees of the firm,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the Number of employees of the company #?",,
16,Name of the subsidiary of the company,Discovery,google/flan-t5-xxl,20,,,"from below text, find the Name of the subsidiary of the company #?",,
17,What is the Legal entity Type of the organization ex; publicly traded/limited liability etc.,Discovery,google/flan-t5-xxl,30,X,,"from below text, What is the Legal entity Type of the organization # ex; publicly traded or limited liability or Private limited? etc.",,
18,What is the turnover or revenue of the organization?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the turnover or revenue of the organization #?",,
19,Certificate/licence issued by the government.,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the Certificate/licence issued by the government for company #?",,
20,Whats is the next date of confirmation statement?,Discovery,google/flan-t5-xxl,30,X,BP P.L.C,"from below text, find the next date of confirmation statement for company #?",30/06/24,30/06/24
31 changes: 13 additions & 18 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 3 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"start": "nest start",
"start:dev": "nest start --watch",
"start:debug": "nest start --debug --watch",
"start:prod": "node dist/main",
"start:prod": "node dist/src/main",
"lint": "eslint \"{src,apps,libs,test}/**/*.ts\" --fix",
"test": "jest",
"test:watch": "jest --watch",
Expand Down Expand Up @@ -46,8 +46,7 @@
"optional-js": "^2.3.0",
"reflect-metadata": "^0.1.13",
"rxjs": "^7.8.1",
"stream-to-blob": "^2.0.1",
"striptags": "^3.2.0"
"stream-to-blob": "^2.0.1"
},
"devDependencies": {
"@nestjs/cli": "^10.2.1",
Expand All @@ -56,10 +55,10 @@
"@types/express": "^4.17.21",
"@types/html-to-text": "^9.0.3",
"@types/jest": "^29.5.7",
"@types/jsdom": "^21.1.5",
"@types/mime": "^3.0.3",
"@types/multer": "^1.4.9",
"@types/node": "^20.8.10",
"@types/striptags": "^3.1.1",
"@types/supertest": "^2.0.15",
"@typescript-eslint/eslint-plugin": "^6.10.0",
"@typescript-eslint/parser": "^6.10.0",
Expand Down
8 changes: 4 additions & 4 deletions src/services/data-extraction/data-extraction.csv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ import {first, parseCsv} from "../../utils";
const csvFile: string = `ID,Question,Source,Model,Token,PoCScope,Company,Prompt,Expected Answer,watsonx Response
1,What is Name and trading name of the organization?,Discovery,google/flan-t5-xxl,20,,,"From below text,What is Name and trading name of the organization #?",,
2,What is the registered address of the company?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,From below text find the registered address of the company #?,"1 St James's Square, London, SW1Y 4PD","1 St James's Square, London, SW1Y 4PD"
3,What is the business/trading address of the company?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the business/trading address of the company #?",,
3,What is the business/trading address of the company?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the business / trading Address of the Company #?",,
4,What is identification number of the organization?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,from below text find identification number of the organization #? ,102498,102498
5,Who are the key controllers and authorized signatories?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who are the key controllers and authorized signatories of the company #?",,
6,Names all the active directors of the company.,KYCSummary,meta-llama/llama-2-70b-chat,30,X,BP P.L.C,"from below text, find the names of all active directors of the company # in sequence ?","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela","LUND, Helge BLANC, Amanda Jayne DALEY, Pamela"
7,"What is the status of the organization ex; active, dissolved?",Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, what is the status of the organization # ?",Active,Active
7,"What is the status of the organization ex; active, dissolved?",Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, what is the status of the organization # ex: Active or Dissolved ?",Active,Active
8,What is the year of incorporation?,Discovery,google/flan-t5-xxl,20,X,BP P.L.C,"from below text, What is the year of incorporation of the company #?",1909,1909
9,Who are the shareholders of the company along with the percentage of ownership?,Discovery,google/flan-t5-xxl,20,,,"from below text, Who are the shareholders of the company # along with the percentage of ownership?",,
10,Who is the ultimate owner of the company?,KYCSummary,meta-llama/llama-2-70b-chat,30,,,"from below text, Who is the ultimate owner of the company #?",,
Expand All @@ -23,10 +23,10 @@ const csvFile: string = `ID,Question,Source,Model,Token,PoCScope,Company,Prompt,
14,What is/are operation location/s or jurisdiction/s?,Discovery,google/flan-t5-xxl,20,,,"from below text, What is/are operation location/s or jurisdiction/s of the comoany #?",,
15,Number of employees of the firm,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the Number of employees of the company #?",,
16,Name of the subsidiary of the company,Discovery,google/flan-t5-xxl,20,,,"from below text, find the Name of the subsidiary of the company #?",,
17,What is the Legal entity Type of the organization ex; publicly traded/limited liability etc.,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, What is the Legal entity Type of the organization # ex; publicly traded/limited liability? etc.",,
17,What is the Legal entity Type of the organization ex; publicly traded/limited liability etc.,Discovery,google/flan-t5-xxl,30,X,,"from below text, What is the Legal entity Type of the organization # ex; publicly traded or limited liability or Private limited? etc.",,
18,What is the turnover or revenue of the organization?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,,"from below text, find the turnover or revenue of the organization #?",,
19,Certificate/licence issued by the government.,Discovery,google/flan-t5-xxl,20,,,"from below text, What is the Certificate/licence issued by the government for company #?",,
20,Whats is the next date of confirmation statement?,KYCSummary,meta-llama/llama-2-70b-chat,30,X,BP P.L.C,"from below text, find the next date of confirmation statement for company #?",30/06/24,30/06/24`
20,Whats is the next date of confirmation statement?,Discovery,google/flan-t5-xxl,30,X,BP P.L.C,"from below text, find the next date of confirmation statement for company #?",30/06/24,30/06/24`

export interface DataExtractionConfig extends DataExtractionQuestionModel {
source: string;
Expand Down
20 changes: 12 additions & 8 deletions src/services/data-extraction/data-extraction.impl.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import * as process from "process";
import {IamAuthenticator, IamTokenManager} from "ibm-cloud-sdk-core";
import DiscoveryV2 = require("ibm-watson/discovery/v2");
const stripTags = require("striptags");
import axios from "axios";

import {DataExtractionApi} from "./data-extraction.api";
Expand All @@ -10,7 +9,7 @@ import {createDiscoveryV2} from "../../utils/discovery-v2";
import {DataExtractionConfig, DataExtractionCsv} from "./data-extraction.csv";
import {kycCaseSummaryApi, KycCaseSummaryApi} from "../kyc-case-summary";
import {DataExtractionResultModel} from "../../models";
import {first, GenAiModel, GenerativeResponse, stripUrls} from "../../utils";
import {first, GenAiModel, GenerativeResponse, stripUrls, stripTags} from "../../utils";
import PQueue from "../../utils/p-queue";

const concurrency = parseInt(process.env.FIND_PASSAGE_CONCURRENCY || '8')
Expand Down Expand Up @@ -161,8 +160,6 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends, Contex

const passages: string[] = this.handleDiscoveryResponse(response.result, customer, passagesPerDocument)

console.log('Finding relevant passages')

const text: string = await this.findRelevantPassages(naturalLanguageQuery, passages)

console.log('1. Text extracted from Discovery:', {naturalLanguageQuery, text})
Expand All @@ -177,9 +174,18 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends, Contex
? this.handleDiscoveryPassages(result)
: this.handleDiscoveryResult(result, subject);

return passages
const cleanPassages = passages
.map(stripTags)
.map(stripUrls)

cleanPassages.forEach((cleanPassage: string, index: number) => {
const originalPassage = passages[index]
if (cleanPassage.length !== originalPassage.length) {
console.log('Passage changed', {originalPassage, cleanPassage})
}
})

return cleanPassages
}

filterDocuments(result: DiscoveryV2.QueryResponse, subject: string): DiscoveryV2.QueryResult[] {
Expand Down Expand Up @@ -214,8 +220,6 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends, Contex

return await queue
.add(async () => {
console.log('Getting relevant passage')

const relevantPassage = await axios
.post<{relevant_passage: string} | string>(url, {question, passages})
.then(response => {
Expand All @@ -231,7 +235,7 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends, Contex
return passages.join('\n')
})

console.log('Found relevant passage: ', {relevantPassage})
console.log('0. Found relevant passage: ', {relevantPassage})

return relevantPassage
}) as string
Expand Down
1 change: 1 addition & 0 deletions src/utils/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ export * from './stream-to-buffer';
export * from './url-to-stream';
export * from './validate-url';
export * from './strip-urls';
export * from './strip-tags';
1 change: 1 addition & 0 deletions src/utils/strip-tags/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export * from './strip-tags'
7 changes: 7 additions & 0 deletions src/utils/strip-tags/strip-tags.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import {JSDOM} from 'jsdom'

export const stripTags = (text: string): string => {
const dom: JSDOM = new JSDOM(text);

return dom.window.document.body.textContent || '';
}
5 changes: 4 additions & 1 deletion src/utils/strip-urls/strip-urls.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
export const stripUrls = (text: string): string => {
return text.replace(/https?:\/\/[\n\S]+/g, '');;
return text
.replace(/https?:\/\/[\n\S]+/g, '')
.replace(/[0-9A-Za-z]+\/[0-9A-Za-z]+\/[0-9A-Za-z\/]+/, '')
.replace(/([ (])\/[0-9A-Za-z\/]+/, '$1')
}

0 comments on commit 10ecdf6

Please sign in to comment.