Skip to content

Commit

Permalink
Update data extract logic (#82)
Browse files Browse the repository at this point in the history
- Change filter logic and passage handling
- Update base continer image version
- Update token count for data extract watsonx call

Signed-off-by: Sean Sundberg <[email protected]>
  • Loading branch information
seansund authored Oct 6, 2023
1 parent 54a6661 commit 5db3055
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 9 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM registry.access.redhat.com/ubi9/nodejs-18:1-62.1692771036 AS builder
FROM registry.access.redhat.com/ubi9/nodejs-18:1-70.1695740477 AS builder

WORKDIR /opt/app-root/src

Expand All @@ -9,7 +9,7 @@ RUN mkdir -p /opt/app-root/src/node_modules && \
npm ci && \
npm run build

FROM registry.access.redhat.com/ubi9/nodejs-18:1-62.1692771036
FROM registry.access.redhat.com/ubi9/nodejs-18:1-70.1695740477

## Uncomment the below lines to update image security content if any
# USER root
Expand Down
45 changes: 38 additions & 7 deletions src/services/data-extraction/data-extraction.impl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import {DataExtractionConfig, DataExtractionCsv} from "./data-extraction.csv";
import {DataExtractionResultModel} from "../../models";
import {first, GenAiModel, GenerativeResponse} from "../../utils";
import {createDiscoveryV2} from "../../utils/discovery-v2";
import {promises} from "fs";
import {join} from "path";

export interface DataExtractionBackendConfig {
identityUrl: string;
Expand All @@ -22,6 +24,7 @@ export interface DataExtractionBackendConfig {
discoveryApiKey: string;
discoveryVersion: string;
discoveryProjectId: string;
documentCount: number;

kycProjectId: string;
kycCollectionId: string;
Expand All @@ -39,13 +42,14 @@ export const buildDataExtractionBackendConfig = (): DataExtractionBackendConfig
wmlProjectId: process.env.WML_PROJECT_ID || '05ba9d92-734e-4b34-a672-f727a2c26440',

decodingMethod: process.env.DECODING_METHOD || 'greedy',
maxNewTokens: parseInt(process.env.MAX_NEW_TOKENS || '100'),
maxNewTokens: parseInt(process.env.MAX_NEW_TOKENS || '20'),
repetitionPenalty: parseInt(process.env.REPETITION_PENALTY || '1'),

discoveryUrl: process.env.DISCOVERY_URL || 'https://api.us-south.discovery.watson.cloud.ibm.com/instances/0992769e-726a-4ab0-a9d9-4352e204cc87',
discoveryApiKey: process.env.DISCOVERY_API_KEY,
discoveryVersion: process.env.DISCOVERY_VERSION || '2020-08-30',
discoveryProjectId: process.env.DISCOVERY_PROJECT_ID || '303aab25-cb4f-4b28-b8d2-30e23e39a37f',
documentCount: parseInt(process.env.DOCUMENT_COUNT || '5'),

kycProjectId: process.env.KYC_PROJECT_ID || '303aab25-cb4f-4b28-b8d2-30e23e39a37f',
kycCollectionId: process.env.KYC_COLLECTION_ID,
Expand Down Expand Up @@ -103,12 +107,12 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends> implem
async queryDiscovery(customer: string, config: DataExtractionConfig, backends: WatsonBackends): Promise<string> {
const naturalLanguageQuery = config.question + ' ' + customer;

const passagesPerDocument = false;
const passagesPerDocument = true;
const response: DiscoveryV2.Response<DiscoveryV2.QueryResponse> = await backends.discovery.query({
projectId: this.backendConfig.discoveryProjectId,
naturalLanguageQuery,
count: 5,
filter: `enriched_text.entities.type:Organization,enriched_text.entities.text:${customer}`,
count: this.backendConfig.documentCount,
// filter: `enriched_text.entities.type:Organization,enriched_text.entities.text:${customer}`,
passages: {
enabled: true,
per_document: passagesPerDocument,
Expand All @@ -118,7 +122,7 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends> implem

const text = !passagesPerDocument
? this.handleDiscoveryPassages(response.result)
: this.handleDiscoveryResult(response.result);
: this.handleDiscoveryResult(response.result, customer);

console.log('1. Text extracted from Discovery:', {naturalLanguageQuery, text})

Expand All @@ -127,8 +131,16 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends> implem
return text;
}

handleDiscoveryResult(result: DiscoveryV2.QueryResponse): string {
return result.results
filterDocuments(result: DiscoveryV2.QueryResponse, subject: string): DiscoveryV2.QueryResult[] {
return result.results.filter(val => {
const organizations = extractEntities(val.enriched_text, 'Organization')

return organizations.map(v => v.toLowerCase()).includes(subject.toLowerCase())
})
}

handleDiscoveryResult(result: DiscoveryV2.QueryResponse, customer: string): string {
return this.filterDocuments(result, customer)
.map(result => result.document_passages
.map(passage => passage.passage_text)
.join(' ')
Expand Down Expand Up @@ -193,3 +205,22 @@ export class DataExtractionImpl extends DataExtractionCsv<WatsonBackends> implem
}

}

interface Entity {
model_name: string
text: string
type: string
}

interface EnrichedText {
entities: Entity[]
}

const extractEntities = (enrichedText: EnrichedText[], ...types: string[]): string[] => {
return enrichedText
.reduce((result: Entity[], current: EnrichedText) => {
return result.concat(...current.entities)
}, [])
.filter((entity: Entity) => types.includes(entity.type))
.map((entity: Entity) => entity.text)
}

0 comments on commit 5db3055

Please sign in to comment.