From 765e9ca923cb7ec86330816fca8260e2a70861e9 Mon Sep 17 00:00:00 2001 From: evans-g-crsj Date: Wed, 16 Oct 2024 12:57:49 -0400 Subject: [PATCH] :sparkles: new admin scripts to add docs --- admin/compare2Docs.mjs | 43 +++++++++++ admin/compareStudiesDupesIfExist.mjs | 62 +++++++++++++++ admin/findClinicalIndicesUsage.mjs | 3 +- admin/utils.mjs | 18 ++++- package-lock.json | 109 +++++++++++++++++++++++++++ package.json | 1 + 6 files changed, 234 insertions(+), 2 deletions(-) create mode 100644 admin/compare2Docs.mjs create mode 100644 admin/compareStudiesDupesIfExist.mjs diff --git a/admin/compare2Docs.mjs b/admin/compare2Docs.mjs new file mode 100644 index 0000000..2e2b728 --- /dev/null +++ b/admin/compare2Docs.mjs @@ -0,0 +1,43 @@ +// example: node admin/compare2Docs.mjs --a:9tCMkJIB0zu1NPYlGUy3 --b:_NCMkJIB0zu1NPYlGUy3 --entity:participant_centric +import { Client } from '@elastic/elasticsearch'; +import { diffString } from 'json-diff'; +import assert from 'node:assert/strict'; + +import { esHost } from '../dist/src/env.js'; +import { ENTITIES } from './releaseStatsUtils.mjs'; + +const args = process.argv.slice(2); +const aArgument = args.find(x => x.startsWith('--a:')) ?? ''; +const a = aArgument.split('--a:')[1]; +const bArgument = args.find(x => x.startsWith('--b:')) ?? ''; +const b = bArgument.split('--b:')[1]; + +const entityArgument = args.find(x => x.startsWith('--entity:')) ?? ''; +const entity = entityArgument.split('--entity:')[1] || 'study_centric'; +assert(!!a && !!b, 'Missing docs values'); +assert(a !== b, 'a and b have the same value. Nothing to compare'); +assert( + Object.values(ENTITIES).some(x => x === entity), + 'Entity invalid', +); + +const client = new Client({ node: esHost }); + +const resp = await client.search({ + index: `${entity}*`, + body: { + query: { + ids: { + values: [a, b], + }, + }, + }, +}); + +const left = resp.body.hits.hits[0]; +const right = resp.body.hits.hits[1]; + +assert(left && right, 'Could not find at least one the 2 docs'); +assert(left._index.startsWith(right._index.split('_centric')[0]), 'Docs seem to be from different entities.'); + +console.log(diffString(left, right, { sort: true })); diff --git a/admin/compareStudiesDupesIfExist.mjs b/admin/compareStudiesDupesIfExist.mjs new file mode 100644 index 0000000..3621dcd --- /dev/null +++ b/admin/compareStudiesDupesIfExist.mjs @@ -0,0 +1,62 @@ +import { Client } from '@elastic/elasticsearch'; +import { diffString } from 'json-diff'; +import assert from 'node:assert/strict'; + +import { esHost } from '../dist/src/env.js'; +import { ENTITIES } from './releaseStatsUtils.mjs'; +import { binomialCoefficient, pairIt } from './utils.mjs'; + +const client = new Client({ node: esHost }); + +const STUDY_SEARCH_SIZE = 50; +const allStudiesSearchResponse = await client.search({ + index: `${ENTITIES.study_centric}`, + track_total_hits: true, + body: { + size: STUDY_SEARCH_SIZE, + }, +}); + +const total = allStudiesSearchResponse.body.hits.total.value; +assert(total > 0, 'No study found'); +assert(total < STUDY_SEARCH_SIZE, 'Not all studies were fetched, increase size in the script if needed'); + +const all = allStudiesSearchResponse.body.hits.hits; + +const dupes = all + .reduce((xs, x) => { + if (all.filter(s => x._index === s._index).length > 1) { + return [...xs, { ...x, studyCode: x._source.study_code }]; + } + return xs; + }, []) + .reduce((xs, x) => { + const studyCode = x.studyCode; + return { + ...xs, + [studyCode]: xs[studyCode] ? [...xs[studyCode], x] : [x], + }; + }, {}); + +const numberComparisons = Object.entries(dupes).reduce((xs, x) => { + const n = x[1].length; + return xs + binomialCoefficient(n, 2); +}, 0); + +const MAX_N_OF_COMPARISONS = 25; // Arbitrary value, need real-world testing. +const willNotExplode = numberComparisons <= MAX_N_OF_COMPARISONS; + +assert(willNotExplode, `Avoiding to compare for there are ${numberComparisons} comparisons to compute`); + +const allPairs = Object.fromEntries(Object.entries(dupes).map(x => [x[0], pairIt(x[1])])); + +Object.entries(allPairs).forEach(([code, pairs]) => { + console.log(`----- Showing diff for duplicates of ${code} (total of ${pairs.length} pairs) -----`); + pairs.forEach((p, index) => { + const left = p[0]; + const right = p[1]; + console.log(`pair #${index + 1} : ${left._id} vs ${right._id}`); + const diff = diffString(left, right, { sort: true }); + console.log(diff ? diff : `no diff found in this docs pair`); + }); +}); diff --git a/admin/findClinicalIndicesUsage.mjs b/admin/findClinicalIndicesUsage.mjs index 141f2a2..7e7b7eb 100644 --- a/admin/findClinicalIndicesUsage.mjs +++ b/admin/findClinicalIndicesUsage.mjs @@ -1,8 +1,9 @@ /** (WIP) Helper script to find unused indices * */ -import assert from 'node:assert/strict'; import { Client } from '@elastic/elasticsearch'; +import assert from 'node:assert/strict'; + import { esHost } from '../dist/src/env.js'; import { cbKeepClinicalPlusTranscriptomicsIndicesOnly, isIndexNameFromTranscriptomics } from './utils.mjs'; diff --git a/admin/utils.mjs b/admin/utils.mjs index aacd31d..c99fe2c 100644 --- a/admin/utils.mjs +++ b/admin/utils.mjs @@ -6,4 +6,20 @@ export const cbKeepClinicalPlusTranscriptomicsIndicesOnly = x => x.index.includes(stem), ); -export const isIndexNameFromTranscriptomics = index => index.includes('gene_exp'); \ No newline at end of file +export const isIndexNameFromTranscriptomics = index => index.includes('gene_exp'); + +//https://labex.io/tutorials/javascript-javascript-programming-fundamentals-28177 +export const binomialCoefficient = (n, k) => { + if (Number.isNaN(n) || Number.isNaN(k)) return NaN; + if (k < 0 || k > n) return 0; + if (k === 0 || k === n) return 1; + if (k === 1 || k === n - 1) return n; + if (n - k < k) k = n - k; + + let res = n; + for (let i = 2; i <= k; i++) res *= (n - i + 1) / i; + return Math.round(res); +}; + +//https://stackoverflow.com/questions/22566379/how-to-get-all-pairs-of-array-javascript +export const pairIt = l => l.map((v, i) => l.slice(i + 1).map(w => [v, w])).flat(); diff --git a/package-lock.json b/package-lock.json index 7b938db..bd84b0b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -42,6 +42,7 @@ "eslint-plugin-prettier": "^3.4.0", "eslint-plugin-simple-import-sort": "^7.0.0", "jest": "^27.0.6", + "json-diff": "^1.0.6", "jsonwebtoken": "^8.5.1", "prettier": "^1.19.1", "supertest": "^6.1.2", @@ -1028,6 +1029,15 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/@ewoudenberg/difflib": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/@ewoudenberg/difflib/-/difflib-0.1.0.tgz", + "integrity": "sha512-OU5P5mJyD3OoWYMWY+yIgwvgNS9cFAU10f+DDuvtogcWQOoJIsQ4Hy2McSfUfhKjq8L0FuWVb4Rt7kgA+XK86A==", + "dev": true, + "dependencies": { + "heap": ">= 0.2.0" + } + }, "node_modules/@humanwhocodes/config-array": { "version": "0.5.0", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.5.0.tgz", @@ -4111,6 +4121,18 @@ "node": ">=10" } }, + "node_modules/dreamopt": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/dreamopt/-/dreamopt-0.8.0.tgz", + "integrity": "sha512-vyJTp8+mC+G+5dfgsY+r3ckxlz+QMX40VjPQsZc5gxVAxLmi64TBoVkP54A/pRAXMXsbu2GMMBrZPxNv23waMg==", + "dev": true, + "dependencies": { + "wordwrap": ">=0.0.2" + }, + "engines": { + "node": ">=0.4.0" + } + }, "node_modules/ecdsa-sig-formatter": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", @@ -5914,6 +5936,12 @@ "node": ">= 0.4" } }, + "node_modules/heap": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/heap/-/heap-0.2.7.tgz", + "integrity": "sha512-2bsegYkkHO+h/9MGbn6KWcE45cHZgPANo5LXF7EvWdT0yT2EguSVO1nDgU5c8+ZOPwp2vMNa7YFsJhVcDR9Sdg==", + "dev": true + }, "node_modules/hmac-drbg": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/hmac-drbg/-/hmac-drbg-1.0.1.tgz", @@ -8372,6 +8400,32 @@ "node": ">=4" } }, + "node_modules/json-diff": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/json-diff/-/json-diff-1.0.6.tgz", + "integrity": "sha512-tcFIPRdlc35YkYdGxcamJjllUhXWv4n2rK9oJ2RsAzV4FBkuV4ojKEDgcZ+kpKxDmJKv+PFK65+1tVVOnSeEqA==", + "dev": true, + "dependencies": { + "@ewoudenberg/difflib": "0.1.0", + "colors": "^1.4.0", + "dreamopt": "~0.8.0" + }, + "bin": { + "json-diff": "bin/json-diff.js" + }, + "engines": { + "node": "*" + } + }, + "node_modules/json-diff/node_modules/colors": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/colors/-/colors-1.4.0.tgz", + "integrity": "sha512-a+UqTh4kgZg/SlGvfbzDHpgRu7AAQOmmqRHJnxhRZICKFUT91brVhNNt58CMWU9PsBbv3PDCZUHbVxuDiH2mtA==", + "dev": true, + "engines": { + "node": ">=0.1.90" + } + }, "node_modules/json-schema-traverse": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", @@ -11392,6 +11446,12 @@ "node": ">=0.10.0" } }, + "node_modules/wordwrap": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz", + "integrity": "sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==", + "dev": true + }, "node_modules/wrap-ansi": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", @@ -12359,6 +12419,15 @@ } } }, + "@ewoudenberg/difflib": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/@ewoudenberg/difflib/-/difflib-0.1.0.tgz", + "integrity": "sha512-OU5P5mJyD3OoWYMWY+yIgwvgNS9cFAU10f+DDuvtogcWQOoJIsQ4Hy2McSfUfhKjq8L0FuWVb4Rt7kgA+XK86A==", + "dev": true, + "requires": { + "heap": ">= 0.2.0" + } + }, "@humanwhocodes/config-array": { "version": "0.5.0", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.5.0.tgz", @@ -14779,6 +14848,15 @@ "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-10.0.0.tgz", "integrity": "sha512-rlBi9d8jpv9Sf1klPjNfFAuWDjKLwTIJJ/VxtoTwIR6hnZxcEOQCZg2oIL3MWBYw5GpUDKOEnND7LXTbIpQ03Q==" }, + "dreamopt": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/dreamopt/-/dreamopt-0.8.0.tgz", + "integrity": "sha512-vyJTp8+mC+G+5dfgsY+r3ckxlz+QMX40VjPQsZc5gxVAxLmi64TBoVkP54A/pRAXMXsbu2GMMBrZPxNv23waMg==", + "dev": true, + "requires": { + "wordwrap": ">=0.0.2" + } + }, "ecdsa-sig-formatter": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", @@ -16101,6 +16179,12 @@ "function-bind": "^1.1.2" } }, + "heap": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/heap/-/heap-0.2.7.tgz", + "integrity": "sha512-2bsegYkkHO+h/9MGbn6KWcE45cHZgPANo5LXF7EvWdT0yT2EguSVO1nDgU5c8+ZOPwp2vMNa7YFsJhVcDR9Sdg==", + "dev": true + }, "hmac-drbg": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/hmac-drbg/-/hmac-drbg-1.0.1.tgz", @@ -17923,6 +18007,25 @@ "integrity": "sha512-OYu7XEzjkCQ3C5Ps3QIZsQfNpqoJyZZA99wd9aWd05NCtC5pWOkShK2mkL6HXQR6/Cy2lbNdPlZBpuQHXE63gA==", "dev": true }, + "json-diff": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/json-diff/-/json-diff-1.0.6.tgz", + "integrity": "sha512-tcFIPRdlc35YkYdGxcamJjllUhXWv4n2rK9oJ2RsAzV4FBkuV4ojKEDgcZ+kpKxDmJKv+PFK65+1tVVOnSeEqA==", + "dev": true, + "requires": { + "@ewoudenberg/difflib": "0.1.0", + "colors": "^1.4.0", + "dreamopt": "~0.8.0" + }, + "dependencies": { + "colors": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/colors/-/colors-1.4.0.tgz", + "integrity": "sha512-a+UqTh4kgZg/SlGvfbzDHpgRu7AAQOmmqRHJnxhRZICKFUT91brVhNNt58CMWU9PsBbv3PDCZUHbVxuDiH2mtA==", + "dev": true + } + } + }, "json-schema-traverse": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", @@ -20241,6 +20344,12 @@ "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz", "integrity": "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==" }, + "wordwrap": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz", + "integrity": "sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==", + "dev": true + }, "wrap-ansi": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", diff --git a/package.json b/package.json index 6ee6105..a486444 100644 --- a/package.json +++ b/package.json @@ -68,6 +68,7 @@ "eslint-plugin-prettier": "^3.4.0", "eslint-plugin-simple-import-sort": "^7.0.0", "jest": "^27.0.6", + "json-diff": "^1.0.6", "jsonwebtoken": "^8.5.1", "prettier": "^1.19.1", "supertest": "^6.1.2",