diff --git a/packages/gollm/common/prompts/dataset_enrichment.py b/packages/gollm/common/prompts/dataset_enrichment.py index 3bfae37c5d..81ee793778 100644 --- a/packages/gollm/common/prompts/dataset_enrichment.py +++ b/packages/gollm/common/prompts/dataset_enrichment.py @@ -5,6 +5,8 @@ Focus on providing detailed and informative descriptions of the dataset and its columns. Do not make up information, only use the information provided in the document and dataset stats. Focus on trying to determine which columns represent spatial or temporal data, and provide a detailed description of the data in these columns. +Try to extract information for sourceData, considerationsForUsingTheData, and additionalInformation from the document. Do not make up information. If you cannot find the information, leave a field empty. + Use the following dataset statistics as a reference: ---START DATASET--- diff --git a/packages/gollm/schemas/dataset_enrichment.json b/packages/gollm/schemas/dataset_enrichment.json index 3528122609..889b90f084 100644 --- a/packages/gollm/schemas/dataset_enrichment.json +++ b/packages/gollm/schemas/dataset_enrichment.json @@ -25,15 +25,15 @@ "type": ["object", "null"], "properties": { "personalAndSensitiveInformation": { - "type": "string", + "type": ["string", "null"], "description": "Statement of whether the dataset contains other data that might be considered sensitive (e.g., data that reveals racial or ethnic origins, financial or health data, etc.)." }, "discussionOfBiases": { - "type": "string", + "type": ["string", "null"], "description": "Discussion of any biases present in the data (e.g., missing data, imbalanced classes, label noise, etc.)." }, "otherKnownLimitations": { - "type": "string", + "type": ["string", "null"], "description": "Any other limitations in the data." } }, @@ -48,26 +48,26 @@ "type": ["object", "null"], "properties": { "datasetCurators": { - "type": "array", + "type": ["array", "null"], "description": "The people involved in collecting the dataset and their affiliation(s).", "items": { "type": "string" } }, "licensingInformation": { - "type": "string", + "type": ["string", "null"], "description": "The license and link to the license webpage if available." }, "citationInformation": { - "type": "string", + "type": ["string", "null"], "description": "The BibTex-formatted reference for the dataset." }, "datasetHomepage": { - "type": "string", + "type": ["string", "null"], "description": "The homepage of the dataset." }, "additionalLinks": { - "type": "array", + "type": ["array", "null"], "description": "Additional links related to the dataset.", "items": { "type": "string" diff --git a/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/document/DocumentAsset.java b/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/document/DocumentAsset.java index 4070f58f4f..8e78ef1df1 100644 --- a/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/document/DocumentAsset.java +++ b/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/document/DocumentAsset.java @@ -28,7 +28,6 @@ import software.uncharted.terarium.hmiserver.annotations.TSModel; import software.uncharted.terarium.hmiserver.annotations.TSOptional; import software.uncharted.terarium.hmiserver.models.TerariumAsset; -import software.uncharted.terarium.hmiserver.models.TerariumAssetEmbeddingType; import software.uncharted.terarium.hmiserver.models.dataservice.Grounding; @EqualsAndHashCode(callSuper = true) @@ -94,9 +93,10 @@ public List getExtractions() { if ( this.extractions.size() == 0 && this.fileNames.size() > 0 && - (this.fileNames.get(0).endsWith(".txt") || this.fileNames.get(0).endsWith(".md")) + (this.fileNames.get(0).endsWith(".txt") || this.fileNames.get(0).endsWith(".md")) && + this.text != null ) { - extractions = List.of(new ExtractedDocumentPage().setPageNumber(1).setText(text)); + extractions = List.of(new ExtractedDocumentPage().setPageNumber(1).setText(this.text)); } return this.extractions; } diff --git a/packages/server/src/main/java/software/uncharted/terarium/hmiserver/service/tasks/EnrichDatasetResponseHandler.java b/packages/server/src/main/java/software/uncharted/terarium/hmiserver/service/tasks/EnrichDatasetResponseHandler.java index 82b2762b83..2d80c7f628 100644 --- a/packages/server/src/main/java/software/uncharted/terarium/hmiserver/service/tasks/EnrichDatasetResponseHandler.java +++ b/packages/server/src/main/java/software/uncharted/terarium/hmiserver/service/tasks/EnrichDatasetResponseHandler.java @@ -70,6 +70,31 @@ public static class Properties { Boolean overwrite; } + private void removeNullNodes(ObjectNode objectNode) { + Iterator> fields = objectNode.fields(); + List keysToRemove = new ArrayList<>(); + + while (fields.hasNext()) { + Map.Entry entry = fields.next(); + JsonNode value = entry.getValue(); + if (value.isNull()) { + keysToRemove.add(entry.getKey()); + } else if (value.isObject()) { + removeNullNodes((ObjectNode) value); + } else if (value.isArray()) { + for (JsonNode arrayItem : value) { + if (arrayItem.isObject()) { + removeNullNodes((ObjectNode) arrayItem); + } + } + } + } + + for (String key : keysToRemove) { + objectNode.remove(key); + } + } + @Override public TaskResponse onSuccess(final TaskResponse resp) { try { @@ -88,24 +113,9 @@ public TaskResponse onSuccess(final TaskResponse resp) { // Update the dataset with the new card ((ObjectNode) dataset.getMetadata()).set("dataCard", response.response.card); - // Remove fields from the datacard that are null or empty + // Remove fields from the datacard that are null final ObjectNode dataCard = (ObjectNode) dataset.getMetadata().get("dataCard"); - Iterator> fields = dataCard.fields(); - List keysToRemove = new ArrayList<>(); - - while (fields.hasNext()) { - Map.Entry entry = fields.next(); - if ( - entry.getValue().isNull() || - entry.getValue().asText().equalsIgnoreCase("none") || - entry.getValue().asText().equalsIgnoreCase("null") - ) { - keysToRemove.add(entry.getKey()); - } - } - for (String key : keysToRemove) { - dataCard.remove(key); - } + removeNullNodes(dataCard); ((ObjectNode) dataset.getMetadata()).put( "description", @@ -125,10 +135,6 @@ public TaskResponse onSuccess(final TaskResponse resp) { metadata.put("description", description); metadata.put("unit", unit); - // Based on the name, description, fetch the best grounding available and add it to the metadata - // final DKG grounding = dkgService.knnSearchEpiDKG(0, 1, 1, name + " " + description, null) - // metadata.put("grounding", grounding); - dataset .getColumns() .stream()