Skip to content

Commit

Permalink
fix issue where LLM makes up information when enriching datasets (#6087)
Browse files Browse the repository at this point in the history
  • Loading branch information
dgauldie authored Jan 16, 2025
1 parent 08c4174 commit c968eb1
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 32 deletions.
2 changes: 2 additions & 0 deletions packages/gollm/common/prompts/dataset_enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
Focus on providing detailed and informative descriptions of the dataset and its columns. Do not make up information, only use the information provided in the document and dataset stats.
Focus on trying to determine which columns represent spatial or temporal data, and provide a detailed description of the data in these columns.
Try to extract information for sourceData, considerationsForUsingTheData, and additionalInformation from the document. Do not make up information. If you cannot find the information, leave a field empty.
Use the following dataset statistics as a reference:
---START DATASET---
Expand Down
16 changes: 8 additions & 8 deletions packages/gollm/schemas/dataset_enrichment.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@
"type": ["object", "null"],
"properties": {
"personalAndSensitiveInformation": {
"type": "string",
"type": ["string", "null"],
"description": "Statement of whether the dataset contains other data that might be considered sensitive (e.g., data that reveals racial or ethnic origins, financial or health data, etc.)."
},
"discussionOfBiases": {
"type": "string",
"type": ["string", "null"],
"description": "Discussion of any biases present in the data (e.g., missing data, imbalanced classes, label noise, etc.)."
},
"otherKnownLimitations": {
"type": "string",
"type": ["string", "null"],
"description": "Any other limitations in the data."
}
},
Expand All @@ -48,26 +48,26 @@
"type": ["object", "null"],
"properties": {
"datasetCurators": {
"type": "array",
"type": ["array", "null"],
"description": "The people involved in collecting the dataset and their affiliation(s).",
"items": {
"type": "string"
}
},
"licensingInformation": {
"type": "string",
"type": ["string", "null"],
"description": "The license and link to the license webpage if available."
},
"citationInformation": {
"type": "string",
"type": ["string", "null"],
"description": "The BibTex-formatted reference for the dataset."
},
"datasetHomepage": {
"type": "string",
"type": ["string", "null"],
"description": "The homepage of the dataset."
},
"additionalLinks": {
"type": "array",
"type": ["array", "null"],
"description": "Additional links related to the dataset.",
"items": {
"type": "string"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import software.uncharted.terarium.hmiserver.annotations.TSModel;
import software.uncharted.terarium.hmiserver.annotations.TSOptional;
import software.uncharted.terarium.hmiserver.models.TerariumAsset;
import software.uncharted.terarium.hmiserver.models.TerariumAssetEmbeddingType;
import software.uncharted.terarium.hmiserver.models.dataservice.Grounding;

@EqualsAndHashCode(callSuper = true)
Expand Down Expand Up @@ -94,9 +93,10 @@ public List<ExtractedDocumentPage> getExtractions() {
if (
this.extractions.size() == 0 &&
this.fileNames.size() > 0 &&
(this.fileNames.get(0).endsWith(".txt") || this.fileNames.get(0).endsWith(".md"))
(this.fileNames.get(0).endsWith(".txt") || this.fileNames.get(0).endsWith(".md")) &&
this.text != null
) {
extractions = List.of(new ExtractedDocumentPage().setPageNumber(1).setText(text));
extractions = List.of(new ExtractedDocumentPage().setPageNumber(1).setText(this.text));
}
return this.extractions;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,31 @@ public static class Properties {
Boolean overwrite;
}

private void removeNullNodes(ObjectNode objectNode) {
Iterator<Map.Entry<String, JsonNode>> fields = objectNode.fields();
List<String> keysToRemove = new ArrayList<>();

while (fields.hasNext()) {
Map.Entry<String, JsonNode> entry = fields.next();
JsonNode value = entry.getValue();
if (value.isNull()) {
keysToRemove.add(entry.getKey());
} else if (value.isObject()) {
removeNullNodes((ObjectNode) value);
} else if (value.isArray()) {
for (JsonNode arrayItem : value) {
if (arrayItem.isObject()) {
removeNullNodes((ObjectNode) arrayItem);
}
}
}
}

for (String key : keysToRemove) {
objectNode.remove(key);
}
}

@Override
public TaskResponse onSuccess(final TaskResponse resp) {
try {
Expand All @@ -88,24 +113,9 @@ public TaskResponse onSuccess(final TaskResponse resp) {
// Update the dataset with the new card
((ObjectNode) dataset.getMetadata()).set("dataCard", response.response.card);

// Remove fields from the datacard that are null or empty
// Remove fields from the datacard that are null
final ObjectNode dataCard = (ObjectNode) dataset.getMetadata().get("dataCard");
Iterator<Map.Entry<String, JsonNode>> fields = dataCard.fields();
List<String> keysToRemove = new ArrayList<>();

while (fields.hasNext()) {
Map.Entry<String, JsonNode> entry = fields.next();
if (
entry.getValue().isNull() ||
entry.getValue().asText().equalsIgnoreCase("none") ||
entry.getValue().asText().equalsIgnoreCase("null")
) {
keysToRemove.add(entry.getKey());
}
}
for (String key : keysToRemove) {
dataCard.remove(key);
}
removeNullNodes(dataCard);

((ObjectNode) dataset.getMetadata()).put(
"description",
Expand All @@ -125,10 +135,6 @@ public TaskResponse onSuccess(final TaskResponse resp) {
metadata.put("description", description);
metadata.put("unit", unit);

// Based on the name, description, fetch the best grounding available and add it to the metadata
// final DKG grounding = dkgService.knnSearchEpiDKG(0, 1, 1, name + " " + description, null)
// metadata.put("grounding", grounding);

dataset
.getColumns()
.stream()
Expand Down

0 comments on commit c968eb1

Please sign in to comment.