From e1611bb6311b7e0bf1237d6c3d71207ce040116f Mon Sep 17 00:00:00 2001 From: Phil Downey Date: Fri, 12 Jan 2024 18:00:38 +1100 Subject: [PATCH] deep search update to include links back to deepsearch --- .../DS4SD/fn_search/fn_search_collection.py | 93 +++++++++++++++++-- 1 file changed, 86 insertions(+), 7 deletions(-) diff --git a/openad/user_toolkits/DS4SD/fn_search/fn_search_collection.py b/openad/user_toolkits/DS4SD/fn_search/fn_search_collection.py index 191ff7f1..770a2510 100644 --- a/openad/user_toolkits/DS4SD/fn_search/fn_search_collection.py +++ b/openad/user_toolkits/DS4SD/fn_search/fn_search_collection.py @@ -4,17 +4,24 @@ # search collection 'patent-uspto' for '\"smiles#ccc(coc(=o)cs)(c(=o)c(=o)cs)c(=o)c(=o)cs\"' show (data) import re +import base64 +import json +import urllib.parse +import os +import sys from copy import deepcopy import readline import numpy as np -from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource +from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource, ElasticProjectDataCollectionSource +from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union from deepsearch.cps.queries import DataQuery from openad.helpers.output import output_text, output_table, output_error from openad.app.global_var_lib import GLOBAL_SETTINGS from openad.plugins.style_parser import style +from openad.helpers.credentials import load_credentials + +DEFAULT_URL = "https://sds.app.accelerate.science/" -import os -import sys parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, parent_dir) @@ -50,6 +57,12 @@ def search_collection(inputs: dict, cmd_pointer): from tqdm import tqdm import pandas as pd + cred_file = load_credentials(os.path.expanduser(f"{cmd_pointer.home_dir}/deepsearch_api.cred")) + + if cred_file["host"].strip() == "": + host = DEFAULT_URL + else: + host = cred_file["host"] search_query = "" val_index_key = "pubchem" page_size = 50 @@ -214,6 +227,10 @@ def search_collection(inputs: dict, cmd_pointer): while x < 20: x = x + 1 result = {} + if "_id" in row and GLOBAL_SETTINGS["display"] == "notebook": + # result["ds_url"] = generate_url(host, data_collection, row["_id"]) + result["DS_URL"] = make_clickable(generate_url(host, data_collection, row["_id"]), "Deep Search Web Link") + if "description" in row["_source"]: if "title" in row["_source"]["description"]: result["Title"] = row["_source"]["description"]["title"] @@ -252,15 +269,15 @@ def search_collection(inputs: dict, cmd_pointer): if "identifiers" in row["_source"]: for ref in row["_source"]["identifiers"]: if ref["type"] == "arxivid": - result["arxivid"] = f'https://arxiv.org/abs/{ref["value"]}' + result["arxivid"] = make_clickable(f'https://arxiv.org/abs/{ref["value"]}', "ARXIVID Link") if ref["type"] == "doi": - result["doi"] = f'https://doi.org/{ref["value"]}' + result["doi"] = make_clickable(f'https://doi.org/{ref["value"]}', "DOI Link") if edit_distance > 0: for field in row.get("highlight", {}).keys(): for snippet in row["highlight"][field]: result["Report"] = str(row["_source"]["file-info"]["filename"]) - result["Field"] = field + result["Field"] = field.split(".")[0] if "attributes" in row["_source"]: for attribute in row["_source"]["attributes"]: @@ -291,7 +308,7 @@ def search_collection(inputs: dict, cmd_pointer): if GLOBAL_SETTINGS["display"] == "notebook": if "return_as_data" not in inputs: - df = df.style.format(hyperlinks="html") + df = df.style df = df.set_properties(**{"text-align": "left"}) elif GLOBAL_SETTINGS["display"] == "terminal": @@ -316,3 +333,65 @@ def confirm_prompt(question: str) -> bool: reply = input(f"{question} (y/n): ").casefold() readline.remove_history_item(readline.get_current_history_length() - 1) return reply == "y" + + +def make_clickable(url, name): + if GLOBAL_SETTINGS["display"] == "notebook": + return f' {name} ' + else: + return url + + +def generate_url(host, data_source, document_hash, item_index=None): + if isinstance(data_source, ElasticProjectDataCollectionSource): + proj_key = data_source.proj_key + index_key = data_source.index_key + select_coords = { + "privateCollection": index_key, + } + url = f"{host}/projects/{proj_key}/library/private/{index_key}" + elif isinstance(data_source, ElasticDataCollectionSource): + # TODO: remove hardcoding of community project + proj_key = "1234567890abcdefghijklmnopqrstvwyz123456" + index_key = data_source.index_key + select_coords = { + "collections": [index_key], + } + url = f"{host}/projects/{proj_key}/library/public" + + hash_expr = f'file-info.document-hash: "{document_hash}"' + search_query = { + **select_coords, + "type": "Document", + "expression": hash_expr, + "filters": [], + "select": [ + "_name", + "description.collection", + "prov", + "description.title", + "description.publication_date", + "description.url_refs", + ], + "itemIndex": 0, + "pageSize": 10, + "searchAfterHistory": [], + "viewType": "snippets", + "recordSelection": { + "record": { + "id": document_hash, + }, + }, + } + if item_index is not None: + search_query["recordSelection"]["itemIndex"] = item_index + + encoded_query = urllib.parse.quote( + base64.b64encode(urllib.parse.quote(json.dumps(search_query, separators=(",", ":"))).encode("utf8")).decode( + "utf8" + ) + ) + + url = f"{url}?search={encoded_query}" + + return url