Skip to content

Commit

Permalink
deep search update to include links back to deepsearch
Browse files Browse the repository at this point in the history
  • Loading branch information
Phil Downey committed Jan 12, 2024
1 parent 3817faa commit e1611bb
Showing 1 changed file with 86 additions and 7 deletions.
93 changes: 86 additions & 7 deletions openad/user_toolkits/DS4SD/fn_search/fn_search_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,24 @@
# search collection 'patent-uspto' for '\"smiles#ccc(coc(=o)cs)(c(=o)c(=o)cs)c(=o)c(=o)cs\"' show (data)

import re
import base64
import json
import urllib.parse
import os
import sys
from copy import deepcopy
import readline
import numpy as np
from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource
from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource, ElasticProjectDataCollectionSource
from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union
from deepsearch.cps.queries import DataQuery
from openad.helpers.output import output_text, output_table, output_error
from openad.app.global_var_lib import GLOBAL_SETTINGS
from openad.plugins.style_parser import style
from openad.helpers.credentials import load_credentials

DEFAULT_URL = "https://sds.app.accelerate.science/"

import os
import sys

parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)
Expand Down Expand Up @@ -50,6 +57,12 @@ def search_collection(inputs: dict, cmd_pointer):
from tqdm import tqdm
import pandas as pd

cred_file = load_credentials(os.path.expanduser(f"{cmd_pointer.home_dir}/deepsearch_api.cred"))

if cred_file["host"].strip() == "":
host = DEFAULT_URL
else:
host = cred_file["host"]
search_query = ""
val_index_key = "pubchem"
page_size = 50
Expand Down Expand Up @@ -214,6 +227,10 @@ def search_collection(inputs: dict, cmd_pointer):
while x < 20:
x = x + 1
result = {}
if "_id" in row and GLOBAL_SETTINGS["display"] == "notebook":
# result["ds_url"] = generate_url(host, data_collection, row["_id"])
result["DS_URL"] = make_clickable(generate_url(host, data_collection, row["_id"]), "Deep Search Web Link")

if "description" in row["_source"]:
if "title" in row["_source"]["description"]:
result["Title"] = row["_source"]["description"]["title"]
Expand Down Expand Up @@ -252,15 +269,15 @@ def search_collection(inputs: dict, cmd_pointer):
if "identifiers" in row["_source"]:
for ref in row["_source"]["identifiers"]:
if ref["type"] == "arxivid":
result["arxivid"] = f'https://arxiv.org/abs/{ref["value"]}'
result["arxivid"] = make_clickable(f'https://arxiv.org/abs/{ref["value"]}', "ARXIVID Link")
if ref["type"] == "doi":
result["doi"] = f'https://doi.org/{ref["value"]}'
result["doi"] = make_clickable(f'https://doi.org/{ref["value"]}', "DOI Link")

if edit_distance > 0:
for field in row.get("highlight", {}).keys():
for snippet in row["highlight"][field]:
result["Report"] = str(row["_source"]["file-info"]["filename"])
result["Field"] = field
result["Field"] = field.split(".")[0]

if "attributes" in row["_source"]:
for attribute in row["_source"]["attributes"]:
Expand Down Expand Up @@ -291,7 +308,7 @@ def search_collection(inputs: dict, cmd_pointer):

if GLOBAL_SETTINGS["display"] == "notebook":
if "return_as_data" not in inputs:
df = df.style.format(hyperlinks="html")
df = df.style
df = df.set_properties(**{"text-align": "left"})

elif GLOBAL_SETTINGS["display"] == "terminal":
Expand All @@ -316,3 +333,65 @@ def confirm_prompt(question: str) -> bool:
reply = input(f"{question} (y/n): ").casefold()
readline.remove_history_item(readline.get_current_history_length() - 1)
return reply == "y"


def make_clickable(url, name):
if GLOBAL_SETTINGS["display"] == "notebook":
return f'<a href="{url}" target="_blank"> {name} </a>'
else:
return url


def generate_url(host, data_source, document_hash, item_index=None):
if isinstance(data_source, ElasticProjectDataCollectionSource):
proj_key = data_source.proj_key
index_key = data_source.index_key
select_coords = {
"privateCollection": index_key,
}
url = f"{host}/projects/{proj_key}/library/private/{index_key}"
elif isinstance(data_source, ElasticDataCollectionSource):
# TODO: remove hardcoding of community project
proj_key = "1234567890abcdefghijklmnopqrstvwyz123456"
index_key = data_source.index_key
select_coords = {
"collections": [index_key],
}
url = f"{host}/projects/{proj_key}/library/public"

hash_expr = f'file-info.document-hash: "{document_hash}"'
search_query = {
**select_coords,
"type": "Document",
"expression": hash_expr,
"filters": [],
"select": [
"_name",
"description.collection",
"prov",
"description.title",
"description.publication_date",
"description.url_refs",
],
"itemIndex": 0,
"pageSize": 10,
"searchAfterHistory": [],
"viewType": "snippets",
"recordSelection": {
"record": {
"id": document_hash,
},
},
}
if item_index is not None:
search_query["recordSelection"]["itemIndex"] = item_index

encoded_query = urllib.parse.quote(
base64.b64encode(urllib.parse.quote(json.dumps(search_query, separators=(",", ":"))).encode("utf8")).decode(
"utf8"
)
)

url = f"{url}?search={encoded_query}"

return url

0 comments on commit e1611bb

Please sign in to comment.