Skip to content

Commit

Permalink
crawler update; gui adjustments
Browse files Browse the repository at this point in the history
  • Loading branch information
BobHanson committed Nov 18, 2024
1 parent 5b20585 commit b56b511
Show file tree
Hide file tree
Showing 7 changed files with 176 additions and 75 deletions.
1 change: 1 addition & 0 deletions docs/examples/assets/FAIRSpec-config.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,6 @@ IFD = {
jmeReturn: null,
smarts: null, // just for reference
canvas: null,
contentHeader: null,
cache: {}
}
29 changes: 18 additions & 11 deletions docs/examples/assets/FAIRSpec-gui.js
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@
var sample = IFD.collections[aidID].samples[id];
var specids = IFD.getSpectrumIDsForSample(aidID, id);
var structureIDs = IFD.getStructureIDsForSpectra(aidID,specids);
var s = getHeader("Sample " + id);
var s = getHeader("Sample/s", "Sample " + id);
s += showCompoundStructures(aidID,structureIDs, false);
var smiles = IFD.getSmilesForStructureID(aidID, structureIDs[0]);
s += showCompoundSpectra(aidID,specids,smiles,true);
Expand Down Expand Up @@ -501,7 +501,7 @@
var sampleID = spec.properties && spec.properties.originating_sample_id;
var sid = (IFD.byID ? id : spec.id);
var s = "<table padding=3><tr><td valign=top>"
+ getHeader("Spectrum " + sid) + "<h3>"
+ getHeader("Spectrum/a ", "Spectrum " + sid) + "<h3>"
+ (sampleID ? "&nbsp;&nbsp;&nbsp; sample " + sampleID : "")
+ "</h3></td>";
var title = getObjectProperty(spec, "expt_title");
Expand All @@ -515,7 +515,8 @@
return s;
}

var getHeader = function(name, description) {
var getHeader = function(types, name, description) {
IFD.contentHeader = types;
var key = removeSpace(name) + "_" + ++divId
IFD.headers.push([key,name]);
return "<a name=\"" + key + "\"><h3>" + name + "</h3></a>"
Expand All @@ -530,7 +531,7 @@
var props = cmpd.properties;
var params = cmpd.attributes;
var label = cmpd.label || cmpd.id;
var s = getHeader(label.startsWith("Compound") ? label : "Compound " + label, cmpd.description);
var s = getHeader("Compound/s", label.startsWith("Compound") ? label : "Compound " + label, cmpd.description);
s += "<table>" + addPropertyRows("",props, null, false) + "</table>"
s += "<table>" + addPropertyRows("",params, null, false) + "</table>"

Expand Down Expand Up @@ -584,7 +585,7 @@
s += "<td rowspan=2 valign=\"top\">";
if (showID) {
var h = (id.indexOf("Structure") == 0 ? removeUnderline(sid) : "Structure " + sid);
s += "<span class=structurehead>"+ (IFD.resultsMode == MODE_STRUCTURES ? getHeader(h) : h) + "</span><br>";
s += "<span class=structurehead>"+ (IFD.resultsMode == MODE_STRUCTURES ? getHeader("Structure/s", h) : h) + "</span><br>";
}
v = IFD.getStructureVisual(reps);
if (v){
Expand Down Expand Up @@ -636,8 +637,13 @@

var loadContents = function(hasContent) {
clearJQ("#contents");
var s = "<table>";
for (var i = 0; i < IFD.headers.length; i++) {
if (!hasContent)
return;
var n = IFD.headers.length;
var type = IFD.contentHeader.split("/");
type = (n == 1 ? type[0] : type[0].substring(0, type[0].length + 1 - type[1].length) + type[1]);
var s = "<b>" + n + " " + type + "</b><br><table>";
for (var i = 0; i < n; i++) {
var h = IFD.headers[i];
var key = h[0];
var val = h[1];
Expand Down Expand Up @@ -706,7 +712,7 @@
if (r.data) {
if (r.data.indexOf(";base64") == 0) {
var imgTag = "<img id=img" + (++divId) + " onload=IFD.checkImage(" + divId + ")" + " src=\"" + "data:" + r.mediaType + r.data + "\"</img>";
s += addPathForRep(aidID, r.ref, -1, imgTag);
s += addPathForRep(aidID, r.ref, -1, imgTag, null);
} else {
if (r.data.length > 30) {
s += anchorHide(shead, r.data);
Expand All @@ -716,7 +722,7 @@
}
}
} else {
s += " " + addPathForRep(aidID, r.ref, r.len, null);
s += " " + addPathForRep(aidID, r.ref, r.len, null, r.mediaType);
}
s = "<tr><td>" + shead + s + "</td></tr>";
return s;
Expand Down Expand Up @@ -817,15 +823,16 @@
}
}

var addPathForRep = function(aidID, ref, len, value) {
var addPathForRep = function(aidID, ref, len, value, mediaType) {
var shortName = ref.localName || shortFileName(ref.localPath);
var url = ref.url || ref.doi || (ref.localPath ? fileFor(aidID, ref.localPath) : null);
mediaType = null;// nah. Doesn't really add anything || (mediaType = "");
if (value) {
s = "<a target=_blank href=\"" + url + "\">" + value + "</a>"
} else if (shortName.endsWith(".png")) {
s = "<img id=img" + (++divId) + " onload=IFD.checkImage(" + divId + ")" + " src=\"" + url +"\">";
} else {
s = "<a target=_blank href=\"" + url + "\">" + shortName + "</a>" + " (" + getSizeString(len) + ")";
s = "<a target=_blank href=\"" + url + "\">" + shortName + "</a>" + " (" + getSizeString(len) + (mediaType ? " " + mediaType : "") + ")";
}
return s;
}
Expand Down
64 changes: 64 additions & 0 deletions docs/examples/v5-icl-repository-DOI-crawl/devnotes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
Notes by Bob Hanson regarding the processing of
10.14469/hpc/10386

For the most part, the crawling was a straightforward
task, looking for <relatedIdentifiers> that had either
a URL or DOI relatedIdentifierType. URL types were digital
items; DOI types with relationType="HasPart" were followed
to "child" records. So, for instance we have:

10386 (the main DOI):
<relatedIdentifier relatedIdentifierType="DOI" relationType="HasPart">10.14469/hpc/11652</relatedIdentifier>
<relatedIdentifier relatedIdentifierType="DOI" relationType="HasPart">10.14469/hpc/11349</relatedIdentifier>
<relatedIdentifier relatedIdentifierType="DOI" relationType="HasPart">10.14469/hpc/11405</relatedIdentifier>

and

11652 (a compound DOI):
<relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://data.hpc.imperial.ac.uk/resolve/?doi=11652&amp;file=1</relatedIdentifier>
<relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://data.hpc.imperial.ac.uk/resolve/?doi=11652&amp;file=2</relatedIdentifier>
<relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://data.hpc.imperial.ac.uk/resolve/?doi=11652&amp;file=3</relatedIdentifier>

Since the DataCite metadata has no more that this information about the URL parts,
we decided to pull the headers of the files using the HTTPS HEAD method.
These headers provided mediaType, length, and local filename.

Determination of the spectroscopy type was not definitive. Some of the DOI entries had
<subject subjectScheme="IFD.xxx" where xxx was "IR", "XRAY", or "comp". But "NMR" was not
listed and had to be taken as a default, possibly leading to issues.

Most problematic was that these types were subject designations for *collections* of
data -- at the DOI level, not the URL level.

For example, 14469, tells us in its title that it involves NMR and IR prediction and

<title>Compound 16: bis(4-(ethoxycarbonyl)-1-phenyl-1H-pyrazol-5-olate)magnesium. NMR and IR prediction, G = -2107.704571</title>



includes a mix of data types:

Filename Size Type Description
NMR Spectra.mnova 751KB chemical/x-mnova NMR Spectrum
NMR Spectra.mnpub 0 chemical/x-mnpub Mestrenova signature file for NMR Spectra.mnova
IR Spectrum.a2r 125KB application/octet-stream IR Spectrum
Mass Spectrum.pdf 46KB application/pdf Mass Spectrum
HPLC Scalemic.pdf 46KB application/pdf HPLC Scalemic (57:43 er)
HPLC Scalemic Report.pdf 12KB application/pdf HPLC Scalemic (57:43 er) Report
HPLC (R,R)-Enantiomer.pdf 41KB application/pdf HPLC (R,R)-Enantiomer (>99:1 er)
HPLC (R,R)-Enantiomer Report.pdf 11KB application/pdf HPLC (R,R)-Enantiomer (>99:1 er) Report

Clearly the media type "application/pdf" is not particularly useful here.

And the <subjects> element for this record includes only:

<subject subjectScheme="Gibbs_Energy" schemeURI="https://doi.org/10.1351/goldbook.G02629" valueURI="http://gaussian.com/thermo/">-2107.704571</subject>
<subject subjectScheme="IFD.comp">Gaussian computation</subject>
<subject subjectScheme="inchi" schemeURI="http://www.inchi-trust.org/">InChI=1S/2C12H12N2O3.2C2H6O.Mg/c2*1-2-17-12(16)10-8-13-14(11(10)15)9-6-4-3-5-7-9;2*1-2-3;/h2*3-8,15H,2H2,1H3;2*3H,2H2,1H3;/q;;;;+2/p-2</subject>
<subject subjectScheme="inchikey" schemeURI="http://www.inchi-trust.org/">FCXMPNFUGCYHCR-UHFFFAOYSA-L</subject>

despite the fact that we have NMR, IR, MS, and HPLC data.




Loading

0 comments on commit b56b511

Please sign in to comment.