Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HotFix] HAL: Hotfix for identification failures #3077

Merged
merged 2 commits into from
Jul 12, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 61 additions & 53 deletions HAL Archives Ouvertes.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@
"translatorID": "f20f91fe-d875-47e7-9656-0abb928be472",
"label": "HAL Archives Ouvertes",
"creator": "Sebastian Karcher",
"target": "^https?://hal\\.archives-ouvertes\\.fr",
"target": "^https://(hal\\.archives-ouvertes\\.fr|hal\\.science)\\b",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2022-12-12 19:29:40"
"lastUpdated": "2023-07-12 08:47:33"
}

/*
***** BEGIN LICENSE BLOCK *****
HAL translator
Copyright © 2012-2014 Sebastian Karcher
Copyright © 2012-2014 Sebastian Karcher and contributors

This file is part of Zotero.

Expand All @@ -36,51 +36,59 @@
*/

function detectWeb(doc, url) {
if (url.search(/\/search\/index\//)!=-1) return "multiple";
if (url.search(/\index\.php\?halsid=|\.fr\/[a-z]+-\d+/)!=-1) return findItemType(doc, url);
if (/\/search\/index\//.test(url)) return "multiple";
if (/\/hal-\d+/.test(url)) return findItemType(doc, url);
return false;
}

function findItemType(doc, url){
var itemType = text(doc, 'div.label');
//Z.debug(itemType)
function findItemType(doc, url) {
var itemType = text(doc, '.typdoc')
// do some preliminary cleaning
.split("(")[0].trim() // discard parenthesized text
.split(", ")[0].trim() // simplify "Pré-publication, Document de travail" and " Preprints, Working Papers, ..."
.toLowerCase();
var typeMap = {
"Books": "book",
"Ouvrage (y compris édition critique et traduction)": "book",
"Book sections": "bookSection",
"Chapitre d'ouvrage": "bookSection",
"Conference papers": "conferencePaper",
"Communication dans un congrès": "conferencePaper",
"Directions of work or proceedings": "book",
"Direction d'ouvrage, Proceedings": "book",
"Journal articles": "journalArticle",
"Article dans des revues": "journalArticle",
"Lectures": "presentation",
"Cours": "presentation",
"Other publications": "book", //this could also be report, not sure here but bibtex guesses book
"Autre publication": "book", //this could also be report, not sure here but bibtex guesses book
"Patents": "patent",
"Brevet": "patent",
"Preprints, Working Papers, ...": "manuscript",
"Pré-publication, Document de travail": "manuscript",
"Reports": "report",
"Rapport": "report",
"Theses": "thesis",
"Thèse": "thesis",
"Poster communications": "presentation",
"Poster de conférence": "presentation"
}
/* eslint-disable quote-props */
"books": "book",
"ouvrages": "book",
"book sections": "bookSection",
"chapitre d'ouvrage": "bookSection",
"conference papers": "conferencePaper",
"communication dans un congrès": "conferencePaper",
"directions of work or proceedings": "book",
"direction d'ouvrage": "book",
"journal articles": "journalArticle",
"article dans une revue": "journalArticle",
"lectures": "presentation",
"cours": "presentation",
"other publications": "book", // this could also be report, not sure here but bibtex guesses book
"autre publication scientifique": "book", // this could also be report, not sure here but bibtex guesses book
"patents": "patent",
"brevet": "patent",
"preprints": "preprint",
"pré-publication": "preprint",
"reports": "report",
"rapport": "report",
"scientific blog post": "blogPost",
"article de blog scientifique": "blogPost",
"theses": "thesis",
"thèse": "thesis",
"poster communications": "presentation",
"poster de conférence": "presentation",
/* eslint-enable quote-props */
};
if (typeMap[itemType]) return typeMap[itemType];
else if (url.indexOf("medihal-")!=-1) return "artwork";
else if (url.includes("medihal-")) return "artwork";
else return "journalArticle";
}

function doWeb(doc, url) {
var articles = new Array();
var articles = [];
if (detectWeb(doc, url) == "multiple") {
var items = {};
var titles = doc.evaluate('//strong/a[@data-original-title="Display the resource" or @data-original-title="Voir la ressource"]', doc, null, XPathResult.ANY_TYPE, null);
var title;
while (title = titles.iterateNext()) {
while ((title = titles.iterateNext())/* assignment */) {
items[title.href] = title.textContent;
}
Zotero.selectItems(items, function (items) {
Expand All @@ -90,34 +98,33 @@ function doWeb(doc, url) {
for (var i in items) {
articles.push(i);
}
Zotero.Utilities.processDocuments(articles, scrape)
Zotero.Utilities.processDocuments(articles, scrape);
return true;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't actually check the return value of the selectItems callback (anymore?), so the consistent-return fix here is just to change the first return true; to return;.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right about the callback return value being discarded. But because this is now already merged, I'm going to focus on substantive improvements next (for example, asyncify, use OAI for metadata, with speedup of saving multiple, etc.)

});
} else {
//work on PDF pages
if (url.search(/\/document$/) != -1 ) {
var articleURL = url.replace(/\/document$/, "")
//Z.debug(articleURL)
ZU.processDocuments(articleURL, scrape);
}
else scrape(doc, url);
}
else if (/\/document$/.test(url)) { // work on PDF pages
var articleURL = url.replace(/\/document$/, "");
// Z.debug(articleURL)
ZU.processDocuments(articleURL, scrape);
}
else scrape(doc, url);
}

function scrape(doc, url) {
var bibtexUrl = url.replace(/#.+|\/$/, "") + "/bibtex";
var abstract = ZU.xpathText(doc, '//div[@class="abstract-content"]');
var pdfUrl = ZU.xpathText(doc, '//meta[@name="citation_pdf_url"]/@content');
//Z.debug("pdfURL " + pdfUrl)
var abstract = text(doc, '.abstract-content');
var pdfUrl = attr(doc, "#viewer-detailed a[download]", "href");
// Z.debug("pdfURL " + pdfUrl)
ZU.doGet(bibtexUrl, function (bibtex) {
//Z.debug(bibtex)
// Z.debug(bibtex)
var translator = Zotero.loadTranslator("import");
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
translator.setString(bibtex);
translator.setHandler("itemDone", function (obj, item) {
if (abstract){
item.abstractNote=abstract.replace(/(Abstract|Résumé)\s*:/, "");
if (abstract) {
item.abstractNote = abstract.replace(/^(Abstract|Résumé)\s*:/, "");
}
if (pdfUrl){
if (pdfUrl) {
item.attachments = [{
url: pdfUrl,
title: "HAL PDF Full Text",
Expand All @@ -141,8 +148,9 @@ function scrape(doc, url) {
item.complete();
});
translator.translate();
})
});
}

/** BEGIN TEST CASES **/
var testCases = [
{
Expand Down