Skip to content

Commit

Permalink
[HotFix] HAL: Fix abstract, PDF link, and some more type identification
Browse files Browse the repository at this point in the history
- Update selector for abstract (it now has a class list with more class
  names so the previous XPath failed).
- Update the selector for PDF link.
- Cosmetic fix: further fixes to the type identification (affecting the
  Connector icon, not the actual item's metadata).
  • Loading branch information
zoe-translates committed Jul 12, 2023
1 parent 1a1feea commit 4e6d3d7
Showing 1 changed file with 32 additions and 31 deletions.
63 changes: 32 additions & 31 deletions HAL Archives Ouvertes.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2023-07-12 07:30:59"
"lastUpdated": "2023-07-12 08:47:33"
}

/*
Expand Down Expand Up @@ -45,35 +45,36 @@ function findItemType(doc, url) {
var itemType = text(doc, '.typdoc')
// do some preliminary cleaning
.split("(")[0].trim() // discard parenthesized text
.split(", ")[0].trim(); // simplify "Pré-publication, Document de travail" and " Preprints, Working Papers, ..."
.split(", ")[0].trim() // simplify "Pré-publication, Document de travail" and " Preprints, Working Papers, ..."
.toLowerCase();
var typeMap = {
/* eslint-disable quote-props */
"Books": "book",
"Ouvrages": "book",
"Book sections": "bookSection",
"Chapitre d'ouvrage": "bookSection",
"Conference papers": "conferencePaper",
"Communication dans un congrès": "conferencePaper",
"Directions of work or proceedings": "book",
"Direction d'ouvrage, Proceedings": "book",
"Journal articles": "journalArticle",
"Article dans une revue": "journalArticle",
"Lectures": "presentation",
"Cours": "presentation",
"Other publications": "book", // this could also be report, not sure here but bibtex guesses book
"Autre publication scientifique": "book", // this could also be report, not sure here but bibtex guesses book
"Patents": "patent",
"Brevet": "patent",
"Preprints": "preprint",
"Pré-publication": "preprint",
"Reports": "report",
"Rapport": "report",
"Scientific blog post": "blogPost",
"Article de blog scientifique": "blogPost",
"Theses": "thesis",
"Thèse": "thesis",
"Poster communications": "presentation",
"Poster de conférence": "presentation",
"books": "book",
"ouvrages": "book",
"book sections": "bookSection",
"chapitre d'ouvrage": "bookSection",
"conference papers": "conferencePaper",
"communication dans un congrès": "conferencePaper",
"directions of work or proceedings": "book",
"direction d'ouvrage": "book",
"journal articles": "journalArticle",
"article dans une revue": "journalArticle",
"lectures": "presentation",
"cours": "presentation",
"other publications": "book", // this could also be report, not sure here but bibtex guesses book
"autre publication scientifique": "book", // this could also be report, not sure here but bibtex guesses book
"patents": "patent",
"brevet": "patent",
"preprints": "preprint",
"pré-publication": "preprint",
"reports": "report",
"rapport": "report",
"scientific blog post": "blogPost",
"article de blog scientifique": "blogPost",
"theses": "thesis",
"thèse": "thesis",
"poster communications": "presentation",
"poster de conférence": "presentation",
/* eslint-enable quote-props */
};
if (typeMap[itemType]) return typeMap[itemType];
Expand Down Expand Up @@ -111,8 +112,8 @@ function doWeb(doc, url) {

function scrape(doc, url) {
var bibtexUrl = url.replace(/#.+|\/$/, "") + "/bibtex";
var abstract = ZU.xpathText(doc, '//div[@class="abstract-content"]');
var pdfUrl = ZU.xpathText(doc, '//meta[@name="citation_pdf_url"]/@content');
var abstract = text(doc, '.abstract-content');
var pdfUrl = attr(doc, "#viewer-detailed a[download]", "href");
// Z.debug("pdfURL " + pdfUrl)
ZU.doGet(bibtexUrl, function (bibtex) {
// Z.debug(bibtex)
Expand All @@ -121,7 +122,7 @@ function scrape(doc, url) {
translator.setString(bibtex);
translator.setHandler("itemDone", function (obj, item) {
if (abstract) {
item.abstractNote = abstract.replace(/(Abstract|Résumé)\s*:/, "");
item.abstractNote = abstract.replace(/^(Abstract|Résumé)\s*:/, "");
}
if (pdfUrl) {
item.attachments = [{
Expand Down

0 comments on commit 4e6d3d7

Please sign in to comment.