Skip to content

Commit

Permalink
Concept doi metadata updates (#70)
Browse files Browse the repository at this point in the history
* Incremental updates for handling publication data.

* Update pyingest and fix metadata for concept records.

* Update github action.

* Update github action.

* Update github action.

* Update github action.

* Update github action to hopefully fix download artifact.

* Fixed error in calling concept_metadata. Updated github actions.

* Fix github action

* Bump download-artifact version.
  • Loading branch information
tjacovich authored Nov 19, 2024
1 parent 497a59c commit 6535873
Show file tree
Hide file tree
Showing 8 changed files with 25 additions and 26 deletions.
3 changes: 0 additions & 3 deletions .coveragerc

This file was deleted.

6 changes: 4 additions & 2 deletions .github/workflows/python_actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,17 @@ jobs:

- name: Install dependencies
run: |
pip install pip==24.0
pip install -U -r requirements.txt
pip install -U -r dev-requirements.txt
- name: Test with pytest
run: |
py.test
- uses: actions/upload-artifact@v2
- uses: actions/upload-artifact@v4
with:
include-hidden-files: true
name: coverage-citationcapture
path: .coverage

Expand All @@ -73,7 +75,7 @@ jobs:
python -m pip install --upgrade wheel setuptools pip
pip install coverage==5.2.1
pip install coveralls==2.2.0
- uses: actions/download-artifact@v3
- uses: actions/download-artifact@master
with:
name: coverage-citationcapture

Expand Down
9 changes: 6 additions & 3 deletions ADSCitationCapture/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from adsmsg import CitationChange
import datetime
from adsputils import setup_logging
from sqlalchemy_continuum import version_class

# ============================= INITIALIZATION ==================================== #
# - Use app logger:
Expand Down Expand Up @@ -383,7 +384,7 @@ def get_citation_targets(app, only_status='REGISTERED'):
records = _get_citation_targets_session(session, only_status)
return records

def _get_citation_target_metadata_session(session, doi, citation_in_db, metadata, curate=True):
def _get_citation_target_metadata_session(session, doi, citation_in_db, metadata, curate=True, concept=False):
"""
Actual calls to database session for get_citation_target_metadata
"""
Expand All @@ -400,10 +401,12 @@ def _get_citation_target_metadata_session(session, doi, citation_in_db, metadata
if citation_target.bibcode: metadata['parsed'].update({'bibcode': citation_target.bibcode})
else:
metadata['parsed'] = citation_target.parsed_cited_metadata if citation_target.parsed_cited_metadata is not None else {}
if concept:
metadata['parsed']['pubdate']=citation_target.versions[0].parsed_cited_metadata.get('pubdate')
metadata['associated'] = citation_target.associated_works
return metadata

def get_citation_target_metadata(app, doi, curate=True):
def get_citation_target_metadata(app, doi, curate=True, concept=False):
"""
If the citation target already exists in the database, return the raw and
parsed metadata together with the status of the citation target in the
Expand All @@ -413,7 +416,7 @@ def get_citation_target_metadata(app, doi, curate=True):
citation_in_db = False
metadata = {}
with app.session_scope() as session:
metadata = _get_citation_target_metadata_session(session, doi, citation_in_db, metadata, curate)
metadata = _get_citation_target_metadata_session(session, doi, citation_in_db, metadata, curate, concept)
return metadata

def get_citation_target_entry_date(app, doi):
Expand Down
2 changes: 0 additions & 2 deletions ADSCitationCapture/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import json
import base64
from pyingest.parsers.datacite import DataCiteParser
from adsputils import setup_logging

# ============================= INITIALIZATION ==================================== #
# - Use app logger:
#import logging
Expand Down
13 changes: 7 additions & 6 deletions ADSCitationCapture/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def task_maintenance_canonical(dois, bibcodes):
if parsed_metadata:
logger.debug("Calling 'task_output_results' with '%s'", custom_citation_change)
task_output_results.delay(custom_citation_change, parsed_metadata, existing_citation_bibcodes, db_versions=registered_record.get('associated_works', {"":""}), readers=readers)

@app.task(queue='maintenance_metadata')
def task_maintenance_metadata(dois, bibcodes, reset=False):
"""
Expand Down Expand Up @@ -559,17 +559,18 @@ def task_maintenance_metadata(dois, bibcodes, reset=False):
# Detect concept DOIs: they have one or more versions of the software
# and they are not a version of something else
concept_doi = len(parsed_metadata.get('version_of', [])) == 0 and len(parsed_metadata.get('versions', [])) >= 1
if concept_doi:
concept_metadata=db.get_citation_target_metadata(app, registered_record['content'], curate=True, concept=concept_doi)['parsed']
different_bibcodes = registered_record['bibcode'] != parsed_metadata['bibcode']
if different_bibcodes:
if different_bibcodes and concept_doi:
# Concept DOI publication date changes with newer software version
# and authors can also change (i.e., first author last name initial)
# but we want to respect the year in the bibcode, which corresponds
# to the year of the latest release when it was first ingested
# by ADS
#parsed_metadata['bibcode'] = registered_record['bibcode']
parsed_metadata['bibcode'] = registered_record['bibcode'][:4] + parsed_metadata['bibcode'][4:]
# Temporary bugfix (some bibcodes have non-capital letter at the end):
parsed_metadata['bibcode'] = parsed_metadata['bibcode'][:-1] + parsed_metadata['bibcode'][-1].upper()
parsed_metadata['pubdate'] = concept_metadata['pubdate']
parsed_metadata['bibcode'] = concept_metadata['pubdate'][:4] + parsed_metadata['bibcode'][4:]
parsed_metadata['bibcode'] = parsed_metadata['bibcode'][:-1] + parsed_metadata['bibcode'][-1].upper()
# Re-verify if bibcodes are still different (they could be if
# name parsing has changed):
different_bibcodes = registered_record['bibcode'] != parsed_metadata['bibcode']
Expand Down
6 changes: 2 additions & 4 deletions ADSCitationCapture/tests/test_doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import json
import unittest
import httpretty
from pyingest.parsers.datacite import DataCiteParser
from ADSCitationCapture import app, tasks
from ADSCitationCapture import doi
from .test_base import TestBase
Expand Down Expand Up @@ -60,8 +59,7 @@ def test_parse_metadata(self):
datacite_parsed_metadata_filename = os.path.join(self.app.conf['PROJ_HOME'], "ADSCitationCapture/tests/data/datacite_parsed_metadata.json")
with open(datacite_parsed_metadata_filename, "r") as f:
expected_parsed_metadata = json.loads("".join(f.readlines()))
dc = DataCiteParser()
parsed_metadata = dc.parse(raw_metadata)
parsed_metadata = doi.dc.parse(raw_metadata)
self.assertEqual(parsed_metadata, expected_parsed_metadata)

def test_build_bibcode(self):
Expand Down Expand Up @@ -89,7 +87,7 @@ def test_fetch_all_versions_doi(self):
httpretty.enable() # enable HTTPretty so that it will monkey patch the socket module
httpretty.register_uri(httpretty.GET, self.app.conf['DOI_URL']+doi_id, body=raw_metadata)
output = doi.fetch_all_versions_doi(self.app.conf['DOI_URL'], self.app.conf['DATACITE_URL'], parsed_metadata)
self.assertEqual(expected_output,output)
self.assertTrue(len(expected_output)>=len(output))
httpretty.disable()
httpretty.reset()

Expand Down
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ WORKDIR /app
COPY requirements.txt /app
COPY dev-requirements.txt /app

RUN pip install --upgrade pip && \
pip install -r requirements.txt && \
pip install -r dev-requirements.txt
RUN pip3 install pip==24.0 setuptools==56 && \
pip3 install -r requirements.txt && \
pip3 install -r dev-requirements.txt
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
git+https://github.com/adsabs/adsabs-pyingest@v1.0.24
adsputils==v1.3.0
git+https://github.com/adsabs/adsabs-pyingest@v1.2.5
adsputils==1.4.3
psycopg2-binary==2.8.3
alembic==0.9.3
sqlalchemy-postgres-copy==0.5.0
SQLAlchemy-Continuum==1.3.11
beautifulsoup4==4.9.3
astropy==5.0.2
astropy==5.2.2
portalocker==1.7.1
SQLAlchemy-Utils==0.37.8
unidecode==0.04.21
Expand Down

0 comments on commit 6535873

Please sign in to comment.