Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sanitize incoming DOIs from classic and bugfixes #51

Open
wants to merge 37 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
ab0c9ce
bugfix: stopped curated metadata being loaded in maintenance_metadata…
tjacovich Apr 13, 2022
7fffd78
More updates to maintenance metadata.
tjacovich Apr 13, 2022
011850e
more updates.
tjacovich Apr 13, 2022
89d89cf
Added more commenting to maintenance task. Updated how alternate bibc…
tjacovich Apr 13, 2022
70d8f68
Added functionality to sanitize citation_change content for DOI targets.
tjacovich Apr 13, 2022
317c96d
Added functionality to sanitize citation_change content for DOI targets.
tjacovich Apr 13, 2022
6182e4a
Added sanitization to task_maintenance_reevaluate
tjacovich Apr 13, 2022
88c6df4
More work on sanitization
tjacovich Apr 14, 2022
9b858c1
working on modifying maintenance reevaluate to update citation conten…
tjacovich Apr 14, 2022
7d7412d
Changed behavior to mark citations to sanitized records as sanitized …
tjacovich Apr 15, 2022
fdbcfb1
Added db function to revert citations from sanitized to discarded.
tjacovich Apr 15, 2022
d50dbd7
Added sanitization at process_new_citation back to codebase. Added se…
tjacovich Apr 15, 2022
6d6c77b
Merged Manual_Curation Bugfix into doi_sanitization updates.
tjacovich Apr 25, 2022
c276bc0
Updated task_maintenance_curation to preserve original bibcode entry …
tjacovich Apr 25, 2022
ed5fdb3
Added function to correct lowercase alternate bibcodes in parsed_meta…
tjacovich Apr 25, 2022
9819b5f
updated populate_bibcode_column so that it won't break when run on a …
tjacovich Apr 25, 2022
d61185d
Added catch for potentially broken entries when correcting alternate …
tjacovich Apr 25, 2022
7b9a673
updated json nargs.
tjacovich Apr 25, 2022
a258cbd
Added unit tests for doi sanitization.
tjacovich Apr 27, 2022
11bce8e
Fixed alembic revisions to presever ENUM column values on downgrade.
tjacovich May 2, 2022
0e86df6
Removed alt_bibcode bugfix from branch.
tjacovich May 2, 2022
941615b
Removed secondary functions related to alt bibcode bugfix.
tjacovich May 2, 2022
2dfec1a
Merged PR #53 into branch.
tjacovich May 3, 2022
7bd0568
Merged divergent alembic heads for lowercase alt bibcodes fix and doi…
tjacovich May 3, 2022
9353224
Merged changes from Associated Works (ADSCC PR #48) into branch.
tjacovich May 16, 2022
d7d2871
Merged alembic heads between doi sanitization and master branch.
tjacovich May 16, 2022
caaecd1
Added bugfix for datetime is only year issue.
tjacovich May 18, 2022
40f0e1f
Merge branch 'master' into doi_sanitization
tjacovich May 19, 2022
8c622da
minor tweak to tasks.py
tjacovich May 20, 2022
38064ec
Merge branch 'master' into doi_sanitization
tjacovich May 24, 2022
6128e8a
minor tweak to forward.py that keeps 'ASSOCIATED' property from being…
tjacovich May 24, 2022
734cdad
Merged current master into branch
tjacovich Aug 24, 2022
f2ab156
Added latest version to test_base mock_data for 10.5281/zenodo.4475376
tjacovich Aug 24, 2022
fd32c43
Added raw_content column to citation model. Modified recreate_previou…
tjacovich Aug 24, 2022
9d0b96c
Modified alembic upgrade to populate raw_content. Modified tasks to s…
tjacovich Aug 25, 2022
1015ee2
fixed bug that caused citation capture to try and parse raw data for …
tjacovich Aug 25, 2022
822fd96
Updated test_doi expected output.
tjacovich Sep 20, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 103 additions & 6 deletions ADSCitationCapture/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from psycopg2 import IntegrityError
from dateutil.tz import tzutc
from ADSCitationCapture.models import Citation, CitationTarget, Event
from adsmsg import CitationChange, CitationChangeContentType
from ADSCitationCapture import doi
from adsmsg import CitationChange
from adsputils import setup_logging

# ============================= INITIALIZATION ==================================== #
Expand All @@ -18,7 +18,6 @@
level=config.get('LOGGING_LEVEL', 'INFO'),
attach_stdout=config.get('LOG_STDOUT', False))


# =============================== FUNCTIONS ======================================= #
def store_event(app, data):
"""
Expand Down Expand Up @@ -73,6 +72,10 @@ def _update_citation_target_metadata_session(session, content, raw_metadata, par
raw_metadata = raw_metadata.decode('utf-8')
except UnicodeEncodeError:
pass

if status == 'SANITIZED':
#reset status but otherwise leave the citation target alone
citation_target.status = status
if citation_target.raw_cited_metadata != raw_metadata or citation_target.parsed_cited_metadata != parsed_metadata or \
(status is not None and citation_target.status != status) or citation_target.curated_metadata != curated_metadata or \
citation_target.bibcode != bibcode or citation_target.associated_works != associated:
Expand Down Expand Up @@ -122,7 +125,7 @@ def update_citation_target_curator_message(app, content, msg):
msg_updated = _update_citation_target_curator_message_session(session, content, msg)
return msg_updated

def store_citation(app, citation_change, content_type, raw_metadata, parsed_metadata, status):
def store_citation(app, citation_change, raw_content, content_type, raw_metadata, parsed_metadata, status):
"""
Stores a new citation in the DB
"""
Expand All @@ -132,6 +135,7 @@ def store_citation(app, citation_change, content_type, raw_metadata, parsed_meta
citation.citing = citation_change.citing
citation.cited = citation_change.cited
citation.content = citation_change.content
citation.raw_content = raw_content
citation.resolved = citation_change.resolved
citation.timestamp = citation_change.timestamp.ToDatetime().replace(tzinfo=tzutc())
citation.status = status
Expand Down Expand Up @@ -322,15 +326,31 @@ def get_citations_by_bibcode(app, bibcode):
citations = get_citations(app, dummy_citation_change)
return citations

def get_citations(app, citation_change):
def get_citations(app, citation_change, status='REGISTERED'):
"""
Return all the citations (bibcodes) to a given content.
It will ignore DELETED and DISCARDED citations.
It will ignore DELETED and DISCARDED citations by default.
"""
with app.session_scope() as session:
citation_bibcodes = [r.citing for r in session.query(Citation).filter_by(content=citation_change.content, status="REGISTERED").all()]
citation_bibcodes = [r.citing for r in session.query(Citation).filter_by(content=citation_change.content, status=status).all()]
return citation_bibcodes

def get_citation_data(app, citing_bibcode, content):
"""
Get the data for given citation
"""
with app.session_scope() as session:
citation_change = session.query(Citation).filter_by(content=content, citing=citing_bibcode).first()
if citation_change:
citation = Citation()
citation.citing = citation_change.citing
citation.cited = citation_change.cited
citation.content = citation_change.content
citation.resolved = citation_change.resolved
citation.timestamp = citation_change.timestamp
citation.status = citation_change.status
return citation

def generate_modified_metadata(parsed_metadata, curated_entry):
"""
modify parsed_metadata with any curated metadata. return results.
Expand Down Expand Up @@ -385,6 +405,43 @@ def update_citation(app, citation_change):
logger.info("Ignoring citation update (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString())
return updated

def citation_data_to_citation_change(citation_data, previously_discarded_record):
"""
Takes data from a citation and converts it into a citation_change.
"""
citation_change = CitationChange()
citation_change.content_type = getattr(CitationChangeContentType, previously_discarded_record['content_type'].lower())
citation_change.content = citation_data.content
citation_change.citing = citation_data.citing
citation_change.cited = citation_data.cited
citation_change.resolved = citation_data.resolved
citation_change.timestamp.FromDatetime(citation_data.timestamp)

return citation_change

def update_citation_content(app, citation_change, raw_content):
"""
Update citation record information
"""
updated = False
with app.session_scope() as session:
citation = session.query(Citation).with_for_update().filter_by(citing=citation_change.citing, content=raw_content).first()
if citation:
if citation.timestamp < citation_change.timestamp:
#citation.citing = citation_change.citing # This should not change
citation.raw_content = raw_content
citation.content = citation_change.content
session.add(citation)
session.commit()
updated = True
logger.info("Updated citation (citing '%s', content '%s' and timestamp '%s')", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString())
else:
logger.info("Ignoring citation update (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString())
else:
logger.info("Unable to update citation (citing '%s', content '%s' and timestamp '%s')", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString())

return updated

def mark_citation_as_deleted(app, citation_change):
"""
Update status to DELETED for a given citation
Expand Down Expand Up @@ -419,6 +476,46 @@ def mark_all_discarded_citations_as_registered(app, content):
session.add(citation)
session.commit()

def mark_sanitized_citation(app, citing, content, raw_content, status='SANITIZED'):
"""
Update status to SANITIZED for a single discarded citation
"""
marked_as_registered = False
previous_status = None
with app.session_scope() as session:
citation = session.query(Citation).with_for_update().filter_by(status='DISCARDED', citing=citing, content=raw_content).first()
citation.status = status
citation.content = content
citation.raw_content = raw_content
session.add(citation)
session.commit()

def mark_all_discarded_citations_as_sanitized(app, content):
"""
Update status to SANITIZED for all discarded citations of a given content
"""
marked_as_registered = False
previous_status = None
with app.session_scope() as session:
citations = session.query(Citation).with_for_update().filter_by(status='DISCARDED', content=content).all()
for citation in citations:
citation.status = 'SANITIZED'
session.add(citation)
session.commit()

def mark_all_sanitized_citations_as_discarded(app, content):
"""
Update status to DISCARDED for all sanitized citations of a given content
"""
marked_as_registered = False
previous_status = None
with app.session_scope() as session:
citations = session.query(Citation).with_for_update().filter_by(status='SANITIZED', content=content).all()
for citation in citations:
citation.status = 'SANITIZED'
session.add(citation)
session.commit()

def populate_bibcode_column(main_session):
"""
Pulls all citation targets from DB and populates the bibcode column using parsed metadata
Expand Down
4 changes: 3 additions & 1 deletion ADSCitationCapture/delta_computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,10 @@ def _reconstruct_previous_expanded_raw_data(self):
# Reconstruct expanded raw table from the official citation table
drop_reconstructed_previous_expanded_table = "DROP TABLE IF EXISTS {0}.{1};"
self._execute_sql(drop_reconstructed_previous_expanded_table, self.previous_schema_name, self.recreated_previous_expanded_table_name)
reconstruct_previous_expanded_table = "CREATE TABLE {0}.{1} AS SELECT id, citing, cited, CASE WHEN citation_target.content_type = 'DOI' THEN true ELSE false END AS doi, CASE WHEN citation_target.content_type = 'PID' THEN true ELSE false END AS pid, CASE WHEN citation_target.content_type = 'URL' THEN true ELSE false END AS url, citation.content, citation.resolved, citation.timestamp FROM citation INNER JOIN citation_target ON citation.content = citation_target.content WHERE citation.status != 'DELETED';"
reconstruct_previous_expanded_table = "CREATE TABLE {0}.{1} AS SELECT id, citing, cited, CASE WHEN citation_target.content_type = 'DOI' THEN true ELSE false END AS doi, CASE WHEN citation_target.content_type = 'PID' THEN true ELSE false END AS pid, CASE WHEN citation_target.content_type = 'URL' THEN true ELSE false END AS url, citation.raw_content, citation.resolved, citation.timestamp FROM citation INNER JOIN citation_target ON citation.content = citation_target.content WHERE citation.status != 'DELETED';"
self._execute_sql(reconstruct_previous_expanded_table, self.previous_schema_name, self.recreated_previous_expanded_table_name)
rename_raw_content_column_previous_expanded_table = "ALTER TABLE {0}.{1} RENAME COLUMN raw_content TO content"
self._execute_sql(rename_raw_content_column_previous_expanded_table, self.previous_schema_name, self.recreated_previous_expanded_table_name)

def _find_not_processed_records_from_previous_run(self):
"""
Expand Down
24 changes: 24 additions & 0 deletions ADSCitationCapture/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,30 @@ def parse_metadata(raw_metadata):
"""
return _parse_metadata_zenodo_doi(raw_metadata)

def sanitize_zenodo_doi(doi):
"""
Takes the imported citation_change content and tries to sanitize it if it is a zenodo doi.
"""
return _sanitize_zendo_doi(doi)

def _sanitize_zendo_doi(doi):
doi_root = '10.5281'
zenodo_doi_reset = re.compile(r"10.\d{4,9}/zenodo\.([0-9]*)", re.IGNORECASE)
zenodo_doi_reset_slash = re.compile(r"10.\d{4,9}/zenodo/([0-9]*)", re.IGNORECASE)
try:
#splits apart any conjoined dois and takes the first full one.
spl_doi = doi_root + doi.split(doi_root)[1]
return re.search(zenodo_doi_reset, spl_doi).group(0)
except Exception as e:
logger.error("Attempt to parse content: {} failed with error: {}. Trying again with alternate regex.".format(doi, e))
try:
spl_doi = doi_root + doi.split(doi_root)[1]
split = re.search(zenodo_doi_reset_slash, spl_doi).group(0).split('/')
return doi_root + "/" + "zenodo." + split[2]
except Exception as e:
logger.error("Attempt to parse content: {} failed with error: {}.".format(doi, e))
return None

def renormalize_author_names(authors):
"""
A wrapper function dc.author_names._normalize that allows CitationCapture
Expand Down
5 changes: 3 additions & 2 deletions ADSCitationCapture/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@

citation_content_type = ENUM('DOI', 'PID', 'URL', name='citation_content_type')
citation_change_type = ENUM('NEW', 'DELETED', 'UPDATED', name='citation_change_type')
citation_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', name='citation_status_type')
target_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', name='target_status_type')
citation_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', 'SANITIZED', name='citation_status_type')
target_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', 'SANITIZED', name='target_status_type')


class RawCitation(Base):
Expand Down Expand Up @@ -53,6 +53,7 @@ class Citation(Base):
)
__versioned__ = {} # Must be added to all models that are to be versioned
id = Column(Integer, primary_key=True)
raw_content = Column(Text())
content = Column(Text(), ForeignKey('public.citation_target.content'))
citing = Column(Text()) # Bibcode of the article that is citing a target
cited = Column(Text()) # Probably not necessary to keep
Expand Down
Loading