Skip to content

Commit

Permalink
Add tind.io harvester for AgEcon with MODS
Browse files Browse the repository at this point in the history
  • Loading branch information
aaxelb committed May 25, 2017
1 parent 9363699 commit c607117
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 269 deletions.
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
'com.peerj.xml = share.transformers.com_peerj_xml:PeerJXMLTransformer',
'com.researchregistry = share.transformers.com_researchregistry:RRTransformer',
'com.springer = share.transformers.com_springer:SpringerTransformer',
'edu.ageconsearch = share.transformers.edu_ageconsearch:AgeconTransformer',
'edu.gwu = share.transformers.edu_gwu:GWScholarSpaceTransformer',
'edu.harvarddataverse = share.transformers.edu_harvarddataverse:HarvardTransformer',
'gov.clinicaltrials = share.transformers.gov_clinicaltrials:ClinicalTrialsTransformer',
Expand Down Expand Up @@ -57,7 +56,6 @@
'com.peerj = share.harvesters.com_peerj:PeerJHarvester',
'com.researchregistry = share.harvesters.com_researchregistry:ResearchRegistryHarvester',
'com.springer = share.harvesters.com_springer:SpringerHarvester',
'edu.ageconsearch = share.harvesters.edu_ageconsearch:AgEconHarvester',
'edu.gwu = share.harvesters.edu_gwu:GWScholarSpaceHarvester',
'edu.harvarddataverse = share.harvesters.edu_harvarddataverse:HarvardDataverseHarvester',
'gov.clinicaltrials = share.harvesters.gov_clinicaltrials:ClinicalTrialsHarvester',
Expand All @@ -67,6 +65,7 @@
'gov.scitech = share.harvesters.gov_scitech:SciTechHarvester',
'gov.usgs = share.harvesters.gov_usgs:USGSHarvester',
'io.osf = share.harvesters.io_osf:OSFHarvester',
'io.tind = share.harvesters.io_tind:TindHarvester',
'oai = share.harvesters.oai:OAIHarvester',
'org.arxiv = share.harvesters.org_arxiv:ArxivHarvester',
'org.biorxiv = share.harvesters.org_biorxiv:BiorxivHarvester',
Expand Down
117 changes: 0 additions & 117 deletions share/harvesters/edu_ageconsearch.py

This file was deleted.

80 changes: 80 additions & 0 deletions share/harvesters/io_tind.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import logging

from furl import furl
from lxml import etree
import pendulum

from share.harvest import BaseHarvester

logger = logging.getLogger('__name__')


class TindHarvester(BaseHarvester):
"""
Expected harvester kwargs:
collection: collection name to harvest
page_size: records per request
format_code:
'xo': MODS XML
'xd': Dublin Core-ish XML
'xm': MARC XML
'hm': MARC
'hb': HTML
API Query Parameters:
dt (type of date filter: 'm' for date modified)
d1d (start of date range day)
d1m (start of date range month)
d1y (start of date range year)
d2d (end of date range day)
d2m (end of date range month)
d2y (end of date range year)
sc (split by collection: 0 or 1)
sf (sort field: e.g. 'latest first')
so (sort order: 'a' for ascending, 'd' for descending)
rg (page size)
jrec (offset)
of (format code, see above)
"""
VERSION = 1

namespaces = {
'mods': 'http://www.loc.gov/mods/v3',
}

def do_harvest(self, start_date: pendulum.Pendulum, end_date: pendulum.Pendulum):
page_size = self.kwargs['page_size']
offset = 1
url = furl(self.config.base_url)
url.args.update({
'c': self.kwargs['collection'],
'of': self.kwargs['format_code'],
'rg': page_size,
'dt': 'm',
'd1d': start_date.day,
'd1m': start_date.month,
'd1y': start_date.year,
'd2d': end_date.day,
'd2m': end_date.month,
'd2y': end_date.year,
'sc': 0, # Splitting by collection screws up the page size
'sf': 'latest first',
'so': 'd',
})

while True:
logger.debug('Making request to %s', url.url)
resp = self.requests.get(url.url)
resp.raise_for_status()

parsed = etree.fromstring(resp.content, parser=etree.XMLParser(recover=True))
records = parsed.xpath('/modsCollection/mods:mods', namespaces=self.namespaces)
if not records:
break

for record in records:
id = record.xpath('mods:recordInfo/mods:recordIdentifier', namespaces=self.namespaces)[0].text
yield (id, etree.tostring(record, encoding=str))

offset += page_size
url.args['jrec'] = offset
16 changes: 10 additions & 6 deletions share/sources/edu.ageconsearch/source.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
configs:
- base_url: http://ageconsearch.umn.edu/browse-date
- base_url: http://ageconsearch.tind.io/search
disabled: false
earliest_date: null
harvester: edu.ageconsearch
harvester_kwargs: {}
label: edu.ageconsearch
harvester: io.tind
harvester_kwargs:
collection: AgEcon Search
page_size: 100
format_code: xo
label: edu.ageconsearch.tind
rate_limit_allowance: 1
rate_limit_period: 2
transformer: edu.ageconsearch
transformer_kwargs: {}
transformer: mods
transformer_kwargs:
emitted_type: Preprint
home_page: http://ageconsearch.umn.edu/
long_title: AgEcon Search
name: edu.ageconsearch
Expand Down
138 changes: 0 additions & 138 deletions share/transformers/edu_ageconsearch.py

This file was deleted.

Loading

0 comments on commit c607117

Please sign in to comment.