diff --git a/setup.py b/setup.py index 93493cf24..e22065345 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,6 @@ 'com.peerj = share.harvesters.com_peerj:PeerJHarvester', 'com.researchregistry = share.harvesters.com_researchregistry:ResearchRegistryHarvester', 'com.springer = share.harvesters.com_springer:SpringerHarvester', - 'edu.ageconsearch = share.harvesters.edu_ageconsearch:AgEconHarvester', 'edu.gwu = share.harvesters.edu_gwu:GWScholarSpaceHarvester', 'edu.harvarddataverse = share.harvesters.edu_harvarddataverse:HarvardDataverseHarvester', 'gov.clinicaltrials = share.harvesters.gov_clinicaltrials:ClinicalTrialsHarvester', @@ -67,6 +66,7 @@ 'gov.scitech = share.harvesters.gov_scitech:SciTechHarvester', 'gov.usgs = share.harvesters.gov_usgs:USGSHarvester', 'io.osf = share.harvesters.io_osf:OSFHarvester', + 'io.tind = share.harvesters.io_tind:TindHarvester', 'oai = share.harvesters.oai:OAIHarvester', 'org.arxiv = share.harvesters.org_arxiv:ArxivHarvester', 'org.biorxiv = share.harvesters.org_biorxiv:BiorxivHarvester', diff --git a/share/harvesters/edu_ageconsearch.py b/share/harvesters/edu_ageconsearch.py deleted file mode 100644 index 4f340e5f9..000000000 --- a/share/harvesters/edu_ageconsearch.py +++ /dev/null @@ -1,117 +0,0 @@ -import logging -import dateutil - -from bs4 import BeautifulSoup -from furl import furl -import pendulum - -from share.harvest import BaseHarvester - -logger = logging.getLogger('__name__') - - -class AgEconHarvester(BaseHarvester): - """ - Query Parameters: - month (MM) - year (YYYY) - order (oldestFirst or None) - starts_with (YYYY-MM-DD) they don't always have a day - top (page number) - - Returns: - Page with nearest date - 20 records/page - """ - VERSION = 1 - - fields = { - 'title': 'title', - 'other titles': 'other_titles', - 'authors': 'authors', - 'editors': 'editors', - 'editors (email)': 'editors_email', - 'authors (email)': 'authors_email', - 'keywords': 'keywords', - 'jel codes': 'jel_codes', - 'issue date': 'issue_date', - 'series/report no.': 'series_report_number', - 'abstract': 'abstract', - 'uri': 'uri', - 'institution/association': 'institution_association', - 'identifiers': 'identifiers', - 'total pages': 'total_pages', - 'from page': 'from_page', - 'to page': 'to_page', - 'notes': 'notes', - 'collections:': 'collections', - } - - # Request page with nearest date - def do_harvest(self, start_date: pendulum.Pendulum, end_date: pendulum.Pendulum): - return self.fetch_records(start_date, end_date) - - # Fetch the list of work urls on a single result page and return results within date range - def fetch_records(self, start_date, end_date): - logger.info('Harvesting %s - %s', start_date, end_date) - logger.debug('Fetching page %s', self.config.base_url) - - url = furl(self.config.base_url) - url.args['starts_with'] = start_date - r = self.requests.get(url.url) - - r.raise_for_status() - within_date_range = True - while within_date_range: - document = BeautifulSoup(r.text, 'html.parser') - results = document.select('a[href^="/handle/"]')[1:] - for result in results: - url = 'http://ageconsearch.umn.edu{}'.format(result.attrs['href']) - work = self.fetch_work(url) - date_status = self.check_record_date(work['issue_date'], start_date, end_date) - - # if date is > start_date continue and skip - if date_status == 'after': - continue - elif date_status == 'before': - within_date_range = False - return - yield work['primary_identifier'], work - - r = self.requests.get('http://ageconsearch.umn.edu/{}'.format(document.find('a', string='Next page').attrs['href'])) - - def check_record_date(self, issue_date, start_date, end_date): - date_object = dateutil.parser.parse(issue_date, default=pendulum.create(2016, 1, 1)) - - if date_object < start_date.start_of('day'): - return 'before' - if date_object > end_date.end_of('day'): - return 'after' - - return 'within' - - # Pull data out of html - def fetch_work(self, url): - r = self.requests.get(url) - r.raise_for_status() - soup = BeautifulSoup(r.text, 'lxml') - data = {} - - data['primary_identifier'] = soup.find('code').text - display_table = soup.find(class_='itemDisplayTable').find_all('tr') - - for row in display_table: - label = row.find(class_='metadataFieldLabel').text.replace(':\xa0', '').lower() - value_object = row.find(class_='metadataFieldValue') - if value_object.string: - value = value_object.string - else: - contents = [] - for content in value_object.contents: - contents.append(content.string or content) - # Feels a little hacky - value = [val for val in contents if val != BeautifulSoup('
', 'lxml').br] - - data[self.fields[label]] = value - - return data diff --git a/share/harvesters/io_tind.py b/share/harvesters/io_tind.py new file mode 100644 index 000000000..3ee93cb6a --- /dev/null +++ b/share/harvesters/io_tind.py @@ -0,0 +1,80 @@ +import logging + +from furl import furl +from lxml import etree +import pendulum + +from share.harvest import BaseHarvester + +logger = logging.getLogger('__name__') + + +class TindHarvester(BaseHarvester): + """ + Expected harvester kwargs: + collection: collection name to harvest + page_size: records per request + format_code: + 'xo': MODS XML + 'xd': Dublin Core-ish XML + 'xm': MARC XML + 'hm': MARC + 'hb': HTML + + API Query Parameters: + dt (type of date filter: 'm' for date modified) + d1d (start of date range day) + d1m (start of date range month) + d1y (start of date range year) + d2d (end of date range day) + d2m (end of date range month) + d2y (end of date range year) + sc (split by collection: 0 or 1) + sf (sort field: e.g. 'latest first') + so (sort order: 'a' for ascending, 'd' for descending) + rg (page size) + jrec (offset) + of (format code, see above) + """ + VERSION = 1 + + namespaces = { + 'mods': 'http://www.loc.gov/mods/v3', + } + + def do_harvest(self, start_date: pendulum.Pendulum, end_date: pendulum.Pendulum): + page_size = self.kwargs['page_size'] + offset = 1 + url = furl(self.config.base_url) + url.args.update({ + 'c': self.kwargs['collection'], + 'of': self.kwargs['format_code'], + 'rg': page_size, + 'dt': 'm', + 'd1d': start_date.day, + 'd1m': start_date.month, + 'd1y': start_date.year, + 'd2d': end_date.day, + 'd2m': end_date.month, + 'd2y': end_date.year, + 'sc': 0, # Splitting by collection screws up the page size + 'sf': 'latest first', + 'so': 'd', + }) + + while True: + logger.debug('Making request to %s', url.url) + resp = self.requests.get(url.url) + resp.raise_for_status() + + parsed = etree.fromstring(resp.content, parser=etree.XMLParser(recover=True)) + records = parsed.xpath('/modsCollection/mods:mods', namespaces=self.namespaces) + if not records: + break + + for record in records: + id = record.xpath('mods:recordInfo/mods:recordIdentifier', namespaces=self.namespaces)[0].text + yield (id, etree.tostring(record, encoding=str)) + + offset += page_size + url.args['jrec'] = offset diff --git a/share/sources/edu.ageconsearch/source.yaml b/share/sources/edu.ageconsearch/source.yaml index 7d38637f1..6c4da4b98 100644 --- a/share/sources/edu.ageconsearch/source.yaml +++ b/share/sources/edu.ageconsearch/source.yaml @@ -1,12 +1,23 @@ configs: -- base_url: http://ageconsearch.umn.edu/browse-date +- base_url: http://ageconsearch.tind.io/search disabled: false earliest_date: null - harvester: edu.ageconsearch - harvester_kwargs: {} - label: edu.ageconsearch + harvester: io.tind + harvester_kwargs: + collection: AgEcon Search + page_size: 100 + format_code: xo + label: edu.ageconsearch.tind rate_limit_allowance: 1 rate_limit_period: 2 + transformer: mods + transformer_kwargs: + emitted_type: Preprint +- base_url: http://ageconsearch.umn.edu/browse-date + disabled: true + earliest_date: null + harvester: null + label: edu.ageconsearch transformer: edu.ageconsearch transformer_kwargs: {} home_page: http://ageconsearch.umn.edu/ diff --git a/share/transformers/mods.py b/share/transformers/mods.py index dfc04709f..038d5e6a9 100644 --- a/share/transformers/mods.py +++ b/share/transformers/mods.py @@ -313,7 +313,11 @@ class MODSCreativeWork(Parser): ) ) - date_updated = tools.ParseDate(tools.Try(ctx.header.datestamp)) + date_updated = tools.OneOf( + tools.ParseDate(ctx.header.datestamp), + tools.ParseDate(ctx['mods:recordInfo']['mods:recordChangeDate']), + tools.Static(None) + ) # TODO (in regulator) handle date ranges, uncertain dates ('1904-1941', '1890?', '1980-', '19uu', etc.) date_published = tools.OneOf( @@ -492,7 +496,11 @@ def do_transform(self, data): def unwrap_data(self, data): unwrapped_data = xmltodict.parse(data, process_namespaces=True, namespaces=self.kwargs.get('namespaces', self.NAMESPACES)) - return { - **unwrapped_data['record'].get('metadata', {}).get('mods:mods', {}), - 'header': unwrapped_data['record']['header'], - } + if 'record' in unwrapped_data: + return { + **unwrapped_data['record'].get('metadata', {}).get('mods:mods', {}), + 'header': unwrapped_data['record']['header'], + } + elif 'mods:mods' in unwrapped_data: + return unwrapped_data['mods:mods'] + raise ValueError('Unrecognized MODS wrapper!\n{}'.format(data))