-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Python3 support #1
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,14 +2,23 @@ | |
|
||
import json | ||
import uuid | ||
from urlparse import urlparse | ||
|
||
import requests | ||
from dateutil.parser import parse | ||
from simplejson.scanner import JSONDecodeError | ||
|
||
try: | ||
from urllib.parse import urlparse | ||
except ImportError: | ||
from urlparse import urlparse | ||
|
||
try: | ||
from json import JSONDecodeError | ||
except ImportError: | ||
from simplejson.scanner import JSONDecodeError | ||
|
||
|
||
Comment on lines
+7
to
+18
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Try the python 3 import, fall back to python 2 |
||
|
||
from ckan import model | ||
from ckan.lib.munge import munge_title_to_name, munge_tag | ||
from ckan.lib.munge import munge_tag | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed an unused import |
||
from ckan.plugins.core import implements | ||
import ckan.plugins.toolkit as toolkit | ||
from ckanext.harvest.interfaces import IHarvester | ||
|
@@ -152,7 +161,6 @@ def _build_package_dict(self, context, harvest_object): | |
'owner_org': local_org, | ||
'resources': [], | ||
} | ||
|
||
# Add tags | ||
package_dict['tags'] = \ | ||
[{'name': munge_tag(t)} | ||
|
@@ -205,11 +213,35 @@ def _build_package_dict(self, context, harvest_object): | |
'url': DOWNLOAD_ENDPOINT_TEMPLATE.format( | ||
domain=urlparse(harvest_object.source.url).hostname, | ||
resource_id=res['resource']['id']), | ||
'format': 'CSV' | ||
'format': 'CSV', | ||
'name': res['resource']['name'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoids all the data files being called "Unnamed Resource" |
||
}] | ||
|
||
return package_dict | ||
|
||
|
||
def _set_config(self, config_str): | ||
self.config = {'base_api_endpoint':BASE_API_ENDPOINT} | ||
if config_str: | ||
self.config = json.loads(config_str) | ||
if 'base_api_endpoint' in self.config: | ||
self.base_api_endpoint = self.config['base_api_endpoint'] | ||
|
||
log.debug('Using config: %r', self.config) | ||
|
||
def validate_config(self, config): | ||
if not config: | ||
return config | ||
|
||
config_obj = json.loads(config) | ||
if 'base_api_endpoint' in config_obj: | ||
try: | ||
parsed = urlparse(config_obj['base_api_endpoint']) | ||
except AttributeError: | ||
raise ValueError('base_api_endpoint must be a valid URL') | ||
if not parsed.scheme or not parsed.netloc: | ||
raise ValueError('base_api_endpoint must be a valid URL') | ||
return config | ||
|
||
Comment on lines
+221
to
+244
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add configuration for base_api_endpoint (we need to use the .eu version). _set_config is called by the gather stage, validate_config is called when the source is created |
||
def process_package(self, package, harvest_object): | ||
''' | ||
Subclasses can override this method to perform additional processing on | ||
|
@@ -237,7 +269,7 @@ def _request_datasets_from_socrata(domain, limit=100, offset=0): | |
api_request_url = \ | ||
'{0}?domains={1}&search_context={1}' \ | ||
'&only=datasets&limit={2}&offset={3}' \ | ||
.format(BASE_API_ENDPOINT, domain, limit, offset) | ||
.format(self.base_api_endpoint, domain, limit, offset) | ||
log.debug('Requesting {}'.format(api_request_url)) | ||
api_response = requests.get(api_request_url) | ||
|
||
|
@@ -266,8 +298,9 @@ def _page_datasets(domain, batch_number): | |
_request_datasets_from_socrata(domain, batch_number, | ||
current_offset) | ||
if datasets is None or len(datasets) == 0: | ||
raise StopIteration | ||
break | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found raising an error was just stopping the process |
||
current_offset = current_offset + batch_number | ||
log.debug(f'Continued with {current_offset}-{batch_number}') | ||
for dataset in datasets: | ||
yield dataset | ||
|
||
|
@@ -292,7 +325,7 @@ def _make_harvest_objs(datasets): | |
|
||
log.debug('In SocrataHarvester gather_stage (%s)', | ||
harvest_job.source.url) | ||
|
||
self._set_config(harvest_job.source.config) | ||
domain = urlparse(harvest_job.source.url).hostname | ||
|
||
object_ids, guids = _make_harvest_objs(_page_datasets(domain, 100)) | ||
|
@@ -396,7 +429,7 @@ def import_stage(self, harvest_object): | |
|
||
else: | ||
# We need to explicitly provide a package ID | ||
package_dict['id'] = unicode(uuid.uuid4()) | ||
package_dict['id'] = str(uuid.uuid4()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added for python 3 compatibility |
||
|
||
harvest_object.package_id = package_dict['id'] | ||
harvest_object.add() | ||
|
@@ -420,3 +453,5 @@ def import_stage(self, harvest_object): | |
return False | ||
|
||
return True | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adds information about the config object