From 67824003e02b27574de9b6f49313043248d5bcff Mon Sep 17 00:00:00 2001 From: Paul Norman Date: Wed, 25 Mar 2020 22:01:01 -0700 Subject: [PATCH] Move external shapefiles to tables in the DB This adds a script that loads files into the DB based on a YAML file listing the data sources. The script can be run while rendering is going on, as it swaps old tables with new ones in a transaction. Loading is done by using ogr2ogr to load into a temporary schema, clustering, then the swap in transaction. The status of the tables is tracked in the `external_data` table, which lists the last modified date of each table. This allows the loading script to use conditional GETs and only download and update for sources which have changed. --- .travis.yml | 2 - DOCKER.md | 2 +- INSTALL.md | 21 +- external-data.yml | 81 +++++++ project.mml | 47 ++-- scripts/docker-startup.sh | 2 +- scripts/get-external-data.py | 258 ++++++++++++++++++++ scripts/get-shapefiles.py | 451 ----------------------------------- 8 files changed, 376 insertions(+), 488 deletions(-) create mode 100644 external-data.yml create mode 100755 scripts/get-external-data.py delete mode 100755 scripts/get-shapefiles.py diff --git a/.travis.yml b/.travis.yml index 15397f385c..b07eae786c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,8 +21,6 @@ env: install: - npm install carto@$CARTO - pip3 install --user colormath - - mkdir -p data/simplified-water-polygons-split-3857 data/ne_110m_admin_0_boundary_lines_land data/water-polygons-split-3857 - - touch data/simplified-water-polygons-split-3857/simplified_water_polygons.shp data/ne_110m_admin_0_boundary_lines_land/ne_110m_admin_0_boundary_lines_land.shp data/water-polygons-split-3857/water_polygons.shp - createdb -w -E utf8 -U postgres gis && psql -Xq -d gis -U postgres -w -c "CREATE EXTENSION postgis; CREATE EXTENSION hstore;" script: # We're using pipes in the checks, so fail if any part fails diff --git a/DOCKER.md b/DOCKER.md index f2028be24e..0fc7418cf6 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -61,7 +61,7 @@ Depending on your machine and the size of the extract the import can take a whil ## Test rendering After you have the necessary data available you can start Kosmtik to produce a test rendering. For that you run `docker-compose up kosmtik` in the openstreetmap-carto directory. This starts a container with Kosmtik and also starts the PostgreSQL database container if it is not already running. The Kosmtik container is built the first time you run that command if it not exists. -At startup of the container the script `scripts/docker-startup.sh` is invoked which downloads necessary shapefiles with `scripts/get-shapefiles.py` (if they are not already present) and indexes them. It afterwards runs Kosmtik. If you have to customize anything, you can do so in the script. The Kosmtik config file can be found in `.kosmtik-config.yml`. +At startup of the container the script `scripts/docker-startup.sh` is invoked which downloads necessary shapefiles with `scripts/get-external-data.py` (if they are not already present). It afterwards runs Kosmtik. If you have to customize anything, you can do so in the script. The Kosmtik config file can be found in `.kosmtik-config.yml`. If you want to have a [local configuration](https://github.com/kosmtik/kosmtik#local-config) for our `project.mml` you can place a `localconfig.js` or `localconfig.json` file into the openstreetmap-carto directory. The shapefile data that is downloaded is owned by the user with UID 1000. If you have another default user id on your system, consider changing the line `USER 1000` in the file `Dockerfile`. diff --git a/INSTALL.md b/INSTALL.md index 83d68b4376..f9f4754151 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -36,28 +36,14 @@ psql -d gis -f indexes.sql ## Scripted download Some features are rendered using preprocessed shapefiles. -To obtain them you can run the following script. +To download them and import them into the database you can run the following script ``` -scripts/get-shapefiles.py +scripts/get-external-data.py ``` This script downloads necessary files, generates and populates the *data* directory with all needed shapefiles, including indexing them through *shapeindex*. -## Manual download - -You can also download them manually at the following paths: - -* [`simplified_water_polygons.shp`](https://osmdata.openstreetmap.de/download/simplified-water-polygons-split-3857.zip) (updated daily) -* [`ne_110m_admin_0_boundary_lines_land.shp`](http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/110m/cultural/ne_110m_admin_0_boundary_lines_land.zip) -* [`water_polygons.shp`](https://osmdata.openstreetmap.de/download/water-polygons-split-3857.zip) (updated daily) -* [`icesheet_polygons.shp`](https://osmdata.openstreetmap.de/download/antarctica-icesheet-polygons-3857.zip) -* [`icesheet_outlines.shp`](https://osmdata.openstreetmap.de/download/antarctica-icesheet-outlines-3857.zip) - -The repeated www.naturalearthdata.com in the Natural Earth shapefiles is correct. - -Put these shapefiles at `path/to/openstreetmap-carto/data`. - ## Fonts The stylesheet uses Noto, an openly licensed font family from Google with support for multiple scripts. The stylesheet uses Noto's "Sans" style where available. If not available, this stylesheet uses another appropriate style of the Noto family. The "UI" version is used where available, with its vertical metrics which fit better with Latin text. @@ -116,8 +102,7 @@ To display any map a database containing OpenStreetMap data and some utilities a * [PostgreSQL](https://www.postgresql.org/) * [PostGIS](https://postgis.net/) * [osm2pgsql](https://github.com/openstreetmap/osm2pgsql#installing) to [import your data](https://switch2osm.org/loading-osm-data/) into a PostGIS database -* `curl` and `unzip` for downloading and decompressing files -* shapeindex (a companion utility to Mapnik found in the `mapnik-utils` package) for indexing downloaded shapefiles +* `ogr2ogr` for loading shapefiles into the database ### Optional development dependencies diff --git a/external-data.yml b/external-data.yml new file mode 100644 index 0000000000..5c231e885f --- /dev/null +++ b/external-data.yml @@ -0,0 +1,81 @@ +settings: + temp_schema: loading + schema: public + data_dir: data + database: gis + metadata_table: external_data +sources: + simplified_water_polygons: + # The type of file this source is + type: shp + # Where to get it + url: https://osmdata.openstreetmap.de/download/simplified-water-polygons-split-3857.zip + # The location within the archive + file: simplified-water-polygons-split-3857/simplified_water_polygons.shp + archive: + format: zip + # Files to extract from the archive + files: + - simplified-water-polygons-split-3857/simplified_water_polygons.cpg + - simplified-water-polygons-split-3857/simplified_water_polygons.dbf + - simplified-water-polygons-split-3857/simplified_water_polygons.prj + - simplified-water-polygons-split-3857/simplified_water_polygons.shp + - simplified-water-polygons-split-3857/simplified_water_polygons.shx + water_polygons: + type: shp + url: https://osmdata.openstreetmap.de/download/water-polygons-split-3857.zip + file: water-polygons-split-3857/water_polygons.shp + archive: + format: zip + files: + - water-polygons-split-3857/water_polygons.cpg + - water-polygons-split-3857/water_polygons.dbf + - water-polygons-split-3857/water_polygons.prj + - water-polygons-split-3857/water_polygons.shp + - water-polygons-split-3857/water_polygons.shx + icesheet_polygons: + type: shp + url: https://osmdata.openstreetmap.de/download/antarctica-icesheet-polygons-3857.zip + file: antarctica-icesheet-polygons-3857/icesheet_polygons.shp + archive: + format: zip + files: + - antarctica-icesheet-polygons-3857/icesheet_polygons.cpg + - antarctica-icesheet-polygons-3857/icesheet_polygons.dbf + - antarctica-icesheet-polygons-3857/icesheet_polygons.prj + - antarctica-icesheet-polygons-3857/icesheet_polygons.shp + - antarctica-icesheet-polygons-3857/icesheet_polygons.shx + icesheet_outlines: + type: shp + url: https://osmdata.openstreetmap.de/download/antarctica-icesheet-outlines-3857.zip + file: antarctica-icesheet-outlines-3857/icesheet_outlines.shp + ogropts: + - "-explodecollections" + archive: + format: zip + files: + - antarctica-icesheet-outlines-3857/icesheet_outlines.cpg + - antarctica-icesheet-outlines-3857/icesheet_outlines.dbf + - antarctica-icesheet-outlines-3857/icesheet_outlines.prj + - antarctica-icesheet-outlines-3857/icesheet_outlines.shp + - antarctica-icesheet-outlines-3857/icesheet_outlines.shx + + ne_110m_admin_0_boundary_lines_land: + type: shp + url: http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/110m/cultural/ne_110m_admin_0_boundary_lines_land.zip + file: ne_110m_admin_0_boundary_lines_land.shp + ogropts: &ne_opts + - "--config" + - "SHAPE_ENCODING" + - "WINDOWS-1252" + - "-explodecollections" + # needs reprojecting + - '-t_srs' + - 'EPSG:3857' + archive: + format: zip + files: + - ne_110m_admin_0_boundary_lines_land.dbf + - ne_110m_admin_0_boundary_lines_land.prj + - ne_110m_admin_0_boundary_lines_land.shp + - ne_110m_admin_0_boundary_lines_land.shx diff --git a/project.mml b/project.mml index 912708c3aa..3b6895d2e3 100644 --- a/project.mml +++ b/project.mml @@ -24,10 +24,6 @@ _parts: extent: *world srs-name: "900913" srs: "+proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0.0 +k=1.0 +units=m +nadgrids=@null +wktext +no_defs +over" - extents84: &extents84 - extent: *world - srs-name: "WGS84" - srs: "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" osm2pgsql: &osm2pgsql type: "postgis" dbname: "gis" @@ -157,8 +153,12 @@ Layer: geometry: polygon <<: *extents Datasource: - file: data/antarctica-icesheet-polygons-3857/icesheet_polygons.shp - type: shape + <<: *osm2pgsql + table: |- + (SELECT + way + FROM icesheet_polygons + ) AS icesheet_polygons properties: minzoom: 5 - id: water-lines-casing @@ -255,16 +255,24 @@ Layer: geometry: polygon <<: *extents Datasource: - file: data/simplified-water-polygons-split-3857/simplified_water_polygons.shp - type: shape + <<: *osm2pgsql + table: |- + (SELECT + way + FROM simplified_water_polygons + ) AS ocean_lz properties: maxzoom: 9 - id: ocean geometry: polygon <<: *extents Datasource: - file: data/water-polygons-split-3857/water_polygons.shp - type: shape + <<: *osm2pgsql + table: |- + (SELECT + way + FROM water_polygons + ) AS ocean properties: minzoom: 10 - id: landcover-area-symbols @@ -301,8 +309,13 @@ Layer: geometry: linestring <<: *extents Datasource: - file: data/antarctica-icesheet-outlines-3857/icesheet_outlines.shp - type: shape + <<: *osm2pgsql + table: |- + (SELECT + way, + ice_edge + FROM icesheet_outlines + ) AS icesheet_outlines properties: minzoom: 5 - id: marinas-area @@ -1027,10 +1040,14 @@ Layer: minzoom: 11 - id: necountries geometry: linestring - <<: *extents84 + <<: *extents Datasource: - file: data/ne_110m_admin_0_boundary_lines_land/ne_110m_admin_0_boundary_lines_land.shp - type: shape + <<: *osm2pgsql + table: |- + (SELECT + way + FROM ne_110m_admin_0_boundary_lines_land + ) AS necountries properties: minzoom: 1 maxzoom: 3 diff --git a/scripts/docker-startup.sh b/scripts/docker-startup.sh index a3557e97bd..61641f6f1b 100644 --- a/scripts/docker-startup.sh +++ b/scripts/docker-startup.sh @@ -54,7 +54,7 @@ EOF kosmtik) # Downloading needed shapefiles - python3 scripts/get-shapefiles.py -n + scripts/get-external-data.py # Creating default Kosmtik settings file if [ ! -e ".kosmtik-config.yml" ]; then diff --git a/scripts/get-external-data.py b/scripts/get-external-data.py new file mode 100755 index 0000000000..082173bb53 --- /dev/null +++ b/scripts/get-external-data.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +'''This script is designed to load quasi-static data into a PostGIS database +for rendering maps. It differs from the usual scripts to do this in that it is +designed to take its configuration from a file rather than be a series of shell +commands. + +Some implicit assumptions are +- Time spent querying (rendering) the data is more valuable than the one-time + cost of loading it +- The script will not be running multiple times in parallel. This is not + normally likely because the script is likely to be called daily or less, + not minutely. +- Usage patterns will be similar to typical map rendering +''' + +import yaml +import os +import re +import argparse +import shutil + +# modules for getting data +import zipfile +import requests +import io + +# modules for converting and postgres loading +import subprocess +import psycopg2 + +import logging + + +def database_setup(conn, temp_schema, schema, metadata_table): + with conn.cursor() as cur: + cur.execute('''CREATE SCHEMA IF NOT EXISTS {temp_schema};''' + .format(temp_schema=temp_schema)) + cur.execute(('''CREATE TABLE IF NOT EXISTS "{schema}"."{metadata_table}"''' + ''' (name text primary key, last_modified text);''') + .format(schema=schema, metadata_table=metadata_table)) + conn.commit() + + +class Table: + def __init__(self, name, conn, temp_schema, schema, metadata_table): + self._name = name + self._conn = conn + self._temp_schema = temp_schema + self._dst_schema = schema + self._metadata_table = metadata_table + + # Clean up the temporary schema in preperation for loading + def clean_temp(self): + with self._conn.cursor() as cur: + cur.execute('''DROP TABLE IF EXISTS "{temp_schema}"."{name}"''' + .format(name=self._name, temp_schema=self._temp_schema)) + self._conn.commit() + + # get the last modified date from the metadata table + def last_modified(self): + with self._conn.cursor() as cur: + cur.execute('''SELECT last_modified FROM "{schema}"."{metadata_table}" WHERE name = %s''' + .format(schema=self._dst_schema, metadata_table=self._metadata_table), [self._name]) + results = cur.fetchone() + if results is not None: + return results[0] + + def index(self): + with self._conn.cursor() as cur: + # Disable autovacuum while manipulating the table, since it'll get clustered towards the end. + cur.execute('''ALTER TABLE "{temp_schema}"."{name}" SET ( autovacuum_enabled = FALSE );''' + .format(name=self._name, temp_schema=self._temp_schema)) + # ogr creates a ogc_fid column we don't need + cur.execute('''ALTER TABLE "{temp_schema}"."{name}" DROP COLUMN ogc_fid;''' + .format(name=self._name, temp_schema=self._temp_schema)) + + # Null geometries are useless for rendering + cur.execute('''DELETE FROM "{temp_schema}"."{name}" WHERE way IS NULL;''' + .format(name=self._name, temp_schema=self._temp_schema)) + cur.execute('''ALTER TABLE "{temp_schema}"."{name}" ALTER COLUMN way SET NOT NULL;''' + .format(name=self._name, temp_schema=self._temp_schema)) + # sorting static tables helps performance and reduces size from the column drop above + cur.execute(('''CREATE INDEX "{name}_order" ON "{temp_schema}"."{name}" ''' + '''(ST_Envelope(way));''' + '''CLUSTER "{temp_schema}"."{name}" ''' + '''USING "{name}_order";''' + '''DROP INDEX "{temp_schema}"."{name}_order";''' + '''CREATE INDEX ON "{temp_schema}"."{name}" ''' + '''USING GIST (way) WITH (fillfactor=100);''') + .format(name=self._name, temp_schema=self._temp_schema)) + # Reset autovacuum. The table is static, so this doesn't really + # matter since it'll never need a vacuum. + cur.execute('''ALTER TABLE "{temp_schema}"."{name}" RESET ( autovacuum_enabled );''' + .format(name=self._name, temp_schema=self._temp_schema)) + self._conn.commit() + + # VACUUM can't be run in transaction, so autocommit needs to be turned on + old_autocommit = self._conn.autocommit + try: + self._conn.autocommit = True + with self._conn.cursor() as cur: + cur.execute('''VACUUM ANALYZE "{temp_schema}"."{name}";''' + .format(name=self._name, temp_schema=self._temp_schema)) + finally: + self._conn.autocommit = old_autocommit + + def replace(self, new_last_modified): + with self._conn.cursor() as cur: + cur.execute('''BEGIN;''') + cur.execute(('''DROP TABLE IF EXISTS "{schema}"."{name}";''' + '''ALTER TABLE "{temp_schema}"."{name}" SET SCHEMA "{schema}";''') + .format(name=self._name, temp_schema=self._temp_schema, schema=self._dst_schema)) + + # We checked if the metadata table had this table way up above + cur.execute('''SELECT 1 FROM "{schema}"."{metadata_table}" WHERE name = %s''' + .format(schema=self._dst_schema, metadata_table=self._metadata_table), + [self._name]) + if cur.rowcount == 0: + cur.execute(('''INSERT INTO "{schema}"."{metadata_table}" ''' + '''(name, last_modified) VALUES (%s, %s)''') + .format(schema=self._dst_schema, metadata_table=self._metadata_table), + [self._name, new_last_modified]) + else: + cur.execute('''UPDATE "{schema}"."{metadata_table}" SET last_modified = %s WHERE name = %s''' + .format(schema=self._dst_schema, metadata_table=self._metadata_table), + [new_last_modified, self._name]) + self._conn.commit() + + +def main(): + # parse options + parser = argparse.ArgumentParser(description="Load external data into a database") + + parser.add_argument("-f", "--force", action="store_true", help="Download new data, even if not required") + + parser.add_argument("-c", "--config", action="store", default="external-data.yml", + help="Name of configuration file (default external-data.yml)") + parser.add_argument("-D", "--data", action="store", help="Override data download directory") + + parser.add_argument("-d", "--database", action="store", help="Override database name to connect to") + parser.add_argument("-H", "--host", action="store", + help="Override database server host or socket directory") + parser.add_argument("-p", "--port", action="store", help="Override database server port") + parser.add_argument("-U", "--username", action="store", help="Override database user name") + parser.add_argument("-v", "--verbose", action="store_true", help="Be more verbose. Overrides -q") + parser.add_argument("-q", "--quiet", action="store_true", help="Only report serious problems") + + opts = parser.parse_args() + + if opts.verbose: + logging.basicConfig(level=logging.DEBUG) + elif opts.quiet: + logging.basicConfig(level=logging.WARNING) + else: + logging.basicConfig(level=logging.INFO) + + with open(opts.config) as config_file: + config = yaml.safe_load(config_file) + data_dir = opts.data or config["settings"]["data_dir"] + os.makedirs(data_dir, exist_ok=True) + + # If the DB options are unspecified in both on the command line and in the + # config file, libpq will pick what to use with the None + database = opts.database or config["settings"].get("database") + host = opts.host or config["settings"].get("host") + port = opts.port or config["settings"].get("port") + user = opts.username or config["settings"].get("username") + with requests.Session() as s, \ + psycopg2.connect(database=database, + host=host, port=port, + user=user) as conn: + + s.headers.update({'User-Agent': 'get-external-data.py/osm-carto'}) + + # DB setup + database_setup(conn, config["settings"]["temp_schema"], + config["settings"]["schema"], + config["settings"]["metadata_table"]) + + for name, source in config["sources"].items(): + logging.info("Checking table {}".format(name)) + # Don't attempt to handle strange names + # Even if there was code to escape them properly here, you don't want + # in a style with all the quoting headaches + if not re.match('''^[a-zA-Z0-9_]+$''', name): + raise RuntimeError("Only ASCII alphanumeric table are names supported") + + workingdir = os.path.join(data_dir, name) + # Clean up anything left over from an aborted run + shutil.rmtree(workingdir, ignore_errors=True) + + os.makedirs(workingdir, exist_ok=True) + + this_table = Table(name, conn, + config["settings"]["temp_schema"], + config["settings"]["schema"], + config["settings"]["metadata_table"]) + this_table.clean_temp() + + if not opts.force: + headers = {'If-Modified-Since': this_table.last_modified()} + else: + headers = {} + + download = s.get(source["url"], headers=headers) + download.raise_for_status() + + if (download.status_code == 200): + if "Last-Modified" in download.headers: + new_last_modified = download.headers["Last-Modified"] + else: + new_last_modified = None + if "archive" in source and source["archive"]["format"] == "zip": + zip = zipfile.ZipFile(io.BytesIO(download.content)) + for member in source["archive"]["files"]: + zip.extract(member, workingdir) + + ogrpg = "PG:dbname={}".format(database) + + if port is not None: + ogrpg = ogrpg + " port={}".format(port) + if user is not None: + ogrpg = ogrpg + " user={}".format(user) + if host is not None: + ogrpg = ogrpg + " host={}".format(host) + + ogrcommand = ["ogr2ogr", + '-f', 'PostgreSQL', + '-lco', 'GEOMETRY_NAME=way', + '-lco', 'SPATIAL_INDEX=FALSE', + '-lco', 'EXTRACT_SCHEMA_FROM_LAYER_NAME=YES', + '-nln', "{}.{}".format(config["settings"]["temp_schema"], name)] + + if "ogropts" in source: + ogrcommand += source["ogropts"] + + ogrcommand += [ogrpg, os.path.join(workingdir, source["file"])] + + logging.debug("running {}".format(subprocess.list2cmdline(ogrcommand))) + + # ogr2ogr can raise errors here, so they need to be caught + try: + subprocess.check_output(ogrcommand, stderr=subprocess.PIPE, universal_newlines=True) + except subprocess.CalledProcessError as e: + # Add more detail on stdout for the logs + logging.critical("ogr2ogr returned {} with layer {}".format(e.returncode, name)) + logging.critical("Command line was {}".format(subprocess.list2cmdline(e.cmd))) + logging.critical("Output was\n{}".format(e.output)) + raise RuntimeError("ogr2ogr error when loading table {}".format(name)) + + this_table.index() + this_table.replace(new_last_modified) + else: + logging.info("Table {} did not require updating".format(name)) + + +if __name__ == '__main__': + main() diff --git a/scripts/get-shapefiles.py b/scripts/get-shapefiles.py deleted file mode 100755 index 2dba3ca35e..0000000000 --- a/scripts/get-shapefiles.py +++ /dev/null @@ -1,451 +0,0 @@ -#!/usr/bin/env python3 - -# This script generates and populates the 'data' directory with all needed -# shapefiles. - -import os -import errno -import tarfile -import zipfile -import subprocess -import distutils.spawn -import argparse -import sys -import tempfile -import logging -import time -import email.utils -import atexit -import time - -import urllib.request as urllib2 -import urllib.parse as urlparse - -start = time.time() -data_dir = 'data' -settings = { - # Keys 1, 2, 3, ... set the arg short-options and the related process - # ordering. Use > 0 to allow processing. - 1: { - 'directory': 'simplified-water-polygons-split-3857', - 'url': 'https://osmdata.openstreetmap.de/download/simplified-water-polygons-split-3857.zip', # noqa - 'type': 'zip', - 'shp_basename': ['simplified_water_polygons'], - 'long_opt': '--simplified-water' - }, - - 2: { - 'directory': 'ne_110m_admin_0_boundary_lines_land', - 'url': 'http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/110m/cultural/ne_110m_admin_0_boundary_lines_land.zip', # noqa - 'type': 'zip_dir', - 'shp_basename': ['ne_110m_admin_0_boundary_lines_land'], - 'long_opt': '--ne-admin' - }, - - 3: { - 'directory': 'water-polygons-split-3857', - 'url': 'https://osmdata.openstreetmap.de/download/water-polygons-split-3857.zip', # noqa - 'type': 'zip', - 'shp_basename': ['water_polygons'], - 'long_opt': '--water-polygons' - }, - - 4: { - 'directory': 'antarctica-icesheet-polygons-3857', - 'url': 'https://osmdata.openstreetmap.de/download/antarctica-icesheet-polygons-3857.zip', # noqa - 'type': 'zip', - 'shp_basename': ['icesheet_polygons'], - 'long_opt': '--icesheet-polygons' - }, - - 5: { - 'directory': 'antarctica-icesheet-outlines-3857', - 'url': 'https://osmdata.openstreetmap.de/download/antarctica-icesheet-outlines-3857.zip', # noqa - 'type': 'zip', - 'shp_basename': ['icesheet_outlines'], - 'long_opt': '--icesheet-outlines' - } -} - -u_prompt = True - - -def exit_handler(dir_path): - # Removing empty directory - try: - os.rmdir(dir_path) - except Exception: - pass - - -def download_file( - url, - desc=None, - option_force_update=False, - option_no_curl=False): - global u_prompt - try: - scheme, netloc, path, query, fragment = urlparse.urlsplit(url) - file_name = os.path.basename(path) - if not file_name: - file_name = 'downloaded.file' - if desc: - file_name = os.path.join(desc, file_name) - - org_file_modified = None - org_file_size = None - if os.path.exists(file_name): - org_file_modified = time.localtime((os.path.getmtime(file_name))) - org_file_size = int(os.path.getsize(file_name)) - - curl_used = 0 - if not option_no_curl and distutils.spawn.find_executable("curl"): - curl_used = 1 - sys.stdout.flush() - if os.path.exists(file_name) and not option_force_update: - if subprocess.call( - ["curl", "-R", "-z", file_name, "-L", "-o", file_name, url], - stderr=subprocess.STDOUT) != 0: - sys.exit("\n\n 'curl' error: download failed.\n") - curl_used = 2 - else: - if subprocess.call( - ["curl", "-R", "-L", "-o", file_name, url], - stderr=subprocess.STDOUT) != 0: - sys.exit("\n\n 'curl' error: download failed.\n") - sys.stdout.flush() - - u = urllib2.urlopen(url) - meta = u.info() - - # Compare dates and sizes - local_file_modified = None - local_file_size = None - if os.path.exists(file_name): - local_file_modified = time.localtime((os.path.getmtime(file_name))) - local_file_size = int(os.path.getsize(file_name)) - meta_func = meta.getheaders if hasattr( - meta, 'getheaders') else meta.get_all - host_file_modified = email.utils.parsedate( - meta_func("last-modified")[0]) - meta_length = meta_func("Content-Length") - host_file_size = None - if meta_length: - host_file_size = int(meta_length[0]) - - # Do a file check control after using curl (which looks like not - # including it internally) - if curl_used == 2 and (host_file_size != local_file_size): - print( - " Warning: file size differs. Downloading the file again.") - curl_used = 0 - if curl_used > 0: - u.close() - if (not option_force_update and local_file_size is not None and - (org_file_modified == local_file_modified) and - (org_file_size == local_file_size)): - print(" No newer file to download.") - return file_name, 0 - else: - return file_name, 1 - - if (not option_force_update and os.path.exists(file_name) and - (host_file_modified <= local_file_modified) and - (host_file_size == local_file_size)): - print(" No newer file to download.", end="") - if u_prompt: - print(" (Use -u to force downloading file)", end="") - u_prompt = False - print() - u.close() - return file_name, 0 - - with open(file_name, 'wb') as f: - print(" Bytes: {0:10}".format(host_file_size)) - - file_size_dl = 0 - block_sz = 65536 - while True: - buffer = u.read(block_sz) - if not buffer: - if file_size_dl != host_file_size: - sys.exit("\n\n Error: download with invalid size.\n") - break - - file_size_dl += len(buffer) - f.write(buffer) - - status = "{0:18}".format(file_size_dl) - if host_file_size: - status += " [{0:3.0f}%]".format( - file_size_dl * - 100 / - host_file_size) - status += chr(13) - print(status, end="") - f.close() - u.close() - os.utime( - file_name, - (time.mktime(host_file_modified), - time.mktime(host_file_modified))) - print() - - return file_name, 2 - except urllib2.HTTPError as e: - sys.exit( - "\n\n Error: download failed. (error code: " + - str(e.code) + - ", error reason: " + e.reason + ")\n") - except Exception as e: - sys.exit("\n\n Error: download failed.\n" + str(e) + "\n") - - -def main(): - - # Option handling - - parser = argparse.ArgumentParser( - epilog="This script generates and populates the '" + data_dir + - "' directory with all needed shapefiles, including indexing " + - " them through shapeindex.") - parser.add_argument( - '-c', "--check", dest='option_check_mode', action='store_true', - help="check whether the '" + data_dir + "' directory already exists") - parser.add_argument( - "-d", "--directory", dest="data_dir", - help="set the name of the data directory (default: '" + - data_dir + "')", - default=data_dir, metavar="") - parser.add_argument( - '-e', "--no-extract", dest='option_no_extract', action='store_true', - help="do not populate target directories with the expansion " + - "of downloaded data") - parser.add_argument( - '-f', "--force", dest='option_force', action='store_true', - help="force continuing even if project.mml does not exist") - parser.add_argument( - '-l', "--no-curl", dest='option_no_curl', action='store_true', - help="do not use 'curl' even if available") - parser.add_argument( - '-n', "--no-download", dest='option_no_download', action='store_true', - help="do not download archive if already existing locally") - parser.add_argument( - '-p', "--pause", dest='option_pause_mode', action='store_true', - help="pause before starting") - parser.add_argument( - '-r', "--remove", dest='option_remove', action='store_true', - help="remove each downloaded archive after its expansion") - parser.add_argument( - '-s', "--no-shape", dest='option_no_shape', action='store_true', - help="do not run shapeindex") - parser.add_argument( - '-u', "--update", dest='option_force_update', action='store_true', - help="force performing an update operation even if not needed " + - "(e.g., downloading, expanding, indexing)") - for element in sorted(settings): - parser.add_argument( - settings[element]['long_opt'], - dest='option_filter', action='append_const', const=element, - help="only process " + settings[element]['directory']) - - args = parser.parse_args() - - # Initial checks - - if not args.option_no_shape and ( - not distutils.spawn.find_executable("shapeindex")): - sys.exit( - """\n Error: you need shapeindex (or shapeindex is not in the - PATH). Otherwise, use '-s' option to skip shapeindex - (indexing shapes is suggested for performance improvement).\n""") - - if args.option_force: - os.chdir(os.path.join(os.path.dirname(__file__))) - else: - os.chdir(os.path.join(os.path.dirname(__file__), '..')) - if not os.path.isfile("project.mml"): - sys.exit( - """\n Error: project.mml not found. - Are you sure you are in the correct folder? - Otherwise, use '-f' option to go on creating or updating the '""" + - args.data_dir + """' directory - placed in the same path of this script.\n""") - - if os.path.isfile(args.data_dir): - sys.exit( - """\n Error: existing file named '""" + - args.data_dir + - """'\n""") - if args.option_check_mode: - if os.path.isdir(args.data_dir): - sys.exit( - """\n A directory named '""" + args.data_dir + - """' already exists. - Please consider renaming it. - Otherwise, remove '-c' option to allow updating.\n""") - - if args.option_pause_mode: - print( - "\nThis script generates and populates the '" + args.data_dir + - "' directory with all needed shapefiles.\n") - try: - input( - "Press Enter to continue " + - "(remove '-p' option to avoid this message)...") - except Exception: - pass - - print("\nStarting " + os.path.basename(__file__) + "...") - - # Processing - - for element in sorted(settings): - - if (not args.option_filter or - (args.option_filter and - element in args.option_filter)) and element > 0: - - dir_name = settings[element]['directory'] - dir_path = os.path.join(args.data_dir, dir_name) - path_name = os.path.join( - args.data_dir, - settings[element]['url'].rsplit('/', 1)[-1]) - - # Creating directory - try: - os.makedirs(dir_path) - atexit.register(exit_handler, dir_path) - except Exception: - pass - - # Downloading - download_type = -1 - if not args.option_no_download or not os.path.isfile(path_name): - print(str(element) + "-1. Downloading '" + dir_name + "'...") - archive_file_name, download_type = download_file( - settings[element]['url'], args.data_dir, - args.option_force_update, args.option_no_curl) - - # Expanding - if ((not args.option_no_extract and download_type > 0) or - args.option_force_update): - sys.stdout.flush() - print() - print( - str(element) + "-2. Expanding '" + dir_name + "'...", - end="") - sys.stdout.flush() - if settings[element]['type'] == 'tgz': - tar = tarfile.open(path_name) - try: - tar.extractall(args.data_dir) - except Exception: - sys.exit(" Failed (try with -u option).\n") - tar.close() - elif settings[element]['type'] == 'zip': - zip = zipfile.ZipFile(path_name) - try: - zip.extractall(args.data_dir) - except Exception: - sys.exit(" Failed (try with -u option).\n") - zip.close() - elif settings[element]['type'] == 'zip_dir': - zip = zipfile.ZipFile(path_name) - try: - zip.extractall(dir_path) - except Exception: - sys.exit(" Failed (try with -u option).\n") - zip.close() - else: - sys.exit( - "\n\nInternal error: unmanaged 'type'='" + - settings[element]['type'] + "'.\n") - sys.stdout.flush() - print(" Done.\n") - - # Removing archive - if args.option_remove: - try: - os.remove(path_name) - except OSError: - sys.exit("\n\n\nCannot remove '" + path_name + "'\n") - - # Indexing - for item, shp_basename in enumerate( - settings[element]['shp_basename']): - shp_file_name = os.path.join(dir_path, shp_basename + ".shp") - index_file_name = os.path.join( - dir_path, shp_basename + ".index") - shp_file_modified = None - if os.path.exists(shp_file_name): - shp_file_modified = time.localtime( - (os.path.getmtime(shp_file_name))) - index_file_modified = None - if os.path.exists(index_file_name): - index_file_modified = time.localtime( - (os.path.getmtime(index_file_name))) - if (not args.option_no_shape and shp_file_modified is None - and index_file_modified is not None): - try: - os.remove(index_file_name) - except OSError: - sys.exit( - "\n\n\nCannot remove '" + - index_file_name + - "'\n") - if shp_file_modified is None: - sys.exit("\n\n\nMissing '" + shp_file_name + "'\n") - if (args.option_force_update or index_file_modified is None or - (shp_file_modified is not None and index_file_modified is not None and - (shp_file_modified > index_file_modified))): - if args.option_no_shape and index_file_modified is not None: - if len(settings[element]['shp_basename']) == 1: - print( - str(element) + - "-3" + - ". Removing old index '" + - index_file_name + - "'...") - else: - print(str(element) + "-3-" + str(item + 1) + - ". Removing old index '" + index_file_name + "'...") - sys.stdout.flush() - try: - os.remove(index_file_name) - except OSError: - sys.exit( - "\n\n\nCannot remove old index '" + - index_file_name + - "'\n") - pass - print() - if not args.option_no_shape: - if len(settings[element]['shp_basename']) == 1: - print(str(element) + "-3" + ". Indexing '" + - shp_file_name + "'...") - else: - print(str(element) + "-3-" + str(item + 1) + - ". Indexing '" + shp_file_name + "'...") - sys.stdout.flush() - if (subprocess.call(["shapeindex", "--shape_files", - shp_file_name], - stderr=subprocess.STDOUT) != 0): - sys.exit( - "\n Indexing error: shapeindex failed.\n") - sys.stdout.flush() - print() - - # Finishing - if time.time()-start < 2: - print ("...script completed.\n") - else: - print ("...script completed in %.1f seconds.\n" % (time.time()-start)) - -if __name__ == '__main__': - try: - main() - except KeyboardInterrupt: - sys.exit("\n\n\nInterrupted: you pressed Ctrl+C!\n") - except Exception as e: - sys.exit("\n Error. " + str(e) + "\n")