Skip to content

Commit

Permalink
ARROW-5831: [Release] Add Python program to download binary artifacts…
Browse files Browse the repository at this point in the history
… in parallel, allow abort/resume

This script only uses the Python standard library and curl. It does 8 downloads in parallel by default. Since Bintray returns sha256 checksums we compute these on any local files and do not re-download files, so that interrupted downloads can be resumed.

Closes #5550 from wesm/parallel-rc-binary-verification and squashes the following commits:

ff207e6 <Wes McKinney> More robust python3 checking
1d78b9f <Wes McKinney> Add Python-based parallel bintray artifact download script that can resume

Authored-by: Wes McKinney <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
  • Loading branch information
wesm committed Oct 1, 2019
1 parent 3b262f6 commit 8231fcb
Show file tree
Hide file tree
Showing 2 changed files with 164 additions and 39 deletions.
161 changes: 161 additions & 0 deletions dev/release/download_rc_binaries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#!/usr/bin/env python

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import sys

try:
import argparse
import concurrent.futures as cf
import functools
import hashlib
import json
import os
import subprocess
import urllib.request
except ImportError:
if sys.version_info.major < 3:
raise Exception("Please use Python 3 to run this script")
raise


BINTRAY_API_ROOT = "https://bintray.com/api/v1"
BINTRAY_DL_ROOT = "https://dl.bintray.com"
BINTRAY_REPO = 'apache/arrow'
DEFAULT_PARALLEL_DOWNLOADS = 8


class Bintray:

def __init__(self, repo=BINTRAY_REPO):
self.repo = repo

def get_file_list(self, package, version):
url = os.path.join(BINTRAY_API_ROOT, 'packages', self.repo, package,
'versions', version, 'files')
request = urllib.request.urlopen(url).read()
return json.loads(request)

def download_files(self, files, dest=None, num_parallel=None):
"""
Download files from Bintray in parallel. If file already exists, will
overwrite if the checksum does not match what Bintray says it should be
Parameters
----------
files : List[Dict]
File listing from Bintray
dest : str, default None
Defaults to current working directory
num_parallel : int, default 8
Number of files to download in parallel. If set to None, uses
default
"""
if dest is None:
dest = os.getcwd()
if num_parallel is None:
num_parallel = DEFAULT_PARALLEL_DOWNLOADS

if num_parallel == 1:
for path in files:
self._download_file(dest, path)
else:
parallel_map_terminate_early(
functools.partial(self._download_file, dest),
files,
num_parallel
)

def _download_file(self, dest, info):
relpath = info['path']

base, filename = os.path.split(relpath)

dest_dir = os.path.join(dest, base)
os.makedirs(dest_dir, exist_ok=True)

dest_path = os.path.join(dest_dir, filename)

if os.path.exists(dest_path):
with open(dest_path, 'rb') as f:
sha256sum = hashlib.sha256(f.read()).hexdigest()
if sha256sum == info['sha256']:
print('Local file {} sha256 matches, skipping'
.format(dest_path))
return
else:
print('Local file sha256 does not match, overwriting')

print("Downloading {} to {}".format(relpath, dest_path))

bintray_abspath = os.path.join(BINTRAY_DL_ROOT, self.repo, relpath)

cmd = [
'curl', '--fail', '--location',
'--output', dest_path, bintray_abspath
]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = proc.communicate()
if proc.returncode != 0:
raise Exception("Downloading {} failed\nstdout: {}\nstderr: {}"
.format(relpath, stdout, stderr))


def parallel_map_terminate_early(f, iterable, num_parallel):
tasks = []
with cf.ProcessPoolExecutor(num_parallel) as pool:
for v in iterable:
tasks.append(pool.submit(functools.partial(f, v)))

for task in cf.as_completed(tasks):
if task.exception() is not None:
e = task.exception()
for task in tasks:
task.cancel()
raise e


ARROW_PACKAGE_TYPES = ['centos', 'debian', 'python', 'ubuntu']


def download_rc_binaries(version, rc_number, dest=None, num_parallel=None):
bintray = Bintray()

version_string = '{}-rc{}'.format(version, rc_number)
for package_type in ARROW_PACKAGE_TYPES:
files = bintray.get_file_list('{}-rc'.format(package_type),
version_string)
bintray.download_files(files, dest=dest, num_parallel=num_parallel)


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Download release candidate binaries'
)
parser.add_argument('version', type=str, help='The version number')
parser.add_argument('rc_number', type=int,
help='The release candidate number, e.g. 0, 1, etc')
parser.add_argument('--dest', type=str, default=os.getcwd(),
help='The output folder for the downloaded files')
parser.add_argument('--num_parallel', type=int, default=8,
help='The number of concurrent downloads to do')
args = parser.parse_args()

download_rc_binaries(args.version, args.rc_number, dest=args.dest,
num_parallel=args.num_parallel)
42 changes: 3 additions & 39 deletions dev/release/verify-release-candidate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -99,49 +99,13 @@ fetch_archive() {
shasum -a 512 -c ${dist_name}.tar.gz.sha512
}

bintray() {
local command=$1
shift
local path=$1
shift
local url=https://bintray.com/api/v1${path}
echo "${command} ${url}" 1>&2
curl \
--fail \
--request ${command} \
${url} \
"$@" | \
jq .
}

download_bintray_files() {
local target=$1

local version_name=${VERSION}-rc${RC_NUMBER}

local file
bintray \
GET /packages/${BINTRAY_REPOSITORY}/${target}-rc/versions/${version_name}/files | \
jq -r ".[].path" | \
while read file; do
mkdir -p "$(dirname ${file})"
curl \
--fail \
--location \
--output ${file} \
https://dl.bintray.com/${BINTRAY_REPOSITORY}/${file}
done
}

test_binary() {
local download_dir=binaries
mkdir -p ${download_dir}
pushd ${download_dir}

# takes longer on slow network
for target in centos debian python ubuntu; do
download_bintray_files ${target}
done
python3 $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER --dest=${download_dir}

pushd ${download_dir}

# verify the signature and the checksums of each artifact
find . -name '*.asc' | while read sigfile; do
Expand Down

0 comments on commit 8231fcb

Please sign in to comment.