From be3b78902ff2871e637af2c340e24b75853e32e7 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 3 Apr 2024 10:33:31 +0200 Subject: [PATCH 01/49] GH-40954: [CI] Fix use of obsolete docker-compose command on Github Actions (#40949) ### Rationale for this change The `docker-compose` utility is progressively being removed from GHA-provided runners: https://github.com/actions/runner-images/issues/9557 ### What changes are included in this PR? Use `docker` client CLI directly instead of `docker-compose` where possible. ### Are these changes tested? Yes, this should fix the sporadic CI failures because of the above migration. ### Are there any user-facing changes? No, except additional optional env var `ARCHERY_DEBUG`. * GitHub Issue: #40954 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .github/workflows/archery.yml | 2 + .github/workflows/cpp.yml | 5 ++ .github/workflows/dev.yml | 4 + .github/workflows/docs.yml | 2 + .github/workflows/docs_light.yml | 2 + .github/workflows/go.yml | 9 +++ .github/workflows/integration.yml | 2 + .github/workflows/java.yml | 2 + .github/workflows/java_jni.yml | 2 + .github/workflows/js.yml | 4 + .github/workflows/python.yml | 2 + .github/workflows/r.yml | 2 + .github/workflows/ruby.yml | 2 + .github/workflows/swift.yml | 2 + dev/archery/archery/cli.py | 1 + dev/archery/archery/docker/cli.py | 79 ++++++------------- dev/archery/archery/docker/core.py | 50 +++++++----- .../archery/docker/tests/test_docker_cli.py | 19 +---- dev/tasks/java-jars/github.yml | 2 + dev/tasks/linux-packages/github.linux.yml | 1 + dev/tasks/macros.jinja | 4 + dev/tasks/python-wheels/github.linux.yml | 2 + dev/tasks/tasks.yml | 4 + 23 files changed, 117 insertions(+), 87 deletions(-) diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index cb783dd66c3fb..2aa6b0dcbb91d 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -32,7 +32,9 @@ on: - 'docker-compose.yml' env: + ARCHERY_DEBUG: 1 ARCHERY_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + ARCHERY_USE_DOCKER_CLI: 1 concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index e8e41f1bcb90c..1d10be3b5bc82 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -53,6 +53,7 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" @@ -94,6 +95,7 @@ jobs: cat <> "$GITHUB_OUTPUT" { "arch": "arm64v8", + "archery-use-docker-cli": "0", "clang-tools": "10", "image": "ubuntu-cpp", "llvm": "10", @@ -118,6 +120,9 @@ jobs: include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} + # By default, use Docker CLI because docker-compose v1 is obsolete, + # except where the Docker client version is too old. + ARCHERY_USE_DOCKER_CLI: ${{ matrix.archery-use-docker-cli || '1' }} ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} CLANG_TOOLS: ${{ matrix.clang-tools }} LLVM: ${{ matrix.llvm }} diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 37fda2e313ae2..8af5832f15948 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -29,6 +29,10 @@ concurrency: permissions: contents: read +env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 + jobs: lint: diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 9c7701f25f756..fe49e275d908d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -24,6 +24,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 6ec4c3d53d0e3..376c87651d2d0 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -33,6 +33,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 7fca38528260f..11dc29dcae54e 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -41,6 +41,10 @@ concurrency: permissions: contents: read +env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 + jobs: docker-targets: @@ -75,12 +79,14 @@ jobs: { "arch-label": "ARM64", "arch": "arm64v8", + "archery-use-docker-cli": "0", "go": "1.21", "runs-on": ["self-hosted", "arm", "linux"] }, { "arch-label": "ARM64", "arch": "arm64v8", + "archery-use-docker-cli": "0", "go": "1.22", "runs-on": ["self-hosted", "arm", "linux"] } @@ -101,6 +107,9 @@ jobs: include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} + # By default, use Docker CLI because docker-compose v1 is obsolete, + # except where the Docker client version is too old. + ARCHERY_USE_DOCKER_CLI: ${{ matrix.archery-use-docker-cli || '1' }} GO: ${{ matrix.go }} steps: - name: Checkout Arrow diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 0f186ff6a4527..2c3499c160f9c 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -51,6 +51,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 423f54cd93547..611e202ca0624 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -45,6 +45,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 790ffd5c650e0..958216ac7669d 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -45,6 +45,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index dab89da44c861..c9b7d7b742d88 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -38,6 +38,10 @@ concurrency: permissions: contents: read +env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 + jobs: docker: diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 1147ac13e6f93..2db9b17e895b0 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -41,6 +41,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 78677499f3e45..05c85fa6dc2c2 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -51,6 +51,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 311c1c822baf6..ea3e61d55787d 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -53,6 +53,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml index f55e9e77503c0..3f039315b505a 100644 --- a/.github/workflows/swift.yml +++ b/.github/workflows/swift.yml @@ -41,6 +41,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 32921afb2e61b..5fa41e28a3208 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -44,6 +44,7 @@ @click.group(context_settings={"help_option_names": ["-h", "--help"]}) @click.option("--debug", type=BOOL, is_flag=True, default=False, + envvar='ARCHERY_DEBUG', help="Increase logging with debugging output.") @click.option("--pdb", type=BOOL, is_flag=True, default=False, help="Invoke pdb on uncaught exception.") diff --git a/dev/archery/archery/docker/cli.py b/dev/archery/archery/docker/cli.py index 20d9a16138bac..e6baf0ca1f002 100644 --- a/dev/archery/archery/docker/cli.py +++ b/dev/archery/archery/docker/cli.py @@ -46,10 +46,19 @@ def _execute(self, *args, **kwargs): callback=validate_arrow_sources, help="Specify Arrow source directory.") @click.option('--dry-run/--execute', default=False, - help="Display the docker-compose commands instead of executing " - "them.") + help="Display the docker commands instead of executing them.") +@click.option('--using-docker-cli', default=False, is_flag=True, + envvar='ARCHERY_USE_DOCKER_CLI', + help="Use docker CLI directly for building instead of calling " + "docker-compose. This may help to reuse cached layers.") +@click.option('--using-docker-buildx', default=False, is_flag=True, + envvar='ARCHERY_USE_DOCKER_BUILDX', + help="Use buildx with docker CLI directly for building instead " + "of calling docker-compose or the plain docker build " + "command. This option makes the build cache reusable " + "across hosts.") @click.pass_context -def docker(ctx, src, dry_run): +def docker(ctx, src, dry_run, using_docker_cli, using_docker_buildx): """ Interact with docker-compose based builds. """ @@ -64,7 +73,10 @@ def docker(ctx, src, dry_run): # take the docker-compose parameters like PYTHON, PANDAS, UBUNTU from the # environment variables to keep the usage similar to docker-compose + using_docker_cli |= using_docker_buildx compose = DockerCompose(config_path, params=os.environ, + using_docker=using_docker_cli, + using_buildx=using_docker_buildx, debug=ctx.obj.get('debug', False)) if dry_run: _mock_compose_calls(compose) @@ -83,24 +95,19 @@ def check_config(obj): @docker.command('pull') @click.argument('image') -@click.option('--using-docker-cli', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_CLI', - help="Use docker CLI directly for pulling instead of calling " - "docker-compose. This may help to reuse cached layers.") @click.option('--pull-leaf/--no-leaf', default=True, help="Whether to pull leaf images too.") @click.option('--ignore-pull-failures/--no-ignore-pull-failures', default=True, help="Whether to ignore pull failures.") @click.pass_obj -def docker_pull(obj, image, *, using_docker_cli, pull_leaf, - ignore_pull_failures): +def docker_pull(obj, image, *, pull_leaf, ignore_pull_failures): """ Execute docker-compose pull. """ compose = obj['compose'] try: - compose.pull(image, pull_leaf=pull_leaf, using_docker=using_docker_cli, + compose.pull(image, pull_leaf=pull_leaf, ignore_pull_failures=ignore_pull_failures) except UndefinedImage as e: raise click.ClickException( @@ -115,16 +122,6 @@ def docker_pull(obj, image, *, using_docker_cli, pull_leaf, @click.argument('image') @click.option('--force-pull/--no-pull', default=True, help="Whether to force pull the image and its ancestor images") -@click.option('--using-docker-cli', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_CLI', - help="Use docker CLI directly for building instead of calling " - "docker-compose. This may help to reuse cached layers.") -@click.option('--using-docker-buildx', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_BUILDX', - help="Use buildx with docker CLI directly for building instead " - "of calling docker-compose or the plain docker build " - "command. This option makes the build cache reusable " - "across hosts.") @click.option('--use-cache/--no-cache', default=True, help="Whether to use cache when building the image and its " "ancestor images") @@ -133,22 +130,17 @@ def docker_pull(obj, image, *, using_docker_cli, pull_leaf, "passed as the argument. To disable caching for both the " "image and its ancestors use --no-cache option.") @click.pass_obj -def docker_build(obj, image, *, force_pull, using_docker_cli, - using_docker_buildx, use_cache, use_leaf_cache): +def docker_build(obj, image, *, force_pull, use_cache, use_leaf_cache): """ Execute docker-compose builds. """ compose = obj['compose'] - using_docker_cli |= using_docker_buildx try: if force_pull: - compose.pull(image, pull_leaf=use_leaf_cache, - using_docker=using_docker_cli) + compose.pull(image, pull_leaf=use_leaf_cache) compose.build(image, use_cache=use_cache, use_leaf_cache=use_leaf_cache, - using_docker=using_docker_cli, - using_buildx=using_docker_buildx, pull_parents=force_pull) except UndefinedImage as e: raise click.ClickException( @@ -172,16 +164,6 @@ def docker_build(obj, image, *, force_pull, using_docker_cli, help="Whether to force build the image and its ancestor images") @click.option('--build-only', default=False, is_flag=True, help="Pull and/or build the image, but do not run it") -@click.option('--using-docker-cli', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_CLI', - help="Use docker CLI directly for building instead of calling " - "docker-compose. This may help to reuse cached layers.") -@click.option('--using-docker-buildx', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_BUILDX', - help="Use buildx with docker CLI directly for building instead " - "of calling docker-compose or the plain docker build " - "command. This option makes the build cache reusable " - "across hosts.") @click.option('--use-cache/--no-cache', default=True, help="Whether to use cache when building the image and its " "ancestor images") @@ -191,7 +173,7 @@ def docker_build(obj, image, *, force_pull, using_docker_cli, "image and its ancestors use --no-cache option.") @click.option('--resource-limit', default=None, help="A CPU/memory limit preset to mimic CI environments like " - "GitHub Actions. Implies --using-docker-cli. Note that " + "GitHub Actions. Mandates --using-docker-cli. Note that " "exporting ARCHERY_DOCKER_BIN=\"sudo docker\" is likely " "required, unless Docker is configured with cgroups v2 " "(else Docker will silently ignore the limits).") @@ -199,8 +181,8 @@ def docker_build(obj, image, *, force_pull, using_docker_cli, help="Set volume within the container") @click.pass_obj def docker_run(obj, image, command, *, env, user, force_pull, force_build, - build_only, using_docker_cli, using_docker_buildx, use_cache, - use_leaf_cache, resource_limit, volume): + build_only, use_cache, use_leaf_cache, resource_limit, + volume): """ Execute docker-compose builds. @@ -234,18 +216,14 @@ def docker_run(obj, image, command, *, env, user, force_pull, force_build, archery docker run ubuntu-cpp bash """ compose = obj['compose'] - using_docker_cli |= using_docker_buildx env = dict(kv.split('=', 1) for kv in env) try: if force_pull: - compose.pull(image, pull_leaf=use_leaf_cache, - using_docker=using_docker_cli) + compose.pull(image, pull_leaf=use_leaf_cache) if force_build: compose.build(image, use_cache=use_cache, - use_leaf_cache=use_leaf_cache, - using_docker=using_docker_cli, - using_buildx=using_docker_buildx) + use_leaf_cache=use_leaf_cache) if build_only: return compose.run( @@ -253,7 +231,6 @@ def docker_run(obj, image, command, *, env, user, force_pull, force_build, command=command, env=env, user=user, - using_docker=using_docker_cli, resource_limit=resource_limit, volumes=volume ) @@ -273,15 +250,11 @@ def docker_run(obj, image, command, *, env, user, force_pull, force_build, @click.option('--password', '-p', required=False, envvar='ARCHERY_DOCKER_PASSWORD', help='Docker repository password') -@click.option('--using-docker-cli', default=False, is_flag=True, - help="Use docker CLI directly for building instead of calling " - "docker-compose. This may help to reuse cached layers.") @click.pass_obj -def docker_compose_push(obj, image, user, password, using_docker_cli): +def docker_compose_push(obj, image, user, password): """Push the generated docker-compose image.""" compose = obj['compose'] - compose.push(image, user=user, password=password, - using_docker=using_docker_cli) + compose.push(image, user=user, password=password) @docker.command('images') diff --git a/dev/archery/archery/docker/core.py b/dev/archery/archery/docker/core.py index 184d9808759b8..38720e5856a14 100644 --- a/dev/archery/archery/docker/core.py +++ b/dev/archery/archery/docker/core.py @@ -58,12 +58,21 @@ class UndefinedImage(Exception): class ComposeConfig: - def __init__(self, config_path, dotenv_path, compose_bin, params=None): + def __init__(self, config_path, dotenv_path, compose_bin, + using_docker=False, using_buildx=False, + params=None, debug=False): + self.using_docker = using_docker + self.using_buildx = using_buildx + self.debug = debug config_path = _ensure_path(config_path) if dotenv_path: dotenv_path = _ensure_path(dotenv_path) else: dotenv_path = config_path.parent / '.env' + if self.debug: + # Log docker version + Docker().run('version') + self._read_env(dotenv_path, params) self._read_config(config_path, compose_bin) @@ -122,8 +131,13 @@ def _read_config(self, config_path, compose_bin): ) # trigger docker-compose's own validation - compose = Command('docker-compose') - args = ['--file', str(config_path), 'config'] + if self.using_docker: + compose = Docker() + args = ['compose'] + else: + compose = Command('docker-compose') + args = [] + args += ['--file', str(config_path), 'config'] result = compose.run(*args, env=self.env, check=False, stderr=subprocess.PIPE, stdout=subprocess.PIPE) @@ -164,12 +178,13 @@ def __init__(self, docker_bin=None): class DockerCompose(Command): def __init__(self, config_path, dotenv_path=None, compose_bin=None, - params=None, debug=False): + using_docker=False, using_buildx=False, params=None, + debug=False): compose_bin = default_bin(compose_bin, 'docker-compose') self.config = ComposeConfig(config_path, dotenv_path, compose_bin, - params) + params=params, using_docker=using_docker, + using_buildx=using_buildx, debug=debug) self.bin = compose_bin - self.debug = debug self.pull_memory = set() def clear_pull_memory(self): @@ -215,14 +230,13 @@ def _execute_docker(self, *args, **kwargs): ) ) - def pull(self, service_name, pull_leaf=True, using_docker=False, - ignore_pull_failures=True): + def pull(self, service_name, pull_leaf=True, ignore_pull_failures=True): def _pull(service): args = ['pull'] if service['image'] in self.pull_memory: return - if using_docker: + if self.config.using_docker: try: self._execute_docker(*args, service['image']) except Exception as e: @@ -245,7 +259,7 @@ def _pull(service): _pull(service) def build(self, service_name, use_cache=True, use_leaf_cache=True, - using_docker=False, using_buildx=False, pull_parents=True): + pull_parents=True): def _build(service, use_cache): if 'build' not in service: # nothing to do @@ -273,7 +287,7 @@ def _build(service, use_cache): if self.config.env.get('BUILDKIT_INLINE_CACHE') == '1': args.extend(['--build-arg', 'BUILDKIT_INLINE_CACHE=1']) - if using_buildx: + if self.config.using_buildx: for k, v in service['build'].get('args', {}).items(): args.extend(['--build-arg', '{}={}'.format(k, v)]) @@ -295,9 +309,9 @@ def _build(service, use_cache): service['build'].get('context', '.') ]) self._execute_docker("buildx", "build", *args) - elif using_docker: + elif self.config.using_docker: # better for caching - if self.debug: + if self.config.debug: args.append("--progress=plain") for k, v in service['build'].get('args', {}).items(): args.extend(['--build-arg', '{}={}'.format(k, v)]) @@ -310,7 +324,7 @@ def _build(service, use_cache): ]) self._execute_docker("build", *args) else: - if self.debug: + if self.config.debug: args.append("--progress=plain") self._execute_compose("build", *args, service['name']) @@ -322,7 +336,7 @@ def _build(service, use_cache): _build(service, use_cache=use_cache and use_leaf_cache) def run(self, service_name, command=None, *, env=None, volumes=None, - user=None, using_docker=False, resource_limit=None): + user=None, resource_limit=None): service = self.config.get(service_name) args = [] @@ -337,7 +351,7 @@ def run(self, service_name, command=None, *, env=None, volumes=None, for volume in volumes: args.extend(['--volume', volume]) - if using_docker or service['need_gpu'] or resource_limit: + if self.config.using_docker or service['need_gpu'] or resource_limit: # use gpus, requires docker>=19.03 if service['need_gpu']: args.extend(['--gpus', 'all']) @@ -399,9 +413,9 @@ def run(self, service_name, command=None, *, env=None, volumes=None, args.append(command) self._execute_compose('run', '--rm', *args) - def push(self, service_name, user=None, password=None, using_docker=False): + def push(self, service_name, user=None, password=None): def _push(service): - if using_docker: + if self.config.using_docker: return self._execute_docker('push', service['image']) else: return self._execute_compose('push', service['name']) diff --git a/dev/archery/archery/docker/tests/test_docker_cli.py b/dev/archery/archery/docker/tests/test_docker_cli.py index ab39c7b9dbb4a..c117a3edfff65 100644 --- a/dev/archery/archery/docker/tests/test_docker_cli.py +++ b/dev/archery/archery/docker/tests/test_docker_cli.py @@ -33,14 +33,12 @@ def test_docker_run_with_custom_command(run, build, pull): assert result.exit_code == 0 pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, using_docker=False + "ubuntu-cpp", pull_leaf=True, ) build.assert_called_once_with( "ubuntu-cpp", use_cache=True, use_leaf_cache=True, - using_docker=False, - using_buildx=False ) run.assert_called_once_with( "ubuntu-cpp", @@ -48,7 +46,6 @@ def test_docker_run_with_custom_command(run, build, pull): env={}, resource_limit=None, user=None, - using_docker=False, volumes=(), ) @@ -75,14 +72,12 @@ def test_docker_run_options(run, build, pull): result = CliRunner().invoke(docker, args) assert result.exit_code == 0 pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, using_docker=False + "ubuntu-cpp", pull_leaf=True, ) build.assert_called_once_with( "ubuntu-cpp", use_cache=True, use_leaf_cache=True, - using_docker=False, - using_buildx=False ) run.assert_called_once_with( "ubuntu-cpp", @@ -90,7 +85,6 @@ def test_docker_run_options(run, build, pull): env={"ARROW_GANDIVA": "OFF", "ARROW_FLIGHT": "ON"}, resource_limit=None, user="root", - using_docker=False, volumes=( "./build:/build", "./ccache:/ccache:delegated", @@ -126,7 +120,6 @@ def test_docker_limit_options(run): env={"ARROW_GANDIVA": "OFF", "ARROW_FLIGHT": "ON"}, resource_limit="github", user="root", - using_docker=False, volumes=( "./build:/build", "./ccache:/ccache:delegated", @@ -145,7 +138,6 @@ def test_docker_run_without_pulling_or_building(run): env={}, resource_limit=None, user=None, - using_docker=False, volumes=(), ) @@ -157,14 +149,12 @@ def test_docker_run_only_pulling_and_building(build, pull): result = CliRunner().invoke(docker, args) assert result.exit_code == 0 pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, using_docker=False + "ubuntu-cpp", pull_leaf=True, ) build.assert_called_once_with( "ubuntu-cpp", use_cache=True, use_leaf_cache=True, - using_docker=False, - using_buildx=False ) @@ -187,8 +177,6 @@ def test_docker_run_without_build_cache(run, build): "ubuntu-cpp", use_cache=False, use_leaf_cache=False, - using_docker=False, - using_buildx=False ) run.assert_called_once_with( "ubuntu-cpp", @@ -196,6 +184,5 @@ def test_docker_run_without_build_cache(run, build): env={}, resource_limit=None, user="me", - using_docker=False, volumes=(), ) diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index 03cbcc7c98fcc..0437ee7864979 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -30,6 +30,7 @@ jobs: ARCH: {{ '${{ matrix.platform.archery_arch }}' }} ARCH_ALIAS: {{ '${{ matrix.platform.archery_arch_alias }}' }} ARCH_SHORT: {{ '${{ matrix.platform.archery_arch_short }}' }} + ARCHERY_USE_DOCKER_CLI: {{ "${{matrix.platform.archery_use_docker_cli || '1'}}" }} strategy: fail-fast: false matrix: @@ -44,6 +45,7 @@ jobs: archery_arch: "arm64v8" archery_arch_alias: "aarch64" archery_arch_short: "arm64" + archery_use_docker_cli: "0" steps: {{ macros.github_checkout_arrow()|indent }} {{ macros.github_free_space()|indent }} diff --git a/dev/tasks/linux-packages/github.linux.yml b/dev/tasks/linux-packages/github.linux.yml index 6de3edfce07e1..9e24835b8b627 100644 --- a/dev/tasks/linux-packages/github.linux.yml +++ b/dev/tasks/linux-packages/github.linux.yml @@ -29,6 +29,7 @@ jobs: {% endif %} env: ARCHITECTURE: {{ architecture }} + ARCHERY_USE_DOCKER_CLI: {{ '0' if architecture == 'arm64' else '1' }} steps: {{ macros.github_checkout_arrow()|indent }} {{ macros.github_login_dockerhub()|indent }} diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index bcafe53066ef8..f55a7f9481e56 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -23,6 +23,10 @@ on: push: branches: - "*-github-*" + +env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 {% endmacro %} {%- macro github_checkout_arrow(fetch_depth=1, submodules="recursive", action_v="4") -%} diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml index 41b18684cee10..0ff3c56b695eb 100644 --- a/dev/tasks/python-wheels/github.linux.yml +++ b/dev/tasks/python-wheels/github.linux.yml @@ -31,8 +31,10 @@ jobs: # archery uses these environment variables {% if arch == "amd64" %} ARCH: amd64 + ARCHERY_USE_DOCKER_CLI: 1 {% else %} ARCH: arm64v8 + ARCHERY_USE_DOCKER_CLI: 0 {% endif %} PYTHON: "{{ python_version }}" diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 5e1ef8d13b988..cf46cb8c6ad70 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1456,12 +1456,16 @@ tasks: ci: github template: docker-tests/github.cuda.yml params: + env: + ARCHERY_USE_DOCKER_CLI: 0 image: ubuntu-cuda-cpp test-cuda-python: ci: github template: docker-tests/github.cuda.yml params: + env: + ARCHERY_USE_DOCKER_CLI: 0 image: ubuntu-cuda-python ############################## Fuzz tests ################################# From 469430f50085b4b462fcc7db84a08cb554c698cd Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 3 Apr 2024 13:51:49 +0200 Subject: [PATCH 02/49] MINOR: [CI] Avoid docker-compose on Azure (#40972) ### Rationale for this change Same as PR #40949, but for Crossbow builds hosted on Azure Pipelines. ### Are these changes tested? Yes, they fix CI tests. ### Are there any user-facing changes? No. Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .github/workflows/archery.yml | 2 +- dev/archery/requirements-test.txt | 1 + dev/tasks/docker-tests/azure.linux.yml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index 2aa6b0dcbb91d..c698baba2c816 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -61,7 +61,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5.1.0 with: - python-version: '3.12' + python-version: '3.9' - name: Install pygit2 binary wheel run: pip install pygit2 --only-binary pygit2 - name: Install Archery, Crossbow- and Test Dependencies diff --git a/dev/archery/requirements-test.txt b/dev/archery/requirements-test.txt index 208ec64cdf026..e3e62a993c2a2 100644 --- a/dev/archery/requirements-test.txt +++ b/dev/archery/requirements-test.txt @@ -1,2 +1,3 @@ +docker-compose pytest responses diff --git a/dev/tasks/docker-tests/azure.linux.yml b/dev/tasks/docker-tests/azure.linux.yml index be03957e925ed..b66bfbdfe940a 100644 --- a/dev/tasks/docker-tests/azure.linux.yml +++ b/dev/tasks/docker-tests/azure.linux.yml @@ -46,7 +46,7 @@ jobs: displayName: Setup Archery - script: | - archery docker run \ + archery --debug docker --using-docker-cli run \ -e ARROW_DOCS_VERSION="{{ arrow.no_rc_version }}" \ -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \ {{ flags|default("") }} \ From e0d73c5fd521e9cf79f8c0bbe74285ab8fd4b6d2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 3 Apr 2024 11:14:27 -0300 Subject: [PATCH 03/49] GH-38828: [R] Ensure that streams can be written to socket connections (#38897) ### Rationale for this change Currently we can't write to socket connection from R. This is a very useful way to send Arrow data around and should work! ### What changes are included in this PR? Implements `Tell()` for non-seekable output streams. Apparently some Arrow code calls this to figure out how many bytes have been written. ### Are these changes tested? I'm not quite sure how to test this...all output streams we can easily test are seekable. We could try to spin up a socket server on another thread (like the reprex below) but I'm worried that will be flaky. ### Are there any user-facing changes? Yes (something that should have previously worked now works), although there is no place where we currently document anything about how connections can be used. ``` r tmp <- tempfile() proc <- callr::r_bg(function() { server <- function() { library(arrow) while (TRUE) { writeLines("Listening...") con <- socketConnection(host = "localhost", port = 6011, blocking = TRUE, server = TRUE, open = "r+b") socketTimeout(con, 3600) data <- arrow::read_ipc_stream(con, as_data_frame = FALSE) print(head(as.data.frame(data))) } } server() }, stdout = tmp) Sys.sleep(0.5) library(arrow, warn.conflicts = FALSE) #> Some features are not enabled in this build of Arrow. Run `arrow_info()` for more information. rb <- arrow::record_batch(iris) socketDriver <- socketConnection(host = "localhost", port = "6011", blocking = TRUE, server = FALSE, open = "w+b") write_ipc_stream(rb, socketDriver) Sys.sleep(0.5) cat(brio::read_file(tmp)) #> Listening... #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 1 5.1 3.5 1.4 0.2 setosa #> 2 4.9 3.0 1.4 0.2 setosa #> 3 4.7 3.2 1.3 0.2 setosa #> 4 4.6 3.1 1.5 0.2 setosa #> 5 5.0 3.6 1.4 0.2 setosa #> 6 5.4 3.9 1.7 0.4 setosa #> Listening... # Shutdown server proc$interrupt() #> [1] TRUE Sys.sleep(0.5) proc$is_alive() #> [1] FALSE ``` Created on 2023-11-27 with [reprex v2.0.2](https://reprex.tidyverse.org) * Closes: #38828 * GitHub Issue: #38828 Authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- r/R/csv.R | 7 ++--- r/R/feather.R | 2 +- r/R/ipc-stream.R | 4 +-- r/R/parquet.R | 2 +- r/man/read_delim_arrow.Rd | 5 ++-- r/man/read_feather.Rd | 4 +-- r/man/read_ipc_stream.Rd | 4 +-- r/man/read_json_arrow.Rd | 5 ++-- r/man/read_parquet.Rd | 4 +-- r/man/write_csv_arrow.Rd | 2 +- r/man/write_feather.Rd | 2 +- r/man/write_ipc_stream.Rd | 2 +- r/man/write_parquet.Rd | 2 +- r/src/io.cpp | 57 ++++++++++++++++++++++++++++++--------- 14 files changed, 69 insertions(+), 33 deletions(-) diff --git a/r/R/csv.R b/r/R/csv.R index 03540006ca0a2..733547570391d 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -78,8 +78,9 @@ #' `col_names`, and the CSV file has a header row that would otherwise be used #' to identify column names, you'll need to add `skip = 1` to skip that row. #' -#' @param file A character file name or URI, literal data (either a single string or a [raw] vector), -#' an Arrow input stream, or a `FileSystem` with path (`SubTreeFileSystem`). +#' @param file A character file name or URI, connection, literal data (either a +#' single string or a [raw] vector), an Arrow input stream, or a `FileSystem` +#' with path (`SubTreeFileSystem`). #' #' If a file name, a memory-mapped Arrow [InputStream] will be opened and #' closed when finished; compression will be detected from the file extension @@ -894,7 +895,7 @@ readr_to_csv_convert_options <- function(na, #' Write CSV file to disk #' #' @param x `data.frame`, [RecordBatch], or [Table] -#' @param sink A string file path, URI, or [OutputStream], or path in a file +#' @param sink A string file path, connection, URI, or [OutputStream], or path in a file #' system (`SubTreeFileSystem`) #' @param file file name. Specify this or `sink`, not both. #' @param include_header Whether to write an initial header line with column names diff --git a/r/R/feather.R b/r/R/feather.R index 474fc6118e44f..aa08dfdbc96a5 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -29,7 +29,7 @@ #' [write_ipc_file()] can only write V2 files. #' #' @param x `data.frame`, [RecordBatch], or [Table] -#' @param sink A string file path, URI, or [OutputStream], or path in a file +#' @param sink A string file path, connection, URI, or [OutputStream], or path in a file #' system (`SubTreeFileSystem`) #' @param version integer Feather file version, Version 1 or Version 2. Version 2 is the default. #' @param chunk_size For V2 files, the number of rows that each chunk of data diff --git a/r/R/ipc-stream.R b/r/R/ipc-stream.R index 37ef0bbaf2126..26a61a790f936 100644 --- a/r/R/ipc-stream.R +++ b/r/R/ipc-stream.R @@ -82,8 +82,8 @@ write_to_raw <- function(x, format = c("stream", "file")) { #' a "stream" format and a "file" format, known as Feather. `read_ipc_stream()` #' and [read_feather()] read those formats, respectively. #' -#' @param file A character file name or URI, `raw` vector, an Arrow input stream, -#' or a `FileSystem` with path (`SubTreeFileSystem`). +#' @param file A character file name or URI, connection, `raw` vector, an +#' Arrow input stream, or a `FileSystem` with path (`SubTreeFileSystem`). #' If a file name or URI, an Arrow [InputStream] will be opened and #' closed when finished. If an input stream is provided, it will be left #' open. diff --git a/r/R/parquet.R b/r/R/parquet.R index d92e913cb5db3..0ee6c62601c1d 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -90,7 +90,7 @@ read_parquet <- function(file, #' article} for examples of this. #' #' @param x `data.frame`, [RecordBatch], or [Table] -#' @param sink A string file path, URI, or [OutputStream], or path in a file +#' @param sink A string file path, connection, URI, or [OutputStream], or path in a file #' system (`SubTreeFileSystem`) #' @param chunk_size how many rows of data to write to disk at once. This #' directly corresponds to how many rows will be in each row group in diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd index b56d445c9e2e3..f946785e4a41e 100644 --- a/r/man/read_delim_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -90,8 +90,9 @@ read_tsv_arrow( ) } \arguments{ -\item{file}{A character file name or URI, literal data (either a single string or a \link{raw} vector), -an Arrow input stream, or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +\item{file}{A character file name or URI, connection, literal data (either a +single string or a \link{raw} vector), an Arrow input stream, or a \code{FileSystem} +with path (\code{SubTreeFileSystem}). If a file name, a memory-mapped Arrow \link{InputStream} will be opened and closed when finished; compression will be detected from the file extension diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index c3b4a54158c7f..95661d9778576 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -10,8 +10,8 @@ read_feather(file, col_select = NULL, as_data_frame = TRUE, mmap = TRUE) read_ipc_file(file, col_select = NULL, as_data_frame = TRUE, mmap = TRUE) } \arguments{ -\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, -or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +\item{file}{A character file name or URI, connection, \code{raw} vector, an +Arrow input stream, or a \code{FileSystem} with path (\code{SubTreeFileSystem}). If a file name or URI, an Arrow \link{InputStream} will be opened and closed when finished. If an input stream is provided, it will be left open.} diff --git a/r/man/read_ipc_stream.Rd b/r/man/read_ipc_stream.Rd index db930b52bde18..49d3949bfcf22 100644 --- a/r/man/read_ipc_stream.Rd +++ b/r/man/read_ipc_stream.Rd @@ -7,8 +7,8 @@ read_ipc_stream(file, as_data_frame = TRUE, ...) } \arguments{ -\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, -or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +\item{file}{A character file name or URI, connection, \code{raw} vector, an +Arrow input stream, or a \code{FileSystem} with path (\code{SubTreeFileSystem}). If a file name or URI, an Arrow \link{InputStream} will be opened and closed when finished. If an input stream is provided, it will be left open.} diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index 9230a9a017495..f289d3356e559 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -13,8 +13,9 @@ read_json_arrow( ) } \arguments{ -\item{file}{A character file name or URI, literal data (either a single string or a \link{raw} vector), -an Arrow input stream, or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +\item{file}{A character file name or URI, connection, literal data (either a +single string or a \link{raw} vector), an Arrow input stream, or a \code{FileSystem} +with path (\code{SubTreeFileSystem}). If a file name, a memory-mapped Arrow \link{InputStream} will be opened and closed when finished; compression will be detected from the file extension diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd index 4f1936529531a..95ee4ac5a8ceb 100644 --- a/r/man/read_parquet.Rd +++ b/r/man/read_parquet.Rd @@ -14,8 +14,8 @@ read_parquet( ) } \arguments{ -\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, -or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +\item{file}{A character file name or URI, connection, \code{raw} vector, an +Arrow input stream, or a \code{FileSystem} with path (\code{SubTreeFileSystem}). If a file name or URI, an Arrow \link{InputStream} will be opened and closed when finished. If an input stream is provided, it will be left open.} diff --git a/r/man/write_csv_arrow.Rd b/r/man/write_csv_arrow.Rd index 9fcca49fadc05..9f9fde74cc996 100644 --- a/r/man/write_csv_arrow.Rd +++ b/r/man/write_csv_arrow.Rd @@ -19,7 +19,7 @@ write_csv_arrow( \arguments{ \item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} -\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +\item{sink}{A string file path, connection, URI, or \link{OutputStream}, or path in a file system (\code{SubTreeFileSystem})} \item{file}{file name. Specify this or \code{sink}, not both.} diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 0d3a7da3b90b4..823bd2224eac2 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -25,7 +25,7 @@ write_ipc_file( \arguments{ \item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} -\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +\item{sink}{A string file path, connection, URI, or \link{OutputStream}, or path in a file system (\code{SubTreeFileSystem})} \item{version}{integer Feather file version, Version 1 or Version 2. Version 2 is the default.} diff --git a/r/man/write_ipc_stream.Rd b/r/man/write_ipc_stream.Rd index 094e3ad11a0c8..da9bb6bcacb45 100644 --- a/r/man/write_ipc_stream.Rd +++ b/r/man/write_ipc_stream.Rd @@ -9,7 +9,7 @@ write_ipc_stream(x, sink, ...) \arguments{ \item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} -\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +\item{sink}{A string file path, connection, URI, or \link{OutputStream}, or path in a file system (\code{SubTreeFileSystem})} \item{...}{extra parameters passed to \code{write_feather()}.} diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index 480abb12fcf4a..954c692dad2f1 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -22,7 +22,7 @@ write_parquet( \arguments{ \item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} -\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +\item{sink}{A string file path, connection, URI, or \link{OutputStream}, or path in a file system (\code{SubTreeFileSystem})} \item{chunk_size}{how many rows of data to write to disk at once. This diff --git a/r/src/io.cpp b/r/src/io.cpp index 4d5ee31794ae8..2f36d51dcd7ab 100644 --- a/r/src/io.cpp +++ b/r/src/io.cpp @@ -212,11 +212,16 @@ void io___BufferOutputStream__Write( class RConnectionFileInterface : public virtual arrow::io::FileInterface { public: explicit RConnectionFileInterface(cpp11::sexp connection_sexp) - : connection_sexp_(connection_sexp), closed_(false) { + : connection_sexp_(connection_sexp), + closed_(false), + seekable_(false), + bytes_written_(0), + bytes_read_(0) { check_closed(); + seekable_ = check_seekable(); } - arrow::Status Close() { + arrow::Status Close() override { if (closed_) { return arrow::Status::OK(); } @@ -227,11 +232,21 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { "close() on R connection"); } - arrow::Result Tell() const { + arrow::Result Tell() const override { if (closed()) { return arrow::Status::IOError("R connection is closed"); } + // R connections use seek() with no additional arguments as a tell() + // implementation; however, non-seekable connections will error if you + // do this. This heuristic allows Tell() to return a reasonable value + // (used by at least the IPC writer). + if (!seekable_ && bytes_written_ > 0) { + return bytes_written_; + } else if (!seekable_) { + return bytes_read_; + } + return SafeCallIntoR( [&]() { cpp11::sexp result = cpp11::package("base")["seek"](connection_sexp_); @@ -240,7 +255,7 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { "tell() on R connection"); } - bool closed() const { return closed_; } + bool closed() const override { return closed_; } protected: cpp11::sexp connection_sexp_; @@ -261,13 +276,14 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { return SafeCallIntoR( [&] { cpp11::function read_bin = cpp11::package("base")["readBin"]; - cpp11::writable::raws ptype((R_xlen_t)0); + cpp11::writable::raws ptype(static_cast(0)); cpp11::integers n = cpp11::as_sexp(static_cast(nbytes)); cpp11::sexp result = read_bin(connection_sexp_, ptype, n); int64_t result_size = cpp11::safe[Rf_xlength](result); memcpy(out, cpp11::safe[RAW](result), result_size); + bytes_read_++; return result_size; }, "readBin() on R connection"); @@ -294,6 +310,7 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { cpp11::function write_bin = cpp11::package("base")["writeBin"]; write_bin(data_raw, connection_sexp_); + bytes_written_ += nbytes; }, "writeBin() on R connection"); } @@ -312,6 +329,9 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { private: bool closed_; + bool seekable_; + int64_t bytes_written_; + int64_t bytes_read_; bool check_closed() { if (closed_) { @@ -333,6 +353,15 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { return closed_; } + + bool check_seekable() { + auto is_seekable_result = SafeCallIntoR([&] { + cpp11::sexp result = cpp11::package("base")["isSeekable"](connection_sexp_); + return cpp11::as_cpp(result); + }); + + return is_seekable_result.ok() && *is_seekable_result; + } }; class RConnectionInputStream : public virtual arrow::io::InputStream, @@ -341,9 +370,11 @@ class RConnectionInputStream : public virtual arrow::io::InputStream, explicit RConnectionInputStream(cpp11::sexp connection_sexp) : RConnectionFileInterface(connection_sexp) {} - arrow::Result Read(int64_t nbytes, void* out) { return ReadBase(nbytes, out); } + arrow::Result Read(int64_t nbytes, void* out) override { + return ReadBase(nbytes, out); + } - arrow::Result> Read(int64_t nbytes) { + arrow::Result> Read(int64_t nbytes) override { return ReadBase(nbytes); } }; @@ -373,13 +404,15 @@ class RConnectionRandomAccessFile : public arrow::io::RandomAccessFile, } } - arrow::Result GetSize() { return size_; } + arrow::Result GetSize() override { return size_; } - arrow::Status Seek(int64_t pos) { return SeekBase(pos); } + arrow::Status Seek(int64_t pos) override { return SeekBase(pos); } - arrow::Result Read(int64_t nbytes, void* out) { return ReadBase(nbytes, out); } + arrow::Result Read(int64_t nbytes, void* out) override { + return ReadBase(nbytes, out); + } - arrow::Result> Read(int64_t nbytes) { + arrow::Result> Read(int64_t nbytes) override { return ReadBase(nbytes); } @@ -393,7 +426,7 @@ class RConnectionOutputStream : public arrow::io::OutputStream, explicit RConnectionOutputStream(cpp11::sexp connection_sexp) : RConnectionFileInterface(connection_sexp) {} - arrow::Status Write(const void* data, int64_t nbytes) { + arrow::Status Write(const void* data, int64_t nbytes) override { return WriteBase(data, nbytes); } }; From 8c09c7f1816c03e82761daa44159ebe9c21742eb Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 3 Apr 2024 15:02:13 -0300 Subject: [PATCH 04/49] GH-40983: [C++] Fix unused function build error (#40984) ### Rationale for this change Make sure local `CompressedInputArguments` is only defined when it's being used. ### What changes are included in this PR? Wrapping definition and usage under the same `#ifdef`. ### Are these changes tested? By building with `-Werror` and running all existing tests. * GitHub Issue: #40983 Authored-by: Felipe Oliveira Carvalho Signed-off-by: mwish --- cpp/src/arrow/io/compressed_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/io/compressed_benchmark.cc b/cpp/src/arrow/io/compressed_benchmark.cc index 52a30d8cb0887..ae9ecd57143f3 100644 --- a/cpp/src/arrow/io/compressed_benchmark.cc +++ b/cpp/src/arrow/io/compressed_benchmark.cc @@ -170,6 +170,7 @@ static void CompressedInputStreamNonZeroCopyBufferReturnedByCallee( BufferReadMode::ReturnedByCallee>(state, kCompression); } +#ifdef ARROW_WITH_LZ4 static void CompressedInputArguments(::benchmark::internal::Benchmark* b) { b->ArgNames({"num_bytes", "batch_size"}) ->Args({8 * 1024, 8 * 1024}) @@ -180,7 +181,6 @@ static void CompressedInputArguments(::benchmark::internal::Benchmark* b) { ->Args({1024 * 1024, 1024 * 1024}); } -#ifdef ARROW_WITH_LZ4 // Benchmark LZ4 because it's lightweight, which makes benchmarking focused on the // overhead of the compression input stream. BENCHMARK_TEMPLATE(CompressedInputStreamZeroCopyBufferProvidedByCaller, From a4acb643437af2323f683e51d6043907fed496a9 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 4 Apr 2024 03:57:41 +0800 Subject: [PATCH 05/49] GH-40872: [C++][Parquet] Encoding: Optimize DecodeArrow/Decode(bitmap) for PlainBooleanDecoder (#40876) ### Rationale for this change This is for enhance boolean decoding. I optimized the `DecodeArrow` for PlainBoolean ### What changes are included in this PR? Optimize DecodeArrow/Decode(bitmap) for PlainBooleanDecoder, and add benchmarks ### Are these changes tested? Yes ### Are there any user-facing changes? Minor optimization. And `Decode` boolean will change the syntax * GitHub Issue: #40872 Lead-authored-by: mwish Co-authored-by: Antoine Pitrou Signed-off-by: mwish --- cpp/src/parquet/column_reader.cc | 3 +- cpp/src/parquet/encoding.cc | 79 ++++++--- cpp/src/parquet/encoding.h | 4 +- cpp/src/parquet/encoding_benchmark.cc | 236 ++++++++++++++++++++++---- 4 files changed, 261 insertions(+), 61 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3fb224154c4ec..af489c70a5233 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1316,7 +1316,8 @@ class TypedRecordReader : public TypedColumnReaderImpl, levels_position_ = 0; levels_capacity_ = 0; read_dense_for_nullable_ = read_dense_for_nullable; - uses_values_ = !(descr->physical_type() == Type::BYTE_ARRAY); + // BYTE_ARRAY values are not stored in the `values_` buffer. + uses_values_ = descr->physical_type() != Type::BYTE_ARRAY; if (uses_values_) { values_ = AllocateBuffer(pool); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 3eed88f08b22a..f16e9b34fc682 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -55,6 +55,7 @@ namespace bit_util = arrow::bit_util; using arrow::Status; using arrow::VisitNullBitmapInline; using arrow::internal::AddWithOverflow; +using arrow::internal::BitBlockCounter; using arrow::internal::checked_cast; using arrow::internal::MultiplyWithOverflow; using arrow::internal::SafeSignedSubtract; @@ -1173,13 +1174,15 @@ class PlainBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { private: std::unique_ptr<::arrow::bit_util::BitReader> bit_reader_; + int total_num_values_{0}; }; PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr) : DecoderImpl(descr, Encoding::PLAIN) {} void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) { - num_values_ = num_values; + DecoderImpl::SetData(num_values, data, len); + total_num_values_ = num_values; bit_reader_ = std::make_unique(data, len); } @@ -1188,19 +1191,52 @@ int PlainBooleanDecoder::DecodeArrow( typename EncodingTraits::Accumulator* builder) { int values_decoded = num_values - null_count; if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) { + // A too large `num_values` was requested. + ParquetException::EofException(); + } + if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(values_decoded))) { ParquetException::EofException(); } - PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); - - VisitNullBitmapInline( - valid_bits, valid_bits_offset, num_values, null_count, - [&]() { - bool value; - ARROW_IGNORE_EXPR(bit_reader_->GetValue(1, &value)); - builder->UnsafeAppend(value); - }, - [&]() { builder->UnsafeAppendNull(); }); + if (null_count == 0) { + // FastPath: can copy the data directly + PARQUET_THROW_NOT_OK(builder->AppendValues(data_, values_decoded, NULLPTR, + total_num_values_ - num_values_)); + } else { + // Handle nulls by BitBlockCounter + PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); + BitBlockCounter bit_counter(valid_bits, valid_bits_offset, num_values); + int64_t value_position = 0; + int64_t valid_bits_offset_position = valid_bits_offset; + int64_t previous_value_offset = 0; + while (value_position < num_values) { + auto block = bit_counter.NextWord(); + if (block.AllSet()) { + // GH-40978: We don't have UnsafeAppendValues for booleans currently, + // so using `AppendValues` here. + PARQUET_THROW_NOT_OK( + builder->AppendValues(data_, block.length, NULLPTR, previous_value_offset)); + previous_value_offset += block.length; + } else if (block.NoneSet()) { + // GH-40978: We don't have UnsafeAppendNulls for booleans currently, + // so using `AppendNulls` here. + PARQUET_THROW_NOT_OK(builder->AppendNulls(block.length)); + } else { + for (int64_t i = 0; i < block.length; ++i) { + if (bit_util::GetBit(valid_bits, valid_bits_offset_position + i)) { + bool value = bit_util::GetBit( + data_, total_num_values_ - num_values_ + previous_value_offset); + builder->UnsafeAppend(value); + previous_value_offset += 1; + } else { + builder->UnsafeAppendNull(); + } + } + } + value_position += block.length; + valid_bits_offset_position += block.length; + } + } num_values_ -= values_decoded; return values_decoded; @@ -1214,18 +1250,15 @@ inline int PlainBooleanDecoder::DecodeArrow( int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) { max_values = std::min(max_values, num_values_); - bool val; - ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values); - for (int i = 0; i < max_values; ++i) { - if (!bit_reader_->GetValue(1, &val)) { - ParquetException::EofException(); - } - if (val) { - bit_writer.Set(); - } - bit_writer.Next(); + if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(max_values))) { + ParquetException::EofException(); } - bit_writer.Finish(); + // Copy the data directly + // Parquet's boolean encoding is bit-packed using LSB. So + // we can directly copy the data to the buffer. + ::arrow::internal::CopyBitmap(this->data_, /*offset=*/total_num_values_ - num_values_, + /*length=*/max_values, /*dest=*/buffer, + /*dest_offset=*/0); num_values_ -= max_values; return max_values; } @@ -1692,7 +1725,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { } protected: - Status IndexInBounds(int32_t index) { + Status IndexInBounds(int32_t index) const { if (ARROW_PREDICT_TRUE(0 <= index && index < dictionary_length_)) { return Status::OK(); } diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index de47bb7deb839..602009189595e 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -400,7 +400,9 @@ class BooleanDecoder : virtual public TypedDecoder { /// \brief Decode and bit-pack values into a buffer /// /// \param[in] buffer destination for decoded values - /// This buffer will contain bit-packed values. + /// This buffer will contain bit-packed values. If + /// max_values is not a multiple of 8, the trailing bits + /// of the last byte will be undefined. /// \param[in] max_values max values to decode. /// \return The number of values decoded. Should be identical to max_values except /// at the end of the current data page. diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 61959b659f633..9c07d262b350e 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -66,6 +66,7 @@ static void BM_PlainEncodingBoolean(benchmark::State& state) { typed_encoder->FlushValues(); } state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(bool)); + state.SetItemsProcessed(state.iterations() * state.range(0)); } BENCHMARK(BM_PlainEncodingBoolean)->Range(MIN_RANGE, MAX_RANGE); @@ -86,11 +87,34 @@ static void BM_PlainDecodingBoolean(benchmark::State& state) { } state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(bool)); + state.SetItemsProcessed(state.iterations() * state.range(0)); delete[] output; } BENCHMARK(BM_PlainDecodingBoolean)->Range(MIN_RANGE, MAX_RANGE); +static void BM_PlainDecodingBooleanToBitmap(benchmark::State& state) { + std::vector values(state.range(0), true); + int64_t bitmap_bytes = ::arrow::bit_util::BytesForBits(state.range(0)); + std::vector output(bitmap_bytes, 0); + auto encoder = MakeEncoder(Type::BOOLEAN, Encoding::PLAIN); + auto typed_encoder = dynamic_cast(encoder.get()); + typed_encoder->Put(values, static_cast(values.size())); + std::shared_ptr buf = encoder->FlushValues(); + + for (auto _ : state) { + auto decoder = MakeTypedDecoder(Encoding::PLAIN); + decoder->SetData(static_cast(values.size()), buf->data(), + static_cast(buf->size())); + decoder->Decode(output.data(), static_cast(values.size())); + } + // Still set `BytesProcessed` to byte level. + state.SetBytesProcessed(state.iterations() * bitmap_bytes); + state.SetItemsProcessed(state.iterations() * state.range(0)); +} + +BENCHMARK(BM_PlainDecodingBooleanToBitmap)->Range(MIN_RANGE, MAX_RANGE); + static void BM_PlainEncodingInt64(benchmark::State& state) { std::vector values(state.range(0), 64); auto encoder = MakeTypedEncoder(Encoding::PLAIN); @@ -1097,8 +1121,11 @@ BENCHMARK(BM_DictDecodingByteArray)->Apply(ByteArrayCustomArguments); using ::arrow::BinaryBuilder; using ::arrow::BinaryDictionary32Builder; -class BenchmarkDecodeArrow : public ::benchmark::Fixture { +template +class BenchmarkDecodeArrowBase : public ::benchmark::Fixture { public: + virtual ~BenchmarkDecodeArrowBase() = default; + void SetUp(const ::benchmark::State& state) override { num_values_ = static_cast(state.range()); InitDataInputs(); @@ -1111,37 +1138,18 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture { values_.clear(); } - void InitDataInputs() { - // Generate a random string dictionary without any nulls so that this dataset can - // be used for benchmarking the DecodeArrowNonNull API - constexpr int repeat_factor = 8; - constexpr int64_t min_length = 2; - constexpr int64_t max_length = 10; - ::arrow::random::RandomArrayGenerator rag(0); - input_array_ = rag.StringWithRepeats(num_values_, num_values_ / repeat_factor, - min_length, max_length, /*null_probability=*/0); - valid_bits_ = input_array_->null_bitmap_data(); - total_size_ = input_array_->data()->buffers[2]->size(); - - values_.reserve(num_values_); - const auto& binary_array = static_cast(*input_array_); - for (int64_t i = 0; i < binary_array.length(); i++) { - auto view = binary_array.GetView(i); - values_.emplace_back(static_cast(view.length()), - reinterpret_cast(view.data())); - } - } - + virtual void InitDataInputs() = 0; virtual void DoEncodeArrow() = 0; virtual void DoEncodeLowLevel() = 0; - - virtual std::unique_ptr InitializeDecoder() = 0; + virtual std::unique_ptr> InitializeDecoder() = 0; + virtual typename EncodingTraits::Accumulator CreateAccumulator() = 0; void EncodeArrowBenchmark(benchmark::State& state) { for (auto _ : state) { DoEncodeArrow(); } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } void EncodeLowLevelBenchmark(benchmark::State& state) { @@ -1149,26 +1157,27 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture { DoEncodeLowLevel(); } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } void DecodeArrowDenseBenchmark(benchmark::State& state) { for (auto _ : state) { auto decoder = InitializeDecoder(); - typename EncodingTraits::Accumulator acc; - acc.builder.reset(new BinaryBuilder); + auto acc = CreateAccumulator(); decoder->DecodeArrow(num_values_, 0, valid_bits_, 0, &acc); } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } void DecodeArrowNonNullDenseBenchmark(benchmark::State& state) { for (auto _ : state) { auto decoder = InitializeDecoder(); - typename EncodingTraits::Accumulator acc; - acc.builder.reset(new BinaryBuilder); + auto acc = CreateAccumulator(); decoder->DecodeArrowNonNull(num_values_, &acc); } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } void DecodeArrowDictBenchmark(benchmark::State& state) { @@ -1179,6 +1188,7 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture { } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } void DecodeArrowNonNullDictBenchmark(benchmark::State& state) { @@ -1189,20 +1199,56 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture { } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } protected: - int num_values_; + int num_values_{0}; std::shared_ptr<::arrow::Array> input_array_; - std::vector values_; - uint64_t total_size_; - const uint8_t* valid_bits_; + uint64_t total_size_{0}; + const uint8_t* valid_bits_{nullptr}; std::shared_ptr buffer_; + std::vector values_; +}; + +class BenchmarkDecodeArrowByteArray : public BenchmarkDecodeArrowBase { + public: + using ByteArrayAccumulator = typename EncodingTraits::Accumulator; + + ByteArrayAccumulator CreateAccumulator() final { + ByteArrayAccumulator acc; + acc.builder = std::make_unique(default_memory_pool()); + return acc; + } + + void InitDataInputs() final { + // Generate a random string dictionary without any nulls so that this dataset can + // be used for benchmarking the DecodeArrowNonNull API + constexpr int repeat_factor = 8; + constexpr int64_t min_length = 2; + constexpr int64_t max_length = 10; + ::arrow::random::RandomArrayGenerator rag(0); + input_array_ = rag.StringWithRepeats(num_values_, num_values_ / repeat_factor, + min_length, max_length, /*null_probability=*/0); + valid_bits_ = input_array_->null_bitmap_data(); + total_size_ = input_array_->data()->buffers[2]->size(); + + values_.reserve(num_values_); + const auto& binary_array = static_cast(*input_array_); + for (int64_t i = 0; i < binary_array.length(); i++) { + auto view = binary_array.GetView(i); + values_.emplace_back(static_cast(view.length()), + reinterpret_cast(view.data())); + } + } + + protected: + std::vector values_; }; // ---------------------------------------------------------------------- // Benchmark Decoding from Plain Encoding -class BM_ArrowBinaryPlain : public BenchmarkDecodeArrow { +class BM_ArrowBinaryPlain : public BenchmarkDecodeArrowByteArray { public: void DoEncodeArrow() override { auto encoder = MakeTypedEncoder(Encoding::PLAIN); @@ -1251,7 +1297,7 @@ BENCHMARK_REGISTER_F(BM_ArrowBinaryPlain, DecodeArrowNonNull_Dict) // ---------------------------------------------------------------------- // Benchmark Decoding from Dictionary Encoding -class BM_ArrowBinaryDict : public BenchmarkDecodeArrow { +class BM_ArrowBinaryDict : public BenchmarkDecodeArrowByteArray { public: template void DoEncode(PutValuesFunc&& put_values) { @@ -1319,7 +1365,7 @@ class BM_ArrowBinaryDict : public BenchmarkDecodeArrow { } void TearDown(const ::benchmark::State& state) override { - BenchmarkDecodeArrow::TearDown(state); + BenchmarkDecodeArrowByteArray::TearDown(state); dict_buffer_.reset(); descr_.reset(); } @@ -1327,7 +1373,7 @@ class BM_ArrowBinaryDict : public BenchmarkDecodeArrow { protected: std::unique_ptr descr_; std::shared_ptr dict_buffer_; - int num_dict_entries_; + int num_dict_entries_{0}; }; BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, EncodeArrow) @@ -1373,4 +1419,122 @@ BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict) BENCHMARK_REGISTER_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict) ->Range(MIN_RANGE, MAX_RANGE); +class BenchmarkDecodeArrowBoolean : public BenchmarkDecodeArrowBase { + public: + void InitDataInputs() final { + // Generate a random boolean array with `null_probability_`. + ::arrow::random::RandomArrayGenerator rag(0); + input_array_ = rag.Boolean(num_values_, /*true_probability=*/0.5, null_probability_); + valid_bits_ = input_array_->null_bitmap_data(); + + // Arrow uses a bitmap representation for boolean arrays, + // so, we uses this as "total_size" for the benchmark. + total_size_ = ::arrow::bit_util::BytesForBits(num_values_); + + values_.reserve(num_values_); + const auto& boolean_array = static_cast(*input_array_); + for (int64_t i = 0; i < boolean_array.length(); i++) { + values_.push_back(boolean_array.Value(i)); + } + } + + typename EncodingTraits::Accumulator CreateAccumulator() final { + return typename EncodingTraits::Accumulator(); + } + + void DoEncodeLowLevel() final { ParquetException::NYI(); } + + void DecodeArrowWithNullDenseBenchmark(benchmark::State& state); + + protected: + void DoEncodeArrowImpl(Encoding::type encoding) { + auto encoder = MakeTypedEncoder(encoding); + encoder->Put(*input_array_); + buffer_ = encoder->FlushValues(); + } + + std::unique_ptr> InitializeDecoderImpl( + Encoding::type encoding) const { + auto decoder = MakeTypedDecoder(encoding); + decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); + return decoder; + } + + protected: + double null_probability_ = 0.0; +}; + +void BenchmarkDecodeArrowBoolean::DecodeArrowWithNullDenseBenchmark( + benchmark::State& state) { + // Change null_probability + null_probability_ = static_cast(state.range(1)) / 10000; + InitDataInputs(); + this->DoEncodeArrow(); + int num_values_with_nulls = this->num_values_; + + for (auto _ : state) { + auto decoder = this->InitializeDecoder(); + auto acc = this->CreateAccumulator(); + decoder->DecodeArrow( + num_values_with_nulls, + /*null_count=*/static_cast(this->input_array_->null_count()), + this->valid_bits_, 0, &acc); + } + state.SetBytesProcessed(state.iterations() * static_cast(total_size_)); + state.SetItemsProcessed(state.iterations() * state.range(0)); +} + +class BM_DecodeArrowBooleanPlain : public BenchmarkDecodeArrowBoolean { + public: + void DoEncodeArrow() final { DoEncodeArrowImpl(Encoding::PLAIN); } + + std::unique_ptr> InitializeDecoder() override { + return InitializeDecoderImpl(Encoding::PLAIN); + } +}; + +class BM_DecodeArrowBooleanRle : public BenchmarkDecodeArrowBoolean { + public: + void DoEncodeArrow() final { DoEncodeArrowImpl(Encoding::RLE); } + + std::unique_ptr> InitializeDecoder() override { + return InitializeDecoderImpl(Encoding::RLE); + } +}; + +static void BooleanWithNullCustomArguments(benchmark::internal::Benchmark* b) { + b->ArgsProduct({ + benchmark::CreateRange(MIN_RANGE, MAX_RANGE, /*multi=*/4), + {1, 100, 1000, 5000, 10000}, + }) + ->ArgNames({"num_values", "null_in_ten_thousand"}); +} + +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrow)(benchmark::State& state) { + DecodeArrowDenseBenchmark(state); +} +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrow)->Range(MIN_RANGE, MAX_RANGE); +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) +(benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) + ->Range(MIN_RANGE, MAX_RANGE); +// TODO(mwish): RleBoolean not implemented DecodeArrow with null slots yet. +// BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) +//(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } +// BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) +// ->Apply(BooleanWithNullCustomArguments); + +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrow) +(benchmark::State& state) { DecodeArrowDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanPlain, DecodeArrow) + ->Range(MIN_RANGE, MAX_RANGE); +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrowNonNull) +(benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanPlain, DecodeArrowNonNull) + ->Range(MIN_RANGE, MAX_RANGE); +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrowWithNull) +(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanPlain, DecodeArrowWithNull) + ->Apply(BooleanWithNullCustomArguments); + } // namespace parquet From a5f80184750fb09fd232f2acbe5558b03b45a156 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 4 Apr 2024 06:41:04 +0900 Subject: [PATCH 06/49] GH-40961: [GLib] Suppress warnings for Vala examples on macOS (#40962) ### Rationale for this change There are some warnings for Vala examples on macOS: ```text FAILED: example/vala/read-file.p/meson-generated_read-file.c.o ccache cc -Iexample/vala/read-file.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/read-file.p/meson-generated_read-file.c.o -MF example/vala/read-file.p/meson-generated_read-file.c.o.d -o example/vala/read-file.p/meson-generated_read-file.c.o -c example/vala/read-file.p/read-file.c ../../c_glib/example/vala/read-file.vala:123:61: error: format specifies type 'long long' but the argument has type 'gint' (aka 'int') [-Werror,-Wformat] fprintf (_tmp2_, "columns[%" G_GINT64_FORMAT "](%s): ", nth_column, _tmp3_); ~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~ 1 error generated. ``` ```text FAILED: example/vala/read-stream.p/meson-generated_read-stream.c.o ccache cc -Iexample/vala/read-stream.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/read-stream.p/meson-generated_read-stream.c.o -MF example/vala/read-stream.p/meson-generated_read-stream.c.o.d -o example/vala/read-stream.p/meson-generated_read-stream.c.o -c example/vala/read-stream.p/read-stream.c ../../c_glib/example/vala/read-stream.vala:123:61: error: format specifies type 'long long' but the argument has type 'gint' (aka 'int') [-Werror,-Wformat] fprintf (_tmp2_, "columns[%" G_GINT64_FORMAT "](%s): ", nth_column, _tmp3_); ~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~ 1 error generated. ``` ```text FAILED: example/vala/write-file.p/meson-generated_write-file.c.o ccache cc -Iexample/vala/write-file.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/write-file.p/meson-generated_write-file.c.o -MF example/vala/write-file.p/meson-generated_write-file.c.o.d -o example/vala/write-file.p/meson-generated_write-file.c.o -c example/vala/write-file.p/write-file.c write-file.c:373:8: error: variable '_tmp45__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp45__length1; ^ write-file.c:504:8: error: variable '_tmp57__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp57__length1; ^ write-file.c:635:8: error: variable '_tmp69__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp69__length1; ^ write-file.c:766:8: error: variable '_tmp81__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp81__length1; ^ write-file.c:897:8: error: variable '_tmp93__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp93__length1; ^ write-file.c:1028:8: error: variable '_tmp105__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp105__length1; ^ write-file.c:1159:8: error: variable '_tmp117__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp117__length1; ^ write-file.c:1290:8: error: variable '_tmp129__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp129__length1; ^ write-file.c:1421:8: error: variable '_tmp141__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp141__length1; ^ write-file.c:1552:8: error: variable '_tmp153__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp153__length1; ^ 10 errors generated. ``` ### What changes are included in this PR? * Fix wrong format string * Disable `unused-but-set-variable` warning ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40961 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/example/vala/meson.build | 6 +++++- c_glib/example/vala/read-file.vala | 4 ++-- c_glib/example/vala/read-stream.vala | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/c_glib/example/vala/meson.build b/c_glib/example/vala/meson.build index 474f0b1e9a51a..ff65a7328f171 100644 --- a/c_glib/example/vala/meson.build +++ b/c_glib/example/vala/meson.build @@ -18,11 +18,15 @@ # under the License. if generate_vapi + c_flags = [ + '-Wunused-but-set-variable', + ] + c_flags = meson.get_compiler('c').get_supported_arguments(c_flags) vala_example_executable_kwargs = { 'c_args': [ '-I' + project_build_root, '-I' + project_source_root, - ], + ] + c_flags, 'dependencies': [ arrow_glib_vapi, dependency('gio-2.0'), diff --git a/c_glib/example/vala/read-file.vala b/c_glib/example/vala/read-file.vala index a0a06275c4b24..287eddac76352 100644 --- a/c_glib/example/vala/read-file.vala +++ b/c_glib/example/vala/read-file.vala @@ -119,8 +119,8 @@ void print_array(GArrow.Array array) { void print_record_batch(GArrow.RecordBatch record_batch) { var n_columns = record_batch.get_n_columns(); - for (var nth_column = 0; nth_column < n_columns; nth_column++) { - stdout.printf("columns[%" + int64.FORMAT + "](%s): ", + for (int nth_column = 0; nth_column < n_columns; nth_column++) { + stdout.printf("columns[%d](%s): ", nth_column, record_batch.get_column_name(nth_column)); var array = record_batch.get_column_data(nth_column); diff --git a/c_glib/example/vala/read-stream.vala b/c_glib/example/vala/read-stream.vala index c58dc848930a8..4520c8609bdaf 100644 --- a/c_glib/example/vala/read-stream.vala +++ b/c_glib/example/vala/read-stream.vala @@ -119,8 +119,8 @@ void print_array(GArrow.Array array) { void print_record_batch(GArrow.RecordBatch record_batch) { var n_columns = record_batch.get_n_columns(); - for (var nth_column = 0; nth_column < n_columns; nth_column++) { - stdout.printf("columns[%" + int64.FORMAT + "](%s): ", + for (int nth_column = 0; nth_column < n_columns; nth_column++) { + stdout.printf("columns[%d](%s): ", nth_column, record_batch.get_column_name(nth_column)); var array = record_batch.get_column_data(nth_column); From 5b09059ae40164c173a6ef593ab7b463b24d431d Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Wed, 3 Apr 2024 18:31:19 -0400 Subject: [PATCH 07/49] GH-40851: [JS] Fix nullcount and make vectors created from typed arrays not nullable (#40852) * GitHub Issue: #40851 --- js/src/data.ts | 13 ++++---- js/src/vector.ts | 2 +- js/test/unit/vector/vector-tests.ts | 46 ++++++++++++++++++++++++++--- 3 files changed, 51 insertions(+), 10 deletions(-) diff --git a/js/src/data.ts b/js/src/data.ts index 6f8792508858b..45fcc35d37676 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -109,7 +109,10 @@ export class Data { let nullCount = this._nullCount; let nullBitmap: Uint8Array | undefined; if (nullCount <= kUnknownNullCount && (nullBitmap = this.nullBitmap)) { - this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); + this._nullCount = nullCount = nullBitmap.length === 0 ? + // no null bitmap, so all values are valid + 0 : + this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); } return nullCount; } @@ -177,16 +180,16 @@ export class Data { // if we have a nullBitmap, truncate + slice and set it over the pre-filled 1s if (this.nullCount > 0) { nullBitmap.set(truncateBitmap(offset, length, this.nullBitmap), 0); + Object.assign(this, { nullBitmap }); + } else { + Object.assign(this, { nullBitmap, _nullCount: 0 }); } - Object.assign(this, { nullBitmap, _nullCount: -1 }); } const byte = nullBitmap[byteOffset]; prev = (byte & mask) !== 0; - value ? - (nullBitmap[byteOffset] = byte | mask) : - (nullBitmap[byteOffset] = byte & ~mask); + nullBitmap[byteOffset] = value ? (byte | mask) : (byte & ~mask); } if (prev !== !!value) { diff --git a/js/src/vector.ts b/js/src/vector.ts index a7c103bc326ee..1b0d9a05796f0 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -445,7 +445,7 @@ export function makeVector(init: any) { if (init instanceof DataView) { init = new Uint8Array(init.buffer); } - const props = { offset: 0, length: init.length, nullCount: 0, data: init }; + const props = { offset: 0, length: init.length, nullCount: -1, data: init }; if (init instanceof Int8Array) { return new Vector([makeData({ ...props, type: new dtypes.Int8 })]); } if (init instanceof Int16Array) { return new Vector([makeData({ ...props, type: new dtypes.Int16 })]); } if (init instanceof Int32Array) { return new Vector([makeData({ ...props, type: new dtypes.Int32 })]); } diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts index bfcf0d8547861..a10d7c757ca17 100644 --- a/js/test/unit/vector/vector-tests.ts +++ b/js/test/unit/vector/vector-tests.ts @@ -16,7 +16,7 @@ // under the License. import { - Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray + Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray, makeData } from 'apache-arrow'; describe(`makeVectorFromArray`, () => { @@ -33,6 +33,47 @@ describe(`makeVectorFromArray`, () => { }); }); +describe(`basic vector methods`, () => { + test(`not nullable`, () => { + const vector = makeVector([makeData({ data: new Int32Array([1, 2, 3]), nullCount: -1, type: new Int32() })]); + expect(vector.nullable).toBe(false); + expect(vector.nullCount).toBe(0); + }); + + test(`nullable`, () => { + const vector = makeVector([makeData({ data: new Int32Array([1, 2, 3]), nullCount: 0, type: new Int32() })]); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(0); + expect(vector.isValid(0)).toBe(true); + + // set a value to null + vector.set(0, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + expect(vector.isValid(0)).toBe(false); + + // set the same value to null which should not change anything + vector.set(0, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + + // set a different value to null + vector.set(1, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(2); + + // set first value to non-null + vector.set(0, 1); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + + // set last null to non-null + vector.set(1, 2); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(0); + }); +}); + describe(`StructVector`, () => { test(`makeVectorFromArray`, () => { const values: { a?: number; b?: string | null; c?: boolean | null }[] = [ @@ -108,7 +149,6 @@ describe(`DateVector`, () => { }); describe(`DictionaryVector`, () => { - const dictionary = ['foo', 'bar', 'baz']; const extras = ['abc', '123']; // values to search for that should NOT be found const dictionary_vec = vectorFromArray(dictionary, new Utf8).memoize(); @@ -117,7 +157,6 @@ describe(`DictionaryVector`, () => { const validity = Array.from({ length: indices.length }, () => Math.random() > 0.2); describe(`index with nullCount == 0`, () => { - const values = indices.map((d) => dictionary[d]); const vector = makeVector({ data: indices, @@ -133,7 +172,6 @@ describe(`DictionaryVector`, () => { }); describe(`index with nullCount > 0`, () => { - const nullBitmap = util.packBools(validity); const nullCount = validity.reduce((acc, d) => acc + (d ? 0 : 1), 0); const values = indices.map((d, i) => validity[i] ? dictionary[d] : null); From 2caec860894945e8bfe5b557c825ba962a6a16bd Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Wed, 3 Apr 2024 18:32:12 -0400 Subject: [PATCH 08/49] GH-40891: [JS] Store Dates as TimestampMillisecond (#40892) Fixes #40891 Tested with ```ts const date = new Date("2023-03-29T12:34:56Z"); console.log("original", date) console.log("=> vec") const vec = arrow.vectorFromArray([date]) console.log(vec.toArray()) console.log(vec.toJSON()) console.log(vec.type) console.log(vec.get(0)) console.log("=> vec2") const vec2 = arrow.vectorFromArray([date], new arrow.DateMillisecond) console.log(vec2.toArray()) console.log(vec.toJSON()) console.log(vec2.type) console.log(vec2.get(0)) console.log("=> table") const table = arrow.tableFromJSON([{ date }]) console.log(table.toArray()) console.log(table.schema.fields[0].type) console.log(table.getChildAt(0)?.get(0)) console.log("=> table2") const table2 = arrow.tableFromIPC(arrow.tableToIPC(table)); console.log(table2.toArray()) console.log(table2.schema.fields[0].type) console.log(table2.getChildAt(0)?.get(0)) console.log("=> table3") const table3 = new arrow.Table({ dates: vec2 }) console.log(table3.toArray()) console.log(table3.schema.fields[0].type) console.log(table3.getChildAt(0)?.get(0)) ``` ``` => table [ {"date": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)} ] TimestampMillisecond { typeId: 10, unit: 1, timezone: undefined, toString: [Function: toString], ArrayType: [class Int32Array], [Symbol(Symbol.toStringTag)]: "Timestamp", children: null, OffsetArrayType: [class Int32Array], } 2023-03-29T12:34:56.000Z => table2 [ {"date": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)} ] Timestamp_ { typeId: 10, unit: 1, timezone: null, toString: [Function: toString], ArrayType: [class Int32Array], children: null, OffsetArrayType: [class Int32Array], } 2023-03-29T12:34:56.000Z => table3 [ {"dates": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)} ] DateMillisecond { typeId: 8, unit: 1, toString: [Function: toString], ArrayType: [class Int32Array], [Symbol(Symbol.toStringTag)]: "Date", children: null, OffsetArrayType: [class Int32Array], } 2023-03-29T12:34:56.000Z ``` * GitHub Issue: #40891 --- js/src/factories.ts | 4 ++-- js/src/type.ts | 14 +++++++++++++- js/test/unit/vector/date-vector-tests.ts | 19 ++++++++++++++----- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/js/src/factories.ts b/js/src/factories.ts index aa54498c50bc0..657ae1b95ab92 100644 --- a/js/src/factories.ts +++ b/js/src/factories.ts @@ -65,7 +65,7 @@ export function makeBuilder(option export function vectorFromArray(values: readonly (null | undefined)[], type?: dtypes.Null): Vector; export function vectorFromArray(values: readonly (null | undefined | boolean)[], type?: dtypes.Bool): Vector; export function vectorFromArray = dtypes.Dictionary>(values: readonly (null | undefined | string)[], type?: T): Vector; -export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; +export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type: T): Vector; export function vectorFromArray(values: readonly (null | undefined | bigint)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type?: T): Vector; @@ -145,7 +145,7 @@ function inferType(value: readonly unknown[]): dtypes.DataType { } else if (booleansCount + nullsCount === value.length) { return new dtypes.Bool; } else if (datesCount + nullsCount === value.length) { - return new dtypes.DateMillisecond; + return new dtypes.TimestampMillisecond; } else if (arraysCount + nullsCount === value.length) { const array = value as Array[]; const childType = inferType(array[array.findIndex((ary) => ary != null)]); diff --git a/js/src/type.ts b/js/src/type.ts index ae3aefa025999..a42552d65ad27 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -349,7 +349,19 @@ export class Date_ extends DataType { /** @ignore */ export class DateDay extends Date_ { constructor() { super(DateUnit.DAY); } } -/** @ignore */ +/** + * A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) in milliseconds. + * According to the specification, this should be treated as the number of days, in milliseconds, since the UNIX epoch. + * Therefore, values must be evenly divisible by `86_400_000` (the number of milliseconds in a standard day). + * + * Practically, validation that values of this type are evenly divisible by `86_400_000` is not enforced by this library + * for performance and usability reasons. + * + * Users should prefer to use {@link DateDay} to cleanly represent the number of days. For JS dates, + * {@link TimestampMillisecond} is the preferred type. + * + * @ignore + */ export class DateMillisecond extends Date_ { constructor() { super(DateUnit.MILLISECOND); } } /** @ignore */ diff --git a/js/test/unit/vector/date-vector-tests.ts b/js/test/unit/vector/date-vector-tests.ts index f8b4c1c7976d2..e5cd49933eac5 100644 --- a/js/test/unit/vector/date-vector-tests.ts +++ b/js/test/unit/vector/date-vector-tests.ts @@ -15,10 +15,19 @@ // specific language governing permissions and limitations // under the License. -import { DateDay, DateMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow'; +import { DateDay, DateMillisecond, TimestampMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow'; + +describe(`TimestampVector`, () => { + test(`Dates are stored in TimestampMillisecond`, () => { + const date = new Date('2023-02-01T12:34:56Z'); + const vec = vectorFromArray([date]); + expect(vec.type).toBeInstanceOf(TimestampMillisecond); + expect(vec.get(0)).toBe(date.valueOf()); + }); +}); describe(`DateVector`, () => { - it('returns days since the epoch as correct JS Dates', () => { + test(`returns days since the epoch as correct JS Dates`, () => { const table = new Table(RecordBatchReader.from(test_data)); const expectedMillis = expectedMillis32(); const date32 = table.getChildAt(0)!; @@ -28,7 +37,7 @@ describe(`DateVector`, () => { } }); - it('returns millisecond longs since the epoch as correct JS Dates', () => { + test(`returns millisecond longs since the epoch as correct JS Dates`, () => { const table = new Table(RecordBatchReader.from(test_data)); const expectedMillis = expectedMillis64(); const date64 = table.getChildAt(1)!; @@ -38,9 +47,9 @@ describe(`DateVector`, () => { } }); - it('returns the same date that was in the vector', () => { + test(`returns the same date that was in the vector`, () => { const dates = [new Date(1950, 1, 0)]; - const vec = vectorFromArray(dates); + const vec = vectorFromArray(dates, new DateMillisecond()); for (const date of vec) { expect(date).toEqual(dates.shift()); } From 89d5d8b40a5074a7ae30430b3fa95f7a9daf16da Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 3 Apr 2024 17:14:06 -0800 Subject: [PATCH 09/49] MINOR: [R] Replace use of show_query in test-duckdb.R with dbplyr::sql_build (#40955) ### Rationale for this change I just ran the R package tests and saw a printed query mixed amongst testthat output: ``` [ FAIL 0 | WARN 0 | SKIP 1 | PASS 15 ] SELECT * FROM arrow_010 ``` I thought it would be good to silence this in some way. ### What changes are included in this PR? I silenced this by changing out the call to `dplyr::show_query` for `dbplyr::sql_build` which produces different output but (1) doesn't print as as side-effect, (2) is specifically made for testing, and (3) still produces output we can use in this test. For reference, this is what show_query prints (assuming via `cat`): ``` > show_query(table_four) SELECT * FROM arrow_011 ``` Whereas `sql_build`: ``` > dbplyr::sql_build(table_four) [1] `arrow_011` ``` ### Are these changes tested? Yes, but just manually on my system. ### Are there any user-facing changes? No. Authored-by: Bryce Mecum Signed-off-by: Bryce Mecum --- r/tests/testthat/test-duckdb.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-duckdb.R b/r/tests/testthat/test-duckdb.R index 33ab1ecc7aa4d..dd7b6dba7fde0 100644 --- a/r/tests/testthat/test-duckdb.R +++ b/r/tests/testthat/test-duckdb.R @@ -281,7 +281,7 @@ test_that("to_duckdb passing a connection", { to_duckdb(con = con_separate, auto_disconnect = FALSE) # Generates a query like SELECT * FROM arrow_xxx - table_four_query <- paste(show_query(table_four), collapse = "\n") + table_four_query <- paste(dbplyr::sql_build(table_four), collapse = "\n") table_four_name <- stringr::str_extract(table_four_query, "arrow_[0-9]{3}") expect_false(is.na(table_four_name)) From 36ed0328b43ca39533e58a889f8e091d1f1ca7dc Mon Sep 17 00:00:00 2001 From: James Henderson Date: Thu, 4 Apr 2024 05:39:37 +0100 Subject: [PATCH 10/49] GH-24826: [Java] Add DUV.setOffset method (#40985) ### Are these changes tested? Yes ### Are there any user-facing changes? Yes - the addition of a public DUV.setOffset method * GitHub Issue: #24826 Authored-by: James Henderson Signed-off-by: David Li --- .../codegen/templates/DenseUnionVector.java | 8 ++++ .../arrow/vector/TestDenseUnionVector.java | 38 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java index c23caf3bb5a03..8edd167152d7f 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionVector.java +++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -908,6 +908,14 @@ private int getTypeBufferValueCapacity() { return (int) typeBuffer.capacity() / TYPE_WIDTH; } + public void setOffset(int index, int offset) { + while (index >= getOffsetBufferValueCapacity()) { + reallocOffsetBuffer(); + } + + offsetBuffer.setInt((long) index * OFFSET_WIDTH, offset); + } + private long getOffsetBufferValueCapacity() { return offsetBuffer.capacity() / OFFSET_WIDTH; } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java index 8fd33eb5a8432..2c29861561bb7 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java @@ -99,6 +99,44 @@ public void testDenseUnionVector() throws Exception { } } + @Test + public void testSetOffset() { + try (DenseUnionVector duv = DenseUnionVector.empty("foo", allocator)) { + duv.allocateNew(); + byte i32TypeId = duv.registerNewTypeId(Field.notNullable("i32", MinorType.INT.getType())); + byte f64TypeId = duv.registerNewTypeId(Field.notNullable("f64", MinorType.FLOAT8.getType())); + + IntVector i32Vector = ((IntVector) duv.addVector(i32TypeId, new IntVector("i32", allocator))); + Float8Vector f64Vector = ((Float8Vector) duv.addVector(f64TypeId, new Float8Vector("f64", allocator))); + + i32Vector.allocateNew(3); + f64Vector.allocateNew(1); + + duv.setTypeId(0, i32TypeId); + duv.setOffset(0, 0); + i32Vector.set(0, 42); + + duv.setTypeId(1, i32TypeId); + duv.setOffset(1, 1); + i32Vector.set(1, 43); + + duv.setTypeId(2, f64TypeId); + duv.setOffset(2, 0); + f64Vector.set(0, 3.14); + + duv.setTypeId(3, i32TypeId); + duv.setOffset(3, 2); + i32Vector.set(2, 44); + + duv.setValueCount(4); + + assertEquals(42, duv.getObject(0)); + assertEquals(43, duv.getObject(1)); + assertEquals(3.14, duv.getObject(2)); + assertEquals(44, duv.getObject(3)); + } + } + @Test public void testTransfer() throws Exception { try (DenseUnionVector srcVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { From 26631d7504420ff00a827d40273b589c6d38860f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 4 Apr 2024 10:29:26 +0200 Subject: [PATCH 11/49] GH-40806: [C++] Revert changes from PR #40857 (#40980) Revert changes from https://github.com/apache/arrow/pull/40857. `GetRuntimeInfo` returns the SIMD level for dynamic dispatch, but Neon currently does not participate in dynamic dispatch (actually, Neon should be available by default on all modern Arm CPUs AFAIU). Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/config.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/config.cc b/cpp/src/arrow/config.cc index 1f852e84d3d5c..9e32e5437325f 100644 --- a/cpp/src/arrow/config.cc +++ b/cpp/src/arrow/config.cc @@ -58,8 +58,6 @@ std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) { return "avx"; } else if (query_flag(CpuInfo::SSE4_2)) { return "sse4_2"; - } else if (query_flag(CpuInfo::ASIMD)) { - return "neon"; } else { return "none"; } From 640c10191a51f6d0f408c72f45dbf5d94ec0b9d7 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 4 Apr 2024 02:51:58 -0700 Subject: [PATCH 12/49] GH-40224: [C++] Fix: improve the backpressure handling in the dataset writer (#40722) ### Rationale for this change The dataset writer would fire the resume callback as soon as the underlying dataset writer's queues freed up, even if there were pending tasks. Backpressure is not applied immediately and so a few tasks will always trickle in. If backpressure is pausing and then resuming frequently this can lead to a buildup of pending tasks and uncontrolled memory growth. ### What changes are included in this PR? The resume callback is not called until all pending write tasks have completed. ### Are these changes tested? There is quite an extensive set of tests for the dataset writer already and they continue to pass. I ran them on repeat, with and without stress, and did not see any issues. However, the underlying problem (dataset writer can have uncontrolled memory growth) is still not tested as it is quite difficult to test. I was able to run the setup described in the issue to reproduce the issue. With this fix the repartitioning task completes for me. ### Are there any user-facing changes? No * GitHub Issue: #40224 Authored-by: Weston Pace Signed-off-by: Antoine Pitrou --- cpp/src/arrow/dataset/dataset_writer.cc | 36 ++++++++++++++++++++----- cpp/src/arrow/util/async_util.cc | 7 +++++ cpp/src/arrow/util/async_util.h | 3 +++ cpp/src/arrow/util/async_util_test.cc | 1 + 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset_writer.cc b/cpp/src/arrow/dataset/dataset_writer.cc index 34731d19ab3eb..754386275d60c 100644 --- a/cpp/src/arrow/dataset/dataset_writer.cc +++ b/cpp/src/arrow/dataset/dataset_writer.cc @@ -515,7 +515,7 @@ class DatasetWriter::DatasetWriterImpl { std::function finish_callback, uint64_t max_rows_queued) : scheduler_(scheduler), write_tasks_(util::MakeThrottledAsyncTaskGroup( - scheduler_, 1, /*queue=*/nullptr, + scheduler_, /*max_concurrent_cost=*/1, /*queue=*/nullptr, [finish_callback = std::move(finish_callback)] { finish_callback(); return Status::OK(); @@ -541,6 +541,23 @@ class DatasetWriter::DatasetWriterImpl { } } + void ResumeIfNeeded() { + if (!paused_) { + return; + } + bool needs_resume = false; + { + std::lock_guard lg(mutex_); + if (!write_tasks_ || write_tasks_->QueueSize() == 0) { + needs_resume = true; + } + } + if (needs_resume) { + paused_ = false; + resume_callback_(); + } + } + void WriteRecordBatch(std::shared_ptr batch, const std::string& directory, const std::string& prefix) { write_tasks_->AddSimpleTask( @@ -549,11 +566,14 @@ class DatasetWriter::DatasetWriterImpl { WriteAndCheckBackpressure(std::move(batch), directory, prefix); if (!has_room.is_finished()) { // We don't have to worry about sequencing backpressure here since - // task_group_ serves as our sequencer. If batches continue to arrive after - // we pause they will queue up in task_group_ until we free up and call - // Resume + // task_group_ serves as our sequencer. If batches continue to arrive + // after we pause they will queue up in task_group_ until we free up and + // call Resume pause_callback_(); - return has_room.Then([this] { resume_callback_(); }); + paused_ = true; + return has_room.Then([this] { ResumeIfNeeded(); }); + } else { + ResumeIfNeeded(); } return has_room; }, @@ -571,6 +591,9 @@ class DatasetWriter::DatasetWriterImpl { return Future<>::MakeFinished(); }, "DatasetWriter::FinishAll"sv); + // Reset write_tasks_ to signal that we are done adding tasks, this will allow + // us to invoke the finish callback once the tasks wrap up. + std::lock_guard lg(mutex_); write_tasks_.reset(); } @@ -660,7 +683,7 @@ class DatasetWriter::DatasetWriterImpl { } util::AsyncTaskScheduler* scheduler_ = nullptr; - std::unique_ptr write_tasks_; + std::unique_ptr write_tasks_; Future<> finish_fut_ = Future<>::Make(); FileSystemDatasetWriteOptions write_options_; DatasetWriterState writer_state_; @@ -670,6 +693,7 @@ class DatasetWriter::DatasetWriterImpl { std::unordered_map> directory_queues_; std::mutex mutex_; + bool paused_ = false; Status err_; }; diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc index 63e27bfbe5773..fbd45eadac2cd 100644 --- a/cpp/src/arrow/util/async_util.cc +++ b/cpp/src/arrow/util/async_util.cc @@ -118,6 +118,8 @@ class FifoQueue : public ThrottledAsyncTaskScheduler::Queue { void Purge() override { tasks_.clear(); } + std::size_t Size() const override { return tasks_.size(); } + private: std::list> tasks_; }; @@ -332,6 +334,10 @@ class ThrottledAsyncTaskSchedulerImpl void Pause() override { throttle_->Pause(); } void Resume() override { throttle_->Resume(); } + std::size_t QueueSize() override { + std::lock_guard lk(mutex_); + return queue_->Size(); + } const util::tracing::Span& span() const override { return target_->span(); } private: @@ -499,6 +505,7 @@ class ThrottledAsyncTaskGroup : public ThrottledAsyncTaskScheduler { : throttle_(std::move(throttle)), task_group_(std::move(task_group)) {} void Pause() override { throttle_->Pause(); } void Resume() override { throttle_->Resume(); } + std::size_t QueueSize() override { return throttle_->QueueSize(); } const util::tracing::Span& span() const override { return task_group_->span(); } bool AddTask(std::unique_ptr task) override { return task_group_->AddTask(std::move(task)); diff --git a/cpp/src/arrow/util/async_util.h b/cpp/src/arrow/util/async_util.h index 7a675da59facd..d9ed63bdbce22 100644 --- a/cpp/src/arrow/util/async_util.h +++ b/cpp/src/arrow/util/async_util.h @@ -226,6 +226,7 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler { virtual bool Empty() = 0; /// Purge the queue of all items virtual void Purge() = 0; + virtual std::size_t Size() const = 0; }; class Throttle { @@ -277,6 +278,8 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler { /// Allows task to be submitted again. If there is a max_concurrent_cost limit then /// it will still apply. virtual void Resume() = 0; + /// Return the number of tasks queued but not yet submitted + virtual std::size_t QueueSize() = 0; /// Create a throttled view of a scheduler /// diff --git a/cpp/src/arrow/util/async_util_test.cc b/cpp/src/arrow/util/async_util_test.cc index 313ca91912335..1f9aad453e9c4 100644 --- a/cpp/src/arrow/util/async_util_test.cc +++ b/cpp/src/arrow/util/async_util_test.cc @@ -680,6 +680,7 @@ class PriorityQueue : public ThrottledAsyncTaskScheduler::Queue { queue_.pop(); } } + std::size_t Size() const { return queue_.size(); } private: std::priority_queue, From ad6758900da1706d3cbfd59e5fe7d1d548c4235b Mon Sep 17 00:00:00 2001 From: James Henderson Date: Thu, 4 Apr 2024 12:34:40 +0100 Subject: [PATCH 13/49] MINOR: [Java] `DenseUnionVector.empty` should create not-nullable DUVs (#41001) ### Rationale for this change DUVs do not have a validity vector, so cannot be set to null - `isNull`, for example, always returns false. This change ensures vectors created through `DenseUnionVector.empty` reflect this in their FieldType. ### What changes are included in this PR? `DenseUnionVector.empty` now creates DUVs with a not-nullable `FieldType` ### Are these changes tested? I haven't added an explicit test for this as it would essentially be testing `FieldType.notNullable` - confirmed that it doesn't break any existing tests. ### Are there any user-facing changes? Yes, strictly speaking this is a public change, correcting a bug in a public API. **This PR includes breaking changes to public APIs.** Authored-by: James Henderson Signed-off-by: David Li --- java/vector/src/main/codegen/templates/DenseUnionVector.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java index 8edd167152d7f..27fd8e9798b67 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionVector.java +++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -124,7 +124,7 @@ public class DenseUnionVector extends AbstractContainerVector implements FieldVe ArrowType.Struct.INSTANCE, /*dictionary*/ null, /*metadata*/ null); public static DenseUnionVector empty(String name, BufferAllocator allocator) { - FieldType fieldType = FieldType.nullable(new ArrowType.Union( + FieldType fieldType = FieldType.notNullable(new ArrowType.Union( UnionMode.Dense, null)); return new DenseUnionVector(name, allocator, fieldType, null); } From bbeeb33a2fb65f40caf6c3176ee377de2b9de6e5 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 4 Apr 2024 14:38:16 +0200 Subject: [PATCH 14/49] GH-40974: [CI][Python] CI failures on Python builds due to pytest_cython (#40975) ### Rationale for this change We are seeing sporadic CI failures on Python builds due to `pytest_cython`. ### What changes are included in this PR? Upper pin of 0.3.0 added for `pytest-cython`. ### Are these changes tested? With a green CI. ### Are there any user-facing changes? No. * GitHub Issue: #40974 Authored-by: AlenkaF Signed-off-by: Sutou Kouhei --- ci/conda_env_sphinx.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 6899f9c36a7f6..0a356d5722c42 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -29,5 +29,8 @@ sphinx-copybutton sphinxcontrib-jquery sphinx==6.2 # Requirement for doctest-cython -pytest-cython +# Needs upper pin of 0.3.0, see: +# https://github.com/lgpage/pytest-cython/issues/67 +# With 0.3.* bug fix release, the pin can be removed +pytest-cython==0.2.2 pandas From b99b00dd66586cf54b04ce6a51eb1cf68b1510a3 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 4 Apr 2024 21:35:00 +0800 Subject: [PATCH 15/49] GH-40994: [C++][Parquet] RleBooleanDecoder supports DecodeArrow with nulls (#40995) ### Rationale for this change Supports DecodeArrow with nulls in RleBooleanDecoder ### What changes are included in this PR? Supports DecodeArrow with nulls in RleBooleanDecoder ### Are these changes tested? Yes ### Are there any user-facing changes? currently not * GitHub Issue: #40994 Lead-authored-by: mwish Co-authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 63 ++++++++++++++++++++------- cpp/src/parquet/encoding_benchmark.cc | 9 ++-- cpp/src/parquet/encoding_test.cc | 46 +++++++++---------- 3 files changed, 74 insertions(+), 44 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index f16e9b34fc682..6e93b493392c9 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3143,27 +3143,58 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out) override { - if (null_count != 0) { - // TODO(ARROW-34660): implement DecodeArrow with null slots. - ParquetException::NYI("RleBoolean DecodeArrow with null slots"); + if (null_count == num_values) { + PARQUET_THROW_NOT_OK(out->AppendNulls(null_count)); + return 0; } constexpr int kBatchSize = 1024; std::array values; - int sum_decode_count = 0; - do { - int current_batch = std::min(kBatchSize, num_values); - int decoded_count = decoder_->GetBatch(values.data(), current_batch); - if (decoded_count == 0) { - break; + const int num_non_null_values = num_values - null_count; + // Remaining non-null boolean values to read from decoder. + // We decode from `decoder_` with maximum 1024 size batches. + int num_remain_non_null_values = num_non_null_values; + int current_index_in_batch = 0; + int current_batch_size = 0; + auto next_boolean_batch = [&]() { + DCHECK_GT(num_remain_non_null_values, 0); + DCHECK_EQ(current_index_in_batch, current_batch_size); + current_batch_size = std::min(num_remain_non_null_values, kBatchSize); + int decoded_count = decoder_->GetBatch(values.data(), current_batch_size); + if (ARROW_PREDICT_FALSE(decoded_count != current_batch_size)) { + // required values is more than values in decoder. + ParquetException::EofException(); } - sum_decode_count += decoded_count; - PARQUET_THROW_NOT_OK(out->Reserve(sum_decode_count)); - for (int i = 0; i < decoded_count; ++i) { - PARQUET_THROW_NOT_OK(out->Append(values[i])); + num_remain_non_null_values -= current_batch_size; + current_index_in_batch = 0; + }; + + // Reserve all values including nulls first + PARQUET_THROW_NOT_OK(out->Reserve(num_values)); + if (null_count == 0) { + // Fast-path for not having nulls. + do { + next_boolean_batch(); + PARQUET_THROW_NOT_OK( + out->AppendValues(values.begin(), values.begin() + current_batch_size)); + num_values -= current_batch_size; + current_index_in_batch = 0; + } while (num_values > 0); + return num_non_null_values; + } + auto next_value = [&]() -> bool { + if (current_index_in_batch == current_batch_size) { + next_boolean_batch(); + DCHECK_GT(current_batch_size, 0); } - num_values -= decoded_count; - } while (num_values > 0); - return sum_decode_count; + DCHECK_LT(current_index_in_batch, current_batch_size); + bool value = values[current_index_in_batch]; + ++current_index_in_batch; + return value; + }; + VisitNullBitmapInline( + valid_bits, valid_bits_offset, num_values, null_count, + [&]() { out->UnsafeAppend(next_value()); }, [&]() { out->UnsafeAppendNull(); }); + return num_non_null_values; } int DecodeArrow( diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 9c07d262b350e..a858c53e931d8 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -1518,11 +1518,10 @@ BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) (benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); } BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) ->Range(MIN_RANGE, MAX_RANGE); -// TODO(mwish): RleBoolean not implemented DecodeArrow with null slots yet. -// BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) -//(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } -// BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) -// ->Apply(BooleanWithNullCustomArguments); +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) +(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) + ->Apply(BooleanWithNullCustomArguments); BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrow) (benchmark::State& state) { DecodeArrowDenseBenchmark(state); } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index ea0029f4c7d7f..bb5126ce251d4 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -602,7 +602,7 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { // Check that one can put several Arrow arrays into a given encoder // and decode to the right values (see GH-36939) -TEST(PlainBooleanArrayEncoding, AdHocRoundTrip) { +TEST(BooleanArrayEncoding, AdHocRoundTrip) { std::vector> arrays{ ::arrow::ArrayFromJSON(::arrow::boolean(), R"([])"), ::arrow::ArrayFromJSON(::arrow::boolean(), R"([false, null, true])"), @@ -610,27 +610,29 @@ TEST(PlainBooleanArrayEncoding, AdHocRoundTrip) { ::arrow::ArrayFromJSON(::arrow::boolean(), R"([true, null, false])"), }; - auto encoder = MakeTypedEncoder(Encoding::PLAIN, - /*use_dictionary=*/false); - for (const auto& array : arrays) { - encoder->Put(*array); - } - auto buffer = encoder->FlushValues(); - auto decoder = MakeTypedDecoder(Encoding::PLAIN); - EXPECT_OK_AND_ASSIGN(auto expected, ::arrow::Concatenate(arrays)); - decoder->SetData(static_cast(expected->length()), buffer->data(), - static_cast(buffer->size())); - - ::arrow::BooleanBuilder builder; - ASSERT_EQ(static_cast(expected->length() - expected->null_count()), - decoder->DecodeArrow(static_cast(expected->length()), - static_cast(expected->null_count()), - expected->null_bitmap_data(), 0, &builder)); + for (auto encoding : {Encoding::PLAIN, Encoding::RLE}) { + auto encoder = MakeTypedEncoder(encoding, + /*use_dictionary=*/false); + for (const auto& array : arrays) { + encoder->Put(*array); + } + auto buffer = encoder->FlushValues(); + auto decoder = MakeTypedDecoder(encoding); + EXPECT_OK_AND_ASSIGN(auto expected, ::arrow::Concatenate(arrays)); + decoder->SetData(static_cast(expected->length()), buffer->data(), + static_cast(buffer->size())); + + ::arrow::BooleanBuilder builder; + ASSERT_EQ(static_cast(expected->length() - expected->null_count()), + decoder->DecodeArrow(static_cast(expected->length()), + static_cast(expected->null_count()), + expected->null_bitmap_data(), 0, &builder)); - std::shared_ptr<::arrow::Array> result; - ASSERT_OK(builder.Finish(&result)); - ASSERT_EQ(expected->length(), result->length()); - ::arrow::AssertArraysEqual(*expected, *result, /*verbose=*/true); + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(builder.Finish(&result)); + ASSERT_EQ(expected->length(), result->length()); + ::arrow::AssertArraysEqual(*expected, *result, /*verbose=*/true); + } } template @@ -963,8 +965,6 @@ TYPED_TEST(EncodingAdHocTyped, ByteStreamSplitArrowDirectPut) { } TYPED_TEST(EncodingAdHocTyped, RleArrowDirectPut) { - // TODO: test with nulls once RleBooleanDecoder::DecodeArrow supports them - this->null_probability_ = 0; for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { this->Rle(seed); } From 72d20ad719021c5513620e23a0a65fb724f0e299 Mon Sep 17 00:00:00 2001 From: Clif Houck Date: Thu, 4 Apr 2024 08:59:30 -0500 Subject: [PATCH 16/49] GH-20213: [C++] Implement cast to/from halffloat (#40067) ### Rationale for this change ### What changes are included in this PR? This PR implements casting to and from float16 types using the vendored float16 library included in arrow at `cpp/arrrow/util/float16.*`. ### Are these changes tested? Unit tests are included in this PR. ### Are there any user-facing changes? In that casts to and from float16 will now work, yes. * Closes: #20213 ### TODO - [x] Add casts to/from float64. - [x] String <-> float16 casts. - [x] Integer <-> float16 casts. - [x] Tests. - [x] Update https://github.com/apache/arrow/blob/main/docs/source/status.rst about half float. - [x] Rebase. - [x] Run clang format over this PR. * GitHub Issue: #20213 Authored-by: Clif Houck Signed-off-by: Sutou Kouhei --- c_glib/test/test-half-float-scalar.rb | 2 +- cpp/src/arrow/compare.cc | 30 +++++ .../compute/kernels/scalar_cast_internal.cc | 70 ++++++++++++ .../compute/kernels/scalar_cast_numeric.cc | 103 +++++++++++++++--- .../compute/kernels/scalar_cast_string.cc | 4 + .../arrow/compute/kernels/scalar_cast_test.cc | 25 +++-- cpp/src/arrow/ipc/json_simple.cc | 32 +++++- cpp/src/arrow/ipc/json_simple_test.cc | 35 +++++- cpp/src/arrow/record_batch_test.cc | 3 + cpp/src/arrow/type_traits.h | 1 + cpp/src/arrow/util/formatting.cc | 11 ++ cpp/src/arrow/util/formatting.h | 7 ++ cpp/src/arrow/util/value_parsing.cc | 14 +++ cpp/src/arrow/util/value_parsing.h | 17 +++ docs/source/status.rst | 11 +- 15 files changed, 325 insertions(+), 40 deletions(-) diff --git a/c_glib/test/test-half-float-scalar.rb b/c_glib/test/test-half-float-scalar.rb index ac41f91ece621..3073d84d796cf 100644 --- a/c_glib/test/test-half-float-scalar.rb +++ b/c_glib/test/test-half-float-scalar.rb @@ -41,7 +41,7 @@ def test_equal end def test_to_s - assert_equal("[\n #{@half_float}\n]", @scalar.to_s) + assert_equal("1.0009765625", @scalar.to_s) end def test_value diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index bb632e2eb912d..e983b47e39dc4 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -44,6 +44,7 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/float16.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -59,6 +60,7 @@ using internal::BitmapReader; using internal::BitmapUInt64Reader; using internal::checked_cast; using internal::OptionalBitmapEquals; +using util::Float16; // ---------------------------------------------------------------------- // Public method implementations @@ -95,6 +97,30 @@ struct FloatingEquality { const T epsilon; }; +// For half-float equality. +template +struct FloatingEquality { + explicit FloatingEquality(const EqualOptions& options) + : epsilon(static_cast(options.atol())) {} + + bool operator()(uint16_t x, uint16_t y) const { + Float16 f_x = Float16::FromBits(x); + Float16 f_y = Float16::FromBits(y); + if (x == y) { + return Flags::signed_zeros_equal || (f_x.signbit() == f_y.signbit()); + } + if (Flags::nans_equal && f_x.is_nan() && f_y.is_nan()) { + return true; + } + if (Flags::approximate && (fabs(f_x.ToFloat() - f_y.ToFloat()) <= epsilon)) { + return true; + } + return false; + } + + const float epsilon; +}; + template struct FloatingEqualityDispatcher { const EqualOptions& options; @@ -259,6 +285,8 @@ class RangeDataEqualsImpl { Status Visit(const DoubleType& type) { return CompareFloating(type); } + Status Visit(const HalfFloatType& type) { return CompareFloating(type); } + // Also matches StringType Status Visit(const BinaryType& type) { return CompareBinary(type); } @@ -863,6 +891,8 @@ class ScalarEqualsVisitor { Status Visit(const DoubleScalar& left) { return CompareFloating(left); } + Status Visit(const HalfFloatScalar& left) { return CompareFloating(left); } + template enable_if_t::value, Status> Visit(const T& left) { const auto& right = checked_cast(right_); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc index 8cf5a04addb00..d8c4088759643 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc @@ -19,10 +19,13 @@ #include "arrow/compute/cast_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/extension_type.h" +#include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/float16.h" namespace arrow { +using arrow::util::Float16; using internal::checked_cast; using internal::PrimitiveScalarBase; @@ -47,6 +50,42 @@ struct CastPrimitive { } }; +// Converting floating types to half float. +template +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + using InT = typename InType::c_type; + const InT* in_values = arr.GetValues(1); + uint16_t* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = Float16(*in_values++).bits(); + } + } +}; + +// Converting from half float to other floating types. +template <> +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + const uint16_t* in_values = arr.GetValues(1); + float* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = Float16::FromBits(*in_values++).ToFloat(); + } + } +}; + +template <> +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + const uint16_t* in_values = arr.GetValues(1); + double* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = Float16::FromBits(*in_values++).ToDouble(); + } + } +}; + template struct CastPrimitive::value>> { // memcpy output @@ -56,6 +95,33 @@ struct CastPrimitive: } }; +// Cast int to half float +template +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + using InT = typename InType::c_type; + const InT* in_values = arr.GetValues(1); + uint16_t* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + float temp = static_cast(*in_values++); + *out_values++ = Float16(temp).bits(); + } + } +}; + +// Cast half float to int +template +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + using OutT = typename OutType::c_type; + const uint16_t* in_values = arr.GetValues(1); + OutT* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = static_cast(Float16::FromBits(*in_values++).ToFloat()); + } + } +}; + template void CastNumberImpl(Type::type out_type, const ArraySpan& input, ArraySpan* out) { switch (out_type) { @@ -79,6 +145,8 @@ void CastNumberImpl(Type::type out_type, const ArraySpan& input, ArraySpan* out) return CastPrimitive::Exec(input, out); case Type::DOUBLE: return CastPrimitive::Exec(input, out); + case Type::HALF_FLOAT: + return CastPrimitive::Exec(input, out); default: break; } @@ -109,6 +177,8 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, return CastNumberImpl(out_type, input, out); case Type::DOUBLE: return CastNumberImpl(out_type, input, out); + case Type::HALF_FLOAT: + return CastNumberImpl(out_type, input, out); default: DCHECK(false); break; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index b054e57f04d12..3df86e7d6936c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -23,6 +23,7 @@ #include "arrow/compute/kernels/util_internal.h" #include "arrow/scalar.h" #include "arrow/util/bit_block_counter.h" +#include "arrow/util/float16.h" #include "arrow/util/int_util.h" #include "arrow/util/value_parsing.h" @@ -34,6 +35,7 @@ using internal::IntegersCanFit; using internal::OptionalBitBlockCounter; using internal::ParseValue; using internal::PrimitiveScalarBase; +using util::Float16; namespace compute { namespace internal { @@ -56,18 +58,37 @@ Status CastFloatingToFloating(KernelContext*, const ExecSpan& batch, ExecResult* // ---------------------------------------------------------------------- // Implement fast safe floating point to integer cast +// +template +struct WasTruncated { + static bool Check(OutT out_val, InT in_val) { + return static_cast(out_val) != in_val; + } + + static bool CheckMaybeNull(OutT out_val, InT in_val, bool is_valid) { + return is_valid && static_cast(out_val) != in_val; + } +}; + +// Half float to int +template +struct WasTruncated { + using OutT = typename OutType::c_type; + static bool Check(OutT out_val, uint16_t in_val) { + return static_cast(out_val) != Float16::FromBits(in_val).ToFloat(); + } + + static bool CheckMaybeNull(OutT out_val, uint16_t in_val, bool is_valid) { + return is_valid && static_cast(out_val) != Float16::FromBits(in_val).ToFloat(); + } +}; // InType is a floating point type we are planning to cast to integer template ARROW_DISABLE_UBSAN("float-cast-overflow") Status CheckFloatTruncation(const ArraySpan& input, const ArraySpan& output) { - auto WasTruncated = [&](OutT out_val, InT in_val) -> bool { - return static_cast(out_val) != in_val; - }; - auto WasTruncatedMaybeNull = [&](OutT out_val, InT in_val, bool is_valid) -> bool { - return is_valid && static_cast(out_val) != in_val; - }; auto GetErrorMessage = [&](InT val) { return Status::Invalid("Float value ", val, " was truncated converting to ", *output.type); @@ -86,26 +107,28 @@ Status CheckFloatTruncation(const ArraySpan& input, const ArraySpan& output) { if (block.popcount == block.length) { // Fast path: branchless for (int64_t i = 0; i < block.length; ++i) { - block_out_of_bounds |= WasTruncated(out_data[i], in_data[i]); + block_out_of_bounds |= + WasTruncated::Check(out_data[i], in_data[i]); } } else if (block.popcount > 0) { // Indices have nulls, must only boundscheck non-null values for (int64_t i = 0; i < block.length; ++i) { - block_out_of_bounds |= WasTruncatedMaybeNull( + block_out_of_bounds |= WasTruncated::CheckMaybeNull( out_data[i], in_data[i], bit_util::GetBit(bitmap, offset_position + i)); } } if (ARROW_PREDICT_FALSE(block_out_of_bounds)) { if (input.GetNullCount() > 0) { for (int64_t i = 0; i < block.length; ++i) { - if (WasTruncatedMaybeNull(out_data[i], in_data[i], - bit_util::GetBit(bitmap, offset_position + i))) { + if (WasTruncated::CheckMaybeNull( + out_data[i], in_data[i], + bit_util::GetBit(bitmap, offset_position + i))) { return GetErrorMessage(in_data[i]); } } } else { for (int64_t i = 0; i < block.length; ++i) { - if (WasTruncated(out_data[i], in_data[i])) { + if (WasTruncated::Check(out_data[i], in_data[i])) { return GetErrorMessage(in_data[i]); } } @@ -151,6 +174,9 @@ Status CheckFloatToIntTruncation(const ExecValue& input, const ExecResult& outpu return CheckFloatToIntTruncationImpl(input.array, *output.array_span()); case Type::DOUBLE: return CheckFloatToIntTruncationImpl(input.array, *output.array_span()); + case Type::HALF_FLOAT: + return CheckFloatToIntTruncationImpl(input.array, + *output.array_span()); default: break; } @@ -293,6 +319,15 @@ struct CastFunctor< } }; +template <> +struct CastFunctor> { + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return applicator::ScalarUnaryNotNull>::Exec(ctx, batch, + out); + } +}; + // ---------------------------------------------------------------------- // Decimal to integer @@ -689,6 +724,10 @@ std::shared_ptr GetCastToInteger(std::string name) { DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, CastFloatingToInteger)); } + // Cast from half-float + DCHECK_OK(func->AddKernel(Type::HALF_FLOAT, {InputType(Type::HALF_FLOAT)}, out_ty, + CastFloatingToInteger)); + // From other numbers to integer AddCommonNumberCasts(out_ty, func.get()); @@ -715,6 +754,10 @@ std::shared_ptr GetCastToFloating(std::string name) { DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, CastFloatingToFloating)); } + // From half-float to float/double + DCHECK_OK(func->AddKernel(Type::HALF_FLOAT, {InputType(Type::HALF_FLOAT)}, out_ty, + CastFloatingToFloating)); + // From other numbers to floating point AddCommonNumberCasts(out_ty, func.get()); @@ -723,6 +766,7 @@ std::shared_ptr GetCastToFloating(std::string name) { CastFunctor::Exec)); DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty, CastFunctor::Exec)); + return func; } @@ -795,6 +839,32 @@ std::shared_ptr GetCastToDecimal256() { return func; } +std::shared_ptr GetCastToHalfFloat() { + // HalfFloat is a bit brain-damaged for now + auto func = std::make_shared("func", Type::HALF_FLOAT); + AddCommonCasts(Type::HALF_FLOAT, float16(), func.get()); + + // Casts from integer to floating point + for (const std::shared_ptr& in_ty : IntTypes()) { + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, + TypeTraits::type_singleton(), + CastIntegerToFloating)); + } + + // Cast from other strings to half float. + for (const std::shared_ptr& in_ty : BaseBinaryTypes()) { + auto exec = GenerateVarBinaryBase(*in_ty); + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, + TypeTraits::type_singleton(), exec)); + } + + DCHECK_OK(func.get()->AddKernel(Type::FLOAT, {InputType(Type::FLOAT)}, float16(), + CastFloatingToFloating)); + DCHECK_OK(func.get()->AddKernel(Type::DOUBLE, {InputType(Type::DOUBLE)}, float16(), + CastFloatingToFloating)); + return func; +} + } // namespace std::vector> GetNumericCasts() { @@ -830,13 +900,14 @@ std::vector> GetNumericCasts() { functions.push_back(GetCastToInteger("cast_uint64")); // HalfFloat is a bit brain-damaged for now - auto cast_half_float = - std::make_shared("cast_half_float", Type::HALF_FLOAT); - AddCommonCasts(Type::HALF_FLOAT, float16(), cast_half_float.get()); + auto cast_half_float = GetCastToHalfFloat(); functions.push_back(cast_half_float); - functions.push_back(GetCastToFloating("cast_float")); - functions.push_back(GetCastToFloating("cast_double")); + auto cast_float = GetCastToFloating("cast_float"); + functions.push_back(cast_float); + + auto cast_double = GetCastToFloating("cast_double"); + functions.push_back(cast_double); functions.push_back(GetCastToDecimal128()); functions.push_back(GetCastToDecimal256()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index a6576e4e4c26f..3a8352a9b870f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -437,6 +437,10 @@ void AddNumberToStringCasts(CastFunction* func) { GenerateNumeric(*in_ty), NullHandling::COMPUTED_NO_PREALLOCATE)); } + + DCHECK_OK(func->AddKernel(Type::HALF_FLOAT, {float16()}, out_ty, + NumericToStringCastFunctor::Exec, + NullHandling::COMPUTED_NO_PREALLOCATE)); } template diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index a8acf68f66c8b..af62b4da2caa5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -389,7 +389,7 @@ TEST(Cast, ToIntDowncastUnsafe) { } TEST(Cast, FloatingToInt) { - for (auto from : {float32(), float64()}) { + for (auto from : {float16(), float32(), float64()}) { for (auto to : {int32(), int64()}) { // float to int no truncation CheckCast(ArrayFromJSON(from, "[1.0, null, 0.0, -1.0, 5.0]"), @@ -407,6 +407,15 @@ TEST(Cast, FloatingToInt) { } } +TEST(Cast, FloatingToFloating) { + for (auto from : {float16(), float32(), float64()}) { + for (auto to : {float16(), float32(), float64()}) { + CheckCast(ArrayFromJSON(from, "[1.0, 0.0, -1.0, 5.0]"), + ArrayFromJSON(to, "[1.0, 0.0, -1.0, 5.0]")); + } + } +} + TEST(Cast, IntToFloating) { for (auto from : {uint32(), int32()}) { std::string two_24 = "[16777216, 16777217]"; @@ -2220,14 +2229,12 @@ TEST(Cast, IntToString) { } TEST(Cast, FloatingToString) { - for (auto string_type : {utf8(), large_utf8()}) { - CheckCast( - ArrayFromJSON(float32(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), - ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); - - CheckCast( - ArrayFromJSON(float64(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), - ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); + for (auto float_type : {float16(), float32(), float64()}) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCast(ArrayFromJSON(float_type, "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), + ArrayFromJSON(string_type, + R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); + } } } diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index ceeabe01677ed..9fd449831c980 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -36,6 +36,7 @@ #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "arrow/util/logging.h" #include "arrow/util/value_parsing.h" @@ -52,6 +53,7 @@ namespace rj = arrow::rapidjson; namespace arrow { using internal::ParseValue; +using util::Float16; namespace ipc { namespace internal { @@ -232,9 +234,9 @@ enable_if_physical_signed_integer ConvertNumber(const rj::Value& json // Convert single unsigned integer value template -enable_if_physical_unsigned_integer ConvertNumber(const rj::Value& json_obj, - const DataType& type, - typename T::c_type* out) { +enable_if_unsigned_integer ConvertNumber(const rj::Value& json_obj, + const DataType& type, + typename T::c_type* out) { if (json_obj.IsUint64()) { uint64_t v64 = json_obj.GetUint64(); *out = static_cast(v64); @@ -249,6 +251,30 @@ enable_if_physical_unsigned_integer ConvertNumber(const rj::Value& js } } +// Convert float16/HalfFloatType +template +enable_if_half_float ConvertNumber(const rj::Value& json_obj, + const DataType& type, uint16_t* out) { + if (json_obj.IsDouble()) { + double f64 = json_obj.GetDouble(); + *out = Float16(f64).bits(); + return Status::OK(); + } else if (json_obj.IsUint()) { + uint32_t u32t = json_obj.GetUint(); + double f64 = static_cast(u32t); + *out = Float16(f64).bits(); + return Status::OK(); + } else if (json_obj.IsInt()) { + int32_t i32t = json_obj.GetInt(); + double f64 = static_cast(i32t); + *out = Float16(f64).bits(); + return Status::OK(); + } else { + *out = static_cast(0); + return JSONTypeError("unsigned int", json_obj.GetType()); + } +} + // Convert single floating point value template enable_if_physical_floating_point ConvertNumber(const rj::Value& json_obj, diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index ea3a9ae1a14a9..b3f7fc5b3458b 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -44,6 +44,7 @@ #include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #if defined(_MSC_VER) // "warning C4307: '+': integral constant overflow" @@ -51,6 +52,9 @@ #endif namespace arrow { + +using util::Float16; + namespace ipc { namespace internal { namespace json { @@ -185,6 +189,21 @@ class TestIntegers : public ::testing::Test { TYPED_TEST_SUITE_P(TestIntegers); +template +std::vector TestIntegersMutateIfNeeded( + std::vector data) { + return data; +} + +// TODO: This works, but is it the right way to do this? +template <> +std::vector TestIntegersMutateIfNeeded( + std::vector data) { + std::for_each(data.begin(), data.end(), + [](HalfFloatType::c_type& value) { value = Float16(value).bits(); }); + return data; +} + TYPED_TEST_P(TestIntegers, Basics) { using T = TypeParam; using c_type = typename T::c_type; @@ -193,16 +212,17 @@ TYPED_TEST_P(TestIntegers, Basics) { auto type = this->type(); AssertJSONArray(type, "[]", {}); - AssertJSONArray(type, "[4, 0, 5]", {4, 0, 5}); - AssertJSONArray(type, "[4, null, 5]", {true, false, true}, {4, 0, 5}); + AssertJSONArray(type, "[4, 0, 5]", TestIntegersMutateIfNeeded({4, 0, 5})); + AssertJSONArray(type, "[4, null, 5]", {true, false, true}, + TestIntegersMutateIfNeeded({4, 0, 5})); // Test limits const auto min_val = std::numeric_limits::min(); const auto max_val = std::numeric_limits::max(); std::string json_string = JSONArray(0, 1, min_val); - AssertJSONArray(type, json_string, {0, 1, min_val}); + AssertJSONArray(type, json_string, TestIntegersMutateIfNeeded({0, 1, min_val})); json_string = JSONArray(0, 1, max_val); - AssertJSONArray(type, json_string, {0, 1, max_val}); + AssertJSONArray(type, json_string, TestIntegersMutateIfNeeded({0, 1, max_val})); } TYPED_TEST_P(TestIntegers, Errors) { @@ -269,7 +289,12 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt8, TestIntegers, UInt8Type); INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt16, TestIntegers, UInt16Type); INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt32, TestIntegers, UInt32Type); INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt64, TestIntegers, UInt64Type); -INSTANTIATE_TYPED_TEST_SUITE_P(TestHalfFloat, TestIntegers, HalfFloatType); +// FIXME: I understand that HalfFloatType is backed by a uint16_t, but does it +// make sense to run this test over it? +// The way ConvertNumber for HalfFloatType is currently written, it allows the +// conversion of floating point notation to a half float, which causes failures +// in this test, one example is asserting 0.0 cannot be parsed as a half float. +// INSTANTIATE_TYPED_TEST_SUITE_P(TestHalfFloat, TestIntegers, HalfFloatType); template class TestStrings : public ::testing::Test { diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 7e0eb1d460555..95f601465b440 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -36,11 +36,14 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/type.h" +#include "arrow/util/float16.h" #include "arrow/util/iterator.h" #include "arrow/util/key_value_metadata.h" namespace arrow { +using util::Float16; + class TestRecordBatch : public ::testing::Test {}; TEST_F(TestRecordBatch, Equals) { diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index ed66c9367dc36..8caf4400fe86d 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -305,6 +305,7 @@ struct TypeTraits { using BuilderType = HalfFloatBuilder; using ScalarType = HalfFloatScalar; using TensorType = HalfFloatTensor; + using CType = uint16_t; static constexpr int64_t bytes_required(int64_t elements) { return elements * static_cast(sizeof(uint16_t)); diff --git a/cpp/src/arrow/util/formatting.cc b/cpp/src/arrow/util/formatting.cc index c16d42ce5cfe2..c5a7e03f8573a 100644 --- a/cpp/src/arrow/util/formatting.cc +++ b/cpp/src/arrow/util/formatting.cc @@ -18,10 +18,12 @@ #include "arrow/util/formatting.h" #include "arrow/util/config.h" #include "arrow/util/double_conversion.h" +#include "arrow/util/float16.h" #include "arrow/util/logging.h" namespace arrow { +using util::Float16; using util::double_conversion::DoubleToStringConverter; static constexpr int kMinBufferSize = DoubleToStringConverter::kBase10MaximalLength + 1; @@ -87,5 +89,14 @@ int FloatToStringFormatter::FormatFloat(double v, char* out_buffer, int out_size return builder.position(); } +int FloatToStringFormatter::FormatFloat(uint16_t v, char* out_buffer, int out_size) { + DCHECK_GE(out_size, kMinBufferSize); + util::double_conversion::StringBuilder builder(out_buffer, out_size); + bool result = impl_->converter_.ToShortest(Float16::FromBits(v).ToFloat(), &builder); + DCHECK(result); + ARROW_UNUSED(result); + return builder.position(); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/formatting.h b/cpp/src/arrow/util/formatting.h index 71bae74629e35..6125f792ff988 100644 --- a/cpp/src/arrow/util/formatting.h +++ b/cpp/src/arrow/util/formatting.h @@ -268,6 +268,7 @@ class ARROW_EXPORT FloatToStringFormatter { // Returns the number of characters written int FormatFloat(float v, char* out_buffer, int out_size); int FormatFloat(double v, char* out_buffer, int out_size); + int FormatFloat(uint16_t v, char* out_buffer, int out_size); protected: struct Impl; @@ -301,6 +302,12 @@ class FloatToStringFormatterMixin : public FloatToStringFormatter { } }; +template <> +class StringFormatter : public FloatToStringFormatterMixin { + public: + using FloatToStringFormatterMixin::FloatToStringFormatterMixin; +}; + template <> class StringFormatter : public FloatToStringFormatterMixin { public: diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc index f6a24ac1467f8..e84aac995e35f 100644 --- a/cpp/src/arrow/util/value_parsing.cc +++ b/cpp/src/arrow/util/value_parsing.cc @@ -22,8 +22,11 @@ #include #include +#include "arrow/util/float16.h" #include "arrow/vendored/fast_float/fast_float.h" +using arrow::util::Float16; + namespace arrow { namespace internal { @@ -43,6 +46,17 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, double* out return res.ec == std::errc() && res.ptr == s + length; } +// Half float +bool StringToFloat(const char* s, size_t length, char decimal_point, uint16_t* out) { + ::arrow_vendored::fast_float::parse_options options{ + ::arrow_vendored::fast_float::chars_format::general, decimal_point}; + float temp_out; + const auto res = + ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, temp_out, options); + *out = Float16::FromFloat(temp_out).bits(); + return res.ec == std::errc() && res.ptr == s + length; +} + // ---------------------------------------------------------------------- // strptime-like parsing diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index b3c711840f3e2..609906052cd20 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -135,6 +135,9 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, float* out) ARROW_EXPORT bool StringToFloat(const char* s, size_t length, char decimal_point, double* out); +ARROW_EXPORT +bool StringToFloat(const char* s, size_t length, char decimal_point, uint16_t* out); + template <> struct StringConverter { using value_type = float; @@ -163,6 +166,20 @@ struct StringConverter { const char decimal_point; }; +template <> +struct StringConverter { + using value_type = uint16_t; + + explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {} + + bool Convert(const HalfFloatType&, const char* s, size_t length, value_type* out) { + return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out)); + } + + private: + const char decimal_point; +}; + // NOTE: HalfFloatType would require a half<->float conversion library inline uint8_t ParseDecimalDigit(char c) { return static_cast(c - '0'); } diff --git a/docs/source/status.rst b/docs/source/status.rst index 9af2fd1921e22..71d33eaa6520c 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -40,7 +40,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | UInt8/16/32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Float16 | ✓ (1) | ✓ (2) | ✓ | ✓ | ✓ (3)| ✓ | ✓ | | +| Float16 | ✓ | ✓ (1) | ✓ | ✓ | ✓ (2)| ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Float32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -104,7 +104,7 @@ Data Types | Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift | | (special) | | | | | | | | | +===================+=======+=======+=======+============+=======+=======+=======+=======+ -| Dictionary | ✓ | ✓ (4) | ✓ | ✓ | ✓ | ✓ (3) | ✓ | | +| Dictionary | ✓ | ✓ (3) | ✓ | ✓ | ✓ | ✓ (3) | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Extension | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -113,10 +113,9 @@ Data Types Notes: -* \(1) Casting to/from Float16 in C++ is not supported. -* \(2) Casting to/from Float16 in Java is not supported. -* \(3) Float16 support in C# is only available when targeting .NET 6+. -* \(4) Nested dictionaries not supported +* \(1) Casting to/from Float16 in Java is not supported. +* \(2) Float16 support in C# is only available when targeting .NET 6+. +* \(3) Nested dictionaries not supported .. seealso:: The :ref:`format_columnar` specification. From 1a1d2c87d11c48569fa603cc157e6bddc1f7d306 Mon Sep 17 00:00:00 2001 From: Joe Marshall Date: Fri, 5 Apr 2024 03:34:32 +0100 Subject: [PATCH 17/49] GH-23221: [C++] Add support for building with Emscripten (#37821) Split from #37696 This is just the cmake changes to enable building on emscripten. Changes are: 1) Support for target system "emscripten" 2) Cmake preset for building libarrow ` ninja-release-python-emscripten` (same as `ninja-release-python`, but with emscripten support) 3) Override file for cmake on emscripten, to set various build parameters that need setting to make it build there. 4) Changes in pyarrow cmake so it works if you are building libarrow as shared library, and also an option to enable the cmake file there to just dump the current arrow configuration, which is useful for cross-compile builds. * Closes: #23221 Lead-authored-by: Joe Marshall Co-authored-by: Sutou Kouhei Co-authored-by: Joris Van den Bossche Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- ci/docker/ubuntu-22.04-cpp.dockerfile | 16 +- ci/scripts/cpp_build.sh | 232 ++++++++++-------- ci/scripts/cpp_test.sh | 4 + ci/scripts/go_bench.sh | 0 cpp/CMakePresets.json | 46 +++- cpp/build-support/emscripten-test-init.js | 24 ++ cpp/cmake_modules/BuildUtils.cmake | 4 +- cpp/cmake_modules/SetupCxxFlags.cmake | 89 ++++++- cpp/cmake_modules/ThirdpartyToolchain.cmake | 184 +++++++++++--- cpp/src/arrow/array/array_dict_test.cc | 2 +- .../compute/kernels/scalar_string_test.cc | 11 + .../compute/kernels/scalar_temporal_test.cc | 9 +- cpp/src/arrow/filesystem/localfs_test.cc | 3 + cpp/src/arrow/io/file.cc | 6 + cpp/src/arrow/io/file_test.cc | 15 ++ cpp/src/arrow/ipc/read_write_test.cc | 7 + cpp/src/arrow/util/async_generator_test.cc | 8 + cpp/src/arrow/util/atfork_test.cc | 20 ++ cpp/src/arrow/util/cache_test.cc | 5 + cpp/src/arrow/util/cancel_test.cc | 24 ++ cpp/src/arrow/util/counting_semaphore_test.cc | 13 + cpp/src/arrow/util/future_test.cc | 16 ++ cpp/src/arrow/util/io_util.cc | 9 +- cpp/src/arrow/util/io_util_test.cc | 35 ++- cpp/src/arrow/util/mutex.cc | 6 +- cpp/src/arrow/util/rle_encoding_test.cc | 16 +- cpp/src/arrow/util/value_parsing_test.cc | 5 + dev/tasks/tasks.yml | 8 + docker-compose.yml | 26 ++ docs/source/developers/cpp/emscripten.rst | 99 ++++++++ docs/source/developers/cpp/index.rst | 1 + 31 files changed, 784 insertions(+), 159 deletions(-) mode change 100644 => 100755 ci/scripts/go_bench.sh create mode 100644 cpp/build-support/emscripten-test-init.js create mode 100644 docs/source/developers/cpp/emscripten.rst diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index e8416c1378a9a..eb189841cd344 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -65,6 +65,7 @@ RUN latest_system_llvm=14 && \ RUN apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ autoconf \ + bzip2 \ ca-certificates \ ccache \ cmake \ @@ -115,10 +116,20 @@ RUN apt-get update -y -q && \ rapidjson-dev \ rsync \ tzdata \ - wget && \ + wget \ + xz-utils && \ apt-get clean && \ rm -rf /var/lib/apt/lists* +# install emscripten using EMSDK +ARG emscripten_version="3.1.45" +RUN cd ~ && git clone https://github.com/emscripten-core/emsdk.git && \ + cd emsdk && \ + ./emsdk install ${emscripten_version} && \ + ./emsdk activate ${emscripten_version} && \ + echo "Installed emsdk to:" ~/emsdk + + ARG gcc_version="" RUN if [ "${gcc_version}" = "" ]; then \ apt-get update -y -q && \ @@ -151,6 +162,9 @@ RUN if [ "${gcc_version}" = "" ]; then \ update-alternatives --set c++ /usr/bin/g++; \ fi +# make sure zlib is cached in the EMSDK folder +RUN source ~/emsdk/emsdk_env.sh && embuilder --pic build zlib + COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 1e09924a5e576..52c89acb9a76a 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -92,112 +92,132 @@ esac mkdir -p ${build_dir} pushd ${build_dir} -cmake \ - -Dabsl_SOURCE=${absl_SOURCE:-} \ - -DARROW_ACERO=${ARROW_ACERO:-OFF} \ - -DARROW_AZURE=${ARROW_AZURE:-OFF} \ - -DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \ - -DARROW_BUILD_BENCHMARKS_REFERENCE=${ARROW_BUILD_BENCHMARKS:-OFF} \ - -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ - -DARROW_BUILD_EXAMPLES=${ARROW_BUILD_EXAMPLES:-OFF} \ - -DARROW_BUILD_INTEGRATION=${ARROW_BUILD_INTEGRATION:-OFF} \ - -DARROW_BUILD_SHARED=${ARROW_BUILD_SHARED:-ON} \ - -DARROW_BUILD_STATIC=${ARROW_BUILD_STATIC:-ON} \ - -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS:-OFF} \ - -DARROW_BUILD_UTILITIES=${ARROW_BUILD_UTILITIES:-ON} \ - -DARROW_COMPUTE=${ARROW_COMPUTE:-ON} \ - -DARROW_CSV=${ARROW_CSV:-ON} \ - -DARROW_CUDA=${ARROW_CUDA:-OFF} \ - -DARROW_CXXFLAGS=${ARROW_CXXFLAGS:-} \ - -DARROW_CXX_FLAGS_DEBUG="${ARROW_CXX_FLAGS_DEBUG:-}" \ - -DARROW_CXX_FLAGS_RELEASE="${ARROW_CXX_FLAGS_RELEASE:-}" \ - -DARROW_CXX_FLAGS_RELWITHDEBINFO="${ARROW_CXX_FLAGS_RELWITHDEBINFO:-}" \ - -DARROW_C_FLAGS_DEBUG="${ARROW_C_FLAGS_DEBUG:-}" \ - -DARROW_C_FLAGS_RELEASE="${ARROW_C_FLAGS_RELEASE:-}" \ - -DARROW_C_FLAGS_RELWITHDEBINFO="${ARROW_C_FLAGS_RELWITHDEBINFO:-}" \ - -DARROW_DATASET=${ARROW_DATASET:-OFF} \ - -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ - -DARROW_ENABLE_THREADING=${ARROW_ENABLE_THREADING:-ON} \ - -DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \ - -DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \ - -DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \ - -DARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \ - -DARROW_FLIGHT_SQL=${ARROW_FLIGHT_SQL:-OFF} \ - -DARROW_FUZZING=${ARROW_FUZZING:-OFF} \ - -DARROW_GANDIVA_PC_CXX_FLAGS=${ARROW_GANDIVA_PC_CXX_FLAGS:-} \ - -DARROW_GANDIVA=${ARROW_GANDIVA:-OFF} \ - -DARROW_GCS=${ARROW_GCS:-OFF} \ - -DARROW_HDFS=${ARROW_HDFS:-ON} \ - -DARROW_INSTALL_NAME_RPATH=${ARROW_INSTALL_NAME_RPATH:-ON} \ - -DARROW_JEMALLOC=${ARROW_JEMALLOC:-ON} \ - -DARROW_JSON=${ARROW_JSON:-ON} \ - -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ - -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ - -DARROW_NO_DEPRECATED_API=${ARROW_NO_DEPRECATED_API:-OFF} \ - -DARROW_ORC=${ARROW_ORC:-OFF} \ - -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ - -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ - -DARROW_S3=${ARROW_S3:-OFF} \ - -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \ - -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ - -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} \ - -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ - -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ - -DARROW_USE_ASAN=${ARROW_USE_ASAN:-OFF} \ - -DARROW_USE_CCACHE=${ARROW_USE_CCACHE:-ON} \ - -DARROW_USE_GLOG=${ARROW_USE_GLOG:-OFF} \ - -DARROW_USE_LD_GOLD=${ARROW_USE_LD_GOLD:-OFF} \ - -DARROW_USE_MOLD=${ARROW_USE_MOLD:-OFF} \ - -DARROW_USE_PRECOMPILED_HEADERS=${ARROW_USE_PRECOMPILED_HEADERS:-OFF} \ - -DARROW_USE_STATIC_CRT=${ARROW_USE_STATIC_CRT:-OFF} \ - -DARROW_USE_TSAN=${ARROW_USE_TSAN:-OFF} \ - -DARROW_USE_UBSAN=${ARROW_USE_UBSAN:-OFF} \ - -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \ - -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-OFF} \ - -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-OFF} \ - -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-OFF} \ - -DARROW_WITH_OPENTELEMETRY=${ARROW_WITH_OPENTELEMETRY:-OFF} \ - -DARROW_WITH_MUSL=${ARROW_WITH_MUSL:-OFF} \ - -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-OFF} \ - -DARROW_WITH_UCX=${ARROW_WITH_UCX:-OFF} \ - -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ - -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ - -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ - -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ - -DAzure_SOURCE=${Azure_SOURCE:-} \ - -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ - -DBOOST_SOURCE=${BOOST_SOURCE:-} \ - -DBrotli_SOURCE=${Brotli_SOURCE:-} \ - -DBUILD_WARNING_LEVEL=${BUILD_WARNING_LEVEL:-CHECKIN} \ - -Dc-ares_SOURCE=${cares_SOURCE:-} \ - -DCMAKE_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} \ - -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE:-OFF} \ - -DCMAKE_C_FLAGS="${CFLAGS:-}" \ - -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ - -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-17}" \ - -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ - -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ - -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ - -Dgflags_SOURCE=${gflags_SOURCE:-} \ - -Dgoogle_cloud_cpp_storage_SOURCE=${google_cloud_cpp_storage_SOURCE:-} \ - -DgRPC_SOURCE=${gRPC_SOURCE:-} \ - -DGTest_SOURCE=${GTest_SOURCE:-} \ - -Dlz4_SOURCE=${lz4_SOURCE:-} \ - -DORC_SOURCE=${ORC_SOURCE:-} \ - -DPARQUET_BUILD_EXAMPLES=${PARQUET_BUILD_EXAMPLES:-OFF} \ - -DPARQUET_BUILD_EXECUTABLES=${PARQUET_BUILD_EXECUTABLES:-OFF} \ - -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION:-ON} \ - -DProtobuf_SOURCE=${Protobuf_SOURCE:-} \ - -DRapidJSON_SOURCE=${RapidJSON_SOURCE:-} \ - -Dre2_SOURCE=${re2_SOURCE:-} \ - -DSnappy_SOURCE=${Snappy_SOURCE:-} \ - -DThrift_SOURCE=${Thrift_SOURCE:-} \ - -Dutf8proc_SOURCE=${utf8proc_SOURCE:-} \ - -Dzstd_SOURCE=${zstd_SOURCE:-} \ - -Dxsimd_SOURCE=${xsimd_SOURCE:-} \ - -G "${CMAKE_GENERATOR:-Ninja}" \ - ${ARROW_CMAKE_ARGS} \ - ${source_dir} +if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then + if [ "${UBUNTU}" = "20.04" ]; then + echo "arrow emscripten build is not supported on Ubuntu 20.04, run with UBUNTU=22.04" + exit -1 + fi + n_jobs=2 # Emscripten build fails on docker unless this is set really low + source ~/emsdk/emsdk_env.sh + emcmake cmake \ + --preset=ninja-${ARROW_BUILD_TYPE:-debug}-emscripten \ + -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE:-OFF} \ + -DCMAKE_C_FLAGS="${CFLAGS:-}" \ + -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ + -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-17}" \ + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ + -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ + ${ARROW_CMAKE_ARGS} \ + ${source_dir} +else + cmake \ + -Dabsl_SOURCE=${absl_SOURCE:-} \ + -DARROW_ACERO=${ARROW_ACERO:-OFF} \ + -DARROW_AZURE=${ARROW_AZURE:-OFF} \ + -DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \ + -DARROW_BUILD_BENCHMARKS_REFERENCE=${ARROW_BUILD_BENCHMARKS:-OFF} \ + -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ + -DARROW_BUILD_EXAMPLES=${ARROW_BUILD_EXAMPLES:-OFF} \ + -DARROW_BUILD_INTEGRATION=${ARROW_BUILD_INTEGRATION:-OFF} \ + -DARROW_BUILD_SHARED=${ARROW_BUILD_SHARED:-ON} \ + -DARROW_BUILD_STATIC=${ARROW_BUILD_STATIC:-ON} \ + -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS:-OFF} \ + -DARROW_BUILD_UTILITIES=${ARROW_BUILD_UTILITIES:-ON} \ + -DARROW_COMPUTE=${ARROW_COMPUTE:-ON} \ + -DARROW_CSV=${ARROW_CSV:-ON} \ + -DARROW_CUDA=${ARROW_CUDA:-OFF} \ + -DARROW_CXXFLAGS=${ARROW_CXXFLAGS:-} \ + -DARROW_CXX_FLAGS_DEBUG="${ARROW_CXX_FLAGS_DEBUG:-}" \ + -DARROW_CXX_FLAGS_RELEASE="${ARROW_CXX_FLAGS_RELEASE:-}" \ + -DARROW_CXX_FLAGS_RELWITHDEBINFO="${ARROW_CXX_FLAGS_RELWITHDEBINFO:-}" \ + -DARROW_C_FLAGS_DEBUG="${ARROW_C_FLAGS_DEBUG:-}" \ + -DARROW_C_FLAGS_RELEASE="${ARROW_C_FLAGS_RELEASE:-}" \ + -DARROW_C_FLAGS_RELWITHDEBINFO="${ARROW_C_FLAGS_RELWITHDEBINFO:-}" \ + -DARROW_DATASET=${ARROW_DATASET:-OFF} \ + -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ + -DARROW_ENABLE_THREADING=${ARROW_ENABLE_THREADING:-ON} \ + -DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \ + -DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \ + -DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \ + -DARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \ + -DARROW_FLIGHT_SQL=${ARROW_FLIGHT_SQL:-OFF} \ + -DARROW_FUZZING=${ARROW_FUZZING:-OFF} \ + -DARROW_GANDIVA_PC_CXX_FLAGS=${ARROW_GANDIVA_PC_CXX_FLAGS:-} \ + -DARROW_GANDIVA=${ARROW_GANDIVA:-OFF} \ + -DARROW_GCS=${ARROW_GCS:-OFF} \ + -DARROW_HDFS=${ARROW_HDFS:-ON} \ + -DARROW_INSTALL_NAME_RPATH=${ARROW_INSTALL_NAME_RPATH:-ON} \ + -DARROW_JEMALLOC=${ARROW_JEMALLOC:-ON} \ + -DARROW_JSON=${ARROW_JSON:-ON} \ + -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ + -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ + -DARROW_NO_DEPRECATED_API=${ARROW_NO_DEPRECATED_API:-OFF} \ + -DARROW_ORC=${ARROW_ORC:-OFF} \ + -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ + -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ + -DARROW_S3=${ARROW_S3:-OFF} \ + -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \ + -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ + -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} \ + -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ + -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ + -DARROW_USE_ASAN=${ARROW_USE_ASAN:-OFF} \ + -DARROW_USE_CCACHE=${ARROW_USE_CCACHE:-ON} \ + -DARROW_USE_GLOG=${ARROW_USE_GLOG:-OFF} \ + -DARROW_USE_LD_GOLD=${ARROW_USE_LD_GOLD:-OFF} \ + -DARROW_USE_MOLD=${ARROW_USE_MOLD:-OFF} \ + -DARROW_USE_PRECOMPILED_HEADERS=${ARROW_USE_PRECOMPILED_HEADERS:-OFF} \ + -DARROW_USE_STATIC_CRT=${ARROW_USE_STATIC_CRT:-OFF} \ + -DARROW_USE_TSAN=${ARROW_USE_TSAN:-OFF} \ + -DARROW_USE_UBSAN=${ARROW_USE_UBSAN:-OFF} \ + -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \ + -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-OFF} \ + -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-OFF} \ + -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-OFF} \ + -DARROW_WITH_OPENTELEMETRY=${ARROW_WITH_OPENTELEMETRY:-OFF} \ + -DARROW_WITH_MUSL=${ARROW_WITH_MUSL:-OFF} \ + -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-OFF} \ + -DARROW_WITH_UCX=${ARROW_WITH_UCX:-OFF} \ + -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ + -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ + -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ + -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ + -DAzure_SOURCE=${Azure_SOURCE:-} \ + -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ + -DBOOST_SOURCE=${BOOST_SOURCE:-} \ + -DBrotli_SOURCE=${Brotli_SOURCE:-} \ + -DBUILD_WARNING_LEVEL=${BUILD_WARNING_LEVEL:-CHECKIN} \ + -Dc-ares_SOURCE=${cares_SOURCE:-} \ + -DCMAKE_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} \ + -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE:-OFF} \ + -DCMAKE_C_FLAGS="${CFLAGS:-}" \ + -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ + -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-17}" \ + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ + -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ + -Dgflags_SOURCE=${gflags_SOURCE:-} \ + -Dgoogle_cloud_cpp_storage_SOURCE=${google_cloud_cpp_storage_SOURCE:-} \ + -DgRPC_SOURCE=${gRPC_SOURCE:-} \ + -DGTest_SOURCE=${GTest_SOURCE:-} \ + -Dlz4_SOURCE=${lz4_SOURCE:-} \ + -DORC_SOURCE=${ORC_SOURCE:-} \ + -DPARQUET_BUILD_EXAMPLES=${PARQUET_BUILD_EXAMPLES:-OFF} \ + -DPARQUET_BUILD_EXECUTABLES=${PARQUET_BUILD_EXECUTABLES:-OFF} \ + -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION:-ON} \ + -DProtobuf_SOURCE=${Protobuf_SOURCE:-} \ + -DRapidJSON_SOURCE=${RapidJSON_SOURCE:-} \ + -Dre2_SOURCE=${re2_SOURCE:-} \ + -DSnappy_SOURCE=${Snappy_SOURCE:-} \ + -DThrift_SOURCE=${Thrift_SOURCE:-} \ + -Dutf8proc_SOURCE=${utf8proc_SOURCE:-} \ + -Dzstd_SOURCE=${zstd_SOURCE:-} \ + -Dxsimd_SOURCE=${xsimd_SOURCE:-} \ + -G "${CMAKE_GENERATOR:-Ninja}" \ + ${ARROW_CMAKE_ARGS} \ + ${source_dir} +fi export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:-$[${n_jobs} + 1]} time cmake --build . --target install diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index f388825fd0a98..2c640f2c1fb6a 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -80,6 +80,10 @@ case "$(uname)" in ;; esac +if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then + n_jobs=1 # avoid spurious fails on emscripten due to loading too many big executables +fi + pushd ${build_dir} if [ -z "${PYTHON}" ] && ! which python > /dev/null 2>&1; then diff --git a/ci/scripts/go_bench.sh b/ci/scripts/go_bench.sh old mode 100644 new mode 100755 diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 9d99b3b2a79e0..13d1241990c31 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -46,6 +46,32 @@ "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, + { + "name": "features-emscripten", + "hidden": true, + "cacheVariables": { + "ARROW_ACERO": "ON", + "ARROW_BUILD_SHARED": "OFF", + "ARROW_BUILD_STATIC": "ON", + "ARROW_CUDA": "OFF", + "ARROW_DEPENDENCY_SOURCE": "BUNDLED", + "ARROW_DEPENDENCY_USE_SHARED": "OFF", + "ARROW_ENABLE_THREADING": "OFF", + "ARROW_FLIGHT": "OFF", + "ARROW_IPC": "ON", + "ARROW_JEMALLOC": "OFF", + "ARROW_MIMALLOC": "OFF", + "ARROW_ORC": "ON", + "ARROW_RUNTIME_SIMD_LEVEL": "NONE", + "ARROW_S3": "OFF", + "ARROW_SIMD_LEVEL": "NONE", + "ARROW_SUBSTRAIT": "ON", + "ARROW_WITH_BROTLI": "ON", + "ARROW_WITH_OPENTELEMETRY": "OFF", + "ARROW_WITH_SNAPPY": "ON", + "CMAKE_C_BYTE_ORDER": "LITTLE_ENDIAN" + } + }, { "name": "features-minimal", "hidden": true, @@ -341,6 +367,24 @@ "displayName": "Release build with CUDA integration", "cacheVariables": {} }, + { + "name": "ninja-debug-emscripten", + "inherits": [ + "features-emscripten", + "base-debug" + ], + "displayName": "Debug build which builds an Emscripten library", + "cacheVariables": {} + }, + { + "name": "ninja-release-emscripten", + "inherits": [ + "features-emscripten", + "base-release" + ], + "displayName": "Release build which builds an Emscripten library", + "cacheVariables": {} + }, { "name": "ninja-release-flight", "inherits": [ @@ -447,4 +491,4 @@ } } ] -} +} \ No newline at end of file diff --git a/cpp/build-support/emscripten-test-init.js b/cpp/build-support/emscripten-test-init.js new file mode 100644 index 0000000000000..bbb542a29f021 --- /dev/null +++ b/cpp/build-support/emscripten-test-init.js @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +var Module = { +}; + +// make sure tests can access the current parquet test data files +Module.preRun = () => {ENV.PARQUET_TEST_DATA = process.env.PARQUET_TEST_DATA; + ENV.ARROW_TEST_DATA = process.env.ARROW_TEST_DATA; +}; \ No newline at end of file diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 7a45e9cca59de..e7523add27223 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -760,8 +760,8 @@ function(ADD_TEST_CASE REL_TEST_NAME) valgrind --suppressions=valgrind.supp --tool=memcheck --gen-suppressions=all \ --num-callers=500 --leak-check=full --leak-check-heuristics=stdstring \ --error-exitcode=1 ${TEST_PATH} ${ARG_TEST_ARGUMENTS}") - elseif(WIN32) - add_test(${TEST_NAME} ${TEST_PATH} ${ARG_TEST_ARGUMENTS}) + elseif(WIN32 OR CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME} ${ARG_TEST_ARGUMENTS}) else() add_test(${TEST_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 90decb4224ec6..1d709fe98d7fe 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -24,7 +24,9 @@ include(CheckCXXSourceCompiles) message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") if(NOT DEFINED ARROW_CPU_FLAG) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|amd64|X86|x86|i[3456]86|x64") + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + set(ARROW_CPU_FLAG "emscripten") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|amd64|X86|x86|i[3456]86|x64") set(ARROW_CPU_FLAG "x86") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64") set(ARROW_CPU_FLAG "aarch64") @@ -312,7 +314,12 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wextra") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wdocumentation") - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wshorten-64-to-32") + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # size_t is 32 bit in Emscripten wasm32 - ignore conversion errors + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-shorten-64-to-32") + else() + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wshorten-64-to-32") + endif() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-missing-braces") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unused-parameter") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-constant-logical-operand") @@ -692,17 +699,36 @@ if(NOT MSVC) set(C_DEBUG_FLAGS "") set(CXX_DEBUG_FLAGS "") if(NOT MSVC) - if(NOT CMAKE_C_FLAGS_DEBUG MATCHES "-O") - string(APPEND C_DEBUG_FLAGS " -O0") - endif() - if(NOT CMAKE_CXX_FLAGS_DEBUG MATCHES "-O") - string(APPEND CXX_DEBUG_FLAGS " -O0") - endif() - if(ARROW_GGDB_DEBUG) - string(APPEND C_DEBUG_FLAGS " -ggdb") - string(APPEND CXX_DEBUG_FLAGS " -ggdb") - string(APPEND C_RELWITHDEBINFO_FLAGS " -ggdb") - string(APPEND CXX_RELWITHDEBINFO_FLAGS " -ggdb") + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # with -g it uses DWARF debug info, which is really slow to build + # on emscripten (and uses tons of memory) + string(REPLACE "-g" " " CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) + string(REPLACE "-g" " " CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + string(APPEND C_DEBUG_FLAGS " -g2") + string(APPEND CXX_DEBUG_FLAGS " -g2") + string(APPEND C_RELWITHDEBINFO_FLAGS " -g2") + string(APPEND CXX_RELWITHDEBINFO_FLAGS " -g2") + # without -O1, emscripten executables are *MASSIVE*. Don't use -O0 + if(NOT CMAKE_C_FLAGS_DEBUG MATCHES "-O") + string(APPEND C_DEBUG_FLAGS " -O1") + endif() + if(NOT CMAKE_CXX_FLAGS_DEBUG MATCHES "-O") + string(APPEND CXX_DEBUG_FLAGS " -O1") + endif() + else() + if(NOT CMAKE_C_FLAGS_DEBUG MATCHES "-O") + string(APPEND C_DEBUG_FLAGS " -O0") + endif() + if(NOT CMAKE_CXX_FLAGS_DEBUG MATCHES "-O") + string(APPEND CXX_DEBUG_FLAGS " -O0") + endif() + + if(ARROW_GGDB_DEBUG) + string(APPEND C_DEBUG_FLAGS " -ggdb") + string(APPEND CXX_DEBUG_FLAGS " -ggdb") + string(APPEND C_RELWITHDEBINFO_FLAGS " -ggdb") + string(APPEND CXX_RELWITHDEBINFO_FLAGS " -ggdb") + endif() endif() endif() @@ -733,3 +759,40 @@ if(MSVC) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MSVC_LINKER_FLAGS}") endif() endif() + +if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # flags are: + # 1) We force *everything* to build as position independent + # 2) And with support for C++ exceptions + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -fexceptions") + # deprecated-literal-operator error is thrown in datetime (vendored lib in arrow) + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -fPIC -fexceptions -Wno-error=deprecated-literal-operator") + + # flags for creating shared libraries (only used in pyarrow, because + # Emscripten builds libarrow as static) + # flags are: + # 1) Tell it to use JavaScript / WebAssembly 64 bit number support. + # 2) Tell it to build with support for C++ exceptions + # 3) Skip linker flags error which happens with -soname parameter + set(ARROW_EMSCRIPTEN_LINKER_FLAGS "-sWASM_BIGINT=1 -fexceptions -Wno-error=linkflags") + set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS + "-sSIDE_MODULE=1 ${ARROW_EMSCRIPTEN_LINKER_FLAGS}") + set(CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS + "-sSIDE_MODULE=1 ${ARROW_EMSCRIPTEN_LINKER_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "-sSIDE_MODULE=1 ${ARROW_EMSCRIPTEN_LINKER_FLAGS}") + if(ARROW_TESTING) + # flags for building test executables for use in node + if("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + set(CMAKE_EXE_LINKER_FLAGS + "${ARROW_EMSCRIPTEN_LINKER_FLAGS} -sALLOW_MEMORY_GROWTH -lnodefs.js -lnoderawfs.js --pre-js ${BUILD_SUPPORT_DIR}/emscripten-test-init.js" + ) + else() + set(CMAKE_EXE_LINKER_FLAGS + "${ARROW_EMSCRIPTEN_LINKER_FLAGS} -sERROR_ON_WASM_CHANGES_AFTER_LINK=1 -sALLOW_MEMORY_GROWTH -lnodefs.js -lnoderawfs.js --pre-js ${BUILD_SUPPORT_DIR}/emscripten-test-init.js" + ) + endif() + else() + set(CMAKE_EXE_LINKER_FLAGS "${ARROW_EMSCRIPTEN_LINKER_FLAGS} -sALLOW_MEMORY_GROWTH") + endif() +endif() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index ad7344b09dd4e..4a67eac1d4d59 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -976,6 +976,23 @@ set(EP_COMMON_CMAKE_ARGS -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT} -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE}) +# if building with a toolchain file, pass that through +if(CMAKE_TOOLCHAIN_FILE) + list(APPEND EP_COMMON_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}) +endif() + +# and crosscompiling emulator (for try_run() ) +if(CMAKE_CROSSCOMPILING_EMULATOR) + string(REPLACE ";" ${EP_LIST_SEPARATOR} EP_CMAKE_CROSSCOMPILING_EMULATOR + "${CMAKE_CROSSCOMPILING_EMULATOR}") + list(APPEND EP_COMMON_CMAKE_ARGS + -DCMAKE_CROSSCOMPILING_EMULATOR=${EP_CMAKE_CROSSCOMPILING_EMULATOR}) +endif() + +if(CMAKE_PROJECT_INCLUDE) + list(APPEND EP_COMMON_CMAKE_ARGS -DCMAKE_PROJECT_INCLUDE=${CMAKE_PROJECT_INCLUDE}) +endif() + # Enable s/ccache if set by parent. if(CMAKE_C_COMPILER_LAUNCHER AND CMAKE_CXX_COMPILER_LAUNCHER) list(APPEND EP_COMMON_CMAKE_ARGS @@ -1349,6 +1366,14 @@ macro(build_snappy) set(SNAPPY_PATCH_COMMAND) endif() + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # ignore linker flag errors, as Snappy sets + # -Werror -Wall, and Emscripten doesn't support -soname + list(APPEND SNAPPY_CMAKE_ARGS + "-DCMAKE_SHARED_LINKER_FLAGS=${CMAKE_SHARED_LINKER_FLAGS}" + "-Wno-error=linkflags") + endif() + externalproject_add(snappy_ep ${EP_COMMON_OPTIONS} BUILD_IN_SOURCE 1 @@ -1394,6 +1419,7 @@ macro(build_brotli) message(STATUS "Building brotli from source") set(BROTLI_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/brotli_ep/src/brotli_ep-install") set(BROTLI_INCLUDE_DIR "${BROTLI_PREFIX}/include") + set(BROTLI_LIB_DIR "${BROTLI_PREFIX}/lib") set(BROTLI_STATIC_LIBRARY_ENC "${BROTLI_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}brotlienc-static${CMAKE_STATIC_LIBRARY_SUFFIX}" ) @@ -1405,6 +1431,26 @@ macro(build_brotli) ) set(BROTLI_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${BROTLI_PREFIX}") + set(BROTLI_EP_OPTIONS) + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # "cmake install" is disabled for Brotli on Emscripten, so the + # default INSTALL_COMMAND fails. We need to disable the default + # INSTALL_COMMAND. + list(APPEND + BROTLI_EP_OPTIONS + INSTALL_COMMAND + ${CMAKE_COMMAND} + -E + true) + + set(BROTLI_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/brotli_ep-prefix/src/brotli_ep-build) + set(BROTLI_BUILD_LIBS + "${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlienc-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlidec-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlicommon-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + endif() + externalproject_add(brotli_ep ${EP_COMMON_OPTIONS} URL ${BROTLI_SOURCE_URL} @@ -1414,7 +1460,20 @@ macro(build_brotli) "${BROTLI_STATIC_LIBRARY_COMMON}" ${BROTLI_BUILD_BYPRODUCTS} CMAKE_ARGS ${BROTLI_CMAKE_ARGS} - STEP_TARGETS headers_copy) + STEP_TARGETS headers_copy ${BROTLI_EP_OPTIONS}) + + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # Copy the libraries to our install directory manually. + set(BROTLI_BUILD_INCLUDE_DIR + ${CMAKE_CURRENT_BINARY_DIR}/brotli_ep-prefix/src/brotli_ep/c/include/brotli) + add_custom_command(TARGET brotli_ep + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}*${CMAKE_STATIC_LIBRARY_SUFFIX} + ${BROTLI_LIB_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${BROTLI_BUILD_INCLUDE_DIR} ${BROTLI_INCLUDE_DIR}/brotli) + endif() file(MAKE_DIRECTORY "${BROTLI_INCLUDE_DIR}") @@ -1657,6 +1716,9 @@ macro(build_thrift) if(DEFINED BOOST_ROOT) list(APPEND THRIFT_CMAKE_ARGS "-DBOOST_ROOT=${BOOST_ROOT}") endif() + if(DEFINED Boost_INCLUDE_DIR) + list(APPEND THRIFT_CMAKE_ARGS "-DBoost_INCLUDE_DIR=${Boost_INCLUDE_DIR}") + endif() if(DEFINED Boost_NAMESPACE) list(APPEND THRIFT_CMAKE_ARGS "-DBoost_NAMESPACE=${Boost_NAMESPACE}") endif() @@ -1798,6 +1860,36 @@ macro(build_protobuf) add_dependencies(arrow::protobuf::protoc protobuf_ep) list(APPEND ARROW_BUNDLED_STATIC_LIBS arrow::protobuf::libprotobuf) + + if(CMAKE_CROSSCOMPILING) + # If we are cross compiling, we need to build protoc for the host + # system also, as it is used when building Arrow + # We do this by calling CMake as a child process + # with CXXFLAGS / CFLAGS and CMake flags cleared. + set(PROTOBUF_HOST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/protobuf_ep_host-install") + set(PROTOBUF_HOST_COMPILER "${PROTOBUF_HOST_PREFIX}/bin/protoc") + + set(PROTOBUF_HOST_CMAKE_ARGS + "-DCMAKE_CXX_FLAGS=" + "-DCMAKE_C_FLAGS=" + "-DCMAKE_INSTALL_PREFIX=${PROTOBUF_HOST_PREFIX}" + -Dprotobuf_BUILD_TESTS=OFF + -Dprotobuf_DEBUG_POSTFIX=) + + externalproject_add(protobuf_ep_host + ${EP_COMMON_OPTIONS} + CMAKE_ARGS ${PROTOBUF_HOST_CMAKE_ARGS} + BUILD_BYPRODUCTS "${PROTOBUF_HOST_COMPILER}" + BUILD_IN_SOURCE 1 + URL ${PROTOBUF_SOURCE_URL} + URL_HASH "SHA256=${ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM}") + + add_executable(arrow::protobuf::host_protoc IMPORTED) + set_target_properties(arrow::protobuf::host_protoc + PROPERTIES IMPORTED_LOCATION "${PROTOBUF_HOST_COMPILER}") + + add_dependencies(arrow::protobuf::host_protoc protobuf_ep_host) + endif() endmacro() if(ARROW_WITH_PROTOBUF) @@ -1862,7 +1954,11 @@ if(ARROW_WITH_PROTOBUF) else() set(ARROW_PROTOBUF_LIBPROTOC protobuf::libprotoc) endif() - if(TARGET arrow::protobuf::protoc) + if(TARGET arrow::protobuf::host_protoc) + # make sure host protoc is used for compiling protobuf files + # during build of e.g. orc + set(ARROW_PROTOBUF_PROTOC arrow::protobuf::host_protoc) + elseif(TARGET arrow::protobuf::protoc) set(ARROW_PROTOBUF_PROTOC arrow::protobuf::protoc) else() if(NOT TARGET protobuf::protoc) @@ -2164,8 +2260,15 @@ function(build_gtest) if(APPLE) string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-value" " -Wno-ignored-attributes") endif() - set(BUILD_SHARED_LIBS ON) - set(BUILD_STATIC_LIBS OFF) + # If we're building static libs for Emscripten, we need to build *everything* as + # static libs. + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + set(BUILD_SHARED_LIBS OFF) + set(BUILD_STATIC_LIBS ON) + else() + set(BUILD_SHARED_LIBS ON) + set(BUILD_STATIC_LIBS OFF) + endif() # We need to use "cache" variable to override the default # INSTALL_GTEST option by this value. See also: # https://cmake.org/cmake/help/latest/policy/CMP0077.html @@ -2403,37 +2506,58 @@ endif() macro(build_zlib) message(STATUS "Building ZLIB from source") - set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install") - if(MSVC) - if(${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") - set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib) - else() - set(ZLIB_STATIC_LIB_NAME zlibstatic.lib) + + # ensure zlib is built with -fpic + # and make sure that the build finds the version in Emscripten ports + # - n.b. the actual linking happens because -sUSE_ZLIB=1 is + # set in the compiler variables, but cmake expects + # it to exist at configuration time if we aren't building it as + # bundled. We need to do this for all packages + # not just zlib as some depend on zlib, but we don't rebuild + # if it exists already + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # build zlib using Emscripten ports + if(NOT EXISTS ${EMSCRIPTEN_SYSROOT}/lib/wasm32-emscripten/pic/libz.a) + execute_process(COMMAND embuilder --pic --force build zlib) endif() + add_library(ZLIB::ZLIB STATIC IMPORTED) + set_property(TARGET ZLIB::ZLIB + PROPERTY IMPORTED_LOCATION + "${EMSCRIPTEN_SYSROOT}/lib/wasm32-emscripten/pic/libz.a") + list(APPEND ARROW_BUNDLED_STATIC_LIBS ZLIB::ZLIB) else() - set(ZLIB_STATIC_LIB_NAME libz.a) - endif() - set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") - set(ZLIB_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}") + set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install") + if(MSVC) + if(${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") + set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib) + else() + set(ZLIB_STATIC_LIB_NAME zlibstatic.lib) + endif() + else() + set(ZLIB_STATIC_LIB_NAME libz.a) + endif() + set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") + set(ZLIB_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}") - externalproject_add(zlib_ep - ${EP_COMMON_OPTIONS} - URL ${ZLIB_SOURCE_URL} - URL_HASH "SHA256=${ARROW_ZLIB_BUILD_SHA256_CHECKSUM}" - BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}" - CMAKE_ARGS ${ZLIB_CMAKE_ARGS}) + externalproject_add(zlib_ep + ${EP_COMMON_OPTIONS} + URL ${ZLIB_SOURCE_URL} + URL_HASH "SHA256=${ARROW_ZLIB_BUILD_SHA256_CHECKSUM}" + BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}" + CMAKE_ARGS ${ZLIB_CMAKE_ARGS}) - file(MAKE_DIRECTORY "${ZLIB_PREFIX}/include") + file(MAKE_DIRECTORY "${ZLIB_PREFIX}/include") - add_library(ZLIB::ZLIB STATIC IMPORTED) - set(ZLIB_LIBRARIES ${ZLIB_STATIC_LIB}) - set(ZLIB_INCLUDE_DIRS "${ZLIB_PREFIX}/include") - set_target_properties(ZLIB::ZLIB PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARIES}) - target_include_directories(ZLIB::ZLIB BEFORE INTERFACE "${ZLIB_INCLUDE_DIRS}") + add_library(ZLIB::ZLIB STATIC IMPORTED) + set(ZLIB_LIBRARIES ${ZLIB_STATIC_LIB}) + set(ZLIB_INCLUDE_DIRS "${ZLIB_PREFIX}/include") + set_target_properties(ZLIB::ZLIB PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARIES}) + target_include_directories(ZLIB::ZLIB BEFORE INTERFACE "${ZLIB_INCLUDE_DIRS}") - add_dependencies(ZLIB::ZLIB zlib_ep) + add_dependencies(ZLIB::ZLIB zlib_ep) + list(APPEND ARROW_BUNDLED_STATIC_LIBS ZLIB::ZLIB) + endif() - list(APPEND ARROW_BUNDLED_STATIC_LIBS ZLIB::ZLIB) set(ZLIB_VENDORED TRUE) endmacro() @@ -4390,6 +4514,10 @@ macro(build_orc) "-DPROTOBUF_LIBRARY=$" "-DPROTOC_LIBRARY=$" "-DSNAPPY_HOME=${ORC_SNAPPY_ROOT}" + "-DSNAPPY_LIBRARY=$" + "-DLZ4_LIBRARY=$" + "-DLZ4_STATIC_LIBRARY=$" + "-DLZ4_INCLUDE_DIR=${ORC_LZ4_ROOT}/include" "-DSNAPPY_INCLUDE_DIR=${ORC_SNAPPY_INCLUDE_DIR}" "-DZSTD_HOME=${ORC_ZSTD_ROOT}" "-DZSTD_INCLUDE_DIR=$" diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 4ae9e3d6dcbfc..22d6d1fc5ae92 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1129,7 +1129,7 @@ TEST(TestDictionary, Validate) { arr = std::make_shared(dict_type, indices, MakeArray(invalid_data)); ASSERT_RAISES(Invalid, arr->ValidateFull()); -#if !defined(__APPLE__) && !defined(ARROW_VALGRIND) +#if !defined(__APPLE__) && !defined(ARROW_VALGRIND) && !defined(__EMSCRIPTEN__) // GH-35712: ASSERT_DEATH would make testing slow on macOS. ASSERT_DEATH( { diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 26289a7f787e1..c7dbdef2436c3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -1988,6 +1988,11 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) { #endif TYPED_TEST(TestStringKernels, Strptime) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Skipping some strptime tests due to emscripten bug " + "https://github.com/emscripten-core/emscripten/issues/20466"; +#endif + std::string input1 = R"(["5/1/2020", null, null, "12/13/1900", null])"; std::string input2 = R"(["5-1-2020", "12/13/1900"])"; std::string input3 = R"(["5/1/2020", "AA/BB/CCCC"])"; @@ -2008,6 +2013,7 @@ TYPED_TEST(TestStringKernels, Strptime) { this->CheckUnary("strptime", input4, unit, output4, &options); options.format = "%m/%d/%Y %%z"; + // emscripten bug https://github.com/emscripten-core/emscripten/issues/20466 this->CheckUnary("strptime", input5, unit, output1, &options); options.error_is_null = false; @@ -2019,6 +2025,11 @@ TYPED_TEST(TestStringKernels, Strptime) { } TYPED_TEST(TestStringKernels, StrptimeZoneOffset) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() + << "Emscripten bug https://github.com/emscripten-core/emscripten/issues/20467"; +#endif + if (!arrow::internal::kStrptimeSupportsZone) { GTEST_SKIP() << "strptime does not support %z on this platform"; } diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 8dac6525fe2e6..8da8c760ea22b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -2143,7 +2143,10 @@ TEST_F(ScalarTemporalTest, StrftimeCLocale) { TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { #ifdef _WIN32 GTEST_SKIP() << "There is a known bug in strftime for locales on Windows (ARROW-15922)"; -#else +#elif defined(__EMSCRIPTEN__) + GTEST_SKIP() << "Emscripten doesn't build with multiple locales as default"; +#endif + if (!LocaleExists("fr_FR.UTF-8")) { GTEST_SKIP() << "locale 'fr_FR.UTF-8' doesn't exist on this system"; } @@ -2155,10 +2158,12 @@ TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { ["01 janvier 1970 00:00:59,123", "18 août 2021 15:11:50,456", null])"; CheckScalarUnary("strftime", timestamp(TimeUnit::MILLI, "UTC"), milliseconds, utf8(), expected, &options); -#endif } TEST_F(ScalarTemporalTest, StrftimeInvalidLocale) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Emscripten doesn't build with multiple locales as default"; +#endif auto options = StrftimeOptions("%d %B %Y %H:%M:%S", "nonexistent"); const char* seconds = R"(["1970-01-01T00:00:59", null])"; auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), seconds); diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index f90833a88d118..b76c7ebad45db 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -138,6 +138,9 @@ TEST(FileSystemFromUri, LinkedRegisteredFactory) { } TEST(FileSystemFromUri, LoadedRegisteredFactory) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Emscripten dynamic library testing disabled"; +#endif // Since the registrar's definition is in libarrow_filesystem_example.so, // its factory will be registered only after the library is dynamically loaded. std::string path; diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 00426f9957b1f..cc3a5187059e9 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -398,8 +398,14 @@ class MemoryMappedFile::MemoryMap ~Region() { if (data_ != nullptr) { +#ifndef __EMSCRIPTEN__ int result = munmap(data(), static_cast(size_)); + // emscripten erroneously reports failures in munmap + // https://github.com/emscripten-core/emscripten/issues/20459 ARROW_CHECK_EQ(result, 0) << "munmap failed"; +#else + munmap(data(), static_cast(size_)); +#endif } } diff --git a/cpp/src/arrow/io/file_test.cc b/cpp/src/arrow/io/file_test.cc index e7e7ba949c9fd..af414891b950e 100644 --- a/cpp/src/arrow/io/file_test.cc +++ b/cpp/src/arrow/io/file_test.cc @@ -42,6 +42,7 @@ #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" +#include "arrow/util/config.h" #include "arrow/util/future.h" #include "arrow/util/io_util.h" @@ -486,6 +487,10 @@ TEST_F(TestReadableFile, CustomMemoryPool) { } TEST_F(TestReadableFile, ThreadSafety) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + std::string data = "foobar"; { std::ofstream stream; @@ -540,6 +545,9 @@ class TestPipeIO : public ::testing::Test { }; TEST_F(TestPipeIO, TestWrite) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Pipes not supported on Emscripten"; +#endif std::string data1 = "test", data2 = "data!"; std::shared_ptr file; uint8_t buffer[10]; @@ -570,6 +578,9 @@ TEST_F(TestPipeIO, TestWrite) { } TEST_F(TestPipeIO, ReadableFileFails) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Pipes not supported on Emscripten"; +#endif // ReadableFile fails on non-seekable fd ASSERT_RAISES(IOError, ReadableFile::Open(pipe_.rfd.fd())); } @@ -1048,6 +1059,10 @@ TEST_F(TestMemoryMappedFile, CastableToFileInterface) { } TEST_F(TestMemoryMappedFile, ThreadSafety) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + std::string data = "foobar"; std::string path = TempFile("ipc-multithreading-test"); CreateFile(path, static_cast(data.size())); diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index c5075299a3e35..ff7838cc39d72 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -1046,6 +1046,9 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { }; TEST_F(RecursionLimits, WriteLimit) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "This crashes the Emscripten runtime."; +#endif int32_t metadata_length = -1; int64_t body_length = -1; std::shared_ptr schema; @@ -1078,6 +1081,10 @@ TEST_F(RecursionLimits, ReadLimit) { // Test fails with a structured exception on Windows + Debug #if !defined(_WIN32) || defined(NDEBUG) TEST_F(RecursionLimits, StressLimit) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "This crashes the Emscripten runtime."; +#endif + auto CheckDepth = [this](int recursion_depth, bool* it_works) { int32_t metadata_length = -1; int64_t body_length = -1; diff --git a/cpp/src/arrow/util/async_generator_test.cc b/cpp/src/arrow/util/async_generator_test.cc index 2b74313db279b..afb03b67209a6 100644 --- a/cpp/src/arrow/util/async_generator_test.cc +++ b/cpp/src/arrow/util/async_generator_test.cc @@ -399,6 +399,10 @@ TEST(TestAsyncUtil, MapParallelStress) { } TEST(TestAsyncUtil, MapQueuingFailStress) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + constexpr int NTASKS = 10; constexpr int NITEMS = 10; for (bool slow : {true, false}) { @@ -1872,6 +1876,10 @@ TEST(PushGenerator, DanglingProducer) { } TEST(PushGenerator, Stress) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + const int NTHREADS = 20; const int NVALUES = 2000; const int NFUTURES = NVALUES + 100; diff --git a/cpp/src/arrow/util/atfork_test.cc b/cpp/src/arrow/util/atfork_test.cc index 004e28e19514a..750f4d138793b 100644 --- a/cpp/src/arrow/util/atfork_test.cc +++ b/cpp/src/arrow/util/atfork_test.cc @@ -35,6 +35,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/atfork_internal.h" +#include "arrow/util/config.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" @@ -109,6 +110,10 @@ class TestAtFork : public ::testing::Test { #ifndef _WIN32 TEST_F(TestAtFork, EmptyHandlers) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + auto handlers = std::make_shared(); RegisterAtFork(handlers); @@ -130,6 +135,10 @@ TEST_F(TestAtFork, EmptyHandlers) { } TEST_F(TestAtFork, SingleThread) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + auto handlers1 = std::make_shared(PushBefore(1), PushParentAfter(11), PushChildAfter(21)); auto handlers2 = std::make_shared(PushBefore(2), PushParentAfter(12), @@ -188,6 +197,10 @@ TEST_F(TestAtFork, SingleThread) { // https://github.com/google/sanitizers/issues/950. TEST_F(TestAtFork, MultipleThreads) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + const int kNumThreads = 5; const int kNumIterations = 40; const int kParentAfterAddend = 10000; @@ -245,6 +258,9 @@ TEST_F(TestAtFork, NestedChild) { #ifdef __APPLE__ GTEST_SKIP() << "Nested fork is not supported on macOS"; #endif +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif auto handlers1 = std::make_shared(PushBefore(1), PushParentAfter(11), PushChildAfter(21)); @@ -286,6 +302,10 @@ TEST_F(TestAtFork, NestedChild) { #ifdef _WIN32 TEST_F(TestAtFork, NoOp) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + auto handlers = std::make_shared(PushBefore(1), PushParentAfter(11), PushChildAfter(21)); diff --git a/cpp/src/arrow/util/cache_test.cc b/cpp/src/arrow/util/cache_test.cc index 6b71baa369b9b..264bfe68ec5d2 100644 --- a/cpp/src/arrow/util/cache_test.cc +++ b/cpp/src/arrow/util/cache_test.cc @@ -26,6 +26,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/cache_internal.h" +#include "arrow/util/config.h" namespace arrow { namespace internal { @@ -255,6 +256,10 @@ TYPED_TEST(TestMemoizeLru, Basics) { this->TestBasics(); } class TestMemoizeLruThreadSafe : public TestMemoizeLru {}; TEST_F(TestMemoizeLruThreadSafe, Threads) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + using V = IntValue; Callable c; diff --git a/cpp/src/arrow/util/cancel_test.cc b/cpp/src/arrow/util/cancel_test.cc index 45f6cde4f5579..713418f15a0cc 100644 --- a/cpp/src/arrow/util/cancel_test.cc +++ b/cpp/src/arrow/util/cancel_test.cc @@ -232,6 +232,10 @@ class SignalCancelTest : public CancelTest { }; TEST_F(SignalCancelTest, Register) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + RegisterHandler(); TriggerSignal(); @@ -239,6 +243,10 @@ TEST_F(SignalCancelTest, Register) { } TEST_F(SignalCancelTest, RegisterUnregister) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + // The signal stop source was set up but no handler was registered, // so the token shouldn't be signalled. TriggerSignal(); @@ -261,6 +269,10 @@ TEST_F(SignalCancelTest, RegisterUnregister) { #if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || \ defined(THREAD_SANITIZER)) TEST_F(SignalCancelTest, ForkSafetyUnregisteredHandlers) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + RunInChild([&]() { // Child TriggerSignal(); @@ -284,6 +296,10 @@ TEST_F(SignalCancelTest, ForkSafetyUnregisteredHandlers) { } TEST_F(SignalCancelTest, ForkSafetyRegisteredHandlers) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + RegisterHandler(); RunInChild([&]() { @@ -307,6 +323,10 @@ TEST_F(SignalCancelTest, ForkSafetyRegisteredHandlers) { #endif TEST_F(CancelTest, ThreadedPollSuccess) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + constexpr int kNumThreads = 10; std::vector results(kNumThreads); @@ -339,6 +359,10 @@ TEST_F(CancelTest, ThreadedPollSuccess) { } TEST_F(CancelTest, ThreadedPollCancel) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + constexpr int kNumThreads = 10; std::vector results(kNumThreads); diff --git a/cpp/src/arrow/util/counting_semaphore_test.cc b/cpp/src/arrow/util/counting_semaphore_test.cc index a5fa9f6bde891..4de11ce852a03 100644 --- a/cpp/src/arrow/util/counting_semaphore_test.cc +++ b/cpp/src/arrow/util/counting_semaphore_test.cc @@ -22,12 +22,17 @@ #include #include "arrow/testing/gtest_util.h" +#include "arrow/util/config.h" #include "gtest/gtest.h" namespace arrow { namespace util { TEST(CountingSemaphore, Basic) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + CountingSemaphore semaphore; std::atomic acquired{false}; std::atomic started{false}; @@ -50,6 +55,10 @@ TEST(CountingSemaphore, Basic) { } TEST(CountingSemaphore, CloseAborts) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + CountingSemaphore semaphore; std::atomic cleanup{false}; std::thread acquirer([&] { @@ -64,6 +73,10 @@ TEST(CountingSemaphore, CloseAborts) { } TEST(CountingSemaphore, Stress) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + constexpr uint32_t NTHREADS = 10; CountingSemaphore semaphore; std::vector max_allowed_cases = {1, 3}; diff --git a/cpp/src/arrow/util/future_test.cc b/cpp/src/arrow/util/future_test.cc index 87891e48efa5e..2ed2b69aed524 100644 --- a/cpp/src/arrow/util/future_test.cc +++ b/cpp/src/arrow/util/future_test.cc @@ -415,6 +415,10 @@ TEST(FutureRefTest, HeadRemoved) { } TEST(FutureStressTest, Callback) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + #ifdef ARROW_VALGRIND const int NITERS = 2; #else @@ -471,6 +475,10 @@ TEST(FutureStressTest, Callback) { } TEST(FutureStressTest, TryAddCallback) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + for (unsigned int n = 0; n < 1; n++) { auto fut = Future<>::Make(); std::atomic callbacks_added(0); @@ -527,6 +535,10 @@ TEST(FutureStressTest, TryAddCallback) { } TEST(FutureStressTest, DeleteAfterWait) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + constexpr int kNumTasks = 100; for (int i = 0; i < kNumTasks; i++) { { @@ -1543,6 +1555,10 @@ TEST(FnOnceTest, MoveOnlyDataType) { } TEST(FutureTest, MatcherExamples) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + EXPECT_THAT(Future::MakeFinished(Status::Invalid("arbitrary error")), Finishes(Raises(StatusCode::Invalid))); diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc index 5928ebcb88959..d48f9eb97d562 100644 --- a/cpp/src/arrow/util/io_util.cc +++ b/cpp/src/arrow/util/io_util.cc @@ -95,6 +95,7 @@ #include "arrow/result.h" #include "arrow/util/atfork_internal.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/config.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" #include "arrow/util/mutex.h" @@ -1485,6 +1486,7 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, } Status MemoryAdviseWillNeed(const std::vector& regions) { +#ifndef __EMSCRIPTEN__ const auto page_size = static_cast(GetPageSize()); DCHECK_GT(page_size, 0); const size_t page_mask = ~(page_size - 1); @@ -1543,6 +1545,9 @@ Status MemoryAdviseWillNeed(const std::vector& regions) { #else return Status::OK(); #endif +#else + return Status::OK(); +#endif } // @@ -2067,7 +2072,9 @@ Status SendSignal(int signum) { } Status SendSignalToThread(int signum, uint64_t thread_id) { -#ifdef _WIN32 +#ifndef ARROW_ENABLE_THREADING + return Status::NotImplemented("Can't send signal with no threads"); +#elif defined(_WIN32) return Status::NotImplemented("Cannot send signal to specific thread on Windows"); #else // Have to use a C-style cast because pthread_t can be a pointer *or* integer type diff --git a/cpp/src/arrow/util/io_util_test.cc b/cpp/src/arrow/util/io_util_test.cc index d0569c799561f..73213bf9ce48a 100644 --- a/cpp/src/arrow/util/io_util_test.cc +++ b/cpp/src/arrow/util/io_util_test.cc @@ -40,6 +40,7 @@ #include "arrow/buffer.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/bit_util.h" +#include "arrow/util/config.h" #include "arrow/util/cpu_info.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" @@ -146,8 +147,8 @@ TEST(MemoryAdviseWillNeed, Basics) { ASSERT_OK(MemoryAdviseWillNeed({{addr1, 0}, {addr2 + 1, 0}})); // Should probably fail - // (but on Windows, MemoryAdviseWillNeed can be a no-op) -#ifndef _WIN32 + // (but on Windows or Emscripten, MemoryAdviseWillNeed can be a no-op) +#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) ASSERT_RAISES(IOError, MemoryAdviseWillNeed({{nullptr, std::numeric_limits::max()}})); #endif @@ -368,6 +369,10 @@ TestSelfPipe* TestSelfPipe::instance_; TEST_F(TestSelfPipe, MakeAndShutdown) {} TEST_F(TestSelfPipe, WaitAndSend) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + StartReading(); SleepABit(); AssertPayloadsEventually({}); @@ -380,6 +385,10 @@ TEST_F(TestSelfPipe, WaitAndSend) { } TEST_F(TestSelfPipe, SendAndWait) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + self_pipe_->Send(123456789123456789ULL); StartReading(); SleepABit(); @@ -390,6 +399,10 @@ TEST_F(TestSelfPipe, SendAndWait) { } TEST_F(TestSelfPipe, WaitAndShutdown) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + StartReading(); SleepABit(); ASSERT_OK(self_pipe_->Shutdown()); @@ -401,6 +414,9 @@ TEST_F(TestSelfPipe, WaitAndShutdown) { } TEST_F(TestSelfPipe, ShutdownAndWait) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif self_pipe_->Send(123456789123456789ULL); ASSERT_OK(self_pipe_->Shutdown()); StartReading(); @@ -413,6 +429,10 @@ TEST_F(TestSelfPipe, ShutdownAndWait) { } TEST_F(TestSelfPipe, WaitAndSendFromSignal) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + signal_received_.store(0); SignalHandlerGuard guard(SIGINT, &HandleSignal); @@ -431,6 +451,10 @@ TEST_F(TestSelfPipe, WaitAndSendFromSignal) { } TEST_F(TestSelfPipe, SendFromSignalAndWait) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + signal_received_.store(0); SignalHandlerGuard guard(SIGINT, &HandleSignal); @@ -450,6 +474,10 @@ TEST_F(TestSelfPipe, SendFromSignalAndWait) { #if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || \ defined(THREAD_SANITIZER)) TEST_F(TestSelfPipe, ForkSafety) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + self_pipe_->Send(123456789123456789ULL); auto child_pid = fork(); @@ -1025,6 +1053,9 @@ TEST_F(TestSendSignal, Generic) { } TEST_F(TestSendSignal, ToThread) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "SendSignalToThread requires threading"; +#endif #ifdef _WIN32 uint64_t dummy_thread_id = 42; ASSERT_RAISES(NotImplemented, SendSignalToThread(SIGINT, dummy_thread_id)); diff --git a/cpp/src/arrow/util/mutex.cc b/cpp/src/arrow/util/mutex.cc index 9f82ad45b0740..bbf2a9a93e692 100644 --- a/cpp/src/arrow/util/mutex.cc +++ b/cpp/src/arrow/util/mutex.cc @@ -24,6 +24,7 @@ #include #endif +#include "arrow/util/config.h" #include "arrow/util/logging.h" namespace arrow { @@ -35,9 +36,12 @@ struct Mutex::Impl { Mutex::Guard::Guard(Mutex* locked) : locked_(locked, [](Mutex* locked) { +#ifdef ARROW_ENABLE_THREADING DCHECK(!locked->impl_->mutex_.try_lock()); +#endif locked->impl_->mutex_.unlock(); - }) {} + }) { +} Mutex::Guard Mutex::TryLock() { DCHECK_NE(impl_, nullptr); diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 01d1ffd767fc9..26984e5f7735d 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -214,7 +214,14 @@ TEST(BitUtil, RoundTripIntValues) { void ValidateRle(const std::vector& values, int bit_width, uint8_t* expected_encoding, int expected_len) { const int len = 64 * 1024; +#ifdef __EMSCRIPTEN__ + // don't make this on the stack as it is + // too big for emscripten + std::vector buffer_vec(static_cast(len)); + uint8_t* buffer = buffer_vec.data(); +#else uint8_t buffer[len]; +#endif EXPECT_LE(expected_len, len); RleEncoder encoder(buffer, len, bit_width); @@ -227,7 +234,7 @@ void ValidateRle(const std::vector& values, int bit_width, if (expected_len != -1) { EXPECT_EQ(encoded_len, expected_len); } - if (expected_encoding != NULL) { + if (expected_encoding != NULL && encoded_len == expected_len) { EXPECT_EQ(memcmp(buffer, expected_encoding, encoded_len), 0); } @@ -256,7 +263,14 @@ void ValidateRle(const std::vector& values, int bit_width, // the returned values are not all the same bool CheckRoundTrip(const std::vector& values, int bit_width) { const int len = 64 * 1024; +#ifdef __EMSCRIPTEN__ + // don't make this on the stack as it is + // too big for emscripten + std::vector buffer_vec(static_cast(len)); + uint8_t* buffer = buffer_vec.data(); +#else uint8_t buffer[len]; +#endif RleEncoder encoder(buffer, len, bit_width); for (size_t i = 0; i < values.size(); ++i) { bool result = encoder.Put(values[i]); diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index 30c5e6aae74ba..92d727019aaf5 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -794,6 +794,11 @@ TEST(TimestampParser, StrptimeZoneOffset) { if (!kStrptimeSupportsZone) { GTEST_SKIP() << "strptime does not support %z on this platform"; } +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Test temporarily disabled due to emscripten bug " + "https://github.com/emscripten-core/emscripten/issues/20467 "; +#endif + std::string format = "%Y-%d-%m %H:%M:%S%z"; auto parser = TimestampParser::MakeStrptime(format); diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index cf46cb8c6ad70..f6ba03552dcce 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1164,6 +1164,14 @@ tasks: flags: "-e ARROW_CSV=ON -e ARROW_PARQUET=ON" image: ubuntu-cpp-minimal + test-ubuntu-22.04-cpp-emscripten: + ci: github + template: docker-tests/github.linux.yml + params: + env: + UBUNTU: 22.04 + image: ubuntu-cpp-emscripten + {% for python_version in ["3.8", "3.9", "3.10", "3.11", "3.12"] %} test-conda-python-{{ python_version }}: ci: github diff --git a/docker-compose.yml b/docker-compose.yml index 9b0610fe553b5..46717557bc337 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -157,6 +157,7 @@ x-hierarchy: - ubuntu-csharp - ubuntu-cpp-sanitizer - ubuntu-cpp-thread-sanitizer + - ubuntu-cpp-emscripten - ubuntu-r-sanitizer - ubuntu-r-valgrind - ubuntu-swift @@ -652,6 +653,31 @@ services: ARROW_USE_TSAN: "ON" command: *cpp-command + ubuntu-cpp-emscripten: + # Usage: + # docker-compose build ubuntu-cpp-emscripten + # docker-compose run --rm ubuntu-cpp-emscripten + # Parameters: + # ARCH: amd64, arm64v8, ... + # UBUNTU: 22.04 + image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp + build: + context: . + dockerfile: ci/docker/ubuntu-${UBUNTU}-cpp.dockerfile + cache_from: + - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp + args: + arch: ${ARCH} + clang_tools: ${CLANG_TOOLS} + llvm: ${LLVM} + shm_size: *shm-size + volumes: *ubuntu-volumes + environment: + <<: [*common, *ccache, *sccache, *cpp] + ARROW_EMSCRIPTEN: "ON" + UBUNTU: + command: *cpp-command + fedora-cpp: # Usage: # docker-compose build fedora-cpp diff --git a/docs/source/developers/cpp/emscripten.rst b/docs/source/developers/cpp/emscripten.rst new file mode 100644 index 0000000000000..b4c563aae1a3b --- /dev/null +++ b/docs/source/developers/cpp/emscripten.rst @@ -0,0 +1,99 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + + +.. highlight:: console + +.. _developers-cpp-emscripten: +=============================================== +Cross compiling for WebAssembly with Emscripten +=============================================== + +Prerequisites +------------- +You need CMake and compilers etc. installed as per the normal build instructions. Before building with Emscripten, you also need to install Emscripten and +activate it using the commands below (see https://emscripten.org/docs/getting_started/downloads.html for details). + +.. code:: shell + + git clone https://github.com/emscripten-core/emsdk.git + cd emsdk + # replace with the desired EMSDK version. + # e.g. for Pyodide 0.24, you need EMSDK version 3.1.45 + ./emsdk install + ./emsdk activate + source ./emsdk_env.sh + +If you want to build PyArrow for `Pyodide `_, you +need ``pyodide-build`` installed via ``pip``, and to be running with the +same version of Python that Pyodide is built for, along with the same +versions of emsdk tools. + +.. code:: shell + + # install Pyodide build tools. + # e.g. for version 0.24 of Pyodide: + pip install pyodide-build==0.24 + +Then build with the ``ninja-release-emscripten`` CMake preset, +like below: + +.. code:: shell + + emcmake cmake --preset "ninja-release-emscripten" + ninja install + +This will install a built static library version of ``libarrow`` it into the +Emscripten sysroot cache, meaning you can build things that depend on it +and they will find ``libarrow``. + +e.g. if you want to build for Pyodide, run the commands above, and then +go to ``arrow/python`` and run + +.. code:: shell + + pyodide build + +It should make a wheel targeting the currently enabled version of +Pyodide (i.e. the version corresponding to the currently installed +``pyodide-build``) in the ``dist`` subdirectory. + + +Manual Build +------------ + +If you want to manually build for Emscripten, take a look at the +``CMakePresets.json`` file in the ``arrow/cpp`` directory for a list of things +you will need to override. In particular you will need: + +#. Build dependencies set to ``BUNDLED``, so it uses properly cross + compiled build dependencies. + +#. ``CMAKE_TOOLCHAIN_FILE`` set by using ``emcmake cmake`` instead of just ``cmake``. + +#. You will quite likely need to set ``ARROW_ENABLE_THREADING`` to ``OFF`` + for builds targeting single threaded Emscripten environments such as + Pyodide. + +#. ``ARROW_FLIGHT`` and anything else that uses network probably won't + work. + +#. ``ARROW_JEMALLOC`` and ``ARROW_MIMALLOC`` again probably need to be + ``OFF`` + +#. ``ARROW_BUILD_STATIC`` set to ``ON`` and ``ARROW_BUILD_SHARED`` set to + ``OFF`` is most likely to work. diff --git a/docs/source/developers/cpp/index.rst b/docs/source/developers/cpp/index.rst index 36c9778bea1b0..603e1607dc543 100644 --- a/docs/source/developers/cpp/index.rst +++ b/docs/source/developers/cpp/index.rst @@ -27,5 +27,6 @@ C++ Development building development windows + emscripten conventions fuzzing From 139afe592a16e6fd168abc9f28b6c140071b4d4e Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Thu, 4 Apr 2024 19:51:29 -0700 Subject: [PATCH 18/49] GH-40775: [Benchmarking][Java] Fix conbench timeout (#40786) ### Rationale for this change The java build script has been recently updated and it is affecting conbench, which is now seeing timeouts when building java. The logs are producing 100s of GB of data due to an unnecessary debug log msg. ### What changes are included in this PR? * Delete log message on write to memory ### Are these changes tested? Yes, via conbench ### Are there any user-facing changes? No * GitHub Issue: #40775 Authored-by: Dane Pitkin Signed-off-by: Sutou Kouhei --- .../org/apache/arrow/memory/netty/TestNettyAllocator.java | 7 +++++-- .../java/org/apache/arrow/vector/ipc/WriteChannel.java | 3 --- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/java/memory/memory-netty/src/test/java/org/apache/arrow/memory/netty/TestNettyAllocator.java b/java/memory/memory-netty/src/test/java/org/apache/arrow/memory/netty/TestNettyAllocator.java index 07fdc3f784e43..a6da36bb35aa7 100644 --- a/java/memory/memory-netty/src/test/java/org/apache/arrow/memory/netty/TestNettyAllocator.java +++ b/java/memory/memory-netty/src/test/java/org/apache/arrow/memory/netty/TestNettyAllocator.java @@ -69,9 +69,12 @@ public void testMemoryUsage() { break; } } - assertTrue("Log messages are:\n" + + synchronized (memoryLogsAppender.list) { + assertTrue("Log messages are:\n" + memoryLogsAppender.list.stream().map(ILoggingEvent::toString).collect(Collectors.joining("\n")), - result); + result); + } + } finally { memoryLogsAppender.stop(); logger.detachAppender(memoryLogsAppender); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java index 9ad71f6fe8847..73bc2ecb12714 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java @@ -105,9 +105,6 @@ public long align() throws IOException { */ public long write(ByteBuffer buffer) throws IOException { long length = buffer.remaining(); - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("Writing buffer with size: {}", length); - } while (buffer.hasRemaining()) { out.write(buffer); } From 074d45f1c133e82da09ff7e6da706a3ad73c293a Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 5 Apr 2024 10:54:14 +0200 Subject: [PATCH 19/49] GH-39440: [Python] Calling pyarrow.dataset.ParquetFileFormat.make_write_options as a class method results in a segfault (#40976) ### Rationale for this change Calling `make_write_options()` method as class instead of instance method results in segfault. ### What changes are included in this PR? Adds a type check on `self` and raises an error if not `ParquetFileFormat`. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #39440 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Antoine Pitrou Signed-off-by: AlenkaF --- python/pyarrow/_dataset_parquet.pyx | 4 ++++ python/pyarrow/tests/test_dataset.py | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 58ef6145cf7d1..a55e889ba8246 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -198,6 +198,10 @@ cdef class ParquetFileFormat(FileFormat): ------- pyarrow.dataset.FileWriteOptions """ + # Safeguard from calling make_write_options as a static class method + if not isinstance(self, ParquetFileFormat): + raise TypeError("make_write_options() should be called on " + "an instance of ParquetFileFormat") opts = FileFormat.make_write_options(self) ( opts).update(**kwargs) return opts diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 3d77214c174c5..6bba7240c05df 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -5630,3 +5630,16 @@ def test_checksum_write_dataset_read_dataset_to_table(tempdir): corrupted_dir_path, format=pq_read_format_crc ).to_table() + + +def test_make_write_options_error(): + # GH-39440 + msg = ("make_write_options\\(\\) should be called on an " + "instance of ParquetFileFormat") + with pytest.raises(TypeError, match=msg): + pa.dataset.ParquetFileFormat.make_write_options(43) + + pformat = pa.dataset.ParquetFileFormat() + msg = "make_write_options\\(\\) takes exactly 0 positional arguments" + with pytest.raises(TypeError, match=msg): + pformat.make_write_options(43) From 110efed4d0d8cf16dc201a422809efdca6de9518 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 5 Apr 2024 20:01:52 +0800 Subject: [PATCH 20/49] GH-41024: [C++] IO: fixing compiling in gcc 7.5.0 (#41025) ### Rationale for this change Previous pr ( https://github.com/apache/arrow/pull/39807 ) remove std::move when returning value, however, it's not allowed in some old compilers ### What changes are included in this PR? add std::move for return, and add reason for that ### Are these changes tested? Should test by other ci ### Are there any user-facing changes? no * GitHub Issue: #41024 Authored-by: mwish Signed-off-by: Sutou Kouhei --- cpp/src/arrow/io/compressed.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/io/compressed.cc b/cpp/src/arrow/io/compressed.cc index d06101748dc0c..5faa4d095eb1e 100644 --- a/cpp/src/arrow/io/compressed.cc +++ b/cpp/src/arrow/io/compressed.cc @@ -405,7 +405,9 @@ class CompressedInputStream::Impl { ARROW_ASSIGN_OR_RAISE(auto buf, AllocateResizableBuffer(nbytes, pool_)); ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buf->mutable_data())); RETURN_NOT_OK(buf->Resize(bytes_read)); - return buf; + // Using std::move because the some compiler might has issue below: + // https://wg21.cmeerw.net/cwg/issue1579 + return std::move(buf); } const std::shared_ptr& raw() const { return raw_; } From 925ca66474692c46c6dcdda29a732b5e7d628b64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 5 Apr 2024 14:26:20 +0200 Subject: [PATCH 21/49] GH-40549: [Java] Revert bump org.apache.maven.plugins:maven-shade-plugin from 3.2.4 to 3.5.2 in /java (#40462)" (#41006) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 8ee9679d401183220a4566681ca7ef9e887ba4d2. ### Rationale for this change Spark integration tests are failing due to this bump. ### What changes are included in this PR? Revert bump org.apache.maven.plugins:maven-shade-plugin from 3.2.4 to 3.5.2 ### Are these changes tested? Via CI. ### Are there any user-facing changes? They shouldn't * GitHub Issue: #40549 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- java/flight/flight-core/pom.xml | 2 +- java/pom.xml | 2 +- java/vector/pom.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index 830caf8a28246..9ea6393f0f6df 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -164,7 +164,7 @@ issues in the arrow-tools tests looking up FlatBuffer dependencies. --> - 3.5.2 + 3.2.4 shade-main diff --git a/java/pom.xml b/java/pom.xml index 8e9ddd5480ea8..b6eb774f580d4 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -449,7 +449,7 @@ org.apache.maven.plugins maven-shade-plugin - 3.5.2 + 3.5.1 maven-surefire-plugin diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 20af3dbd38443..436ffd15b297d 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -179,7 +179,7 @@ issues in the arrow-tools tests looking up FlatBuffer dependencies. --> - 3.5.2 + 3.2.4 package From 7da285e39a89269f8aa175369d8da4e577805319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 5 Apr 2024 15:16:33 +0200 Subject: [PATCH 22/49] GH-41005: [CI] HDFS and skyhook tests require docker compose usage because they require multiple containers (#41027) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change We are currently using docker run but we require linked containers to be also started. ### What changes are included in this PR? Use docker compose instead of docker ### Are these changes tested? Via archery ### Are there any user-facing changes? No * GitHub Issue: #41005 Authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- dev/tasks/tasks.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index f6ba03552dcce..fc8d03b3d5bfb 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1090,6 +1090,7 @@ tasks: template: docker-tests/github.linux.yml params: env: + ARCHERY_USE_DOCKER_CLI: 0 UBUNTU: 20.04 flags: -e ARROW_SKYHOOK=ON image: ubuntu-cpp @@ -1534,8 +1535,9 @@ tasks: template: docker-tests/github.linux.yml params: env: - PYTHON: "3.10" + ARCHERY_USE_DOCKER_CLI: 0 HDFS: "{{ hdfs_version }}" + PYTHON: "3.10" image: conda-python-hdfs {% endfor %} From 17e51ccaace120cbcef3bfc8bf28553a1e7985a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 5 Apr 2024 16:04:56 +0200 Subject: [PATCH 23/49] GH-41007: [CI][Archery] Correctly interpolate environment variables from docker compose when using docker cli on archery docker (#41026) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Currently our verification jobs are failing due to environment variables not being correctly interpolated. docker compose expects a double `$$` sign on the command as explained here: https://docs.docker.com/compose/compose-file/12-interpolation/ When we use `ARCHERY_USE_DOCKER_CLI=1` we are using `docker run` instead of `docker compose run`. The behaviour of the command changes and we have to update change the environment variable. ### What changes are included in this PR? Use correct docker compose binary when using `docker cli` and remove double $$. ### Are these changes tested? Via archery ### Are there any user-facing changes? No * GitHub Issue: #41007 Authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- dev/archery/archery/docker/cli.py | 4 +++- dev/archery/archery/docker/core.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dev/archery/archery/docker/cli.py b/dev/archery/archery/docker/cli.py index e6baf0ca1f002..7053db2afccff 100644 --- a/dev/archery/archery/docker/cli.py +++ b/dev/archery/archery/docker/cli.py @@ -77,7 +77,9 @@ def docker(ctx, src, dry_run, using_docker_cli, using_docker_buildx): compose = DockerCompose(config_path, params=os.environ, using_docker=using_docker_cli, using_buildx=using_docker_buildx, - debug=ctx.obj.get('debug', False)) + debug=ctx.obj.get('debug', False), + compose_bin=("docker compose" if using_docker_cli + else "docker-compose")) if dry_run: _mock_compose_calls(compose) ctx.obj['compose'] = compose diff --git a/dev/archery/archery/docker/core.py b/dev/archery/archery/docker/core.py index 38720e5856a14..0b49111dd6944 100644 --- a/dev/archery/archery/docker/core.py +++ b/dev/archery/archery/docker/core.py @@ -402,6 +402,10 @@ def run(self, service_name, command=None, *, env=None, volumes=None, # on the docker-compose yaml file. if isinstance(cmd, list): cmd = shlex.join(cmd) + # Match behaviour from docker compose + # to interpolate environment variables + # https://docs.docker.com/compose/compose-file/12-interpolation/ + cmd = cmd.replace("$$", "$") args.extend(shlex.split(cmd)) # execute as a plain docker cli command From 83359d6958273be534f376f40976196c14675c8b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 5 Apr 2024 23:35:59 +0900 Subject: [PATCH 24/49] GH-40962: [GLib] Add missing "no" to suppress warnings (#41030) ### Rationale for this change There are some `unused-but-set-variable` warnings for Vala examples on macOS: ```text FAILED: example/vala/write-file.p/meson-generated_write-file.c.o ccache cc -Iexample/vala/write-file.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/write-file.p/meson-generated_write-file.c.o -MF example/vala/write-file.p/meson-generated_write-file.c.o.d -o example/vala/write-file.p/meson-generated_write-file.c.o -c example/vala/write-file.p/write-file.c write-file.c:373:8: error: variable '_tmp45__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp45__length1; ^ write-file.c:504:8: error: variable '_tmp57__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp57__length1; ^ write-file.c:635:8: error: variable '_tmp69__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp69__length1; ^ write-file.c:766:8: error: variable '_tmp81__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp81__length1; ^ write-file.c:897:8: error: variable '_tmp93__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp93__length1; ^ write-file.c:1028:8: error: variable '_tmp105__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp105__length1; ^ write-file.c:1159:8: error: variable '_tmp117__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp117__length1; ^ write-file.c:1290:8: error: variable '_tmp129__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp129__length1; ^ write-file.c:1421:8: error: variable '_tmp141__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp141__length1; ^ write-file.c:1552:8: error: variable '_tmp153__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp153__length1; ^ 10 errors generated. ``` ### What changes are included in this PR? Add missing `no` to option that suppress this warning: -Wunused-but-set-variable -> -Wno-unused-but-set-variable ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40962 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/example/vala/meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_glib/example/vala/meson.build b/c_glib/example/vala/meson.build index ff65a7328f171..b7eb86200ddd6 100644 --- a/c_glib/example/vala/meson.build +++ b/c_glib/example/vala/meson.build @@ -19,7 +19,7 @@ if generate_vapi c_flags = [ - '-Wunused-but-set-variable', + '-Wno-unused-but-set-variable', ] c_flags = meson.get_compiler('c').get_supported_arguments(c_flags) vala_example_executable_kwargs = { From bf3c1b99c3398e3ed390e4eb7f2895f64e9abf80 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:09:42 -0700 Subject: [PATCH 25/49] GH-41039: [Python] ListView pandas tests should use np.nan instead of None (#41040) ### Rationale for this change ListView pandas conversion tests are failing in upstream development branches of pandas/numpy. ### What changes are included in this PR? * Use np.nan instead of None when comparing with an expected Pandas Series. ### Are these changes tested? Yes, unit tests updated. ### Are there any user-facing changes? No. * GitHub Issue: #41039 Authored-by: Dane Pitkin Signed-off-by: AlenkaF --- python/pyarrow/tests/test_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 90b9bd8b8c453..3678b4e57a9a8 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2571,7 +2571,7 @@ def test_list_view_to_pandas_with_null_values(self, klass): ) actual = arr.to_pandas() - expected = pd.Series([[1, None], [], None]) + expected = pd.Series([[1, np.nan], [], None]) tm.assert_series_equal(actual, expected) @@ -2593,7 +2593,7 @@ def test_list_view_to_pandas_multiple_chunks(self, klass): arr = pa.chunked_array([arr1, arr2]) actual = arr.to_pandas() - expected = pd.Series([[3, 4], [2, 3], [1, 2], [5, 6, 7], [6, 7, None], None]) + expected = pd.Series([[3, 4], [2, 3], [1, 2], [5, 6, 7], [6, 7, np.nan], None]) tm.assert_series_equal(actual, expected) From 02bc6537f5989bc37896429394e8c6b0cb843889 Mon Sep 17 00:00:00 2001 From: Austin Dickey Date: Fri, 5 Apr 2024 16:19:16 -0500 Subject: [PATCH 26/49] GH-41015: [JS] [Benchmarking] allow JS benchmarks to run more portably (#41031) --- js/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/package.json b/js/package.json index bb70fd0a395b0..773cf23e35b36 100644 --- a/js/package.json +++ b/js/package.json @@ -11,7 +11,7 @@ "build": "cross-env NODE_NO_WARNINGS=1 gulp build", "clean": "cross-env NODE_NO_WARNINGS=1 gulp clean", "debug": "cross-env NODE_NO_WARNINGS=1 gulp debug", - "perf": "perf/index.ts", + "perf": "node --no-warnings --loader ts-node/esm/transpile-only perf/index.ts", "test:integration": "bin/integration.ts --mode validate", "release": "./npm-release.sh", "clean:all": "yarn clean && yarn clean:testdata", From be3baf2697fa449d527e475749d0619f4f77bead Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Fri, 5 Apr 2024 15:05:11 -0700 Subject: [PATCH 27/49] GH-40680: [Java] Test JDK 22 in CI (#41038) ### Rationale for this change JDK 22 is released. Let's ensure Arrow java CI tests this version successfully. ### What changes are included in this PR? * Add JDK 22 to CI ### Are these changes tested? Tested by CI jobs ### Are there any user-facing changes? No * GitHub Issue: #40680 Authored-by: Dane Pitkin Signed-off-by: David Li --- .github/workflows/java.yml | 4 ++-- docs/source/java/install.rst | 10 +++++----- java/pom.xml | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 611e202ca0624..e92d3f4fc5877 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -58,8 +58,8 @@ jobs: strategy: fail-fast: false matrix: - jdk: [8, 11, 17, 21] - maven: [3.9.5] + jdk: [8, 11, 17, 21, 22] + maven: [3.9.6] image: [java] env: JDK: ${{ matrix.jdk }} diff --git a/docs/source/java/install.rst b/docs/source/java/install.rst index 7ac1a4990f37d..a551edc36c477 100644 --- a/docs/source/java/install.rst +++ b/docs/source/java/install.rst @@ -29,8 +29,8 @@ Java modules are regularly built and tested on macOS and Linux distributions. Java Compatibility ================== -Java modules are compatible with JDK 8 and above. -Currently, JDK 8, 11, 17, and 21 are tested in CI. +Java modules are compatible with JDK 8 and above. Currently, JDK versions +8, 11, 17, and 21 are tested in CI. The latest JDK is also tested in CI. When using Java 9 or later, some JDK internals must be exposed by adding ``--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED`` to the ``java`` command: @@ -61,7 +61,7 @@ Modifying the command above for Flight: $ env _JAVA_OPTIONS="--add-reads=org.apache.arrow.flight.core=ALL-UNNAMED --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... Otherwise, you may see errors like ``java.lang.IllegalAccessError: superclass access check failed: class -org.apache.arrow.flight.ArrowMessage$ArrowBufRetainingCompositeByteBuf (in module org.apache.arrow.flight.core) +org.apache.arrow.flight.ArrowMessage$ArrowBufRetainingCompositeByteBuf (in module org.apache.arrow.flight.core) cannot access class io.netty.buffer.CompositeByteBuf (in unnamed module ...) because module org.apache.arrow.flight.core does not read unnamed module ... @@ -74,8 +74,8 @@ Modifying the command above for arrow-memory: # Indirectly via environment variables $ env _JAVA_OPTIONS="--add-opens=java.base/java.nio=org.apache.arrow.dataset,org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... -Otherwise you may see errors such as ``java.lang.RuntimeException: java.lang.reflect.InaccessibleObjectException: -Unable to make static void java.nio.Bits.reserveMemory(long,long) accessible: module +Otherwise you may see errors such as ``java.lang.RuntimeException: java.lang.reflect.InaccessibleObjectException: +Unable to make static void java.nio.Bits.reserveMemory(long,long) accessible: module java.base does not "opens java.nio" to module org.apache.arrow.dataset`` If using Maven and Surefire for unit testing, :ref:`this argument must diff --git a/java/pom.xml b/java/pom.xml index b6eb774f580d4..95b27922eafa9 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -46,7 +46,7 @@ 9+181-r4173-1 2.24.0 3.12.1 - 5.5.0 + 5.11.0 5.2.0 3.42.0 From 3886cf1d43701afe25f3d008b81ae62e880e7878 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 5 Apr 2024 17:55:28 -0500 Subject: [PATCH 28/49] GH-40991: [R] Prefer r-universe, add a startup message (#41019) ### Rationale for this change If someone loads a version of Arrow on macOS with features disabled, warn them on startup that they can use `install_arrow()`. By default, prefer R-Universe in `install_arrow()` ### What changes are included in this PR? ^^^ ### Are these changes tested? Yes ### Are there any user-facing changes? Yes **This PR contains a "Critical Fix".** * GitHub Issue: #40991 Lead-authored-by: Jonathan Keane Co-authored-by: Neal Richardson Signed-off-by: Jonathan Keane --- r/DESCRIPTION | 2 +- r/R/arrow-info.R | 3 +- r/R/arrow-package.R | 71 ++++++++++++++++++++++++++---------------- r/R/install-arrow.R | 14 ++------- r/man/arrow-package.Rd | 4 +-- r/man/format_schema.Rd | 18 +++++++++++ 6 files changed, 70 insertions(+), 42 deletions(-) create mode 100644 r/man/format_schema.Rd diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 6062a8c4f4689..c9f84e2e794c4 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -43,7 +43,7 @@ Imports: utils, vctrs Roxygen: list(markdown = TRUE, r6 = FALSE, load = "source") -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Config/testthat/edition: 3 Config/build/bootstrap: TRUE Suggests: diff --git a/r/R/arrow-info.R b/r/R/arrow-info.R index 54145b2ee1204..916b6683fbcce 100644 --- a/r/R/arrow-info.R +++ b/r/R/arrow-info.R @@ -139,7 +139,8 @@ arrow_with_json <- function() { some_features_are_off <- function(features) { # `features` is a named logical vector (as in arrow_info()$capabilities) # Let's exclude some less relevant ones - blocklist <- c("lzo", "bz2", "brotli", "substrait") + # jemalloc is only included because it is sometimes disabled in our build process + blocklist <- c("lzo", "bz2", "brotli", "substrait", "jemalloc") # Return TRUE if any of the other features are FALSE !all(features[setdiff(names(features), blocklist)]) } diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 54e237192e080..f6977e626276b 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -182,37 +182,54 @@ configure_tzdb <- function() { .onAttach <- function(libname, pkgname) { # Just to be extra safe, let's wrap this in a try(); # we don't want a failed startup message to prevent the package from loading - try({ - # On macOS only, Check if we are running in under emulation, and warn this will not work - if (on_rosetta()) { - packageStartupMessage( - paste( - "Warning:", - " It appears that you are running R and Arrow in emulation (i.e. you're", - " running an Intel version of R on a non-Intel mac). This configuration is", - " not supported by arrow, you should install a native (arm64) build of R", - " and use arrow with that. See https://cran.r-project.org/bin/macosx/", - "", - sep = "\n" + try( + { + # On macOS only, Check if we are running in under emulation, and warn this will not work + if (on_rosetta()) { + packageStartupMessage( + paste( + "Warning:", + " It appears that you are running R and Arrow in emulation (i.e. you're", + " running an Intel version of R on a non-Intel mac). This configuration is", + " not supported by arrow, you should install a native (arm64) build of R", + " and use arrow with that. See https://cran.r-project.org/bin/macosx/", + "", + sep = "\n" + ) ) - ) - } + } - features <- arrow_info()$capabilities - # That has all of the #ifdef features, plus the compression libs and the - # string libraries (but not the memory allocators, they're added elsewhere) - # - # Let's print a message if some are off - if (some_features_are_off(features)) { - packageStartupMessage( - paste( - "Some features are not enabled in this build of Arrow.", - "Run `arrow_info()` for more information." + features <- arrow_info()$capabilities + # That has all of the #ifdef features, plus the compression libs and the + # string libraries (but not the memory allocators, they're added elsewhere) + # + # Let's print a message if some are off + if (some_features_are_off(features)) { + packageStartupMessage( + paste( + "Some features are not enabled in this build of Arrow.", + "Run `arrow_info()` for more information." + ) ) - ) - } - }, silent = TRUE) + # On macOS binaries from CRAN can be hobbled. They sometimes restrict access to our + # dependency source downloading even though we can build from source on their machines. + # They also refuse to allow libarrow binaries to be downloaded, so instead distribute + # hobbled arrow binaries + # If on macOS, and features are disabled, advise that reinstalling might help + if (identical(tolower(Sys.info()[["sysname"]]), "darwin")) { + packageStartupMessage( + paste0( + "The repository you retrieved Arrow from did not include all of Arrow's features.\n", + "You can install a fully-featured version by running:\n", + "`install.packages('arrow', repos = 'https://apache.r-universe.dev')`." + ) + ) + } + } + }, + silent = TRUE + ) } # Clean up the StopSource that was registered in .onLoad() so that if the diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 74d3a96454777..44b876537490c 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -76,22 +76,14 @@ install_arrow <- function(nightly = FALSE, ARROW_R_DEV = verbose, ARROW_USE_PKG_CONFIG = use_system ) - # On the M1, we can't use the usual autobrew, which pulls Intel dependencies - apple_m1 <- grepl("arm-apple|aarch64.*darwin", R.Version()$platform) - # On Rosetta, we have to build without JEMALLOC, so we also can't autobrew - rosetta <- on_rosetta() - if (rosetta) { + # On Rosetta, we have to build without JEMALLOC + if (on_rosetta()) { Sys.setenv(ARROW_JEMALLOC = "OFF") - } - if (apple_m1 || rosetta) { Sys.setenv(FORCE_BUNDLED_BUILD = "true") } opts <- list() - if (apple_m1 || rosetta) { - # Skip binaries (esp. for rosetta) - opts$pkgType <- "source" - } else if (isTRUE(binary)) { + if (isTRUE(binary)) { # Unless otherwise directed, don't consider newer source packages when # options(pkgType) == "both" (default on win/mac) opts$install.packages.check.source <- "no" diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd index 15f672a1fe949..41ec7956fe815 100644 --- a/r/man/arrow-package.Rd +++ b/r/man/arrow-package.Rd @@ -18,15 +18,15 @@ Useful links: } \author{ -\strong{Maintainer}: Nic Crane \email{thisisnic@gmail.com} +\strong{Maintainer}: Jonathan Keane \email{jkeane@gmail.com} Authors: \itemize{ \item Neal Richardson \email{neal.p.richardson@gmail.com} \item Ian Cook \email{ianmcook@gmail.com} + \item Nic Crane \email{thisisnic@gmail.com} \item Dewey Dunnington \email{dewey@fishandwhistle.net} (\href{https://orcid.org/0000-0002-9415-4582}{ORCID}) \item Romain François (\href{https://orcid.org/0000-0002-2444-4226}{ORCID}) - \item Jonathan Keane \email{jkeane@gmail.com} \item Dragoș Moldovan-Grünfeld \email{dragos.mold@gmail.com} \item Jeroen Ooms \email{jeroen@berkeley.edu} \item Jacob Wujciak-Jens \email{jacob@wujciak.de} diff --git a/r/man/format_schema.Rd b/r/man/format_schema.Rd new file mode 100644 index 0000000000000..d1c81e3fe7623 --- /dev/null +++ b/r/man/format_schema.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/schema.R +\name{format_schema} +\alias{format_schema} +\title{Get a string representing a Dataset or RecordBatchReader object's schema} +\usage{ +format_schema(obj) +} +\arguments{ +\item{obj}{a Dataset or RecordBatchReader} +} +\value{ +A string containing a formatted representation of the schema of \code{obj} +} +\description{ +Get a string representing a Dataset or RecordBatchReader object's schema +} +\keyword{internal} From df7da77acbecdc617b8fa03863cbdb38321b629c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 6 Apr 2024 08:13:55 +0900 Subject: [PATCH 29/49] GH-40855: [C++][ORC] Fix `std::filesystem` related link error with ORC 2.0.0 or later (#41023) ### Rationale for this change If we use `std::filesystem`, we need `-lstdc++fs` with GCC 8 and `-lc++fs` for clang 7. We don't want to maintain CMake code for GCC 8/clang 7. ### What changes are included in this PR? * Don't use `std::filesystem` with ORC 2.0.0 or later. * Use missing `-lstdc++fs`/`-lc++fs` with bundled ORC 2.0.0. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40855 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/FindorcAlt.cmake | 12 +++++++++++- cpp/cmake_modules/ThirdpartyToolchain.cmake | 14 ++++++++++++++ cpp/src/arrow/CMakeLists.txt | 4 ++++ cpp/src/arrow/adapters/orc/adapter.cc | 14 ++++++++++---- 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/cpp/cmake_modules/FindorcAlt.cmake b/cpp/cmake_modules/FindorcAlt.cmake index dc3b978cf4037..289416678ad39 100644 --- a/cpp/cmake_modules/FindorcAlt.cmake +++ b/cpp/cmake_modules/FindorcAlt.cmake @@ -29,6 +29,7 @@ endif() find_package(orc ${find_package_args}) if(orc_FOUND) set(orcAlt_FOUND TRUE) + set(orcAlt_VERSION ${orc_VERSION}) return() endif() @@ -51,8 +52,17 @@ else() NAMES orc/orc-config.hh PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() +if(ORC_INCLUDE_DIR) + file(READ "${ORC_INCLUDE_DIR}/orc/orc-config.hh" ORC_CONFIG_HH_CONTENT) + string(REGEX MATCH "#define ORC_VERSION \"[0-9.]+\"" ORC_VERSION_DEFINITION + "${ORC_CONFIG_HH_CONTENT}") + string(REGEX MATCH "[0-9.]+" ORC_VERSION "${ORC_VERSION_DEFINITION}") +endif() -find_package_handle_standard_args(orcAlt REQUIRED_VARS ORC_STATIC_LIB ORC_INCLUDE_DIR) +find_package_handle_standard_args( + orcAlt + REQUIRED_VARS ORC_STATIC_LIB ORC_INCLUDE_DIR + VERSION_VAR ORC_VERSION) if(orcAlt_FOUND) if(NOT TARGET orc::orc) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 4a67eac1d4d59..7d54ccccf7c19 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4559,6 +4559,15 @@ macro(build_orc) endif() target_link_libraries(orc::orc INTERFACE ${CMAKE_DL_LIBS}) endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9") + target_link_libraries(orc::orc INTERFACE stdc++fs) + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8") + target_link_libraries(orc::orc INTERFACE c++fs) + endif() + endif() add_dependencies(orc::orc orc_ep) @@ -4568,6 +4577,11 @@ endmacro() if(ARROW_ORC) resolve_dependency(orc HAVE_ALT TRUE) target_link_libraries(orc::orc INTERFACE ${ARROW_PROTOBUF_LIBPROTOBUF}) + if(ORC_VENDORED) + set(ARROW_ORC_VERSION ${ARROW_ORC_BUILD_VERSION}) + else() + set(ARROW_ORC_VERSION ${orcAlt_VERSION}) + endif() message(STATUS "Found ORC static library: ${ORC_STATIC_LIB}") message(STATUS "Found ORC headers: ${ORC_INCLUDE_DIR}") endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 617bfedabf373..026bb5c77e066 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -899,6 +899,10 @@ if(ARROW_ORC) adapters/orc/util.cc) foreach(ARROW_ORC_TARGET ${ARROW_ORC_TARGETS}) target_link_libraries(${ARROW_ORC_TARGET} PRIVATE orc::orc) + if(ARROW_ORC_VERSION VERSION_LESS "2.0.0") + target_compile_definitions(${ARROW_ORC_TARGET} + PRIVATE ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK) + endif() endforeach() else() set(ARROW_ORC_TARGET_SHARED) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 127ec49ba990f..98784450b3cce 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -18,13 +18,16 @@ #include "arrow/adapters/orc/adapter.h" #include -#include #include #include #include #include #include +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK +#include +#endif + #include "arrow/adapters/orc/util.h" #include "arrow/builder.h" #include "arrow/io/interfaces.h" @@ -183,11 +186,9 @@ liborc::RowReaderOptions DefaultRowReaderOptions() { return options; } +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK // Proactively check timezone database availability for ORC versions older than 2.0.0 Status CheckTimeZoneDatabaseAvailability() { - if (GetOrcMajorVersion() >= 2) { - return Status::OK(); - } auto tz_dir = std::getenv("TZDIR"); bool is_tzdb_avaiable = tz_dir != nullptr ? std::filesystem::exists(tz_dir) @@ -200,6 +201,7 @@ Status CheckTimeZoneDatabaseAvailability() { } return Status::OK(); } +#endif } // namespace @@ -559,7 +561,9 @@ ORCFileReader::~ORCFileReader() {} Result> ORCFileReader::Open( const std::shared_ptr& file, MemoryPool* pool) { +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); +#endif auto result = std::unique_ptr(new ORCFileReader()); RETURN_NOT_OK(result->impl_->Open(file, pool)); return std::move(result); @@ -826,7 +830,9 @@ ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); } Result> ORCFileWriter::Open( io::OutputStream* output_stream, const WriteOptions& writer_options) { +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); +#endif std::unique_ptr result = std::unique_ptr(new ORCFileWriter()); Status status = result->impl_->Open(output_stream, writer_options); From 6f6b3b49c013ab2982fbd8b50313c3b0539bc7ca Mon Sep 17 00:00:00 2001 From: "Marcus D. Hanwell" Date: Fri, 5 Apr 2024 19:20:13 -0400 Subject: [PATCH 30/49] GH-40400: [C++] Add support for LLD (#40927) ### Rationale for this change Add support for the LLD LLVM linker, it is faster than the default linker. ### What changes are included in this PR? Added `ARROW_USE_LLD` as a CMake option, test if GCC supports the flags and then set the linker flags if so when the option is enabled. ### Are these changes tested? When this is enabled then the libraries and tests will use the linker flag. ### Are there any user-facing changes? No * GitHub Issue: #40400 Authored-by: Marcus D. Hanwell Signed-off-by: Sutou Kouhei --- ci/docker/ubuntu-20.04-cpp.dockerfile | 2 ++ ci/scripts/cpp_build.sh | 1 + cpp/cmake_modules/DefineOptions.cmake | 2 ++ cpp/cmake_modules/SetupCxxFlags.cmake | 28 +++++++++++++++++++++++++++ 4 files changed, 33 insertions(+) diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 3e3b7ac3a6d99..124256378b287 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -101,6 +101,7 @@ RUN apt-get update -y -q && \ libutf8proc-dev \ libxml2-dev \ libzstd-dev \ + lld \ make \ ninja-build \ nlohmann-json3-dev \ @@ -164,6 +165,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_SUBSTRAIT=ON \ ARROW_USE_ASAN=OFF \ ARROW_USE_CCACHE=ON \ + ARROW_USE_LLD=ON \ ARROW_USE_UBSAN=OFF \ ARROW_WITH_BROTLI=ON \ ARROW_WITH_BZ2=ON \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 52c89acb9a76a..e28ceae8801f0 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -166,6 +166,7 @@ else -DARROW_USE_CCACHE=${ARROW_USE_CCACHE:-ON} \ -DARROW_USE_GLOG=${ARROW_USE_GLOG:-OFF} \ -DARROW_USE_LD_GOLD=${ARROW_USE_LD_GOLD:-OFF} \ + -DARROW_USE_LLD=${ARROW_USE_LLD:-OFF} \ -DARROW_USE_MOLD=${ARROW_USE_MOLD:-OFF} \ -DARROW_USE_PRECOMPILED_HEADERS=${ARROW_USE_PRECOMPILED_HEADERS:-OFF} \ -DARROW_USE_STATIC_CRT=${ARROW_USE_STATIC_CRT:-OFF} \ diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 5b8bcb3ac6965..dc0e5da63adb7 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -170,6 +170,8 @@ takes precedence over ccache if a storage backend is configured" ON) define_option(ARROW_USE_LD_GOLD "Use ld.gold for linking on Linux (if available)" OFF) + define_option(ARROW_USE_LLD "Use the LLVM lld for linking (if available)" OFF) + define_option(ARROW_USE_MOLD "Use mold for linking on Linux (if available)" OFF) define_option(ARROW_USE_PRECOMPILED_HEADERS "Use precompiled headers when compiling" diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 1d709fe98d7fe..3e580dfc109cf 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -663,6 +663,34 @@ if(NOT WIN32 AND NOT APPLE) endif() endif() +if(ARROW_USE_LLD) + find_program(LD_LLD ld.lld) + if(LD_LLD) + unset(LLD_LINKER_FLAGS) + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.1.0") + set(LLD_LINKER_FLAGS "-fuse-ld=lld") + else() + message(STATUS "Need GCC 9.1.0 or later to use LLD linker: ${CMAKE_CXX_COMPILER_VERSION}" + ) + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(LLD_LINKER_FLAGS "--ld-path=${LD_LLD}") + else() + message(STATUS "Using the default linker because compiler doesn't support LLD: ${CMAKE_CXX_COMPILER_ID}" + ) + endif() + if(LLD_LINKER_FLAGS) + message(STATUS "Using optional LLVM LLD linker") + string(APPEND CMAKE_EXE_LINKER_FLAGS " ${LLD_LINKER_FLAGS}") + string(APPEND CMAKE_MODULE_LINKER_FLAGS " ${LLD_LINKER_FLAGS}") + string(APPEND CMAKE_SHARED_LINKER_FLAGS " ${LLD_LINKER_FLAGS}") + else() + message(STATUS "Using the default linker because the LLD isn't supported") + endif() + endif() +endif() + # compiler flags for different build types (run 'cmake -DCMAKE_BUILD_TYPE= .') # For all builds: # For CMAKE_BUILD_TYPE=Debug From 6aa33210ecd9773f18ceb775306b79de132b6c6f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 5 Apr 2024 23:05:12 -0300 Subject: [PATCH 31/49] GH-41044: [C++] formatting.h: Make sure space is allocated for the 'Z' when formatting timestamps (#41045) ### What changes are included in this PR? A test that reproduces an issue found by the fuzzer and a fix for it. ### Are these changes tested? - A test - Comments clarifying somethings around `formatting.h` - Increasing the size of the local buffer used to format timestamps The issue was introduced only recently (unreleased): #39272 * GitHub Issue: #41044 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/util/formatting.h | 11 ++++++++--- cpp/src/arrow/util/formatting_util_test.cc | 8 ++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/formatting.h b/cpp/src/arrow/util/formatting.h index 6125f792ff988..dd9af907ecc37 100644 --- a/cpp/src/arrow/util/formatting.h +++ b/cpp/src/arrow/util/formatting.h @@ -126,8 +126,10 @@ namespace detail { ARROW_EXPORT extern const char digit_pairs[]; // Based on fmtlib's format_int class: -// Write digits from right to left into a stack allocated buffer -inline void FormatOneChar(char c, char** cursor) { *--*cursor = c; } +// Write digits from right to left into a stack allocated buffer. +// \pre *cursor points to the byte after the one that will be written. +// \post *cursor points to the byte that was written. +inline void FormatOneChar(char c, char** cursor) { *(--(*cursor)) = c; } template void FormatOneDigit(Int value, char** cursor) { @@ -326,6 +328,7 @@ class StringFormatter : public FloatToStringFormatterMixin constexpr size_t BufferSizeHH_MM_SS() { + // "23:59:59" ("." "9"+)? return detail::Digits10(23) + 1 + detail::Digits10(59) + 1 + detail::Digits10(59) + 1 + detail::Digits10(Duration::period::den) - 1; } @@ -505,8 +509,9 @@ class StringFormatter { timepoint_days -= days(1); } + // YYYY_MM_DD " " HH_MM_SS "Z"? constexpr size_t buffer_size = - detail::BufferSizeYYYY_MM_DD() + 1 + detail::BufferSizeHH_MM_SS(); + detail::BufferSizeYYYY_MM_DD() + 1 + detail::BufferSizeHH_MM_SS() + 1; std::array buffer; char* cursor = buffer.data() + buffer_size; diff --git a/cpp/src/arrow/util/formatting_util_test.cc b/cpp/src/arrow/util/formatting_util_test.cc index 13f57a495d639..fcbeec347d32a 100644 --- a/cpp/src/arrow/util/formatting_util_test.cc +++ b/cpp/src/arrow/util/formatting_util_test.cc @@ -533,6 +533,14 @@ TEST(Formatting, Timestamp) { } } + { + constexpr int64_t kMillisInDay = 24 * 60 * 60 * 1000; + auto ty = timestamp(TimeUnit::MILLI, "+01:00"); + StringFormatter formatter(ty.get()); + AssertFormatting(formatter, -15000 * 365 * kMillisInDay + 1, + "-13021-12-17 00:00:00.001Z"); + } + { auto ty = timestamp(TimeUnit::MILLI, "Pacific/Maruesas"); StringFormatter formatter(ty.get()); From 8fd3ce9e64565935b17c87bd00037d8e96528cce Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sun, 7 Apr 2024 12:23:43 -0300 Subject: [PATCH 32/49] GH-40898: [C#] Do not import length-zero buffers from C Data Interface Arrays (#41054) ### Rationale for this change When implementing integration tests for nanoarrow, it was observed that C# never released arrays where `array->buffers[i]` was `NULL` (including any buffers of any recursive child arrays). This is allowed ( https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowArray.buffers ); however, every other implementation appears to allocate even for length zero buffers (including nanoarrow after https://github.com/apache/arrow-nanoarrow/pull/399 ). ### What changes are included in this PR? `AddMemory()` is replaced with `ArrowBuffer.Empty` if the length of the imported buffer would have been 0 bytes. For other buffers (or anywhere I saw dereferencing a buffer pointer), I added a `Debug.Assert` just to be sure. ### Are these changes tested? I'm not sure what the best way to test them is! They won't be tested in the nanoarrow integration tests since at the point that they run, nanoarrow will no longer export arrays that would trigger this. ### Are there any user-facing changes? No * GitHub Issue: #40898 Authored-by: Dewey Dunnington Signed-off-by: Curt Hagenlocher --- .../src/Apache.Arrow/C/CArrowArrayImporter.cs | 41 +++++++++++++------ 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index fbb2be661fc5d..abe02dcbb591f 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -17,6 +17,7 @@ using System; using System.Collections.Generic; +using System.Diagnostics; using System.Runtime.InteropServices; using Apache.Arrow.Memory; using Apache.Arrow.Types; @@ -36,7 +37,7 @@ public static class CArrowArrayImporter /// Typically, you will allocate an uninitialized CArrowArray pointer, /// pass that to external function, and then use this method to import /// the result. - /// + /// /// /// CArrowArray* importedPtr = CArrowArray.Create(); /// foreign_export_function(importedPtr); @@ -71,7 +72,7 @@ public static unsafe IArrowArray ImportArray(CArrowArray* ptr, IArrowType type) /// Typically, you will allocate an uninitialized CArrowArray pointer, /// pass that to external function, and then use this method to import /// the result. - /// + /// /// /// CArrowArray* importedPtr = CArrowArray.Create(); /// foreign_export_function(importedPtr); @@ -256,6 +257,19 @@ private ArrowBuffer ImportValidityBuffer(CArrowArray* cArray) return (cArray->buffers[0] == null) ? ArrowBuffer.Empty : new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, validityLength)); } + private ArrowBuffer ImportCArrayBuffer(CArrowArray* cArray, int i, int lengthBytes) + { + if (lengthBytes > 0) + { + Debug.Assert(cArray->buffers[i] != null); + return new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[i], 0, lengthBytes)); + } + else + { + return ArrowBuffer.Empty; + } + } + private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray) { if (cArray->n_buffers != 3) @@ -266,12 +280,13 @@ private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray) int length = checked((int)cArray->length); int offsetsLength = (length + 1) * 4; int* offsets = (int*)cArray->buffers[1]; + Debug.Assert(offsets != null); int valuesLength = offsets[length]; ArrowBuffer[] buffers = new ArrowBuffer[3]; buffers[0] = ImportValidityBuffer(cArray); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); - buffers[2] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[2], 0, valuesLength)); + buffers[1] = ImportCArrayBuffer(cArray, 1, offsetsLength); + buffers[2] = ImportCArrayBuffer(cArray, 2, valuesLength); return buffers; } @@ -289,10 +304,10 @@ private ArrowBuffer[] ImportByteArrayViewBuffers(CArrowArray* cArray) long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers - 1]; ArrowBuffer[] buffers = new ArrowBuffer[cArray->n_buffers - 1]; buffers[0] = ImportValidityBuffer(cArray); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, viewsLength)); + buffers[1] = ImportCArrayBuffer(cArray, 1, viewsLength); for (int i = 2; i < buffers.Length; i++) { - buffers[i] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[i], 0, checked((int)bufferLengths[i - 2]))); + buffers[i] = ImportCArrayBuffer(cArray, i, checked((int)bufferLengths[i - 2])); } return buffers; @@ -310,7 +325,7 @@ private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray) ArrowBuffer[] buffers = new ArrowBuffer[2]; buffers[0] = ImportValidityBuffer(cArray); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); + buffers[1] = ImportCArrayBuffer(cArray, 1, offsetsLength); return buffers; } @@ -327,8 +342,8 @@ private ArrowBuffer[] ImportListViewBuffers(CArrowArray* cArray) ArrowBuffer[] buffers = new ArrowBuffer[3]; buffers[0] = ImportValidityBuffer(cArray); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); - buffers[2] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[2], 0, offsetsLength)); + buffers[1] = ImportCArrayBuffer(cArray, 1, offsetsLength); + buffers[2] = ImportCArrayBuffer(cArray, 2, offsetsLength); return buffers; } @@ -356,8 +371,8 @@ private ArrowBuffer[] ImportDenseUnionBuffers(CArrowArray* cArray) int offsetsLength = length * 4; ArrowBuffer[] buffers = new ArrowBuffer[2]; - buffers[0] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, length)); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); + buffers[0] = ImportCArrayBuffer(cArray, 0, length); + buffers[1] = ImportCArrayBuffer(cArray, 1, offsetsLength); return buffers; } @@ -370,7 +385,7 @@ private ArrowBuffer[] ImportSparseUnionBuffers(CArrowArray* cArray) } ArrowBuffer[] buffers = new ArrowBuffer[1]; - buffers[0] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, checked((int)cArray->length))); + buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->length)); return buffers; } @@ -392,7 +407,7 @@ private ArrowBuffer[] ImportFixedWidthBuffers(CArrowArray* cArray, int bitWidth) ArrowBuffer[] buffers = new ArrowBuffer[2]; buffers[0] = ImportValidityBuffer(cArray); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, valuesLength)); + buffers[1] = ImportCArrayBuffer(cArray, 1, valuesLength); return buffers; } From 6c1417220151fe91e6ea9c5e6af7916863f40bfe Mon Sep 17 00:00:00 2001 From: James Duong Date: Mon, 8 Apr 2024 08:05:16 +0900 Subject: [PATCH 33/49] GH-40745: [Java][FlightRPC] Support configuring backpressure threshold (#41051) ### Rationale for this change gRPC uses a default backpressure threshold that is too low for services that send large amounts of data such as Arrow Flight. This causes excessive blocking and reduces throughput. ### What changes are included in this PR? * Update to grpc-java 1.63.0 * Add to FlightServer.Builder an option to set the number of bytes queued before blocking due to backpressure. Set the default to 10MB instead of gRPC's default of 64K. * Add a ServerInterceptor for automating setting the backpressure threshold on ServerCalls. ### Are these changes tested? Tested through existing unit tests. ### Are there any user-facing changes? The FlightServer.Builder class has an extra configuration option to let users change the backpressure threshold themselves. * GitHub Issue: #40745 Authored-by: James Duong Signed-off-by: David Li --- .../org/apache/arrow/flight/FlightServer.java | 15 +++++++ ...erverBackpressureThresholdInterceptor.java | 43 +++++++++++++++++++ java/pom.xml | 2 +- 3 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/ServerBackpressureThresholdInterceptor.java diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java index d873f7d2828d0..dc545c131828a 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java @@ -42,6 +42,7 @@ import org.apache.arrow.flight.auth2.Auth2Constants; import org.apache.arrow.flight.auth2.CallHeaderAuthenticator; import org.apache.arrow.flight.auth2.ServerCallHeaderAuthMiddleware; +import org.apache.arrow.flight.grpc.ServerBackpressureThresholdInterceptor; import org.apache.arrow.flight.grpc.ServerInterceptorAdapter; import org.apache.arrow.flight.grpc.ServerInterceptorAdapter.KeyFactory; import org.apache.arrow.memory.BufferAllocator; @@ -79,6 +80,9 @@ public class FlightServer implements AutoCloseable { /** The maximum size of an individual gRPC message. This effectively disables the limit. */ static final int MAX_GRPC_MESSAGE_SIZE = Integer.MAX_VALUE; + /** The default number of bytes that can be queued on an output stream before blocking. */ + public static final int DEFAULT_BACKPRESSURE_THRESHOLD = 10 * 1024 * 1024; // 10MB + /** Create a new instance from a gRPC server. For internal use only. */ private FlightServer(Location location, Server server, ExecutorService grpcExecutor) { this.location = location; @@ -179,6 +183,7 @@ public static final class Builder { private CallHeaderAuthenticator headerAuthenticator = CallHeaderAuthenticator.NO_OP; private ExecutorService executor = null; private int maxInboundMessageSize = MAX_GRPC_MESSAGE_SIZE; + private int backpressureThreshold = DEFAULT_BACKPRESSURE_THRESHOLD; private InputStream certChain; private InputStream key; private InputStream mTlsCACert; @@ -300,6 +305,7 @@ public FlightServer build() { .addService( ServerInterceptors.intercept( flightService, + new ServerBackpressureThresholdInterceptor(backpressureThreshold), new ServerAuthInterceptor(authHandler))); // Allow hooking into the gRPC builder. This is not guaranteed to be available on all Arrow versions or @@ -336,6 +342,15 @@ public Builder maxInboundMessageSize(int maxMessageSize) { return this; } + /** + * Set the number of bytes that may be queued on a server output stream before writes are blocked. + */ + public Builder backpressureThreshold(int backpressureThreshold) { + Preconditions.checkArgument(backpressureThreshold > 0); + this.backpressureThreshold = backpressureThreshold; + return this; + } + /** * A small utility function to ensure that InputStream attributes. * are closed if they are not null diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/ServerBackpressureThresholdInterceptor.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/ServerBackpressureThresholdInterceptor.java new file mode 100644 index 0000000000000..bd42fbc8ad6a4 --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/ServerBackpressureThresholdInterceptor.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.grpc; + +import io.grpc.Metadata; +import io.grpc.ServerCall; +import io.grpc.ServerCallHandler; +import io.grpc.ServerInterceptor; + +/** + * An interceptor for specifying the number of bytes that can be queued before a call with an output stream + * gets blocked by backpressure. + */ +public class ServerBackpressureThresholdInterceptor implements ServerInterceptor { + + private final int numBytes; + + public ServerBackpressureThresholdInterceptor(int numBytes) { + this.numBytes = numBytes; + } + + @Override + public ServerCall.Listener interceptCall(ServerCall call, Metadata headers, + ServerCallHandler next) { + call.setOnReadyThreshold(numBytes); + return next.startCall(call, headers); + } +} diff --git a/java/pom.xml b/java/pom.xml index 95b27922eafa9..9892061677d09 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -34,7 +34,7 @@ 2.0.11 33.0.0-jre 4.1.108.Final - 1.62.2 + 1.63.0 3.23.1 2.17.0 3.4.0 From 805327e48d81cc1ccd3ae4de749547c4f1f4842c Mon Sep 17 00:00:00 2001 From: normanj-bitquill <78755797+normanj-bitquill@users.noreply.github.com> Date: Sun, 7 Apr 2024 18:17:09 -0500 Subject: [PATCH 34/49] GH-40249: [Java] Fix NPE in ArrowDatabaseMetadata (#40988) ### Rationale for this change When retrieving database metadata using the JDBC driver, some data such as SQL keywords could be null. Before this change, an NPE would be thrown when trying to convert the list of SQL keywords into a String. ### What changes are included in this PR? The following database metadata fields: * SQL keywords * Numeric functions * String functions * System functions * Time/date functions will convert to an empty string when they are null. ### Are these changes tested? A unit test has been added to verify that the fields above are converted to the empty string when null, with no exceptions thrown. ### Are there any user-facing changes? The fields above will now return an empty string rather than throw an NPE. * GitHub Issue: #40249 Authored-by: Norman Jordan Signed-off-by: David Li --- .../driver/jdbc/ArrowDatabaseMetadata.java | 24 +++++++++++++------ .../jdbc/ArrowDatabaseMetadataTest.java | 19 ++++++++++++++- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java index d68b8070e2bb7..4af3e55ee11c0 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java @@ -49,6 +49,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -197,31 +198,36 @@ public boolean isReadOnly() throws SQLException { @Override public String getSQLKeywords() throws SQLException { return convertListSqlInfoToString( - getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_KEYWORDS, List.class)); + getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_KEYWORDS, List.class)) + .orElse(""); } @Override public String getNumericFunctions() throws SQLException { return convertListSqlInfoToString( - getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_NUMERIC_FUNCTIONS, List.class)); + getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_NUMERIC_FUNCTIONS, List.class)) + .orElse(""); } @Override public String getStringFunctions() throws SQLException { return convertListSqlInfoToString( - getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_STRING_FUNCTIONS, List.class)); + getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_STRING_FUNCTIONS, List.class)) + .orElse(""); } @Override public String getSystemFunctions() throws SQLException { return convertListSqlInfoToString( - getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_SYSTEM_FUNCTIONS, List.class)); + getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_SYSTEM_FUNCTIONS, List.class)) + .orElse(""); } @Override public String getTimeDateFunctions() throws SQLException { return convertListSqlInfoToString( - getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_DATETIME_FUNCTIONS, List.class)); + getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_DATETIME_FUNCTIONS, List.class)) + .orElse(""); } @Override @@ -753,8 +759,12 @@ private T getSqlInfoAndCacheIfCacheIsEmpty(final SqlInfo sqlInfoCommand, return desiredType.cast(cachedSqlInfo.get(sqlInfoCommand)); } - private String convertListSqlInfoToString(final List sqlInfoList) { - return sqlInfoList.stream().map(Object::toString).collect(Collectors.joining(", ")); + private Optional convertListSqlInfoToString(final List sqlInfoList) { + if (sqlInfoList == null) { + return Optional.empty(); + } else { + return Optional.of(sqlInfoList.stream().map(Object::toString).collect(Collectors.joining(", "))); + } } private boolean getSqlInfoEnumOptionAndCacheIfCacheIsEmpty( diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadataTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadataTest.java index 0d930f4c44e1f..51334c77486bf 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadataTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadataTest.java @@ -95,9 +95,14 @@ public class ArrowDatabaseMetadataTest { public static final boolean EXPECTED_MAX_ROW_SIZE_INCLUDES_BLOBS = false; private static final MockFlightSqlProducer FLIGHT_SQL_PRODUCER = new MockFlightSqlProducer(); + private static final MockFlightSqlProducer FLIGHT_SQL_PRODUCER_EMPTY_SQLINFO = + new MockFlightSqlProducer(); @ClassRule public static final FlightServerTestRule FLIGHT_SERVER_TEST_RULE = FlightServerTestRule .createStandardTestRule(FLIGHT_SQL_PRODUCER); + @ClassRule + public static final FlightServerTestRule FLIGHT_SERVER_EMPTY_SQLINFO_TEST_RULE = + FlightServerTestRule.createStandardTestRule(FLIGHT_SQL_PRODUCER_EMPTY_SQLINFO); private static final int ROW_COUNT = 10; private static final List> EXPECTED_GET_CATALOGS_RESULTS = range(0, ROW_COUNT) @@ -604,7 +609,7 @@ public static void setUpBeforeClass() throws SQLException { @AfterClass public static void tearDown() throws Exception { - AutoCloseables.close(connection, FLIGHT_SQL_PRODUCER); + AutoCloseables.close(connection, FLIGHT_SQL_PRODUCER, FLIGHT_SQL_PRODUCER_EMPTY_SQLINFO); } @@ -1420,4 +1425,16 @@ public void testSqlToRegexLike() { Assert.assertEquals("\\*", ArrowDatabaseMetadata.sqlToRegexLike("*")); Assert.assertEquals("T\\*E.S.*T", ArrowDatabaseMetadata.sqlToRegexLike("T*E_S%T")); } + + @Test + public void testEmptySqlInfo() throws Exception { + try (final Connection testConnection = FLIGHT_SERVER_EMPTY_SQLINFO_TEST_RULE.getConnection(false)) { + final DatabaseMetaData metaData = testConnection.getMetaData(); + collector.checkThat(metaData.getSQLKeywords(), is("")); + collector.checkThat(metaData.getNumericFunctions(), is("")); + collector.checkThat(metaData.getStringFunctions(), is("")); + collector.checkThat(metaData.getSystemFunctions(), is("")); + collector.checkThat(metaData.getTimeDateFunctions(), is("")); + } + } } From 84f6edef697fd0fa0f5fce252c017a31e4ba3944 Mon Sep 17 00:00:00 2001 From: James Henderson Date: Mon, 8 Apr 2024 00:34:27 +0100 Subject: [PATCH 35/49] GH-40999: [Java] Fix AIOOBE trying to splitAndTransfer DUV within nullable struct (#41000) We add a `typeId >= 0` guard to `DUV.TransferImpl.splitAndTransfer` to fix #40999. ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #40999 Authored-by: James Henderson Signed-off-by: David Li --- .../codegen/templates/DenseUnionVector.java | 10 ++++--- .../arrow/vector/TestDenseUnionVector.java | 28 +++++++++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java index 27fd8e9798b67..42e96f7aca335 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionVector.java +++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -676,10 +676,12 @@ public void splitAndTransfer(int startIndex, int length) { for (int i = startIndex; i < startIndex + length; i++) { byte typeId = typeBuffer.getByte(i); - to.offsetBuffer.setInt((long) (i - startIndex) * OFFSET_WIDTH, typeCounts[typeId]); - typeCounts[typeId] += 1; - if (typeStarts[typeId] == -1) { - typeStarts[typeId] = offsetBuffer.getInt((long) i * OFFSET_WIDTH); + if (typeId >= 0) { + to.offsetBuffer.setInt((long) (i - startIndex) * OFFSET_WIDTH, typeCounts[typeId]); + typeCounts[typeId] += 1; + if (typeStarts[typeId] == -1) { + typeStarts[typeId] = offsetBuffer.getInt((long) i * OFFSET_WIDTH); + } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java index 2c29861561bb7..0621fd4527520 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java @@ -363,6 +363,34 @@ public void testSplitAndTransferWithMixedVectors() throws Exception { } } + @Test + public void testSplitAndTransferDuvInStruct() { + try (StructVector struct = StructVector.empty("struct", allocator)) { + DenseUnionVector duv = struct.addOrGet("duv", + FieldType.notNullable(MinorType.DENSEUNION.getType()), + DenseUnionVector.class); + byte i32TypeId = duv.registerNewTypeId(Field.notNullable("i32", MinorType.INT.getType())); + duv.addVector(i32TypeId, new IntVector("i32", allocator)); + + struct.setIndexDefined(0); + duv.setTypeId(0, i32TypeId); + duv.setSafe(0, newIntHolder(42)); + + struct.setNull(1); + struct.setValueCount(2); + + try (StructVector dest = StructVector.empty("dest", allocator)) { + TransferPair pair = struct.makeTransferPair(dest); + pair.splitAndTransfer(0, 2); + + assertEquals(2, dest.getValueCount()); + assertFalse(dest.isNull(0)); + assertEquals(42, dest.getObject(0).get("duv")); + assertTrue(dest.isNull(1)); + } + } + } + @Test public void testGetFieldTypeInfo() throws Exception { Map metadata = new HashMap<>(); From 1343ac74aad59181ff0322e35d42f337edcc1afc Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 8 Apr 2024 18:29:07 +0900 Subject: [PATCH 36/49] GH-41061: [C++] Ignore ARROW_USE_MOLD/ARROW_USE_LLD with clang < 12 (#41062) ### Rationale for this change `--ld-path` is available since clang 12 or later. ### What changes are included in this PR? Ignore `ARROW_USE_MOLD`/`ARROW_USE_LLD` with clang < 12. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #41061 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/SetupCxxFlags.cmake | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 3e580dfc109cf..d56609c123968 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -640,19 +640,23 @@ if(NOT WIN32 AND NOT APPLE) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "12.1.0") set(MOLD_LINKER_FLAGS "-fuse-ld=mold") - message(STATUS "Using optional mold linker") else() message(STATUS "Need GCC 12.1.0 or later to use mold linker: ${CMAKE_CXX_COMPILER_VERSION}" ) endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(MOLD_LINKER_FLAGS "--ld-path=${LD_MOLD}") - message(STATUS "Using optional mold linker") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "12.0.0") + set(MOLD_LINKER_FLAGS "--ld-path=${LD_MOLD}") + else() + message(STATUS "Need clang 12.0.0 or later to use mold linker: ${CMAKE_CXX_COMPILER_VERSION}" + ) + endif() else() message(STATUS "Using the default linker because compiler doesn't support mold: ${CMAKE_CXX_COMPILER_ID}" ) endif() if(MOLD_LINKER_FLAGS) + message(STATUS "Using optional mold linker") string(APPEND CMAKE_EXE_LINKER_FLAGS " ${MOLD_LINKER_FLAGS}") string(APPEND CMAKE_MODULE_LINKER_FLAGS " ${MOLD_LINKER_FLAGS}") string(APPEND CMAKE_SHARED_LINKER_FLAGS " ${MOLD_LINKER_FLAGS}") @@ -675,7 +679,12 @@ if(ARROW_USE_LLD) ) endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(LLD_LINKER_FLAGS "--ld-path=${LD_LLD}") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "12.0.0") + set(LLD_LINKER_FLAGS "--ld-path=${LD_LLD}") + else() + message(STATUS "Need clang 12.0.0 or later to use LLD linker: ${CMAKE_CXX_COMPILER_VERSION}" + ) + endif() else() message(STATUS "Using the default linker because compiler doesn't support LLD: ${CMAKE_CXX_COMPILER_ID}" ) From f8784acbfcd4a2811dd404d4d53a89b26569b05a Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 8 Apr 2024 22:15:05 +0800 Subject: [PATCH 37/49] GH-41032: [C++][Parquet] Bugfixes and more tests in boolean arrow decoding (#41037) ### Rationale for this change Previous patches ( https://github.com/apache/arrow/pull/40876 , https://github.com/apache/arrow/pull/40995 ) introduce some bugs in boolean decoding. This patch add tests for DecodeArrow and fix the bug ### What changes are included in this PR? 1. Add tests for DecodeArrow with nulls and without nulls 2. Fix 2 bugs ### Are these changes tested? Yes ### Are there any user-facing changes? no * GitHub Issue: #41032 Lead-authored-by: mwish Co-authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 15 ++-- cpp/src/parquet/encoding_test.cc | 144 +++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 6e93b493392c9..3da5c64ace5dd 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1192,10 +1192,12 @@ int PlainBooleanDecoder::DecodeArrow( int values_decoded = num_values - null_count; if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) { // A too large `num_values` was requested. - ParquetException::EofException(); + ParquetException::EofException( + "A too large `num_values` was requested in PlainBooleanDecoder: remain " + + std::to_string(num_values_) + ", requested: " + std::to_string(values_decoded)); } if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(values_decoded))) { - ParquetException::EofException(); + ParquetException::EofException("PlainDecoder doesn't have enough values in page"); } if (null_count == 0) { @@ -1208,7 +1210,7 @@ int PlainBooleanDecoder::DecodeArrow( BitBlockCounter bit_counter(valid_bits, valid_bits_offset, num_values); int64_t value_position = 0; int64_t valid_bits_offset_position = valid_bits_offset; - int64_t previous_value_offset = 0; + int64_t previous_value_offset = total_num_values_ - num_values_; while (value_position < num_values) { auto block = bit_counter.NextWord(); if (block.AllSet()) { @@ -1224,8 +1226,7 @@ int PlainBooleanDecoder::DecodeArrow( } else { for (int64_t i = 0; i < block.length; ++i) { if (bit_util::GetBit(valid_bits, valid_bits_offset_position + i)) { - bool value = bit_util::GetBit( - data_, total_num_values_ - num_values_ + previous_value_offset); + bool value = bit_util::GetBit(data_, previous_value_offset); builder->UnsafeAppend(value); previous_value_offset += 1; } else { @@ -3177,7 +3178,9 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { PARQUET_THROW_NOT_OK( out->AppendValues(values.begin(), values.begin() + current_batch_size)); num_values -= current_batch_size; - current_index_in_batch = 0; + // set current_index_in_batch to current_batch_size means + // the whole batch is totally consumed. + current_index_in_batch = current_batch_size; } while (num_values > 0); return num_non_null_values; } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index bb5126ce251d4..b91fcb0839cba 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -635,6 +635,150 @@ TEST(BooleanArrayEncoding, AdHocRoundTrip) { } } +class TestBooleanArrowDecoding : public ::testing::Test { + public: + // number of values including nulls + constexpr static int kNumValues = 10000; + + void SetUp() override { + null_probabilities_ = {0.0, 0.001, 0.5, 0.999, 1.0}; + read_batch_sizes_ = {1024, 4096, 10000}; + true_probabilities_ = {0.0, 0.001, 0.5, 0.999, 1.0}; + } + void TearDown() override {} + + void InitTestCase(Encoding::type encoding, double null_probability, + double true_probability) { + GenerateInputData(null_probability, true_probability); + SetupEncoderDecoder(encoding); + } + + void GenerateInputData(double null_probability, double true_probability) { + ::arrow::random::RandomArrayGenerator rag(0); + expected_dense_ = rag.Boolean(kNumValues, true_probability, null_probability); + null_count_ = static_cast(expected_dense_->null_count()); + valid_bits_ = expected_dense_->null_bitmap_data(); + + // Initialize input_data_ for the encoder from the expected_array_ values + const auto& boolean_array = + checked_cast(*expected_dense_); + input_data_.resize(boolean_array.length()); + + for (int64_t i = 0; i < boolean_array.length(); ++i) { + input_data_[i] = boolean_array.Value(i); + } + } + + // Setup encoder/decoder pair for testing with boolean encoding + void SetupEncoderDecoder(Encoding::type encoding) { + encoder_ = MakeTypedEncoder(encoding); + decoder_ = MakeTypedDecoder(encoding); + const auto* data_ptr = reinterpret_cast(input_data_.data()); + if (valid_bits_ != nullptr) { + ASSERT_NO_THROW(encoder_->PutSpaced(data_ptr, kNumValues, valid_bits_, 0)); + } else { + ASSERT_NO_THROW(encoder_->Put(data_ptr, kNumValues)); + } + buffer_ = encoder_->FlushValues(); + ResetTheDecoder(); + } + + void ResetTheDecoder() { + ASSERT_NE(nullptr, buffer_); + decoder_->SetData(kNumValues, buffer_->data(), static_cast(buffer_->size())); + } + + void CheckDense(int actual_num_values, const ::arrow::Array& chunk) { + ASSERT_EQ(actual_num_values, kNumValues - null_count_); + ASSERT_ARRAYS_EQUAL(chunk, *expected_dense_); + } + + void CheckDecodeArrow(Encoding::type encoding) { + for (double np : null_probabilities_) { + for (double true_prob : true_probabilities_) { + InitTestCase(encoding, np, true_prob); + for (int read_batch_size : this->read_batch_sizes_) { + ResetTheDecoder(); + + int num_values_left = kNumValues; + ::arrow::BooleanBuilder acc; + int actual_num_not_null_values = 0; + while (num_values_left > 0) { + int batch_size = std::min(num_values_left, read_batch_size); + ASSERT_NE(0, batch_size); + // Counting nulls + int64_t batch_null_count = 0; + if (null_count_ != 0) { + batch_null_count = + batch_size - ::arrow::internal::CountSetBits( + valid_bits_, kNumValues - num_values_left, batch_size); + } + int batch_num_values = + decoder_->DecodeArrow(batch_size, static_cast(batch_null_count), + valid_bits_, kNumValues - num_values_left, &acc); + actual_num_not_null_values += batch_num_values; + num_values_left -= batch_size; + } + std::shared_ptr<::arrow::Array> chunk; + ASSERT_OK(acc.Finish(&chunk)); + CheckDense(actual_num_not_null_values, *chunk); + } + } + } + } + + void CheckDecodeArrowNonNull(Encoding::type encoding) { + // NonNull skips tests for null_prob != 0. + for (auto true_prob : true_probabilities_) { + InitTestCase(encoding, /*null_probability=*/0, true_prob); + for (int read_batch_size : this->read_batch_sizes_) { + // Resume the decoder + ResetTheDecoder(); + ::arrow::BooleanBuilder acc; + int actual_num_values = 0; + int num_values_left = kNumValues; + while (num_values_left > 0) { + int batch_size = std::min(num_values_left, read_batch_size); + int batch_num_values = decoder_->DecodeArrowNonNull(batch_size, &acc); + actual_num_values += batch_num_values; + num_values_left -= batch_num_values; + } + std::shared_ptr<::arrow::Array> chunk; + ASSERT_OK(acc.Finish(&chunk)); + CheckDense(actual_num_values, *chunk); + } + } + } + + protected: + std::vector null_probabilities_; + std::vector true_probabilities_; + std::vector read_batch_sizes_; + std::shared_ptr<::arrow::Array> expected_dense_; + int null_count_{0}; + std::vector input_data_; + const uint8_t* valid_bits_ = NULLPTR; + std::unique_ptr encoder_; + std::unique_ptr decoder_; + std::shared_ptr buffer_; +}; + +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowUsingPlain) { + this->CheckDecodeArrow(Encoding::PLAIN); +} + +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowNonNullPlain) { + this->CheckDecodeArrowNonNull(Encoding::PLAIN); +} + +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowRle) { + this->CheckDecodeArrow(Encoding::RLE); +} + +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowNonNullRle) { + this->CheckDecodeArrowNonNull(Encoding::RLE); +} + template void GetDictDecoder(DictEncoder* encoder, int64_t num_values, std::shared_ptr* out_values, From e9b3d40fa75a2f86f871d5c3720407a23f07c679 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Apr 2024 09:42:26 -0700 Subject: [PATCH 38/49] MINOR: [C#] Bump BenchmarkDotNet.Diagnostics.Windows, BenchmarkDotNet and System.Runtime.CompilerServices.Unsafe in /csharp (#41073) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [BenchmarkDotNet.Diagnostics.Windows](https://github.com/dotnet/BenchmarkDotNet), [BenchmarkDotNet](https://github.com/dotnet/BenchmarkDotNet) and [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime). These dependencies needed to be updated together. Updates `BenchmarkDotNet.Diagnostics.Windows` from 0.13.10 to 0.13.12
Release notes

Sourced from BenchmarkDotNet.Diagnostics.Windows's releases.

0.13.12

Full changelog: https://benchmarkdotnet.org/changelog/v0.13.12.html

Highlights

The biggest highlight of this release if our new VSTest Adapter, which allows to run benchmarks as unit tests in your favorite IDE! The detailed guide can be found here.

This release also includes to a minor bug fix that caused incorrect job id generation: fixed job id generation (#2491).

Also, the target framework in the BenchmarkDotNet templates was bumped to .NET 8.0.

0.13.11

Full changelog: https://benchmarkdotnet.org/changelog/v0.13.11.html

In the v0.13.11 scope, 4 issues were resolved and 8 pull requests were merged. This release includes 28 commits by 7 contributors.

Resolved issues (4)

  • #2060 NativeAOT benchmark started from .Net Framework host doesn't have all intrinsics enabled (assignee: @​adamsitnik)
  • #2233 Q: Include hardware counters in XML output (assignee: @​nazulg)
  • #2388 Include AVX512 in listed HardwareIntrinsics
  • #2463 Bug. Native AOT .NET 7.0 doesn't work. System.NotSupportedException: X86Serialize (assignee: @​adamsitnik)

Merged pull requests (8)

Commits (28)

... (truncated)

Commits
  • cdce32f Prepare v0.13.12 changelog
  • 0159b88 [docs] Update vstest
  • 2990951 [Bug Fix] Custom SimpleJob Id ignored (#2491)
  • 06fb242 Use different FQNs for parameterized cases in TestAdapter, fix #2494
  • b67b6b1 Update pack files layout for BenchmarkDotNet.TestAdapter, fix #2492
  • 9f74923 [build] Add test-pack job in run-tests workflow, see #2492
  • 8b6badb [build] In the release workflow, automatically generate an announcement discu...
  • 5c46537 [build] Bump Octokit: 9.0.0->9.1.0
  • 41b23b7 Update default framework version to net8.0 (LTS)
  • 91f3f7e Add a VSTest Adapter (#2438)
  • Additional commits viewable in compare view

Updates `BenchmarkDotNet` from 0.13.10 to 0.13.12
Release notes

Sourced from BenchmarkDotNet's releases.

0.13.12

Full changelog: https://benchmarkdotnet.org/changelog/v0.13.12.html

Highlights

The biggest highlight of this release if our new VSTest Adapter, which allows to run benchmarks as unit tests in your favorite IDE! The detailed guide can be found here.

This release also includes to a minor bug fix that caused incorrect job id generation: fixed job id generation (#2491).

Also, the target framework in the BenchmarkDotNet templates was bumped to .NET 8.0.

0.13.11

Full changelog: https://benchmarkdotnet.org/changelog/v0.13.11.html

In the v0.13.11 scope, 4 issues were resolved and 8 pull requests were merged. This release includes 28 commits by 7 contributors.

Resolved issues (4)

  • #2060 NativeAOT benchmark started from .Net Framework host doesn't have all intrinsics enabled (assignee: @​adamsitnik)
  • #2233 Q: Include hardware counters in XML output (assignee: @​nazulg)
  • #2388 Include AVX512 in listed HardwareIntrinsics
  • #2463 Bug. Native AOT .NET 7.0 doesn't work. System.NotSupportedException: X86Serialize (assignee: @​adamsitnik)

Merged pull requests (8)

Commits (28)

... (truncated)

Commits
  • cdce32f Prepare v0.13.12 changelog
  • 0159b88 [docs] Update vstest
  • 2990951 [Bug Fix] Custom SimpleJob Id ignored (#2491)
  • 06fb242 Use different FQNs for parameterized cases in TestAdapter, fix #2494
  • b67b6b1 Update pack files layout for BenchmarkDotNet.TestAdapter, fix #2492
  • 9f74923 [build] Add test-pack job in run-tests workflow, see #2492
  • 8b6badb [build] In the release workflow, automatically generate an announcement discu...
  • 5c46537 [build] Bump Octokit: 9.0.0->9.1.0
  • 41b23b7 Update default framework version to net8.0 (LTS)
  • 91f3f7e Add a VSTest Adapter (#2438)
  • Additional commits viewable in compare view

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 5.0.0
Release notes

Sourced from System.Runtime.CompilerServices.Unsafe's releases.

.NET 5

Release Notes Install Instructions

Repo

.NET 5 RC 2

Release Notes Install Instructions

Repos

.NET 5.0 RC 1

Release Notes Install Instructions

Repos

.NET 5.0 Preview 8

Release Notes Install Instructions

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj index df76fd4a7b45f..d44b7488e3b17 100644 --- a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj +++ b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj @@ -6,8 +6,8 @@ - - + + From ebd21490c656be00642be7eddbbe3486b037c86d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Apr 2024 09:46:55 -0700 Subject: [PATCH 39/49] MINOR: [C#] Bump Grpc.AspNetCore, Grpc.AspNetCore.Server, System.Runtime.CompilerServices.Unsafe and Grpc.Net.Client in /csharp (#41078) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [Grpc.AspNetCore](https://github.com/grpc/grpc-dotnet), [Grpc.AspNetCore.Server](https://github.com/grpc/grpc-dotnet), [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime) and [Grpc.Net.Client](https://github.com/grpc/grpc-dotnet). These dependencies needed to be updated together. Updates `Grpc.AspNetCore` from 2.59.0 to 2.62.0
Release notes

Sourced from Grpc.AspNetCore's releases.

Release v2.62.0

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.61.0...v2.62.0

Release v2.62.0-pre1

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.61.0...v2.62.0-pre1

Release v2.61.0

What's Changed

New Contributors

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.60.0...v2.61.0

Release v2.61.0-pre1

What's Changed

... (truncated)

Commits

Updates `Grpc.AspNetCore.Server` from 2.60.0 to 2.62.0
Release notes

Sourced from Grpc.AspNetCore.Server's releases.

Release v2.62.0

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.61.0...v2.62.0

Release v2.62.0-pre1

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.61.0...v2.62.0-pre1

Release v2.61.0

What's Changed

New Contributors

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.60.0...v2.61.0

Release v2.61.0-pre1

What's Changed

... (truncated)

Commits

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 6.0.0
Release notes

Sourced from System.Runtime.CompilerServices.Unsafe's releases.

.NET 6.0

Release

.NET 6.0 RC 2

Release

.NET 6.0 RC 1

Release

.NET 6.0 Preview 7

Release

.NET 6.0 Preview 6

Release

.NET 6.0 Preview 5

Release

.NET 6.0 Preview 4

Release

.NET 6.0 Preview 3

Release

.NET 6.0 Preview 2

Release

.NET 6.0 Preview 1

Release

.NET 5.0.17

Release

.NET 5 is now out of support. We recommend using .NET 6.

.NET 5.0.16

Release

.NET 5.0.15

Release

.NET 5.0.14

Release

.NET 5.0.13

Release

.NET 5.0.11

Release

... (truncated)

Commits

Updates `Grpc.Net.Client` from 2.59.0 to 2.62.0
Release notes

Sourced from Grpc.Net.Client's releases.

Release v2.62.0

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.61.0...v2.62.0

Release v2.62.0-pre1

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.61.0...v2.62.0-pre1

Release v2.61.0

What's Changed

New Contributors

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.60.0...v2.61.0

Release v2.61.0-pre1

What's Changed

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Flight.TestWeb.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj index 3aaebb103f9da..d7a2042a4581a 100644 --- a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj +++ b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj @@ -5,7 +5,7 @@ - + From 072eff362742c2309dc89ca816c98c523c6887df Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 8 Apr 2024 20:37:52 +0200 Subject: [PATCH 40/49] GH-40799: [Doc][Format] Implementation status page should list canonical extension types (#41053) ### Rationale for this change Two specifications and one implementation of canonical extension types were added and this should be documented. ### What changes are included in this PR? This represents current state of canonical extension types. ### Are these changes tested? No, docs only. ### Are there any user-facing changes? In so much as they read docs. * GitHub Issue: #40799 Authored-by: Rok Mihevc Signed-off-by: Rok Mihevc --- docs/source/status.rst | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/source/status.rst b/docs/source/status.rst index 71d33eaa6520c..dc60b311a1489 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -111,6 +111,15 @@ Data Types | Run-End Encoded | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ ++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Canonical | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift | +| Extension types | | | | | | | | | ++=======================+=======+=======+=======+============+=======+=======+=======+=======+ +| Fixed shape tensor | ✓ | | | | | | | | ++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Variable shape tensor | | | | | | | | | ++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ + Notes: * \(1) Casting to/from Float16 in Java is not supported. @@ -118,7 +127,8 @@ Notes: * \(3) Nested dictionaries not supported .. seealso:: - The :ref:`format_columnar` specification. + The :ref:`format_columnar` and the + :ref:`format_canonical_extensions` specification. IPC Format From fe38d47e82c6ffc231f0918be15aa5cd0da98607 Mon Sep 17 00:00:00 2001 From: keshen-msft <53491277+keshen-msft@users.noreply.github.com> Date: Mon, 8 Apr 2024 13:14:31 -0700 Subject: [PATCH 41/49] GH-41047: [C#] Address performance issue of reading from StringArray (#41048) ### Rationale for this change The motivation here is to address https://github.com/apache/arrow/issues/41047. There is severe performance drawback in reading a StringArray as value array of a DictionaryArray, because of repeated and unnecessary UTF 8 string decoding. ### What changes are included in this PR? - Added a new function Materialize() to materialize the values to a list. When materialized, GetString() reads from the vector directly. - Added test coverage. ### Are these changes tested? Yes ### Are there any user-facing changes? No. This change maintains backwards compatibility on the API surface. It is up to the client application to decide whether to materialize the array and gain performance. * GitHub Issue: #41047 Authored-by: Keshuang Shen Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow/Arrays/StringArray.cs | 60 ++++++++++++++++++- .../Apache.Arrow.Tests/StringArrayTests.cs | 31 ++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow/Arrays/StringArray.cs b/csharp/src/Apache.Arrow/Arrays/StringArray.cs index af77fe1b1a83d..a3ec596adc7ba 100644 --- a/csharp/src/Apache.Arrow/Arrays/StringArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/StringArray.cs @@ -13,12 +13,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -using Apache.Arrow.Types; using System; using System.Collections; using System.Collections.Generic; using System.Runtime.InteropServices; using System.Text; +using Apache.Arrow.Types; namespace Apache.Arrow { @@ -26,6 +26,8 @@ public class StringArray: BinaryArray, IReadOnlyList { public static readonly Encoding DefaultEncoding = Encoding.UTF8; + private Dictionary materializedStringStore; + public new class Builder : BuilderBase { public Builder() : base(StringType.Default) { } @@ -71,16 +73,28 @@ public StringArray(int length, public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + /// + /// Get the string value at the given index + /// + /// Input index + /// Optional: the string encoding, default is UTF8 + /// The string object at the given index public string GetString(int index, Encoding encoding = default) { encoding ??= DefaultEncoding; + if (materializedStringStore != null && materializedStringStore.TryGetValue(encoding, out string[] materializedStrings)) + { + return materializedStrings[index]; + } + ReadOnlySpan bytes = GetBytes(index, out bool isNull); if (isNull) { return null; } + if (bytes.Length == 0) { return string.Empty; @@ -93,6 +107,50 @@ public string GetString(int index, Encoding encoding = default) } } + /// + /// Materialize the array for the given encoding to accelerate the string access + /// + /// Optional: the string encoding, default is UTF8 + /// This method is not thread safe when it is called in parallel with or . + public void Materialize(Encoding encoding = default) + { + encoding ??= DefaultEncoding; + + if (IsMaterialized(encoding)) + { + return; + } + + if (materializedStringStore == null) + { + materializedStringStore = new Dictionary(); + } + + var stringStore = new string[Length]; + for (int i = 0; i < Length; i++) + { + stringStore[i] = GetString(i, encoding); + } + + materializedStringStore[encoding] = stringStore; + } + + /// + /// Check if the array has been materialized for the given encoding + /// + /// Optional: the string encoding, default is UTF8 + /// True of false whether the array has been materialized + public bool IsMaterialized(Encoding encoding = default) + { + if (materializedStringStore == null) + { + return false; + } + + encoding ??= DefaultEncoding; + return materializedStringStore.ContainsKey(encoding); + } + int IReadOnlyCollection.Count => Length; string IReadOnlyList.this[int index] => GetString(index); diff --git a/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs b/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs index 0fd3d3d105a70..b19731535a29d 100644 --- a/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs @@ -49,6 +49,37 @@ public void ReturnsAppendedValue(string firstValue, string secondValue) // Assert Assert.Equal(firstValue, retrievedValue); } + + [Theory] + [InlineData(null, null)] + [InlineData(null, "")] + [InlineData(null, "value")] + [InlineData("", null)] + [InlineData("", "")] + [InlineData("", "value")] + [InlineData("value", null)] + [InlineData("value", "")] + [InlineData("value", "value")] + public void ReturnsAppendedValueMaterialize(string firstValue, string secondValue) + { + // Arrange + // Create an array with two elements. The second element being null, + // empty, or non-empty may influence the underlying BinaryArray + // storage such that retrieving an empty first element could result + // in an empty span or a 0-length span backed by storage. + var array = new StringArray.Builder() + .Append(firstValue) + .Append(secondValue) + .Build(); + + // Act + array.Materialize(); + var retrievedValue = array.GetString(0); + + // Assert + Assert.True(array.IsMaterialized()); + Assert.Equal(firstValue, retrievedValue); + } } } } From 83341920f76ef0b4b232706b508b04ca14a68e2f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 9 Apr 2024 11:42:39 +0900 Subject: [PATCH 42/49] GH-41085: [CI][Java] Add Spark integration tests to "java" group in Crossbow tasks (#41086) ### Rationale for this change Spark integration tests to related to our Java implementation. ### What changes are included in this PR? Add `test-*spark*` task to `java` group. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #41085 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/tasks/tasks.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index fc8d03b3d5bfb..f98c0a2b48caa 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -80,6 +80,7 @@ groups: java: - "*java*" + - test-*spark* python: - test-*python* From 84aac1ecb11daae45ee22f2e74369debf7988b6f Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Tue, 9 Apr 2024 03:20:48 -0400 Subject: [PATCH 43/49] MINOR: [Go] Bump golang.org/x/crypto,net,sys to latest versions in /go (#41033) This PR bumps - `golang.org/x/crypto` to 0.22.0 - `golang.org/x/net` to 0.24.0 - `golang.org/x/sys` to 0.19.0 to resolve issues described at https://security.snyk.io/vuln/SNYK-GOLANG-GOLANGORGXNETHTTP2-6531285. --- go/go.mod | 4 ++-- go/go.sum | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/go/go.mod b/go/go.mod index 9975ecfc69d34..d2871edbf0c91 100644 --- a/go/go.mod +++ b/go/go.mod @@ -36,7 +36,7 @@ require ( github.com/zeebo/xxh3 v1.0.2 golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 golang.org/x/sync v0.6.0 - golang.org/x/sys v0.18.0 + golang.org/x/sys v0.19.0 golang.org/x/tools v0.19.0 golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 gonum.org/v1/gonum v0.15.0 @@ -76,7 +76,7 @@ require ( github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.0 // indirect golang.org/x/mod v0.16.0 // indirect - golang.org/x/net v0.22.0 // indirect + golang.org/x/net v0.24.0 // indirect golang.org/x/text v0.14.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240123012728-ef4313101c80 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go/go.sum b/go/go.sum index 462c43021a29e..850a3b10224e8 100644 --- a/go/go.sum +++ b/go/go.sum @@ -113,21 +113,21 @@ github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= -golang.org/x/crypto v0.21.0 h1:X31++rzVUdKhX5sWmSOFZxx8UW/ldWx55cbf08iNAMA= -golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= +golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30= +golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 h1:LfspQV/FYTatPTr/3HzIcmiUFH7PGP+OQ6mgDYo3yuQ= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc= golang.org/x/mod v0.16.0 h1:QX4fJ0Rr5cPQCF7O9lh9Se4pmwfwskqZfq5moyldzic= golang.org/x/mod v0.16.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/net v0.22.0 h1:9sGLhx7iRIHEiX0oAJ3MRZMUCElJgy7Br1nO+AMN3Tc= -golang.org/x/net v0.22.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w= +golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= -golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= +golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/tools v0.19.0 h1:tfGCXNR1OsFG+sVdLAitlpjAvD/I6dHDKnYrpEZUHkw= From 5dc50865b794bfe50cceabdb5e8a2ed25c1d573f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Finn=20V=C3=B6lkel?= Date: Tue, 9 Apr 2024 10:25:46 +0200 Subject: [PATCH 44/49] GH-30866: [Java] fix SplitAndTransfer throws for (0,0) if vector empty (#41066) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is addresses https://issues.apache.org/jira/browse/ARROW-15382 and is reopening of https://github.com/apache/arrow/pull/12250 (which I asked to be reopened). I tried to address all the comments from the previous discussion, added some more tests and fixed an issue in the old commit. * GitHub Issue: #30866 Authored-by: Finn Völkel Signed-off-by: David Li --- .../vector/BaseLargeVariableWidthVector.java | 12 +- .../arrow/vector/BaseVariableWidthVector.java | 6 +- .../arrow/vector/complex/ListVector.java | 28 ++-- .../arrow/vector/TestLargeVarCharVector.java | 4 +- .../arrow/vector/TestSplitAndTransfer.java | 121 +++++++++++++++++- 5 files changed, 140 insertions(+), 31 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index 34c9e73a0b072..2ef6e4bd8b374 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -763,16 +763,14 @@ public void transferTo(BaseLargeVariableWidthVector target) { */ public void splitAndTransferTo(int startIndex, int length, BaseLargeVariableWidthVector target) { - Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, - "Invalid startIndex: %s", startIndex); - Preconditions.checkArgument(startIndex + length <= valueCount, - "Invalid length: %s", length); + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); compareTypes(target, "splitAndTransferTo"); target.clear(); - splitAndTransferValidityBuffer(startIndex, length, target); - splitAndTransferOffsetBuffer(startIndex, length, target); - target.setLastSet(length - 1); if (length > 0) { + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferOffsetBuffer(startIndex, length, target); + target.setLastSet(length - 1); target.setValueCount(length); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 6b82dd7729a6c..d533629cdd44e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -808,10 +808,10 @@ public void splitAndTransferTo(int startIndex, int length, "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); compareTypes(target, "splitAndTransferTo"); target.clear(); - splitAndTransferValidityBuffer(startIndex, length, target); - splitAndTransferOffsetBuffer(startIndex, length, target); - target.setLastSet(length - 1); if (length > 0) { + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferOffsetBuffer(startIndex, length, target); + target.setLastSet(length - 1); target.setValueCount(length); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 91275ae73d2c3..220f177659038 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -552,21 +552,23 @@ public void transfer() { public void splitAndTransfer(int startIndex, int length) { Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); - final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); - final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); - to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); - /* splitAndTransfer offset buffer */ - for (int i = 0; i < length + 1; i++) { - final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; - to.offsetBuffer.setInt(i * OFFSET_WIDTH, relativeOffset); + if (length > 0) { + final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); + final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; + to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + /* splitAndTransfer offset buffer */ + for (int i = 0; i < length + 1; i++) { + final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; + to.offsetBuffer.setInt(i * OFFSET_WIDTH, relativeOffset); + } + /* splitAndTransfer validity buffer */ + splitAndTransferValidityBuffer(startIndex, length, to); + /* splitAndTransfer data buffer */ + dataTransferPair.splitAndTransfer(startPoint, sliceLength); + to.lastSet = length - 1; + to.setValueCount(length); } - /* splitAndTransfer validity buffer */ - splitAndTransferValidityBuffer(startIndex, length, to); - /* splitAndTransfer data buffer */ - dataTransferPair.splitAndTransfer(startPoint, sliceLength); - to.lastSet = length - 1; - to.setValueCount(length); } /* diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java index 62d09da86d652..06b27a9eba156 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java @@ -166,7 +166,7 @@ public void testInvalidStartIndex() { IllegalArgumentException.class, () -> tp.splitAndTransfer(valueCount, 10)); - assertEquals("Invalid startIndex: 500", e.getMessage()); + assertEquals("Invalid parameters startIndex: 500, length: 10 for valueCount: 500", e.getMessage()); } } @@ -185,7 +185,7 @@ public void testInvalidLength() { IllegalArgumentException.class, () -> tp.splitAndTransfer(0, valueCount * 2)); - assertEquals("Invalid length: 1000", e.getMessage()); + assertEquals("Invalid parameters startIndex: 0, length: 1000 for valueCount: 500", e.getMessage()); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java index 3580a321f01c9..396f5665e0382 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java @@ -28,6 +28,7 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; @@ -49,7 +50,7 @@ public class TestSplitAndTransfer { public void init() { allocator = new RootAllocator(Long.MAX_VALUE); } - + @After public void terminate() throws Exception { allocator.close(); @@ -65,7 +66,116 @@ private void populateVarcharVector(final VarCharVector vector, int valueCount, S } vector.setValueCount(valueCount); } - + + private void populateIntVector(final IntVector vector, int valueCount) { + for (int i = 0; i < valueCount; i++) { + vector.set(i, i); + } + vector.setValueCount(valueCount); + } + + private void populateDenseUnionVector(final DenseUnionVector vector, int valueCount) { + VarCharVector varCharVector = vector.addOrGet("varchar", FieldType.nullable(new ArrowType.Utf8()), + VarCharVector.class); + BigIntVector intVector = vector.addOrGet("int", + FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + + for (int i = 0; i < valueCount; i++) { + vector.setTypeId(i, (byte) (i % 2)); + if (i % 2 == 0) { + final String s = String.format("%010d", i); + varCharVector.setSafe(i / 2, s.getBytes(StandardCharsets.UTF_8)); + } else { + intVector.setSafe(i / 2, i); + } + } + vector.setValueCount(valueCount); + } + + @Test + public void testWithEmptyVector() { + // MapVector use TransferImpl from ListVector + ListVector listVector = ListVector.empty("", allocator); + TransferPair transferPair = listVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // BaseFixedWidthVector + IntVector intVector = new IntVector("", allocator); + transferPair = intVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // BaseVariableWidthVector + VarCharVector varCharVector = new VarCharVector("", allocator); + transferPair = varCharVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // BaseLargeVariableWidthVector + LargeVarCharVector largeVarCharVector = new LargeVarCharVector("", allocator); + transferPair = largeVarCharVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // StructVector + StructVector structVector = StructVector.empty("", allocator); + transferPair = structVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // FixedSizeListVector + FixedSizeListVector fixedSizeListVector = FixedSizeListVector.empty("", 0, allocator); + transferPair = fixedSizeListVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // FixedSizeBinaryVector + FixedSizeBinaryVector fixedSizeBinaryVector = new FixedSizeBinaryVector("", allocator, 4); + transferPair = fixedSizeBinaryVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // UnionVector + UnionVector unionVector = UnionVector.empty("", allocator); + transferPair = unionVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // DenseUnionVector + DenseUnionVector duv = DenseUnionVector.empty("", allocator); + transferPair = duv.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + + // non empty from vector + + // BaseFixedWidthVector + IntVector fromIntVector = new IntVector("", allocator); + fromIntVector.allocateNew(100); + populateIntVector(fromIntVector, 100); + transferPair = fromIntVector.getTransferPair(allocator); + IntVector toIntVector = (IntVector) transferPair.getTo(); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, toIntVector.getValueCount()); + + transferPair.splitAndTransfer(50, 0); + assertEquals(0, toIntVector.getValueCount()); + + transferPair.splitAndTransfer(100, 0); + assertEquals(0, toIntVector.getValueCount()); + fromIntVector.clear(); + toIntVector.clear(); + + // DenseUnionVector + DenseUnionVector fromDuv = DenseUnionVector.empty("", allocator); + populateDenseUnionVector(fromDuv, 100); + transferPair = fromDuv.getTransferPair(allocator); + DenseUnionVector toDUV = (DenseUnionVector) transferPair.getTo(); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, toDUV.getValueCount()); + + transferPair.splitAndTransfer(50, 0); + assertEquals(0, toDUV.getValueCount()); + + transferPair.splitAndTransfer(100, 0); + assertEquals(0, toDUV.getValueCount()); + fromDuv.clear(); + toDUV.clear(); + } + @Test /* VarCharVector */ public void test() throws Exception { try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { @@ -73,13 +183,13 @@ public void test() throws Exception { final int valueCount = 500; final String[] compareArray = new String[valueCount]; - + populateVarcharVector(varCharVector, valueCount, compareArray); - + final TransferPair tp = varCharVector.getTransferPair(allocator); final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); final int[][] startLengths = {{0, 201}, {201, 0}, {201, 200}, {401, 99}}; - + for (final int[] startLength : startLengths) { final int start = startLength[0]; final int length = startLength[1]; @@ -429,5 +539,4 @@ public void testMapVectorZeroStartIndexAndLength() { newMapVector.clear(); } } - } From 823e1c81cf01a9764a88440a56a0a219eb6687dd Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Tue, 9 Apr 2024 05:45:40 -0400 Subject: [PATCH 45/49] MINOR: [Docs] Change Twitter logo to X logo (#41096) Thanks Elon --- docs/source/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index ad8fa798d6aac..05340dc923c89 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -346,9 +346,9 @@ "icon": "fa-brands fa-square-github", }, { - "name": "Twitter", + "name": "X", "url": "https://twitter.com/ApacheArrow", - "icon": "fa-brands fa-square-twitter", + "icon": "fa-brands fa-square-x-twitter", }, ], "show_version_warning_banner": True, From aeb1618a3093bc05cd81edbc0f1de54f8c7f406b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 9 Apr 2024 11:13:39 -0300 Subject: [PATCH 46/49] GH-41020: [C++] Introduce portable compiler assumptions (#41021) ### Rationale for this change Allow portable use of the "assumption" feature of modern GCC, Clang, and MSVC compilers. ### What changes are included in this PR? - Documenting of macros in `macros.h` - Addition of `ARROW_COMPILER_ASSUME` - One simple change where this macro simplifies the existing code while keeping the common case of a loop branch dependent only on local variables ### Are these changes tested? By existing tests. * GitHub Issue: #41020 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/util/macros.h | 72 +++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index 1d23e829d74a9..d80828869b33c 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -36,28 +36,74 @@ TypeName& operator=(TypeName&&) = default #endif -#define ARROW_UNUSED(x) (void)(x) -#define ARROW_ARG_UNUSED(x) +// With ARROW_PREDICT_FALSE, GCC and clang can be told that a certain branch is +// not likely to be taken (for instance, a CHECK failure), and use that information in +// static analysis. Giving the compiler this information can affect the generated code +// layout in the absence of better information (i.e. -fprofile-arcs). [1] explains how +// this feature can be used to improve code generation. It was written as a positive +// comment to a negative article about the use of these annotations. // -// GCC can be told that a certain branch is not likely to be taken (for -// instance, a CHECK failure), and use that information in static analysis. -// Giving it this information can help it optimize for the common case in -// the absence of better information (ie. -fprofile-arcs). +// ARROW_COMPILER_ASSUME allows the compiler to assume that a given expression is +// true, without evaluating it, and to optimise based on this assumption [2]. If this +// condition is violated at runtime, the behavior is undefined. This can be useful to +// generate both faster and smaller code in compute kernels. // -#if defined(__GNUC__) -#define ARROW_PREDICT_FALSE(x) (__builtin_expect(!!(x), 0)) -#define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) +// IMPORTANT: Different optimisers are likely to react differently to this annotation! +// It should be used with care when we can prove by some means that the assumption +// is (1) guaranteed to always hold and (2) is useful for optimization [3]. If the +// assumption is pessimistic, it might even block the compiler from decisions that +// could lead to better code [4]. If you have a good intuition for what the compiler +// can do with assumptions [5], you can use this macro to guide it and end up with +// results you would only get with more complex code transformations. +// `clang -S -emit-llvm` can be used to check how the generated code changes with +// your specific use of this macro. +// +// [1] https://lobste.rs/s/uwgtkt/don_t_use_likely_unlikely_attributes#c_xi3wmc +// [2] "Portable assumptions" +// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1774r4.pdf +// [3] "Assertions Are Pessimistic, Assumptions Are Optimistic" +// https://blog.regehr.org/archives/1096 +// [4] https://discourse.llvm.org/t/llvm-assume-blocks-optimization/71609 +// [5] J. Doerfert et al. 2019. "Performance Exploration Through Optimistic Static +// Program Annotations". https://github.com/jdoerfert/PETOSPA/blob/master/ISC19.pdf +#define ARROW_UNUSED(x) (void)(x) +#define ARROW_ARG_UNUSED(x) +#if defined(__GNUC__) // GCC and compatible compilers (clang, Intel ICC) #define ARROW_NORETURN __attribute__((noreturn)) #define ARROW_NOINLINE __attribute__((noinline)) #define ARROW_FORCE_INLINE __attribute__((always_inline)) +#define ARROW_PREDICT_FALSE(x) (__builtin_expect(!!(x), 0)) +#define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) #define ARROW_PREFETCH(addr) __builtin_prefetch(addr) -#elif defined(_MSC_VER) +#define ARROW_RESTRICT __restrict +#if defined(__clang__) // clang-specific +#define ARROW_COMPILER_ASSUME(expr) __builtin_assume(expr) +#else // GCC-specific +#if __GNUC__ >= 13 +#define ARROW_COMPILER_ASSUME(expr) __attribute__((assume(expr))) +#else +// GCC does not have a built-in assume intrinsic before GCC 13, so we use an +// if statement and __builtin_unreachable() to achieve the same effect [2]. +// Unlike clang's __builtin_assume and C++23's [[assume(expr)]], using this +// on GCC won't warn about side-effects in the expression, so make sure expr +// is side-effect free when working with GCC versions before 13 (Jan-2024), +// otherwise clang/MSVC builds will fail in CI. +#define ARROW_COMPILER_ASSUME(expr) \ + if (expr) { \ + } else { \ + __builtin_unreachable(); \ + } +#endif // __GNUC__ >= 13 +#endif +#elif defined(_MSC_VER) // MSVC #define ARROW_NORETURN __declspec(noreturn) #define ARROW_NOINLINE __declspec(noinline) #define ARROW_FORCE_INLINE __declspec(forceinline) #define ARROW_PREDICT_FALSE(x) (x) #define ARROW_PREDICT_TRUE(x) (x) #define ARROW_PREFETCH(addr) +#define ARROW_RESTRICT __restrict +#define ARROW_COMPILER_ASSUME(expr) __assume(expr) #else #define ARROW_NORETURN #define ARROW_NOINLINE @@ -65,12 +111,8 @@ #define ARROW_PREDICT_FALSE(x) (x) #define ARROW_PREDICT_TRUE(x) (x) #define ARROW_PREFETCH(addr) -#endif - -#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) -#define ARROW_RESTRICT __restrict -#else #define ARROW_RESTRICT +#define ARROW_COMPILER_ASSUME(expr) #endif // ---------------------------------------------------------------------- From 75a100a11332a3cdae70ce6e66e9af52666f3dd7 Mon Sep 17 00:00:00 2001 From: LucasG0 <44552904+LucasG0@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:05:03 +0200 Subject: [PATCH 47/49] GH-38768: [Python] Empty slicing an array backwards beyond the start is now empty (#40682) ### What changes are included in this PR? `_normalize_slice` now relies on `slice.indices` (https://docs.python.org/3/reference/datamodel.html#slice.indices). ### Are these changes tested? Yes. ### Are there any user-facing changes? Fixing wrong data returned in an edge case. * GitHub Issue: #40642 * GitHub Issue: #38768 Lead-authored-by: LucasG0 Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 29 +---------------------------- python/pyarrow/tests/test_array.py | 1 + 2 files changed, 2 insertions(+), 28 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 59d2e91ef6c70..45fd29ad3b3f3 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -561,34 +561,7 @@ def _normalize_slice(object arrow_obj, slice key): Py_ssize_t start, stop, step Py_ssize_t n = len(arrow_obj) - step = key.step or 1 - - if key.start is None: - if step < 0: - start = n - 1 - else: - start = 0 - elif key.start < 0: - start = key.start + n - if start < 0: - start = 0 - elif key.start >= n: - start = n - else: - start = key.start - - if step < 0 and (key.stop is None or key.stop < -n): - stop = -1 - elif key.stop is None: - stop = n - elif key.stop < 0: - stop = key.stop + n - if stop < 0: # step > 0 in this case. - stop = 0 - elif key.stop >= n: - stop = n - else: - stop = key.stop + start, stop, step = key.indices(n) if step != 1: indices = np.arange(start, stop, step) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 60794ebef3281..472a6c5dce750 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -486,6 +486,7 @@ def test_array_slice_negative_step(): slice(None, None, 2), slice(0, 10, 2), slice(15, -25, -1), # GH-38768 + slice(-22, -22, -1), # GH-40642 ] for case in cases: From 835e21873522bba2928beb972e863790f4c270cb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 Apr 2024 11:05:21 -0400 Subject: [PATCH 48/49] MINOR: [Go] Bump golang.org/x/tools from 0.19.0 to 0.20.0 in /go (#41082) Bumps [golang.org/x/tools](https://github.com/golang/tools) from 0.19.0 to 0.20.0.
Commits
  • 11c692e gopls/internal/test/marker/testdata: skip hover size tests on 32-bit arm
  • fc660e5 go.mod: update golang.org/x dependencies
  • 6590f47 internal/gcimporter: renable tests of issue50259.go
  • f1d5252 gopls/internal/golang: Hover: show wasted % of struct space
  • 951bb40 gopls/internal/test/integration/misc: fix flaky test
  • c9b0c65 gopls/internal/analysis/fillreturns: skip test if gotypesalias=1
  • c623a28 gopls/internal/cache: fix crash in snapshot.Analyze with patch versions
  • f345449 gopls/internal/server: filter diagnostics to "best" views
  • 42d590c gopls/internal/test/integration: add a WriteGoSum run option
  • 53d35a5 gopls/internal/golang: RenderPackageDoc: fix doc links
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=golang.org/x/tools&package-manager=go_modules&previous-version=0.19.0&new-version=0.20.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 6 +++--- go/go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go/go.mod b/go/go.mod index d2871edbf0c91..af7c937800ff1 100644 --- a/go/go.mod +++ b/go/go.mod @@ -35,9 +35,9 @@ require ( github.com/stretchr/testify v1.9.0 github.com/zeebo/xxh3 v1.0.2 golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 - golang.org/x/sync v0.6.0 + golang.org/x/sync v0.7.0 golang.org/x/sys v0.19.0 - golang.org/x/tools v0.19.0 + golang.org/x/tools v0.20.0 golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 gonum.org/v1/gonum v0.15.0 google.golang.org/grpc v1.62.1 @@ -75,7 +75,7 @@ require ( github.com/tidwall/gjson v1.14.2 // indirect github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.0 // indirect - golang.org/x/mod v0.16.0 // indirect + golang.org/x/mod v0.17.0 // indirect golang.org/x/net v0.24.0 // indirect golang.org/x/text v0.14.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240123012728-ef4313101c80 // indirect diff --git a/go/go.sum b/go/go.sum index 850a3b10224e8..9d42ed49ab57c 100644 --- a/go/go.sum +++ b/go/go.sum @@ -117,12 +117,12 @@ golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30= golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 h1:LfspQV/FYTatPTr/3HzIcmiUFH7PGP+OQ6mgDYo3yuQ= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc= -golang.org/x/mod v0.16.0 h1:QX4fJ0Rr5cPQCF7O9lh9Se4pmwfwskqZfq5moyldzic= -golang.org/x/mod v0.16.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w= golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= -golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= -golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -130,8 +130,8 @@ golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/tools v0.19.0 h1:tfGCXNR1OsFG+sVdLAitlpjAvD/I6dHDKnYrpEZUHkw= -golang.org/x/tools v0.19.0/go.mod h1:qoJWxmGSIBmAeriMx19ogtrEPrGtDbPK634QFIcLAhc= +golang.org/x/tools v0.20.0 h1:hz/CVckiOxybQvFw6h7b/q80NTr9IUQb4s1IIzW7KNY= +golang.org/x/tools v0.20.0/go.mod h1:WvitBU7JJf6A4jOdg4S1tviW9bhUxkgeCui/0JHctQg= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSmiC7MMxXNOb3PU/VUEz+EhU= golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= From 433ceef8a292932b31ddb3127f95eeb85ae41df5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 Apr 2024 11:05:56 -0400 Subject: [PATCH 49/49] MINOR: [Go] Bump golang.org/x/sync from 0.6.0 to 0.7.0 in /go (#41079) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [//]: # (dependabot-start) ⚠️ **Dependabot is rebasing this PR** ⚠️ Rebasing might not happen immediately, so don't worry if this takes some time. Note: if you make any changes to this PR yourself, they will take precedence over the rebase. --- [//]: # (dependabot-end) Bumps [golang.org/x/sync](https://github.com/golang/sync) from 0.6.0 to 0.7.0.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=golang.org/x/sync&package-manager=go_modules&previous-version=0.6.0&new-version=0.7.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol