Skip to content

Commit

Permalink
Merge branch 'master' into char_varchar
Browse files Browse the repository at this point in the history
# Conflicts:
#	sql/core/src/test/resources/sql-tests/results/group-by-all-mosha.sql.out
  • Loading branch information
jovanm-db committed Nov 19, 2024
2 parents 8d08abc + 87a5b37 commit 0809b25
Show file tree
Hide file tree
Showing 46 changed files with 3,834 additions and 835 deletions.
25 changes: 24 additions & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ jobs:
image_docs_url_link: ${{ steps.infra-image-link.outputs.image_docs_url_link }}
image_lint_url: ${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}
image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }}
image_sparkr_url: ${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}
image_sparkr_url_link: ${{ steps.infra-image-link.outputs.image_sparkr_url_link }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -154,6 +156,14 @@ jobs:
IMG_NAME="apache-spark-ci-image-lint:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_lint_url=$IMG_URL" >> $GITHUB_OUTPUT
- name: Generate infra image URL (SparkR)
id: infra-image-sparkr-outputs
run: |
# Convert to lowercase to meet Docker repo name requirement
REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT
- name: Link the docker images
id: infra-image-link
run: |
Expand All @@ -162,9 +172,11 @@ jobs:
if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
echo "image_sparkr_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
else
echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT
echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT
echo "image_sparkr_url_link=${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT
fi
# Build: build Spark and run the tests for specified modules.
Expand Down Expand Up @@ -405,6 +417,17 @@ jobs:
${{ needs.precondition.outputs.image_lint_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }}
- name: Build and push (SparkR)
if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
id: docker_build_sparkr
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/sparkr/
push: true
tags: |
${{ needs.precondition.outputs.image_sparkr_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ inputs.branch }}


pyspark:
Expand Down Expand Up @@ -564,7 +587,7 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 180
container:
image: ${{ needs.precondition.outputs.image_url }}
image: ${{ needs.precondition.outputs.image_sparkr_url_link }}
env:
HADOOP_PROFILE: ${{ inputs.hadoop }}
HIVE_PROFILE: hive2.3
Expand Down
14 changes: 14 additions & 0 deletions .github/workflows/build_infra_images_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ on:
- 'dev/infra/Dockerfile'
- 'dev/spark-test-image/docs/Dockerfile'
- 'dev/spark-test-image/lint/Dockerfile'
- 'dev/spark-test-image/sparkr/Dockerfile'
- '.github/workflows/build_infra_images_cache.yml'
# Create infra image when cutting down branches/tags
create:
Expand Down Expand Up @@ -88,3 +89,16 @@ jobs:
- name: Image digest (Linter)
if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
run: echo ${{ steps.docker_build_lint.outputs.digest }}
- name: Build and push (SparkR)
if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
id: docker_build_sparkr
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/sparkr/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }},mode=max
- name: Image digest (SparkR)
if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
run: echo ${{ steps.docker_build_sparkr.outputs.digest }}
32 changes: 32 additions & 0 deletions .github/workflows/build_python_3.11_macos.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

name: "Build / Python-only (master, Python 3.11, MacOS)"

on:
schedule:
- cron: '0 21 * * *'

jobs:
run-build:
permissions:
packages: write
name: Run
uses: ./.github/workflows/python_macos_test.yml
if: github.repository == 'apache/spark'
165 changes: 165 additions & 0 deletions .github/workflows/python_macos_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

name: Build and test PySpark on macOS

on:
workflow_call:
inputs:
java:
required: false
type: string
default: 17
python:
required: false
type: string
default: 3.11
branch:
description: Branch to run the build against
required: false
type: string
default: master
hadoop:
description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
required: false
type: string
default: hadoop3
envs:
description: Additional environment variables to set when running the tests. Should be in JSON format.
required: false
type: string
default: '{}'
jobs:
build:
name: "PySpark test on macos: ${{ matrix.modules }}"
runs-on: macos-15
strategy:
fail-fast: false
matrix:
java:
- ${{ inputs.java }}
python:
- ${{inputs.python}}
modules:
- >-
pyspark-sql, pyspark-resource, pyspark-testing
- >-
pyspark-core, pyspark-errors, pyspark-streaming
- >-
pyspark-mllib, pyspark-ml, pyspark-ml-connect
- >-
pyspark-connect
- >-
pyspark-pandas
- >-
pyspark-pandas-slow
- >-
pyspark-pandas-connect-part0
- >-
pyspark-pandas-connect-part1
- >-
pyspark-pandas-connect-part2
- >-
pyspark-pandas-connect-part3
env:
MODULES_TO_TEST: ${{ matrix.modules }}
PYTHON_TO_TEST: python${{inputs.python}}
HADOOP_PROFILE: ${{ inputs.hadoop }}
HIVE_PROFILE: hive2.3
# GitHub Actions' default miniconda to use in pip packaging test.
CONDA_PREFIX: /usr/share/miniconda
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
SKIP_UNIDOC: true
SKIP_MIMA: true
SKIP_PACKAGING: true
METASPACE_SIZE: 1g
BRANCH: ${{ inputs.branch }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
# In order to fetch changed files
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v4
with:
path: ~/.cache/coursier
key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
pyspark-coursier-
- name: Install Java ${{ matrix.java }}
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ matrix.java }}
- name: Install Python packages (Python ${{matrix.python}})
run: |
python${{matrix.python}} -m pip install --ignore-installed 'blinker>=1.6.2'
python${{matrix.python}} -m pip install --ignore-installed 'six==1.16.0'
python${{matrix.python}} -m pip install py-cpuinfo && \
python${{matrix.python}} -m pip install numpy 'pyarrow>=15.0.0' 'six==1.16.0' 'pandas==2.2.3' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' unittest-xml-reporting && \
python${{matrix.python}} -m pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' && \
python${{matrix.python}} -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \
python${{matrix.python}} -m pip install deepspeed torcheval && \
python${{matrix.python}} -m pip cache purge && \
python${{matrix.python}} -m pip list
# Run the tests.
- name: Run tests
env: ${{ fromJSON(inputs.envs) }}
run: |
if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then
export SKIP_PACKAGING=false
echo "Python Packaging Tests Enabled!"
fi
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST"
- name: Upload test results to report
env: ${{ fromJSON(inputs.envs) }}
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
env: ${{ fromJSON(inputs.envs) }}
if: ${{ !success() }}
uses: actions/upload-artifact@v4
with:
name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
path: "**/target/unit-tests.log"
77 changes: 77 additions & 0 deletions dev/spark-test-image/sparkr/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Image for building and testing Spark branches. Based on Ubuntu 22.04.
# See also in https://hub.docker.com/_/ubuntu
FROM ubuntu:jammy-20240911.1
LABEL org.opencontainers.image.authors="Apache Spark project <[email protected]>"
LABEL org.opencontainers.image.licenses="Apache-2.0"
LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image for SparkR"
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE 20241114

ENV DEBIAN_FRONTEND noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN true

RUN apt-get update && apt-get install -y \
build-essential \
ca-certificates \
curl \
gfortran \
git \
gnupg \
libcurl4-openssl-dev \
libfontconfig1-dev \
libfreetype6-dev \
libfribidi-dev \
libgit2-dev \
libharfbuzz-dev \
libjpeg-dev \
liblapack-dev \
libopenblas-dev \
libpng-dev \
libpython3-dev \
libssl-dev \
libtiff5-dev \
libxml2-dev \
pandoc \
pkg-config \
qpdf \
r-base \
software-properties-common \
wget \
zlib1g-dev \
&& rm -rf /var/lib/apt/lists/*

RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' >> /etc/apt/sources.list
RUN gpg --keyserver hkps://keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9
RUN gpg -a --export E084DAB9 | apt-key add -
RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/'

# See more in SPARK-39959, roxygen2 < 7.2.1
RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', \
'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', \
'ggplot2', 'mvtnorm', 'statmod', 'xml2'), repos='https://cloud.r-project.org/')" && \
Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='https://cloud.r-project.org')" && \
Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" && \
Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" && \
Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"

# See more in SPARK-39735
ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library"
6 changes: 3 additions & 3 deletions python/pyspark/sql/connect/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def convert_other(value: Any) -> Any:
return lambda value: value

@staticmethod
def convert(data: Sequence[Any], schema: StructType) -> "pa.Table":
def convert(data: Sequence[Any], schema: StructType, verifySchema: bool = False) -> "pa.Table":
assert isinstance(data, list) and len(data) > 0

assert schema is not None and isinstance(schema, StructType)
Expand Down Expand Up @@ -372,8 +372,8 @@ def convert(data: Sequence[Any], schema: StructType) -> "pa.Table":
]
)
)

return pa.Table.from_arrays(pylist, schema=pa_schema)
table = pa.Table.from_arrays(pylist, schema=pa_schema)
return table.cast(pa_schema, safe=verifySchema)


class ArrowTableToRowsConversion:
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/sql/connect/functions/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -1552,7 +1552,7 @@ def count_if(col: "ColumnOrName") -> Column:
count_if.__doc__ = pysparkfuncs.count_if.__doc__


def histogram_numeric(col: "ColumnOrName", nBins: "ColumnOrName") -> Column:
def histogram_numeric(col: "ColumnOrName", nBins: Column) -> Column:
return _invoke_function_over_columns("histogram_numeric", col, nBins)


Expand Down
Loading

0 comments on commit 0809b25

Please sign in to comment.