Skip to content

Commit

Permalink
Merge branch 'apache:master' into char_varchar
Browse files Browse the repository at this point in the history
  • Loading branch information
jovanm-db authored Nov 15, 2024
2 parents 90c9c99 + cf90271 commit 863dd33
Show file tree
Hide file tree
Showing 671 changed files with 30,845 additions and 31,022 deletions.
6 changes: 2 additions & 4 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,14 @@ INFRA:
'.asf.yaml',
'.gitattributes',
'.gitignore',
'dev/merge_spark_pr.py',
'dev/run-tests-jenkins*'
'dev/merge_spark_pr.py'
]

BUILD:
- changed-files:
- all-globs-to-any-file: [
'dev/**/*',
'!dev/merge_spark_pr.py',
'!dev/run-tests-jenkins*'
'!dev/merge_spark_pr.py'
]
- any-glob-to-any-file: [
'build/**/*',
Expand Down
118 changes: 75 additions & 43 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ jobs:
outputs:
required: ${{ steps.set-outputs.outputs.required }}
image_url: ${{ steps.infra-image-outputs.outputs.image_url }}
image_docs_url: ${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}
image_docs_url_link: ${{ steps.infra-image-link.outputs.image_docs_url_link }}
image_lint_url: ${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}
image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -134,6 +138,34 @@ jobs:
IMG_NAME="apache-spark-ci-image:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_url=$IMG_URL" >> $GITHUB_OUTPUT
- name: Generate infra image URL (Documentation)
id: infra-image-docs-outputs
run: |
# Convert to lowercase to meet Docker repo name requirement
REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
IMG_NAME="apache-spark-ci-image-docs:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_docs_url=$IMG_URL" >> $GITHUB_OUTPUT
- name: Generate infra image URL (Linter)
id: infra-image-lint-outputs
run: |
# Convert to lowercase to meet Docker repo name requirement
REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
IMG_NAME="apache-spark-ci-image-lint:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_lint_url=$IMG_URL" >> $GITHUB_OUTPUT
- name: Link the docker images
id: infra-image-link
run: |
# Set the image URL for job "docs"
# Should delete the link and directly use image_docs_url after SPARK 3.x EOL
if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
else
echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT
echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT
fi
# Build: build Spark and run the tests for specified modules.
build:
Expand Down Expand Up @@ -276,7 +308,7 @@ jobs:
- name: Install Python packages (Python 3.11)
if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect')
run: |
python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1'
python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3'
python3.11 -m pip list
# Run the tests.
- name: Run tests
Expand Down Expand Up @@ -351,6 +383,29 @@ jobs:
${{ needs.precondition.outputs.image_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }}
- name: Build and push (Documentation)
if: hashFiles('dev/spark-test-image/docs/Dockerfile') != ''
id: docker_build_docs
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/docs/
push: true
tags: |
${{ needs.precondition.outputs.image_docs_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ inputs.branch }}
- name: Build and push (Linter)
if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
id: docker_build_lint
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/lint/
push: true
tags: |
${{ needs.precondition.outputs.image_lint_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }}


pyspark:
needs: [precondition, infra-image]
Expand Down Expand Up @@ -614,10 +669,14 @@ jobs:
python-version: '3.11'
- name: Install dependencies for Python CodeGen check
run: |
python3.11 -m pip install 'black==23.9.1' 'protobuf==4.25.1' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
python3.11 -m pip install 'black==23.9.1' 'protobuf==5.28.3' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
python3.11 -m pip list
- name: Python CodeGen check
- name: Python CodeGen check for branch-3.5
if: inputs.branch == 'branch-3.5'
run: ./dev/connect-check-protos.py
- name: Python CodeGen check
if: inputs.branch != 'branch-3.5'
run: ./dev/check-protos.py

# Static analysis
lint:
Expand All @@ -635,7 +694,7 @@ jobs:
PYSPARK_PYTHON: python3.9
GITHUB_PREV_SHA: ${{ github.event.before }}
container:
image: ${{ needs.precondition.outputs.image_url }}
image: ${{ needs.precondition.outputs.image_lint_url_link }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -702,32 +761,15 @@ jobs:
run: ./dev/lint-java
- name: Spark connect jvm client mima check
run: ./dev/connect-jvm-client-mima-check
- name: Install Python linter dependencies for branch-3.4
if: inputs.branch == 'branch-3.4'
run: |
# SPARK-44554: Copy from https://github.com/apache/spark/blob/a05c27e85829fe742c1828507a1fd180cdc84b54/.github/workflows/build_and_test.yml#L571-L578
# Should delete this section after SPARK 3.4 EOL.
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.48.1' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
- name: Install Python linter dependencies for branch-3.5
if: inputs.branch == 'branch-3.5'
run: |
# SPARK-45212: Copy from https://github.com/apache/spark/blob/555c8def51e5951c7bf5165a332795e9e330ec9d/.github/workflows/build_and_test.yml#L631-L638
# Should delete this section after SPARK 3.5 EOL.
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
- name: Install Python dependencies for python linter and documentation generation
if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5'
run: |
# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
# See 'ipython_genutils' in SPARK-38517
# See 'docutils<0.18.0' in SPARK-39421
python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
ipython ipython_genutils sphinx_plotly_directive numpy pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
python3.9 -m pip list
- name: List Python packages
run: python3.9 -m pip list
- name: Python linter
run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python
# Should delete this section after SPARK 3.5 EOL.
Expand All @@ -745,16 +787,16 @@ jobs:
if: inputs.branch == 'branch-3.5'
run: if test -f ./dev/connect-check-protos.py; then PATH=$PATH:$HOME/buf/bin PYTHON_EXECUTABLE=python3.9 ./dev/connect-check-protos.py; fi
# Should delete this section after SPARK 3.5 EOL.
- name: Install JavaScript linter dependencies for branch-3.4, branch-3.5
if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
- name: Install JavaScript linter dependencies for branch-3.5
if: inputs.branch == 'branch-3.5'
run: |
apt update
apt-get install -y nodejs npm
- name: JS linter
run: ./dev/lint-js
# Should delete this section after SPARK 3.5 EOL.
- name: Install R linter dependencies for branch-3.4, branch-3.5
if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
- name: Install R linter dependencies for branch-3.5
if: inputs.branch == 'branch-3.5'
run: |
apt update
apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \
Expand Down Expand Up @@ -783,7 +825,7 @@ jobs:
PYSPARK_PYTHON: python3.9
GITHUB_PREV_SHA: ${{ github.event.before }}
container:
image: ${{ needs.precondition.outputs.image_url }}
image: ${{ needs.precondition.outputs.image_docs_url_link }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -833,20 +875,8 @@ jobs:
with:
distribution: zulu
java-version: ${{ inputs.java }}
- name: Install Python dependencies for python linter and documentation generation
if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5'
run: |
# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
# See 'ipython_genutils' in SPARK-38517
# See 'docutils<0.18.0' in SPARK-39421
python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
python3.9 -m pip list
- name: Install dependencies for documentation generation for branch-3.4, branch-3.5
if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
- name: Install dependencies for documentation generation for branch-3.5
if: inputs.branch == 'branch-3.5'
run: |
# pandoc is required to generate PySpark APIs as well in nbsphinx.
apt-get update -y
Expand All @@ -860,6 +890,8 @@ jobs:
python3.9 -m pip install ipython_genutils # See SPARK-38517
python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8'
python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421
- name: List Python packages
run: python3.9 -m pip list
- name: Install dependencies for documentation generation
run: |
# Keep the version of Bundler here in sync with the following locations:
Expand Down Expand Up @@ -1134,7 +1166,7 @@ jobs:
export PVC_TESTS_VM_PATH=$PVC_TMP_DIR
minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 &
kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true
if [[ "${{ inputs.branch }}" == 'branch-3.5' || "${{ inputs.branch }}" == 'branch-3.4' ]]; then
if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true
else
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.9.0/installer/volcano-development.yaml || true
Expand Down
51 changes: 0 additions & 51 deletions .github/workflows/build_branch34.yml

This file was deleted.

28 changes: 28 additions & 0 deletions .github/workflows/build_infra_images_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ on:
- 'branch-*'
paths:
- 'dev/infra/Dockerfile'
- 'dev/spark-test-image/docs/Dockerfile'
- 'dev/spark-test-image/lint/Dockerfile'
- '.github/workflows/build_infra_images_cache.yml'
# Create infra image when cutting down branches/tags
create:
Expand Down Expand Up @@ -60,3 +62,29 @@ jobs:
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ github.ref_name }},mode=max
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
- name: Build and push (Documentation)
if: hashFiles('dev/spark-test-image/docs/Dockerfile') != ''
id: docker_build_docs
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/docs/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ github.ref_name }},mode=max
- name: Image digest (Documentation)
if: hashFiles('dev/spark-test-image/docs/Dockerfile') != ''
run: echo ${{ steps.docker_build_docs.outputs.digest }}
- name: Build and push (Linter)
if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
id: docker_build_lint
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/lint/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }},mode=max
- name: Image digest (Linter)
if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
run: echo ${{ steps.docker_build_lint.outputs.digest }}
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
# under the License.
#

name: "Build / Python-only (branch-3.4)"
name: "Build / Python-only (master, Python 3.9)"

on:
schedule:
- cron: '0 9 * * *'
- cron: '0 21 * * *'

jobs:
run-build:
Expand All @@ -31,12 +31,12 @@ jobs:
uses: ./.github/workflows/build_and_test.yml
if: github.repository == 'apache/spark'
with:
java: 8
branch: branch-3.4
java: 17
branch: master
hadoop: hadoop3
envs: >-
{
"PYTHON_TO_TEST": ""
"PYTHON_TO_TEST": "python3.9"
}
jobs: >-
{
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build_python_connect35.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
# Add Python deps for Spark Connect.
pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' 'graphviz==0.20.3'
pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3'
# Add torch as a testing dependency for TorchDistributor
pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# under the License.
#

name: "Build / Python-only (master, PyPy 3.9)"
name: "Build / Python-only (master, PyPy 3.10)"

on:
schedule:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/build_sparkr_window.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# specific language governing permissions and limitations
# under the License.
#
name: "Build / SparkR-only (master, 4.4.1, windows-2022)"
name: "Build / SparkR-only (master, 4.4.2, windows-2022)"

on:
schedule:
Expand Down Expand Up @@ -50,10 +50,10 @@ jobs:
with:
distribution: zulu
java-version: 17
- name: Install R 4.4.1
- name: Install R 4.4.2
uses: r-lib/actions/setup-r@v2
with:
r-version: 4.4.1
r-version: 4.4.2
- name: Install R dependencies
run: |
Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')"
Expand Down
Loading

0 comments on commit 863dd33

Please sign in to comment.