diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4a3707404bccf..3117872e21680 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -62,6 +62,8 @@ jobs: image_docs_url_link: ${{ steps.infra-image-link.outputs.image_docs_url_link }} image_lint_url: ${{ steps.infra-image-lint-outputs.outputs.image_lint_url }} image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }} + image_sparkr_url: ${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }} + image_sparkr_url_link: ${{ steps.infra-image-link.outputs.image_sparkr_url_link }} steps: - name: Checkout Spark repository uses: actions/checkout@v4 @@ -154,6 +156,14 @@ jobs: IMG_NAME="apache-spark-ci-image-lint:${{ inputs.branch }}-${{ github.run_id }}" IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_lint_url=$IMG_URL" >> $GITHUB_OUTPUT + - name: Generate infra image URL (SparkR) + id: infra-image-sparkr-outputs + run: | + # Convert to lowercase to meet Docker repo name requirement + REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ github.run_id }}" + IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" + echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT - name: Link the docker images id: infra-image-link run: | @@ -162,9 +172,11 @@ jobs: if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT + echo "image_sparkr_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT else echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT + echo "image_sparkr_url_link=${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT fi # Build: build Spark and run the tests for specified modules. @@ -405,6 +417,17 @@ jobs: ${{ needs.precondition.outputs.image_lint_url }} # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }} + - name: Build and push (SparkR) + if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' + id: docker_build_sparkr + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/sparkr/ + push: true + tags: | + ${{ needs.precondition.outputs.image_sparkr_url }} + # Use the infra image cache to speed up + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ inputs.branch }} pyspark: @@ -564,7 +587,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 180 container: - image: ${{ needs.precondition.outputs.image_url }} + image: ${{ needs.precondition.outputs.image_sparkr_url_link }} env: HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml index b82d0633b0cee..a6beacedeebd4 100644 --- a/.github/workflows/build_infra_images_cache.yml +++ b/.github/workflows/build_infra_images_cache.yml @@ -29,6 +29,7 @@ on: - 'dev/infra/Dockerfile' - 'dev/spark-test-image/docs/Dockerfile' - 'dev/spark-test-image/lint/Dockerfile' + - 'dev/spark-test-image/sparkr/Dockerfile' - '.github/workflows/build_infra_images_cache.yml' # Create infra image when cutting down branches/tags create: @@ -88,3 +89,16 @@ jobs: - name: Image digest (Linter) if: hashFiles('dev/spark-test-image/lint/Dockerfile') != '' run: echo ${{ steps.docker_build_lint.outputs.digest }} + - name: Build and push (SparkR) + if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' + id: docker_build_sparkr + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/sparkr/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }},mode=max + - name: Image digest (SparkR) + if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' + run: echo ${{ steps.docker_build_sparkr.outputs.digest }} diff --git a/.github/workflows/build_python_3.11_macos.yml b/.github/workflows/build_python_3.11_macos.yml new file mode 100644 index 0000000000000..4caae55b5fea8 --- /dev/null +++ b/.github/workflows/build_python_3.11_macos.yml @@ -0,0 +1,32 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Python-only (master, Python 3.11, MacOS)" + +on: + schedule: + - cron: '0 21 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/python_macos_test.yml + if: github.repository == 'apache/spark' diff --git a/.github/workflows/python_macos_test.yml b/.github/workflows/python_macos_test.yml new file mode 100644 index 0000000000000..32a8e21801db3 --- /dev/null +++ b/.github/workflows/python_macos_test.yml @@ -0,0 +1,165 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: Build and test PySpark on macOS + +on: + workflow_call: + inputs: + java: + required: false + type: string + default: 17 + python: + required: false + type: string + default: 3.11 + branch: + description: Branch to run the build against + required: false + type: string + default: master + hadoop: + description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it. + required: false + type: string + default: hadoop3 + envs: + description: Additional environment variables to set when running the tests. Should be in JSON format. + required: false + type: string + default: '{}' +jobs: + build: + name: "PySpark test on macos: ${{ matrix.modules }}" + runs-on: macos-15 + strategy: + fail-fast: false + matrix: + java: + - ${{ inputs.java }} + python: + - ${{inputs.python}} + modules: + - >- + pyspark-sql, pyspark-resource, pyspark-testing + - >- + pyspark-core, pyspark-errors, pyspark-streaming + - >- + pyspark-mllib, pyspark-ml, pyspark-ml-connect + - >- + pyspark-connect + - >- + pyspark-pandas + - >- + pyspark-pandas-slow + - >- + pyspark-pandas-connect-part0 + - >- + pyspark-pandas-connect-part1 + - >- + pyspark-pandas-connect-part2 + - >- + pyspark-pandas-connect-part3 + env: + MODULES_TO_TEST: ${{ matrix.modules }} + PYTHON_TO_TEST: python${{inputs.python}} + HADOOP_PROFILE: ${{ inputs.hadoop }} + HIVE_PROFILE: hive2.3 + # GitHub Actions' default miniconda to use in pip packaging test. + CONDA_PREFIX: /usr/share/miniconda + GITHUB_PREV_SHA: ${{ github.event.before }} + SPARK_LOCAL_IP: localhost + SKIP_UNIDOC: true + SKIP_MIMA: true + SKIP_PACKAGING: true + METASPACE_SIZE: 1g + BRANCH: ${{ inputs.branch }} + steps: + - name: Checkout Spark repository + uses: actions/checkout@v4 + # In order to fetch changed files + with: + fetch-depth: 0 + repository: apache/spark + ref: ${{ inputs.branch }} + - name: Sync the current branch with the latest in Apache Spark + if: github.repository != 'apache/spark' + run: | + echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + # Cache local repositories. Note that GitHub Actions cache has a 10G limit. + - name: Cache SBT and Maven + uses: actions/cache@v4 + with: + path: | + build/apache-maven-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Coursier local repository + uses: actions/cache@v4 + with: + path: ~/.cache/coursier + key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + pyspark-coursier- + - name: Install Java ${{ matrix.java }} + uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: ${{ matrix.java }} + - name: Install Python packages (Python ${{matrix.python}}) + run: | + python${{matrix.python}} -m pip install --ignore-installed 'blinker>=1.6.2' + python${{matrix.python}} -m pip install --ignore-installed 'six==1.16.0' + python${{matrix.python}} -m pip install py-cpuinfo && \ + python${{matrix.python}} -m pip install numpy 'pyarrow>=15.0.0' 'six==1.16.0' 'pandas==2.2.3' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' unittest-xml-reporting && \ + python${{matrix.python}} -m pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' && \ + python${{matrix.python}} -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ + python${{matrix.python}} -m pip install deepspeed torcheval && \ + python${{matrix.python}} -m pip cache purge && \ + python${{matrix.python}} -m pip list + # Run the tests. + - name: Run tests + env: ${{ fromJSON(inputs.envs) }} + run: | + if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then + export SKIP_PACKAGING=false + echo "Python Packaging Tests Enabled!" + fi + ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST" + - name: Upload test results to report + env: ${{ fromJSON(inputs.envs) }} + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} + path: "**/target/test-reports/*.xml" + - name: Upload unit tests log files + env: ${{ fromJSON(inputs.envs) }} + if: ${{ !success() }} + uses: actions/upload-artifact@v4 + with: + name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} + path: "**/target/unit-tests.log" diff --git a/dev/spark-test-image/sparkr/Dockerfile b/dev/spark-test-image/sparkr/Dockerfile new file mode 100644 index 0000000000000..43260c714a550 --- /dev/null +++ b/dev/spark-test-image/sparkr/Dockerfile @@ -0,0 +1,77 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image for SparkR" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE 20241114 + +ENV DEBIAN_FRONTEND noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + pandoc \ + pkg-config \ + qpdf \ + r-base \ + software-properties-common \ + wget \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' >> /etc/apt/sources.list +RUN gpg --keyserver hkps://keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN gpg -a --export E084DAB9 | apt-key add - +RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' + +# See more in SPARK-39959, roxygen2 < 7.2.1 +RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', \ + 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', \ + 'ggplot2', 'mvtnorm', 'statmod', 'xml2'), repos='https://cloud.r-project.org/')" && \ + Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" + +# See more in SPARK-39735 +ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" diff --git a/python/pyspark/sql/connect/conversion.py b/python/pyspark/sql/connect/conversion.py index d803f37c5b9f1..f689c439f5f68 100644 --- a/python/pyspark/sql/connect/conversion.py +++ b/python/pyspark/sql/connect/conversion.py @@ -322,7 +322,7 @@ def convert_other(value: Any) -> Any: return lambda value: value @staticmethod - def convert(data: Sequence[Any], schema: StructType) -> "pa.Table": + def convert(data: Sequence[Any], schema: StructType, verifySchema: bool = False) -> "pa.Table": assert isinstance(data, list) and len(data) > 0 assert schema is not None and isinstance(schema, StructType) @@ -372,8 +372,8 @@ def convert(data: Sequence[Any], schema: StructType) -> "pa.Table": ] ) ) - - return pa.Table.from_arrays(pylist, schema=pa_schema) + table = pa.Table.from_arrays(pylist, schema=pa_schema) + return table.cast(pa_schema, safe=verifySchema) class ArrowTableToRowsConversion: diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py index fa1a81ab04eba..f52cdffb84b7c 100644 --- a/python/pyspark/sql/connect/functions/builtin.py +++ b/python/pyspark/sql/connect/functions/builtin.py @@ -1552,7 +1552,7 @@ def count_if(col: "ColumnOrName") -> Column: count_if.__doc__ = pysparkfuncs.count_if.__doc__ -def histogram_numeric(col: "ColumnOrName", nBins: "ColumnOrName") -> Column: +def histogram_numeric(col: "ColumnOrName", nBins: Column) -> Column: return _invoke_function_over_columns("histogram_numeric", col, nBins) diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py index 83b0496a84274..e7292bf8804f8 100644 --- a/python/pyspark/sql/connect/session.py +++ b/python/pyspark/sql/connect/session.py @@ -50,6 +50,7 @@ ) import urllib +from pyspark._globals import _NoValue, _NoValueType from pyspark.sql.connect.dataframe import DataFrame from pyspark.sql.dataframe import DataFrame as ParentDataFrame from pyspark.sql.connect.logging import logger @@ -449,7 +450,7 @@ def createDataFrame( data: Union["pd.DataFrame", "np.ndarray", "pa.Table", Iterable[Any]], schema: Optional[Union[AtomicType, StructType, str, List[str], Tuple[str, ...]]] = None, samplingRatio: Optional[float] = None, - verifySchema: Optional[bool] = None, + verifySchema: Union[_NoValueType, bool] = _NoValue, ) -> "ParentDataFrame": assert data is not None if isinstance(data, DataFrame): @@ -461,9 +462,6 @@ def createDataFrame( if samplingRatio is not None: warnings.warn("'samplingRatio' is ignored. It is not supported with Spark Connect.") - if verifySchema is not None: - warnings.warn("'verifySchema' is ignored. It is not supported with Spark Connect.") - _schema: Optional[Union[AtomicType, StructType]] = None _cols: Optional[List[str]] = None _num_cols: Optional[int] = None @@ -576,7 +574,10 @@ def createDataFrame( "spark.sql.session.timeZone", "spark.sql.execution.pandas.convertToArrowArraySafely" ) - ser = ArrowStreamPandasSerializer(cast(str, timezone), safecheck == "true") + if verifySchema is _NoValue: + verifySchema = safecheck == "true" + + ser = ArrowStreamPandasSerializer(cast(str, timezone), verifySchema) _table = pa.Table.from_batches( [ @@ -596,6 +597,9 @@ def createDataFrame( ).cast(arrow_schema) elif isinstance(data, pa.Table): + if verifySchema is _NoValue: + verifySchema = False + prefer_timestamp_ntz = is_timestamp_ntz_preferred() (timezone,) = self._client.get_configs("spark.sql.session.timeZone") @@ -613,7 +617,10 @@ def createDataFrame( _table = ( _check_arrow_table_timestamps_localize(data, schema, True, timezone) - .cast(to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True)) + .cast( + to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True), + safe=verifySchema, + ) .rename_columns(schema.names) ) @@ -652,6 +659,12 @@ def createDataFrame( # The _table should already have the proper column names. _cols = None + if verifySchema is not _NoValue: + warnings.warn( + "'verifySchema' is ignored. It is not supported" + " with np.ndarray input on Spark Connect." + ) + else: _data = list(data) @@ -683,12 +696,15 @@ def createDataFrame( errorClass="CANNOT_DETERMINE_TYPE", messageParameters={} ) + if verifySchema is _NoValue: + verifySchema = True + from pyspark.sql.connect.conversion import LocalDataToArrowConversion # Spark Connect will try its best to build the Arrow table with the # inferred schema in the client side, and then rename the columns and # cast the datatypes in the server side. - _table = LocalDataToArrowConversion.convert(_data, _schema) + _table = LocalDataToArrowConversion.convert(_data, _schema, cast(bool, verifySchema)) # TODO: Beside the validation on number of columns, we should also check # whether the Arrow Schema is compatible with the user provided Schema. diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 842fdf7ae4234..5fce4a67f5662 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -6078,10 +6078,11 @@ def dense_rank() -> Column: Examples -------- - >>> from pyspark.sql import Window, types - >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType()) + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql import Window + >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], "int") >>> w = Window.orderBy("value") - >>> df.withColumn("drank", dense_rank().over(w)).show() + >>> df.withColumn("drank", sf.dense_rank().over(w)).show() +-----+-----+ |value|drank| +-----+-----+ @@ -6121,10 +6122,11 @@ def rank() -> Column: Examples -------- - >>> from pyspark.sql import Window, types - >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType()) + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql import Window + >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], "int") >>> w = Window.orderBy("value") - >>> df.withColumn("drank", rank().over(w)).show() + >>> df.withColumn("drank", sf.rank().over(w)).show() +-----+-----+ |value|drank| +-----+-----+ @@ -6157,10 +6159,11 @@ def cume_dist() -> Column: Examples -------- - >>> from pyspark.sql import Window, types - >>> df = spark.createDataFrame([1, 2, 3, 3, 4], types.IntegerType()) + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql import Window + >>> df = spark.createDataFrame([1, 2, 3, 3, 4], "int") >>> w = Window.orderBy("value") - >>> df.withColumn("cd", cume_dist().over(w)).show() + >>> df.withColumn("cd", sf.cume_dist().over(w)).show() +-----+---+ |value| cd| +-----+---+ @@ -6191,10 +6194,11 @@ def percent_rank() -> Column: Examples -------- - >>> from pyspark.sql import Window, types - >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType()) + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql import Window + >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], "int") >>> w = Window.orderBy("value") - >>> df.withColumn("pr", percent_rank().over(w)).show() + >>> df.withColumn("pr", sf.percent_rank().over(w)).show() +-----+---+ |value| pr| +-----+---+ @@ -6240,7 +6244,7 @@ def approx_count_distinct(col: "ColumnOrName", rsd: Optional[float] = None) -> C Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name The label of the column to count distinct values in. rsd : float, optional The maximum allowed relative standard deviation (default = 0.05). @@ -6259,47 +6263,46 @@ def approx_count_distinct(col: "ColumnOrName", rsd: Optional[float] = None) -> C -------- Example 1: Counting distinct values in a single column DataFrame representing integers - >>> from pyspark.sql.functions import approx_count_distinct + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([1,2,2,3], "int") - >>> df.agg(approx_count_distinct("value").alias('distinct_values')).show() - +---------------+ - |distinct_values| - +---------------+ - | 3| - +---------------+ + >>> df.agg(sf.approx_count_distinct("value")).show() + +----------------------------+ + |approx_count_distinct(value)| + +----------------------------+ + | 3| + +----------------------------+ Example 2: Counting distinct values in a single column DataFrame representing strings - >>> from pyspark.sql.functions import approx_count_distinct + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("apple",), ("orange",), ("apple",), ("banana",)], ['fruit']) - >>> df.agg(approx_count_distinct("fruit").alias('distinct_fruits')).show() - +---------------+ - |distinct_fruits| - +---------------+ - | 3| - +---------------+ + >>> df.agg(sf.approx_count_distinct("fruit")).show() + +----------------------------+ + |approx_count_distinct(fruit)| + +----------------------------+ + | 3| + +----------------------------+ Example 3: Counting distinct values in a DataFrame with multiple columns - >>> from pyspark.sql.functions import approx_count_distinct, struct - >>> df = spark.createDataFrame([("Alice", 1), - ... ("Alice", 2), - ... ("Bob", 3), - ... ("Bob", 3)], ["name", "value"]) - >>> df = df.withColumn("combined", struct("name", "value")) - >>> df.agg(approx_count_distinct("combined").alias('distinct_pairs')).show() - +--------------+ - |distinct_pairs| - +--------------+ - | 3| - +--------------+ + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame( + ... [("Alice", 1), ("Alice", 2), ("Bob", 3), ("Bob", 3)], ["name", "value"]) + >>> df = df.withColumn("combined", sf.struct("name", "value")) + >>> df.agg(sf.approx_count_distinct(df.combined)).show() + +-------------------------------+ + |approx_count_distinct(combined)| + +-------------------------------+ + | 3| + +-------------------------------+ Example 4: Counting distinct values with a specified relative standard deviation - >>> from pyspark.sql.functions import approx_count_distinct - >>> df = spark.range(100000) - >>> df.agg(approx_count_distinct("id").alias('with_default_rsd'), - ... approx_count_distinct("id", 0.1).alias('with_rsd_0.1')).show() + >>> from pyspark.sql import functions as sf + >>> spark.range(100000).agg( + ... sf.approx_count_distinct("id").alias('with_default_rsd'), + ... sf.approx_count_distinct("id", 0.1).alias('with_rsd_0.1') + ... ).show() +----------------+------------+ |with_default_rsd|with_rsd_0.1| +----------------+------------+ @@ -6331,10 +6334,10 @@ def broadcast(df: "ParentDataFrame") -> "ParentDataFrame": Examples -------- - >>> from pyspark.sql import types - >>> df = spark.createDataFrame([1, 2, 3, 3, 4], types.IntegerType()) + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([1, 2, 3, 3, 4], "int") >>> df_small = spark.range(3) - >>> df_b = broadcast(df_small) + >>> df_b = sf.broadcast(df_small) >>> df.join(df_b, df.value == df_small.id).show() +-----+---+ |value| id| @@ -6360,7 +6363,7 @@ def coalesce(*cols: "ColumnOrName") -> Column: Parameters ---------- - cols : :class:`~pyspark.sql.Column` or str + cols : :class:`~pyspark.sql.Column` or column name list of columns to work on. Returns @@ -6370,8 +6373,9 @@ def coalesce(*cols: "ColumnOrName") -> Column: Examples -------- - >>> cDf = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b")) - >>> cDf.show() + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b")) + >>> df.show() +----+----+ | a| b| +----+----+ @@ -6380,16 +6384,16 @@ def coalesce(*cols: "ColumnOrName") -> Column: |NULL| 2| +----+----+ - >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show() - +--------------+ - |coalesce(a, b)| - +--------------+ - | NULL| - | 1| - | 2| - +--------------+ + >>> df.select('*', sf.coalesce("a", df["b"])).show() + +----+----+--------------+ + | a| b|coalesce(a, b)| + +----+----+--------------+ + |NULL|NULL| NULL| + | 1|NULL| 1| + |NULL| 2| 2| + +----+----+--------------+ - >>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show() + >>> df.select('*', sf.coalesce(df["a"], lit(0.0))).show() +----+----+----------------+ | a| b|coalesce(a, 0.0)| +----+----+----------------+ @@ -6413,9 +6417,9 @@ def corr(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: Parameters ---------- - col1 : :class:`~pyspark.sql.Column` or str + col1 : :class:`~pyspark.sql.Column` or column name first column to calculate correlation. - col2 : :class:`~pyspark.sql.Column` or str + col2 : :class:`~pyspark.sql.Column` or column name second column to calculate correlation. Returns @@ -6425,11 +6429,16 @@ def corr(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> a = range(20) >>> b = [2 * x for x in range(20)] >>> df = spark.createDataFrame(zip(a, b), ["a", "b"]) - >>> df.agg(corr("a", "b").alias('c')).collect() - [Row(c=1.0)] + >>> df.agg(sf.corr("a", df.b)).show() + +----------+ + |corr(a, b)| + +----------+ + | 1.0| + +----------+ """ return _invoke_function_over_columns("corr", col1, col2) @@ -6446,9 +6455,9 @@ def covar_pop(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: Parameters ---------- - col1 : :class:`~pyspark.sql.Column` or str + col1 : :class:`~pyspark.sql.Column` or column name first column to calculate covariance. - col2 : :class:`~pyspark.sql.Column` or str + col2 : :class:`~pyspark.sql.Column` or column name second column to calculate covariance. Returns @@ -6456,13 +6465,22 @@ def covar_pop(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` covariance of these two column values. + See Also + -------- + :meth:`pyspark.sql.functions.covar_samp` + Examples -------- + >>> from pyspark.sql import functions as sf >>> a = [1] * 10 >>> b = [1] * 10 >>> df = spark.createDataFrame(zip(a, b), ["a", "b"]) - >>> df.agg(covar_pop("a", "b").alias('c')).collect() - [Row(c=0.0)] + >>> df.agg(sf.covar_pop("a", df.b)).show() + +---------------+ + |covar_pop(a, b)| + +---------------+ + | 0.0| + +---------------+ """ return _invoke_function_over_columns("covar_pop", col1, col2) @@ -6479,9 +6497,9 @@ def covar_samp(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: Parameters ---------- - col1 : :class:`~pyspark.sql.Column` or str + col1 : :class:`~pyspark.sql.Column` or column name first column to calculate covariance. - col2 : :class:`~pyspark.sql.Column` or str + col2 : :class:`~pyspark.sql.Column` or column name second column to calculate covariance. Returns @@ -6489,13 +6507,22 @@ def covar_samp(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` sample covariance of these two column values. + See Also + -------- + :meth:`pyspark.sql.functions.covar_pop` + Examples -------- + >>> from pyspark.sql import functions as sf >>> a = [1] * 10 >>> b = [1] * 10 >>> df = spark.createDataFrame(zip(a, b), ["a", "b"]) - >>> df.agg(covar_samp("a", "b").alias('c')).collect() - [Row(c=0.0)] + >>> df.agg(sf.covar_samp("a", df.b)).show() + +----------------+ + |covar_samp(a, b)| + +----------------+ + | 0.0| + +----------------+ """ return _invoke_function_over_columns("covar_samp", col1, col2) @@ -6544,9 +6571,9 @@ def count_distinct(col: "ColumnOrName", *cols: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name first column to compute on. - cols : :class:`~pyspark.sql.Column` or str + cols : :class:`~pyspark.sql.Column` or column name other columns to compute on. Returns @@ -6616,7 +6643,7 @@ def first(col: "ColumnOrName", ignorenulls: bool = False) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name column to fetch first value for. ignorenulls : bool if first value is null then look for first non-null value. ``False``` by default. @@ -6628,9 +6655,10 @@ def first(col: "ColumnOrName", ignorenulls: bool = False) -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age")) >>> df = df.orderBy(df.age) - >>> df.groupby("name").agg(first("age")).orderBy("name").show() + >>> df.groupby("name").agg(sf.first("age")).orderBy("name").show() +-----+----------+ | name|first(age)| +-----+----------+ @@ -6640,7 +6668,7 @@ def first(col: "ColumnOrName", ignorenulls: bool = False) -> Column: To ignore any null values, set ``ignorenulls`` to `True` - >>> df.groupby("name").agg(first("age", ignorenulls=True)).orderBy("name").show() + >>> df.groupby("name").agg(sf.first("age", ignorenulls=True)).orderBy("name").show() +-----+----------+ | name|first(age)| +-----+----------+ @@ -6666,7 +6694,7 @@ def grouping(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name column to check if it's aggregated. Returns @@ -6676,8 +6704,9 @@ def grouping(col: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) - >>> df.cube("name").agg(grouping("name"), sum("age")).orderBy("name").show() + >>> df.cube("name").agg(sf.grouping("name"), sf.sum("age")).orderBy("name").show() +-----+--------------+--------+ | name|grouping(name)|sum(age)| +-----+--------------+--------+ @@ -6708,7 +6737,7 @@ def grouping_id(*cols: "ColumnOrName") -> Column: Parameters ---------- - cols : :class:`~pyspark.sql.Column` or str + cols : :class:`~pyspark.sql.Column` or column name columns to check for. Returns @@ -6718,10 +6747,10 @@ def grouping_id(*cols: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.createDataFrame([(1, "a", "a"), - ... (3, "a", "a"), - ... (4, "b", "c")], ["c1", "c2", "c3"]) - >>> df.cube("c2", "c3").agg(grouping_id(), sum("c1")).orderBy("c2", "c3").show() + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame( + ... [(1, "a", "a"), (3, "a", "a"), (4, "b", "c")], ["c1", "c2", "c3"]) + >>> df.cube("c2", "c3").agg(sf.grouping_id(), sf.sum("c1")).orderBy("c2", "c3").show() +----+----+-------------+-------+ | c2| c3|grouping_id()|sum(c1)| +----+----+-------------+-------+ @@ -6754,7 +6783,7 @@ def count_min_sketch( Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to compute on. eps : :class:`~pyspark.sql.Column` or float relative error, must be positive @@ -6855,9 +6884,10 @@ def input_file_name() -> Column: Examples -------- >>> import os + >>> from pyspark.sql import functions as sf >>> path = os.path.abspath(__file__) >>> df = spark.read.text(path) - >>> df.select(input_file_name()).first() + >>> df.select(sf.input_file_name()).first() Row(input_file_name()='file:///...') """ return _invoke_function("input_file_name") @@ -6874,7 +6904,7 @@ def isnan(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to compute on. Returns @@ -6884,14 +6914,15 @@ def isnan(col: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b")) - >>> df.select("a", "b", isnan("a").alias("r1"), isnan(df.b).alias("r2")).show() - +---+---+-----+-----+ - | a| b| r1| r2| - +---+---+-----+-----+ - |1.0|NaN|false| true| - |NaN|2.0| true|false| - +---+---+-----+-----+ + >>> df.select("*", sf.isnan("a"), sf.isnan(df.b)).show() + +---+---+--------+--------+ + | a| b|isnan(a)|isnan(b)| + +---+---+--------+--------+ + |1.0|NaN| false| true| + |NaN|2.0| true| false| + +---+---+--------+--------+ """ return _invoke_function_over_columns("isnan", col) @@ -6907,7 +6938,7 @@ def isnull(col: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to compute on. Returns @@ -6917,14 +6948,15 @@ def isnull(col: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(1, None), (None, 2)], ("a", "b")) - >>> df.select("a", "b", isnull("a").alias("r1"), isnull(df.b).alias("r2")).show() - +----+----+-----+-----+ - | a| b| r1| r2| - +----+----+-----+-----+ - | 1|NULL|false| true| - |NULL| 2| true|false| - +----+----+-----+-----+ + >>> df.select("*", sf.isnull("a"), isnull(df.b)).show() + +----+----+-----------+-----------+ + | a| b|(a IS NULL)|(b IS NULL)| + +----+----+-----------+-----------+ + | 1|NULL| false| true| + |NULL| 2| true| false| + +----+----+-----------+-----------+ """ return _invoke_function_over_columns("isnull", col) @@ -6948,7 +6980,7 @@ def last(col: "ColumnOrName", ignorenulls: bool = False) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name column to fetch last value for. ignorenulls : bool if last value is null then look for non-null value. ``False``` by default. @@ -6960,9 +6992,10 @@ def last(col: "ColumnOrName", ignorenulls: bool = False) -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age")) >>> df = df.orderBy(df.age.desc()) - >>> df.groupby("name").agg(last("age")).orderBy("name").show() + >>> df.groupby("name").agg(sf.last("age")).orderBy("name").show() +-----+---------+ | name|last(age)| +-----+---------+ @@ -6972,7 +7005,7 @@ def last(col: "ColumnOrName", ignorenulls: bool = False) -> Column: To ignore any null values, set ``ignorenulls`` to `True` - >>> df.groupby("name").agg(last("age", ignorenulls=True)).orderBy("name").show() + >>> df.groupby("name").agg(sf.last("age", ignorenulls=True)).orderBy("name").show() +-----+---------+ | name|last(age)| +-----+---------+ @@ -7015,21 +7048,24 @@ def monotonically_increasing_id() -> Column: Examples -------- >>> from pyspark.sql import functions as sf - >>> spark.range(0, 10, 1, 2).select(sf.monotonically_increasing_id()).show() - +-----------------------------+ - |monotonically_increasing_id()| - +-----------------------------+ - | 0| - | 1| - | 2| - | 3| - | 4| - | 8589934592| - | 8589934593| - | 8589934594| - | 8589934595| - | 8589934596| - +-----------------------------+ + >>> spark.range(0, 10, 1, 2).select( + ... "*", + ... sf.spark_partition_id(), + ... sf.monotonically_increasing_id()).show() + +---+--------------------+-----------------------------+ + | id|SPARK_PARTITION_ID()|monotonically_increasing_id()| + +---+--------------------+-----------------------------+ + | 0| 0| 0| + | 1| 0| 1| + | 2| 0| 2| + | 3| 0| 3| + | 4| 0| 4| + | 5| 1| 8589934592| + | 6| 1| 8589934593| + | 7| 1| 8589934594| + | 8| 1| 8589934595| + | 9| 1| 8589934596| + +---+--------------------+-----------------------------+ """ return _invoke_function("monotonically_increasing_id") @@ -7047,9 +7083,9 @@ def nanvl(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: Parameters ---------- - col1 : :class:`~pyspark.sql.Column` or str + col1 : :class:`~pyspark.sql.Column` or column name first column to check. - col2 : :class:`~pyspark.sql.Column` or str + col2 : :class:`~pyspark.sql.Column` or column name second column to return if first is NaN. Returns @@ -7059,9 +7095,15 @@ def nanvl(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: Examples -------- + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b")) - >>> df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect() - [Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)] + >>> df.select("*", sf.nanvl("a", "b"), sf.nanvl(df.a, df.b)).show() + +---+---+-----------+-----------+ + | a| b|nanvl(a, b)|nanvl(a, b)| + +---+---+-----------+-----------+ + |1.0|NaN| 1.0| 1.0| + |NaN|2.0| 2.0| 2.0| + +---+---+-----------+-----------+ """ return _invoke_function_over_columns("nanvl", col1, col2) @@ -7079,7 +7121,7 @@ def percentile( Parameters ---------- - col : :class:`~pyspark.sql.Column` or str input column. + col : :class:`~pyspark.sql.Column` or column name percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats percentage in decimal (must be between 0.0 and 1.0). frequency : :class:`~pyspark.sql.Column` or int is a positive numeric literal which @@ -7092,28 +7134,25 @@ def percentile( Examples -------- - >>> key = (col("id") % 3).alias("key") - >>> value = (randn(42) + key * 10).alias("value") + >>> from pyspark.sql import functions as sf + >>> key = (sf.col("id") % 3).alias("key") + >>> value = (sf.randn(42) + key * 10).alias("value") >>> df = spark.range(0, 1000, 1, 1).select(key, value) - >>> df.select( - ... percentile("value", [0.25, 0.5, 0.75], lit(1)).alias("quantiles") - ... ).show() - +--------------------+ - | quantiles| - +--------------------+ - |[0.74419914941216...| - +--------------------+ + >>> df.select(sf.percentile("value", [0.25, 0.5, 0.75], sf.lit(1))).show(truncate=False) + +--------------------------------------------------------+ + |percentile(value, array(0.25, 0.5, 0.75), 1) | + +--------------------------------------------------------+ + |[0.7441991494121..., 9.9900713756..., 19.33740203080...]| + +--------------------------------------------------------+ - >>> df.groupBy("key").agg( - ... percentile("value", 0.5, lit(1)).alias("median") - ... ).show() - +---+--------------------+ - |key| median| - +---+--------------------+ - | 0|-0.03449962216667901| - | 1| 9.990389751837329| - | 2| 19.967859769284075| - +---+--------------------+ + >>> df.groupBy("key").agg(sf.percentile("value", sf.lit(0.5), sf.lit(1))).show() + +---+-------------------------+ + |key|percentile(value, 0.5, 1)| + +---+-------------------------+ + | 0| -0.03449962216667901| + | 1| 9.990389751837329| + | 2| 19.967859769284075| + +---+-------------------------+ """ percentage = lit(list(percentage)) if isinstance(percentage, (list, tuple)) else lit(percentage) return _invoke_function_over_columns("percentile", col, percentage, lit(frequency)) @@ -7137,7 +7176,7 @@ def percentile_approx( Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name input column. percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats percentage in decimal (must be between 0.0 and 1.0). @@ -7156,22 +7195,25 @@ def percentile_approx( Examples -------- - >>> key = (col("id") % 3).alias("key") - >>> value = (randn(42) + key * 10).alias("value") + >>> from pyspark.sql import functions as sf + >>> key = (sf.col("id") % 3).alias("key") + >>> value = (sf.randn(42) + key * 10).alias("value") >>> df = spark.range(0, 1000, 1, 1).select(key, value) - >>> df.select( - ... percentile_approx("value", [0.25, 0.5, 0.75], 1000000).alias("quantiles") - ... ).printSchema() - root - |-- quantiles: array (nullable = true) - | |-- element: double (containsNull = false) + >>> df.select(sf.percentile_approx("value", [0.25, 0.5, 0.75], 1000000)).show(truncate=False) + +----------------------------------------------------------+ + |percentile_approx(value, array(0.25, 0.5, 0.75), 1000000) | + +----------------------------------------------------------+ + |[0.7264430125286..., 9.98975299938..., 19.335304783039...]| + +----------------------------------------------------------+ - >>> df.groupBy("key").agg( - ... percentile_approx("value", 0.5, lit(1000000)).alias("median") - ... ).printSchema() - root - |-- key: long (nullable = true) - |-- median: double (nullable = true) + >>> df.groupBy("key").agg(sf.percentile_approx("value", sf.lit(0.5), sf.lit(1000000))).show() + +---+--------------------------------------+ + |key|percentile_approx(value, 0.5, 1000000)| + +---+--------------------------------------+ + | 0| -0.03519435193070...| + | 1| 9.990389751837...| + | 2| 19.967859769284...| + +---+--------------------------------------+ """ percentage = lit(list(percentage)) if isinstance(percentage, (list, tuple)) else lit(percentage) return _invoke_function_over_columns("percentile_approx", col, percentage, lit(accuracy)) @@ -7191,7 +7233,7 @@ def approx_percentile( Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name input column. percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats percentage in decimal (must be between 0.0 and 1.0). @@ -7210,23 +7252,25 @@ def approx_percentile( Examples -------- - >>> import pyspark.sql.functions as sf + >>> from pyspark.sql import functions as sf >>> key = (sf.col("id") % 3).alias("key") >>> value = (sf.randn(42) + key * 10).alias("value") >>> df = spark.range(0, 1000, 1, 1).select(key, value) - >>> df.select( - ... sf.approx_percentile("value", [0.25, 0.5, 0.75], 1000000) - ... ).printSchema() - root - |-- approx_percentile(value, array(0.25, 0.5, 0.75), 1000000): array (nullable = true) - | |-- element: double (containsNull = false) + >>> df.select(sf.approx_percentile("value", [0.25, 0.5, 0.75], 1000000)).show(truncate=False) + +----------------------------------------------------------+ + |approx_percentile(value, array(0.25, 0.5, 0.75), 1000000) | + +----------------------------------------------------------+ + |[0.7264430125286507, 9.98975299938167, 19.335304783039014]| + +----------------------------------------------------------+ - >>> df.groupBy("key").agg( - ... sf.approx_percentile("value", 0.5, sf.lit(1000000)) - ... ).printSchema() - root - |-- key: long (nullable = true) - |-- approx_percentile(value, 0.5, 1000000): double (nullable = true) + >>> df.groupBy("key").agg(sf.approx_percentile("value", sf.lit(0.5), sf.lit(1000000))).show() + +---+--------------------------------------+ + |key|approx_percentile(value, 0.5, 1000000)| + +---+--------------------------------------+ + | 0| -0.03519435193070876| + | 1| 9.990389751837329| + | 2| 19.967859769284075| + +---+--------------------------------------+ """ percentage = lit(list(percentage)) if isinstance(percentage, (list, tuple)) else lit(percentage) return _invoke_function_over_columns("approx_percentile", col, percentage, lit(accuracy)) @@ -7261,22 +7305,22 @@ def rand(seed: Optional[int] = None) -> Column: Example 1: Generate a random column without a seed >>> from pyspark.sql import functions as sf - >>> spark.range(0, 2, 1, 1).withColumn('rand', sf.rand()).show() # doctest: +SKIP - +---+-------------------+ - | id| rand| - +---+-------------------+ - | 0|0.14879325244215424| - | 1| 0.4640631044275454| - +---+-------------------+ + >>> spark.range(0, 2, 1, 1).select("*", sf.rand()).show() # doctest: +SKIP + +---+-------------------------+ + | id|rand(-158884697681280011)| + +---+-------------------------+ + | 0| 0.9253464547887...| + | 1| 0.6533254118758...| + +---+-------------------------+ Example 2: Generate a random column with a specific seed - >>> spark.range(0, 2, 1, 1).withColumn('rand', sf.rand(seed=42) * 3).show() + >>> spark.range(0, 2, 1, 1).select("*", sf.rand(seed=42)).show() +---+------------------+ - | id| rand| + | id| rand(42)| +---+------------------+ - | 0|1.8575681106759028| - | 1|1.5288056527339444| + | 0| 0.619189370225...| + | 1|0.5096018842446...| +---+------------------+ """ if seed is not None: @@ -7314,22 +7358,22 @@ def randn(seed: Optional[int] = None) -> Column: Example 1: Generate a random column without a seed >>> from pyspark.sql import functions as sf - >>> spark.range(0, 2, 1, 1).withColumn('randn', sf.randn()).show() # doctest: +SKIP - +---+--------------------+ - | id| randn| - +---+--------------------+ - | 0|-0.45011372342934214| - | 1| 0.6567304165329736| - +---+--------------------+ + >>> spark.range(0, 2, 1, 1).select("*", sf.randn()).show() # doctest: +SKIP + +---+--------------------------+ + | id|randn(3968742514375399317)| + +---+--------------------------+ + | 0| -0.47968645355788...| + | 1| -0.4950952457305...| + +---+--------------------------+ Example 2: Generate a random column with a specific seed - >>> spark.range(0, 2, 1, 1).withColumn('randn', sf.randn(seed=42)).show() + >>> spark.range(0, 2, 1, 1).select("*", sf.randn(seed=42)).show() +---+------------------+ - | id| randn| + | id| randn(42)| +---+------------------+ - | 0| 2.384479054241165| - | 1|0.1920934041293524| + | 0| 2.384479054241...| + | 1|0.1920934041293...| +---+------------------+ """ if seed is not None: @@ -7351,7 +7395,7 @@ def round(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Co Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name The target column or column name to compute the round on. scale : :class:`~pyspark.sql.Column` or int, optional An optional parameter to control the rounding behavior. @@ -7407,7 +7451,7 @@ def bround(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> C Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name The target column or column name to compute the round on. scale : :class:`~pyspark.sql.Column` or int, optional An optional parameter to control the rounding behavior. @@ -7477,7 +7521,7 @@ def shiftleft(col: "ColumnOrName", numBits: int) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name input column of values to shift. numBits : int number of bits to shift. @@ -7489,8 +7533,16 @@ def shiftleft(col: "ColumnOrName", numBits: int) -> Column: Examples -------- - >>> spark.createDataFrame([(21,)], ['a']).select(shiftleft('a', 1).alias('r')).collect() - [Row(r=42)] + >>> import pyspark.sql.functions as sf + >>> spark.range(4).select("*", sf.shiftleft('id', 1)).show() + +---+----------------+ + | id|shiftleft(id, 1)| + +---+----------------+ + | 0| 0| + | 1| 2| + | 2| 4| + | 3| 6| + +---+----------------+ """ from pyspark.sql.classic.column import _to_java_column @@ -7524,7 +7576,7 @@ def shiftright(col: "ColumnOrName", numBits: int) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name input column of values to shift. numBits : int number of bits to shift. @@ -7536,8 +7588,16 @@ def shiftright(col: "ColumnOrName", numBits: int) -> Column: Examples -------- - >>> spark.createDataFrame([(42,)], ['a']).select(shiftright('a', 1).alias('r')).collect() - [Row(r=21)] + >>> import pyspark.sql.functions as sf + >>> spark.range(4).select("*", sf.shiftright('id', 1)).show() + +---+-----------------+ + | id|shiftright(id, 1)| + +---+-----------------+ + | 0| 0| + | 1| 0| + | 2| 1| + | 3| 1| + +---+-----------------+ """ from pyspark.sql.classic.column import _to_java_column @@ -7571,7 +7631,7 @@ def shiftrightunsigned(col: "ColumnOrName", numBits: int) -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name input column of values to shift. numBits : int number of bits to shift. @@ -7583,9 +7643,16 @@ def shiftrightunsigned(col: "ColumnOrName", numBits: int) -> Column: Examples -------- - >>> df = spark.createDataFrame([(-42,)], ['a']) - >>> df.select(shiftrightunsigned('a', 1).alias('r')).collect() - [Row(r=9223372036854775787)] + >>> import pyspark.sql.functions as sf + >>> spark.range(4).select("*", sf.shiftrightunsigned(sf.col('id') - 2, 1)).show() + +---+-------------------------------+ + | id|shiftrightunsigned((id - 2), 1)| + +---+-------------------------------+ + | 0| 9223372036854775807| + | 1| 9223372036854775807| + | 2| 0| + | 3| 0| + +---+-------------------------------+ """ from pyspark.sql.classic.column import _to_java_column @@ -7612,9 +7679,22 @@ def spark_partition_id() -> Column: Examples -------- - >>> df = spark.range(2) - >>> df.repartition(1).select(spark_partition_id().alias("pid")).collect() - [Row(pid=0), Row(pid=0)] + >>> import pyspark.sql.functions as sf + >>> spark.range(10, numPartitions=5).select("*", sf.spark_partition_id()).show() + +---+--------------------+ + | id|SPARK_PARTITION_ID()| + +---+--------------------+ + | 0| 0| + | 1| 0| + | 2| 1| + | 3| 1| + | 4| 2| + | 5| 2| + | 6| 3| + | 7| 3| + | 8| 4| + | 9| 4| + +---+--------------------+ """ return _invoke_function("spark_partition_id") @@ -7630,7 +7710,7 @@ def expr(str: str) -> Column: Parameters ---------- - str : str + str : expression string expression defined in string. Returns @@ -7640,8 +7720,9 @@ def expr(str: str) -> Column: Examples -------- + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([["Alice"], ["Bob"]], ["name"]) - >>> df.select("name", expr("length(name)")).show() + >>> df.select("*", sf.expr("length(name)")).show() +-----+------------+ | name|length(name)| +-----+------------+ @@ -8557,7 +8638,7 @@ def count_if(col: "ColumnOrName") -> Column: @_try_remote_functions -def histogram_numeric(col: "ColumnOrName", nBins: "ColumnOrName") -> Column: +def histogram_numeric(col: "ColumnOrName", nBins: Column) -> Column: """Computes a histogram on numeric 'col' using nb bins. The return value is an array of (x,y) pairs representing the centers of the histogram's bins. As the value of 'nb' is increased, the histogram approximation @@ -8573,9 +8654,9 @@ def histogram_numeric(col: "ColumnOrName", nBins: "ColumnOrName") -> Column: Parameters ---------- - col : :class:`~pyspark.sql.Column` or str + col : :class:`~pyspark.sql.Column` or column name target column to work on. - nBins : :class:`~pyspark.sql.Column` or str + nBins : :class:`~pyspark.sql.Column` number of Histogram columns. Returns @@ -8585,17 +8666,14 @@ def histogram_numeric(col: "ColumnOrName", nBins: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.createDataFrame([("a", 1), - ... ("a", 2), - ... ("a", 3), - ... ("b", 8), - ... ("b", 2)], ["c1", "c2"]) - >>> df.select(histogram_numeric('c2', lit(5))).show() - +------------------------+ - |histogram_numeric(c2, 5)| - +------------------------+ - | [{1, 1.0}, {2, 1....| - +------------------------+ + >>> from pyspark.sql import functions as sf + >>> df = spark.range(100, numPartitions=1) + >>> df.select(sf.histogram_numeric('id', sf.lit(5))).show(truncate=False) + +-----------------------------------------------------------+ + |histogram_numeric(id, 5) | + +-----------------------------------------------------------+ + |[{11, 25.0}, {36, 24.0}, {59, 23.0}, {84, 25.0}, {98, 3.0}]| + +-----------------------------------------------------------+ """ return _invoke_function_over_columns("histogram_numeric", col, nBins) diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow.py b/python/pyspark/sql/tests/connect/test_parity_arrow.py index d47a367a5460a..99d03ad1a4409 100644 --- a/python/pyspark/sql/tests/connect/test_parity_arrow.py +++ b/python/pyspark/sql/tests/connect/test_parity_arrow.py @@ -137,9 +137,8 @@ def test_toPandas_udt(self): def test_create_dataframe_namedtuples(self): self.check_create_dataframe_namedtuples(True) - @unittest.skip("Spark Connect does not support verifySchema.") def test_createDataFrame_verifySchema(self): - super().test_createDataFrame_verifySchema() + self.check_createDataFrame_verifySchema(True) if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index 19d0db9894317..99149d1a23d3e 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -533,6 +533,11 @@ def test_createDataFrame_arrow_pandas(self): self.assertEqual(df_arrow.collect(), df_pandas.collect()) def test_createDataFrame_verifySchema(self): + for arrow_enabled in [True, False]: + with self.subTest(arrow_enabled=arrow_enabled): + self.check_createDataFrame_verifySchema(arrow_enabled) + + def check_createDataFrame_verifySchema(self, arrow_enabled): data = {"id": [1, 2, 3], "value": [100000000000, 200000000000, 300000000000]} # data.value should fail schema validation when verifySchema is True schema = StructType( @@ -547,29 +552,32 @@ def test_createDataFrame_verifySchema(self): table = pa.table(data) df = self.spark.createDataFrame(table, schema=schema) self.assertEqual(df.collect(), expected) - with self.assertRaises(Exception): self.spark.createDataFrame(table, schema=schema, verifySchema=True) - # pandas DataFrame with Arrow optimization - pdf = pd.DataFrame(data) - df = self.spark.createDataFrame(pdf, schema=schema) - # verifySchema defaults to `spark.sql.execution.pandas.convertToArrowArraySafely`, - # which is false by default - self.assertEqual(df.collect(), expected) - with self.assertRaises(Exception): - with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": True}): + if arrow_enabled: + with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": True}): + # pandas DataFrame with Arrow optimization + pdf = pd.DataFrame(data) df = self.spark.createDataFrame(pdf, schema=schema) - with self.assertRaises(Exception): - df = self.spark.createDataFrame(pdf, schema=schema, verifySchema=True) - - # pandas DataFrame without Arrow optimization - with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}): - pdf = pd.DataFrame(data) - with self.assertRaises(Exception): - df = self.spark.createDataFrame(pdf, schema=schema) # verifySchema defaults to True - df = self.spark.createDataFrame(pdf, schema=schema, verifySchema=False) - self.assertEqual(df.collect(), expected) + # verifySchema defaults to `spark.sql.execution.pandas.convertToArrowArraySafely`, + # which is false by default + self.assertEqual(df.collect(), expected) + with self.assertRaises(Exception): + with self.sql_conf( + {"spark.sql.execution.pandas.convertToArrowArraySafely": True} + ): + df = self.spark.createDataFrame(pdf, schema=schema) + with self.assertRaises(Exception): + df = self.spark.createDataFrame(pdf, schema=schema, verifySchema=True) + else: + # pandas DataFrame without Arrow optimization + with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}): + pdf = pd.DataFrame(data) + with self.assertRaises(Exception): + self.spark.createDataFrame(pdf, schema=schema) # verifySchema defaults to True + df = self.spark.createDataFrame(pdf, schema=schema, verifySchema=False) + self.assertEqual(df.collect(), expected) def _createDataFrame_toggle(self, data, schema=None): with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}): diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index 9688ed4923737..8a30608cd4087 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -28,6 +28,7 @@ from pyspark.sql import Row from pyspark.sql import functions as F from pyspark.errors import ( + AnalysisException, ParseException, PySparkTypeError, PySparkValueError, @@ -1129,17 +1130,10 @@ def test_cast_to_string_with_udt(self): def test_cast_to_udt_with_udt(self): row = Row(point=ExamplePoint(1.0, 2.0), python_only_point=PythonOnlyPoint(1.0, 2.0)) df = self.spark.createDataFrame([row]) - result = df.select(F.col("point").cast(PythonOnlyUDT())).collect() - self.assertEqual( - result, - [Row(point=PythonOnlyPoint(1.0, 2.0))], - ) - - result = df.select(F.col("python_only_point").cast(ExamplePointUDT())).collect() - self.assertEqual( - result, - [Row(python_only_point=ExamplePoint(1.0, 2.0))], - ) + with self.assertRaises(AnalysisException): + df.select(F.col("point").cast(PythonOnlyUDT())).collect() + with self.assertRaises(AnalysisException): + df.select(F.col("python_only_point").cast(ExamplePointUDT())).collect() def test_struct_type(self): struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala index 6f2fd41f1f799..4993e249b3059 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala @@ -66,10 +66,6 @@ private[sql] object UpCastRule { case (from: UserDefinedType[_], to: UserDefinedType[_]) if to.acceptsType(from) => true - case (udt: UserDefinedType[_], toType) => canUpCast(udt.sqlType, toType) - - case (fromType, udt: UserDefinedType[_]) => canUpCast(fromType, udt.sqlType) - case _ => false } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala index e2e9bde856781..4a6504666d41f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy -import org.apache.spark.sql.types.{ArrayType, DataType, DecimalType, IntegralType, MapType, StructType} +import org.apache.spark.sql.types.{ArrayType, DataType, DecimalType, IntegralType, MapType, StructType, UserDefinedType} object TableOutputResolver extends SQLConfHelper with Logging { @@ -539,7 +539,8 @@ object TableOutputResolver extends SQLConfHelper with Logging { } } else { val nullCheckedQueryExpr = checkNullability(queryExpr, tableAttr, conf, colPath) - val casted = cast(nullCheckedQueryExpr, attrTypeWithoutCharVarchar, conf, colPath.quoted) + val udtUnwrapped = unwrapUDT(nullCheckedQueryExpr) + val casted = cast(udtUnwrapped, attrTypeWithoutCharVarchar, conf, colPath.quoted) val exprWithStrLenCheck = if (conf.charVarcharAsString || !attrTypeHasCharVarchar) { casted } else { @@ -558,6 +559,39 @@ object TableOutputResolver extends SQLConfHelper with Logging { if (canWriteExpr) outputField else None } + private def unwrapUDT(expr: Expression): Expression = expr.dataType match { + case ArrayType(et, containsNull) => + val param = NamedLambdaVariable("element", et, containsNull) + val func = LambdaFunction(unwrapUDT(param), Seq(param)) + ArrayTransform(expr, func) + + case MapType(kt, vt, valueContainsNull) => + val keyParam = NamedLambdaVariable("key", kt, nullable = false) + val valueParam = NamedLambdaVariable("value", vt, valueContainsNull) + val keyFunc = LambdaFunction(unwrapUDT(keyParam), Seq(keyParam)) + val valueFunc = LambdaFunction(unwrapUDT(valueParam), Seq(valueParam)) + val newKeys = ArrayTransform(MapKeys(expr), keyFunc) + val newValues = ArrayTransform(MapValues(expr), valueFunc) + MapFromArrays(newKeys, newValues) + + case st: StructType => + val newFieldExprs = st.indices.map { i => + unwrapUDT(GetStructField(expr, i)) + } + val struct = CreateNamedStruct(st.zip(newFieldExprs).flatMap { + case (field, newExpr) => Seq(Literal(field.name), newExpr) + }) + if (expr.nullable) { + If(IsNull(expr), Literal(null, struct.dataType), struct) + } else { + struct + } + + case _: UserDefinedType[_] => UnwrapUDT(expr) + + case _ => expr + } + private def cast( expr: Expression, expectedType: DataType, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index b1e3a4ad21e41..154199d37c46d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -150,10 +150,6 @@ object Cast extends QueryErrorsBase { case (udt1: UserDefinedType[_], udt2: UserDefinedType[_]) if udt2.acceptsType(udt1) => true - case (udt: UserDefinedType[_], toType) => canAnsiCast(udt.sqlType, toType) - - case (fromType, udt: UserDefinedType[_]) => canAnsiCast(fromType, udt.sqlType) - case _ => false } @@ -271,10 +267,6 @@ object Cast extends QueryErrorsBase { case (udt1: UserDefinedType[_], udt2: UserDefinedType[_]) if udt2.acceptsType(udt1) => true - case (udt: UserDefinedType[_], toType) => canCast(udt.sqlType, toType) - - case (fromType, udt: UserDefinedType[_]) => canCast(fromType, udt.sqlType) - case _ => false } @@ -1131,42 +1123,33 @@ case class Cast( variant.VariantGet.cast(v, to, evalMode != EvalMode.TRY, timeZoneId, zoneId) }) } else { - from match { - // `castToString` has special handling for `UserDefinedType` - case udt: UserDefinedType[_] if !to.isInstanceOf[StringType] => - castInternal(udt.sqlType, to) - case _ => - to match { - case dt if dt == from => identity[Any] - case VariantType => input => - variant.VariantExpressionEvalUtils.castToVariant(input, from) - case _: StringType => castToString(from) - case BinaryType => castToBinary(from) - case DateType => castToDate(from) - case decimal: DecimalType => castToDecimal(from, decimal) - case TimestampType => castToTimestamp(from) - case TimestampNTZType => castToTimestampNTZ(from) - case CalendarIntervalType => castToInterval(from) - case it: DayTimeIntervalType => castToDayTimeInterval(from, it) - case it: YearMonthIntervalType => castToYearMonthInterval(from, it) - case BooleanType => castToBoolean(from) - case ByteType => castToByte(from) - case ShortType => castToShort(from) - case IntegerType => castToInt(from) - case FloatType => castToFloat(from) - case LongType => castToLong(from) - case DoubleType => castToDouble(from) - case array: ArrayType => - castArray(from.asInstanceOf[ArrayType].elementType, array.elementType) - case map: MapType => castMap(from.asInstanceOf[MapType], map) - case struct: StructType => castStruct(from.asInstanceOf[StructType], struct) - case udt: UserDefinedType[_] if udt.acceptsType(from) => - identity[Any] - case udt: UserDefinedType[_] => - castInternal(from, udt.sqlType) - case _ => - throw QueryExecutionErrors.cannotCastError(from, to) - } + to match { + case dt if dt == from => identity[Any] + case VariantType => input => variant.VariantExpressionEvalUtils.castToVariant(input, from) + case _: StringType => castToString(from) + case BinaryType => castToBinary(from) + case DateType => castToDate(from) + case decimal: DecimalType => castToDecimal(from, decimal) + case TimestampType => castToTimestamp(from) + case TimestampNTZType => castToTimestampNTZ(from) + case CalendarIntervalType => castToInterval(from) + case it: DayTimeIntervalType => castToDayTimeInterval(from, it) + case it: YearMonthIntervalType => castToYearMonthInterval(from, it) + case BooleanType => castToBoolean(from) + case ByteType => castToByte(from) + case ShortType => castToShort(from) + case IntegerType => castToInt(from) + case FloatType => castToFloat(from) + case LongType => castToLong(from) + case DoubleType => castToDouble(from) + case array: ArrayType => + castArray(from.asInstanceOf[ArrayType].elementType, array.elementType) + case map: MapType => castMap(from.asInstanceOf[MapType], map) + case struct: StructType => castStruct(from.asInstanceOf[StructType], struct) + case udt: UserDefinedType[_] if udt.acceptsType(from) => + identity[Any] + case _: UserDefinedType[_] => + throw QueryExecutionErrors.cannotCastError(from, to) } } } @@ -1228,64 +1211,54 @@ case class Cast( private[this] def nullSafeCastFunction( from: DataType, to: DataType, - ctx: CodegenContext): CastFunction = { - from match { - // `castToStringCode` has special handling for `UserDefinedType` - case udt: UserDefinedType[_] if !to.isInstanceOf[StringType] => - nullSafeCastFunction(udt.sqlType, to, ctx) - case _ => - to match { - - case _ if from == NullType => (c, evPrim, evNull) => code"$evNull = true;" - case _ if to == from => (c, evPrim, evNull) => code"$evPrim = $c;" - case _ if from.isInstanceOf[VariantType] => (c, evPrim, evNull) => - val tmp = ctx.freshVariable("tmp", classOf[Object]) - val dataTypeArg = ctx.addReferenceObj("dataType", to) - val zoneStrArg = ctx.addReferenceObj("zoneStr", timeZoneId) - val zoneIdArg = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName) - val failOnError = evalMode != EvalMode.TRY - val cls = classOf[variant.VariantGet].getName - code""" - Object $tmp = $cls.cast($c, $dataTypeArg, $failOnError, $zoneStrArg, $zoneIdArg); - if ($tmp == null) { - $evNull = true; - } else { - $evPrim = (${CodeGenerator.boxedType(to)})$tmp; - } - """ - case VariantType => - val cls = variant.VariantExpressionEvalUtils.getClass.getName.stripSuffix("$") - val fromArg = ctx.addReferenceObj("from", from) - (c, evPrim, evNull) => code"$evPrim = $cls.castToVariant($c, $fromArg);" - case _: StringType => (c, evPrim, _) => castToStringCode(from, ctx).apply(c, evPrim) - case BinaryType => castToBinaryCode(from) - case DateType => castToDateCode(from, ctx) - case decimal: DecimalType => castToDecimalCode(from, decimal, ctx) - case TimestampType => castToTimestampCode(from, ctx) - case TimestampNTZType => castToTimestampNTZCode(from, ctx) - case CalendarIntervalType => castToIntervalCode(from) - case it: DayTimeIntervalType => castToDayTimeIntervalCode(from, it) - case it: YearMonthIntervalType => castToYearMonthIntervalCode(from, it) - case BooleanType => castToBooleanCode(from, ctx) - case ByteType => castToByteCode(from, ctx) - case ShortType => castToShortCode(from, ctx) - case IntegerType => castToIntCode(from, ctx) - case FloatType => castToFloatCode(from, ctx) - case LongType => castToLongCode(from, ctx) - case DoubleType => castToDoubleCode(from, ctx) - - case array: ArrayType => - castArrayCode(from.asInstanceOf[ArrayType].elementType, array.elementType, ctx) - case map: MapType => castMapCode(from.asInstanceOf[MapType], map, ctx) - case struct: StructType => castStructCode(from.asInstanceOf[StructType], struct, ctx) - case udt: UserDefinedType[_] if udt.acceptsType(from) => - (c, evPrim, evNull) => code"$evPrim = $c;" - case udt: UserDefinedType[_] => - nullSafeCastFunction(from, udt.sqlType, ctx) - case _ => - throw QueryExecutionErrors.cannotCastError(from, to) + ctx: CodegenContext): CastFunction = to match { + + case _ if from == NullType => (c, evPrim, evNull) => code"$evNull = true;" + case _ if to == from => (c, evPrim, evNull) => code"$evPrim = $c;" + case _ if from.isInstanceOf[VariantType] => (c, evPrim, evNull) => + val tmp = ctx.freshVariable("tmp", classOf[Object]) + val dataTypeArg = ctx.addReferenceObj("dataType", to) + val zoneStrArg = ctx.addReferenceObj("zoneStr", timeZoneId) + val zoneIdArg = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName) + val failOnError = evalMode != EvalMode.TRY + val cls = classOf[variant.VariantGet].getName + code""" + Object $tmp = $cls.cast($c, $dataTypeArg, $failOnError, $zoneStrArg, $zoneIdArg); + if ($tmp == null) { + $evNull = true; + } else { + $evPrim = (${CodeGenerator.boxedType(to)})$tmp; } - } + """ + case VariantType => + val cls = variant.VariantExpressionEvalUtils.getClass.getName.stripSuffix("$") + val fromArg = ctx.addReferenceObj("from", from) + (c, evPrim, evNull) => code"$evPrim = $cls.castToVariant($c, $fromArg);" + case _: StringType => (c, evPrim, _) => castToStringCode(from, ctx).apply(c, evPrim) + case BinaryType => castToBinaryCode(from) + case DateType => castToDateCode(from, ctx) + case decimal: DecimalType => castToDecimalCode(from, decimal, ctx) + case TimestampType => castToTimestampCode(from, ctx) + case TimestampNTZType => castToTimestampNTZCode(from, ctx) + case CalendarIntervalType => castToIntervalCode(from) + case it: DayTimeIntervalType => castToDayTimeIntervalCode(from, it) + case it: YearMonthIntervalType => castToYearMonthIntervalCode(from, it) + case BooleanType => castToBooleanCode(from, ctx) + case ByteType => castToByteCode(from, ctx) + case ShortType => castToShortCode(from, ctx) + case IntegerType => castToIntCode(from, ctx) + case FloatType => castToFloatCode(from, ctx) + case LongType => castToLongCode(from, ctx) + case DoubleType => castToDoubleCode(from, ctx) + + case array: ArrayType => + castArrayCode(from.asInstanceOf[ArrayType].elementType, array.elementType, ctx) + case map: MapType => castMapCode(from.asInstanceOf[MapType], map, ctx) + case struct: StructType => castStructCode(from.asInstanceOf[StructType], struct, ctx) + case udt: UserDefinedType[_] if udt.acceptsType(from) => + (c, evPrim, evNull) => code"$evPrim = $c;" + case _: UserDefinedType[_] => + throw QueryExecutionErrors.cannotCastError(from, to) } // Since we need to cast input expressions recursively inside ComplexTypes, such as Map's diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 1448d8165f57b..b874cb53cb31f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -443,53 +443,47 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression { override def eval(input: InternalRow): Any = value override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - def gen(ctx: CodegenContext, ev: ExprCode, dataType: DataType): ExprCode = { - val javaType = CodeGenerator.javaType(dataType) - if (value == null) { - ExprCode.forNullValue(dataType) - } else { - def toExprCode(code: String): ExprCode = { - ExprCode.forNonNullValue(JavaCode.literal(code, dataType)) - } - - dataType match { - case BooleanType | IntegerType | DateType | _: YearMonthIntervalType => - toExprCode(value.toString) - case FloatType => - value.asInstanceOf[Float] match { - case v if v.isNaN => - toExprCode("Float.NaN") - case Float.PositiveInfinity => - toExprCode("Float.POSITIVE_INFINITY") - case Float.NegativeInfinity => - toExprCode("Float.NEGATIVE_INFINITY") - case _ => - toExprCode(s"${value}F") - } - case DoubleType => - value.asInstanceOf[Double] match { - case v if v.isNaN => - toExprCode("Double.NaN") - case Double.PositiveInfinity => - toExprCode("Double.POSITIVE_INFINITY") - case Double.NegativeInfinity => - toExprCode("Double.NEGATIVE_INFINITY") - case _ => - toExprCode(s"${value}D") - } - case ByteType | ShortType => - ExprCode.forNonNullValue(JavaCode.expression(s"($javaType)$value", dataType)) - case TimestampType | TimestampNTZType | LongType | _: DayTimeIntervalType => - toExprCode(s"${value}L") - case udt: UserDefinedType[_] => - gen(ctx, ev, udt.sqlType) - case _ => - val constRef = ctx.addReferenceObj("literal", value, javaType) - ExprCode.forNonNullValue(JavaCode.global(constRef, dataType)) - } + val javaType = CodeGenerator.javaType(dataType) + if (value == null) { + ExprCode.forNullValue(dataType) + } else { + def toExprCode(code: String): ExprCode = { + ExprCode.forNonNullValue(JavaCode.literal(code, dataType)) + } + dataType match { + case BooleanType | IntegerType | DateType | _: YearMonthIntervalType => + toExprCode(value.toString) + case FloatType => + value.asInstanceOf[Float] match { + case v if v.isNaN => + toExprCode("Float.NaN") + case Float.PositiveInfinity => + toExprCode("Float.POSITIVE_INFINITY") + case Float.NegativeInfinity => + toExprCode("Float.NEGATIVE_INFINITY") + case _ => + toExprCode(s"${value}F") + } + case DoubleType => + value.asInstanceOf[Double] match { + case v if v.isNaN => + toExprCode("Double.NaN") + case Double.PositiveInfinity => + toExprCode("Double.POSITIVE_INFINITY") + case Double.NegativeInfinity => + toExprCode("Double.NEGATIVE_INFINITY") + case _ => + toExprCode(s"${value}D") + } + case ByteType | ShortType => + ExprCode.forNonNullValue(JavaCode.expression(s"($javaType)$value", dataType)) + case TimestampType | TimestampNTZType | LongType | _: DayTimeIntervalType => + toExprCode(s"${value}L") + case _ => + val constRef = ctx.addReferenceObj("literal", value, javaType) + ExprCode.forNonNullValue(JavaCode.global(constRef, dataType)) } } - gen(ctx, ev, dataType) } override def sql: String = (value, dataType) match { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala index f915d6efeb827..e87b54339821f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.expressions import java.sql.{Date, Timestamp} -import java.time.{Duration, LocalDate, LocalDateTime, Period, Year => JYear} +import java.time.{Duration, LocalDate, LocalDateTime, Period} import java.time.temporal.ChronoUnit import java.util.{Calendar, Locale, TimeZone} @@ -37,7 +37,6 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.sql.types.DataTypeTestUtils.{dayTimeIntervalTypes, yearMonthIntervalTypes} import org.apache.spark.sql.types.DayTimeIntervalType.{DAY, HOUR, MINUTE, SECOND} -import org.apache.spark.sql.types.TestUDT._ import org.apache.spark.sql.types.UpCastRule.numericPrecedence import org.apache.spark.sql.types.YearMonthIntervalType.{MONTH, YEAR} import org.apache.spark.unsafe.types.UTF8String @@ -1410,43 +1409,4 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { assert(!Cast(timestampLiteral, TimestampNTZType).resolved) assert(!Cast(timestampNTZLiteral, TimestampType).resolved) } - - test("SPARK-49787: Cast between UDT and other types") { - val value = new MyDenseVector(Array(1.0, 2.0, -1.0)) - val udtType = new MyDenseVectorUDT() - val targetType = ArrayType(DoubleType, containsNull = false) - - val serialized = udtType.serialize(value) - - checkEvaluation(Cast(new Literal(serialized, udtType), targetType), serialized) - checkEvaluation(Cast(new Literal(serialized, targetType), udtType), serialized) - - val year = JYear.parse("2024") - val yearUDTType = new YearUDT() - - val yearSerialized = yearUDTType.serialize(year) - - checkEvaluation(Cast(new Literal(yearSerialized, yearUDTType), IntegerType), 2024) - checkEvaluation(Cast(new Literal(2024, IntegerType), yearUDTType), yearSerialized) - - val yearString = UTF8String.fromString("2024") - checkEvaluation(Cast(new Literal(yearSerialized, yearUDTType), StringType), yearString) - checkEvaluation(Cast(new Literal(yearString, StringType), yearUDTType), yearSerialized) - } -} - -private[sql] class YearUDT extends UserDefinedType[JYear] { - override def sqlType: DataType = IntegerType - - override def serialize(obj: JYear): Int = { - obj.getValue - } - - def deserialize(datum: Any): JYear = datum match { - case value: Int => JYear.of(value) - } - - override def userClass: Class[JYear] = classOf[JYear] - - private[spark] override def asNullable: YearUDT = this } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/conditional-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/conditional-functions.sql.out new file mode 100644 index 0000000000000..5effa73c413a6 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/conditional-functions.sql.out @@ -0,0 +1,142 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +CREATE TABLE conditional_t USING PARQUET AS SELECT c1, c2 FROM VALUES(1d, 0),(2d, 1),(null, 1),(CAST('NaN' AS DOUBLE), 0) AS t(c1, c2) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`conditional_t`, ErrorIfExists, [c1, c2] + +- Project [c1#x, c2#x] + +- SubqueryAlias t + +- LocalRelation [c1#x, c2#x] + + +-- !query +SELECT nanvl(c2, c1/c2 + c1/c2) FROM conditional_t +-- !query analysis +Project [nanvl(cast(c2#x as double), ((c1#x / cast(c2#x as double)) + (c1#x / cast(c2#x as double)))) AS nanvl(c2, ((c1 / c2) + (c1 / c2)))#x] ++- SubqueryAlias spark_catalog.default.conditional_t + +- Relation spark_catalog.default.conditional_t[c1#x,c2#x] parquet + + +-- !query +SELECT nanvl(c2, 1/0) FROM conditional_t +-- !query analysis +Project [nanvl(cast(c2#x as double), (cast(1 as double) / cast(0 as double))) AS nanvl(c2, (1 / 0))#x] ++- SubqueryAlias spark_catalog.default.conditional_t + +- Relation spark_catalog.default.conditional_t[c1#x,c2#x] parquet + + +-- !query +SELECT nanvl(1-0, 1/0) FROM conditional_t +-- !query analysis +Project [nanvl(cast((1 - 0) as double), (cast(1 as double) / cast(0 as double))) AS nanvl((1 - 0), (1 / 0))#x] ++- SubqueryAlias spark_catalog.default.conditional_t + +- Relation spark_catalog.default.conditional_t[c1#x,c2#x] parquet + + +-- !query +SELECT if(c2 >= 0, 1-0, 1/0) from conditional_t +-- !query analysis +Project [if ((c2#x >= 0)) cast((1 - 0) as double) else (cast(1 as double) / cast(0 as double)) AS (IF((c2 >= 0), (1 - 0), (1 / 0)))#x] ++- SubqueryAlias spark_catalog.default.conditional_t + +- Relation spark_catalog.default.conditional_t[c1#x,c2#x] parquet + + +-- !query +SELECT if(1 == 1, 1, 1/0) +-- !query analysis +Project [if ((1 = 1)) cast(1 as double) else (cast(1 as double) / cast(0 as double)) AS (IF((1 = 1), 1, (1 / 0)))#x] ++- OneRowRelation + + +-- !query +SELECT if(1 != 1, 1/0, 1) +-- !query analysis +Project [if (NOT (1 = 1)) (cast(1 as double) / cast(0 as double)) else cast(1 as double) AS (IF((NOT (1 = 1)), (1 / 0), 1))#x] ++- OneRowRelation + + +-- !query +SELECT coalesce(c2, 1/0) from conditional_t +-- !query analysis +Project [coalesce(cast(c2#x as double), (cast(1 as double) / cast(0 as double))) AS coalesce(c2, (1 / 0))#x] ++- SubqueryAlias spark_catalog.default.conditional_t + +- Relation spark_catalog.default.conditional_t[c1#x,c2#x] parquet + + +-- !query +SELECT coalesce(1, 1/0) +-- !query analysis +Project [coalesce(cast(1 as double), (cast(1 as double) / cast(0 as double))) AS coalesce(1, (1 / 0))#x] ++- OneRowRelation + + +-- !query +SELECT coalesce(null, 1, 1/0) +-- !query analysis +Project [coalesce(cast(null as double), cast(1 as double), (cast(1 as double) / cast(0 as double))) AS coalesce(NULL, 1, (1 / 0))#x] ++- OneRowRelation + + +-- !query +SELECT case when c2 >= 0 then 1 else 1/0 end from conditional_t +-- !query analysis +Project [CASE WHEN (c2#x >= 0) THEN cast(1 as double) ELSE (cast(1 as double) / cast(0 as double)) END AS CASE WHEN (c2 >= 0) THEN 1 ELSE (1 / 0) END#x] ++- SubqueryAlias spark_catalog.default.conditional_t + +- Relation spark_catalog.default.conditional_t[c1#x,c2#x] parquet + + +-- !query +SELECT case when 1 < 2 then 1 else 1/0 end +-- !query analysis +Project [CASE WHEN (1 < 2) THEN cast(1 as double) ELSE (cast(1 as double) / cast(0 as double)) END AS CASE WHEN (1 < 2) THEN 1 ELSE (1 / 0) END#x] ++- OneRowRelation + + +-- !query +SELECT case when 1 > 2 then 1/0 else 1 end +-- !query analysis +Project [CASE WHEN (1 > 2) THEN (cast(1 as double) / cast(0 as double)) ELSE cast(1 as double) END AS CASE WHEN (1 > 2) THEN (1 / 0) ELSE 1 END#x] ++- OneRowRelation + + +-- !query +SELECT nullifzero(0), + nullifzero(cast(0 as tinyint)), + nullifzero(cast(0 as bigint)), + nullifzero('0'), + nullifzero(0.0), + nullifzero(1), + nullifzero(null) +-- !query analysis +Project [nullifzero(0) AS nullifzero(0)#x, nullifzero(cast(0 as tinyint)) AS nullifzero(CAST(0 AS TINYINT))#x, nullifzero(cast(0 as bigint)) AS nullifzero(CAST(0 AS BIGINT))#xL, nullifzero(0) AS nullifzero(0)#x, nullifzero(0.0) AS nullifzero(0.0)#x, nullifzero(1) AS nullifzero(1)#x, nullifzero(null) AS nullifzero(NULL)#x] ++- OneRowRelation + + +-- !query +SELECT nullifzero('abc') +-- !query analysis +Project [nullifzero(abc) AS nullifzero(abc)#x] ++- OneRowRelation + + +-- !query +SELECT zeroifnull(null), + zeroifnull(1), + zeroifnull(cast(1 as tinyint)), + zeroifnull(cast(1 as bigint)) +-- !query analysis +Project [zeroifnull(null) AS zeroifnull(NULL)#x, zeroifnull(1) AS zeroifnull(1)#x, zeroifnull(cast(1 as tinyint)) AS zeroifnull(CAST(1 AS TINYINT))#x, zeroifnull(cast(1 as bigint)) AS zeroifnull(CAST(1 AS BIGINT))#xL] ++- OneRowRelation + + +-- !query +SELECT zeroifnull('abc') +-- !query analysis +Project [zeroifnull(abc) AS zeroifnull(abc)#xL] ++- OneRowRelation + + +-- !query +DROP TABLE conditional_t +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.conditional_t diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/double-quoted-identifiers-enabled.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers-enabled.sql.out similarity index 56% rename from sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/double-quoted-identifiers-enabled.sql.out rename to sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers-enabled.sql.out index a02bf525f947d..22dfeac5fd0b6 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/double-quoted-identifiers-enabled.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers-enabled.sql.out @@ -2,27 +2,32 @@ -- !query SELECT 1 FROM "not_exist" -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "relationName" : "`not_exist`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 25, + "fragment" : "\"not_exist\"" + } ] } -- !query USE SCHEMA "not_exist" -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "SCHEMA_NOT_FOUND", + "sqlState" : "42704", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" + "schemaName" : "`spark_catalog`.`not_exist`" } } @@ -30,84 +35,122 @@ org.apache.spark.sql.catalyst.parser.ParseException -- !query ALTER TABLE "not_exist" ADD COLUMN not_exist int -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "relationName" : "`not_exist`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 13, + "stopIndex" : 23, + "fragment" : "\"not_exist\"" + } ] } -- !query ALTER TABLE not_exist ADD COLUMN "not_exist" int -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "relationName" : "`not_exist`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 13, + "stopIndex" : 21, + "fragment" : "not_exist" + } ] } -- !query SELECT 1 AS "not_exist" FROM not_exist -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "relationName" : "`not_exist`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 30, + "stopIndex" : 38, + "fragment" : "not_exist" + } ] } -- !query SELECT 1 FROM not_exist AS X("hello") -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", "messageParameters" : { - "error" : "'\"hello\"'", - "hint" : "" - } + "relationName" : "`not_exist`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 23, + "fragment" : "not_exist" + } ] } -- !query SELECT "not_exist"() -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.AnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "UNRESOLVED_ROUTINE", + "sqlState" : "42883", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "routineName" : "`not_exist`", + "searchPath" : "[`system`.`builtin`, `system`.`session`, `spark_catalog`.`default`]" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 20, + "fragment" : "\"not_exist\"()" + } ] } -- !query SELECT "not_exist".not_exist() -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.AnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "UNRESOLVED_ROUTINE", + "sqlState" : "42883", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "routineName" : "`not_exist`.`not_exist`", + "searchPath" : "[`system`.`builtin`, `system`.`session`, `spark_catalog`.`default`]" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 30, + "fragment" : "\"not_exist\".not_exist()" + } ] } @@ -269,29 +312,62 @@ org.apache.spark.sql.AnalysisException -- !query SELECT "hello" -- !query analysis -Project [hello AS hello#x] -+- OneRowRelation +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`hello`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 14, + "fragment" : "\"hello\"" + } ] +} -- !query CREATE TEMPORARY VIEW v(c1 COMMENT "hello") AS SELECT 1 -- !query analysis -CreateViewCommand `v`, [(c1,Some(hello))], SELECT 1, false, false, LocalTempView, UNSUPPORTED, true - +- Project [1 AS 1#x] - +- OneRowRelation +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'\"hello\"'", + "hint" : "" + } +} -- !query DROP VIEW v -- !query analysis -DropTempViewCommand v +org.apache.spark.sql.catalyst.analysis.NoSuchTableException +{ + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`spark_catalog`.`default`.`v`" + } +} -- !query SELECT INTERVAL "1" YEAR -- !query analysis -Project [INTERVAL '1' YEAR AS INTERVAL '1' YEAR#x] -+- OneRowRelation +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'\"1\"'", + "hint" : "" + } +} -- !query @@ -325,69 +401,54 @@ Project [INTERVAL '1' YEAR AS INTERVAL '1' YEAR#x] -- !query CREATE SCHEMA "myschema" -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException -{ - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", - "messageParameters" : { - "error" : "'\"myschema\"'", - "hint" : "" - } -} +CreateNamespace false ++- ResolvedNamespace V2SessionCatalog(spark_catalog), [myschema] -- !query CREATE TEMPORARY VIEW "myview"("c1") AS WITH "v"("a") AS (SELECT 1) SELECT "a" FROM "v" -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException -{ - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", - "messageParameters" : { - "error" : "'\"myview\"'", - "hint" : "" - } -} +CreateViewCommand `myview`, [(c1,None)], WITH "v"("a") AS (SELECT 1) SELECT "a" FROM "v", false, false, LocalTempView, UNSUPPORTED, true + +- WithCTE + :- CTERelationDef xxxx, false + : +- SubqueryAlias v + : +- Project [1#x AS a#x] + : +- Project [1 AS 1#x] + : +- OneRowRelation + +- Project [a#x] + +- SubqueryAlias v + +- CTERelationRef xxxx, true, [a#x], false -- !query SELECT "a1" AS "a2" FROM "myview" AS "atab"("a1") -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException -{ - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", - "messageParameters" : { - "error" : "'\"a2\"'", - "hint" : "" - } -} +Project [a1#x AS a2#x] ++- SubqueryAlias atab + +- Project [c1#x AS a1#x] + +- SubqueryAlias myview + +- View (`myview`, [c1#x]) + +- Project [cast(a#x as int) AS c1#x] + +- WithCTE + :- CTERelationDef xxxx, false + : +- SubqueryAlias v + : +- Project [1#x AS a#x] + : +- Project [1 AS 1#x] + : +- OneRowRelation + +- Project [a#x] + +- SubqueryAlias v + +- CTERelationRef xxxx, true, [a#x], false -- !query DROP TABLE "myview" -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException -{ - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", - "messageParameters" : { - "error" : "'\"myview\"'", - "hint" : "" - } -} +DropTempViewCommand myview -- !query DROP SCHEMA "myschema" -- !query analysis -org.apache.spark.sql.catalyst.parser.ParseException -{ - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", - "messageParameters" : { - "error" : "'\"myschema\"'", - "hint" : "" - } -} +DropNamespace false, false ++- ResolvedNamespace V2SessionCatalog(spark_catalog), [myschema] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-mosha.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-mosha.sql.out index b7dd089ba86a8..34d22ad6c6176 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-mosha.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-mosha.sql.out @@ -56,11 +56,11 @@ Sort [i#x ASC NULLS FIRST, i#x ASC NULLS FIRST, ci#xL ASC NULLS FIRST, ci#xL ASC -- !query -SELECT i + 1, f / i, substring(s, 2, 3), extract(year from t), d / 2, size(a) FROM stuff +SELECT i + 1, f / i, substring(s, 2, 3), extract(year from t), octet_length(d), size(a) FROM stuff GROUP BY ALL ORDER BY 1, 3, 4, 5, 6, 2 -- !query analysis -Sort [(i + 1)#x ASC NULLS FIRST, substring(s, 2, 3)#x ASC NULLS FIRST, extract(year FROM t)#x ASC NULLS FIRST, (d / 2)#x ASC NULLS FIRST, size(a)#x ASC NULLS FIRST, (f / i)#x ASC NULLS FIRST], true -+- Aggregate [(i#x + 1), (f#x / cast(i#x as decimal(10,0))), substring(s#x, 2, 3), extract(year, t#x), (cast(cast(d#x as bigint) as double) / cast(cast(2 as bigint) as double)), size(a#x, false)], [(i#x + 1) AS (i + 1)#x, (f#x / cast(i#x as decimal(10,0))) AS (f / i)#x, substring(s#x, 2, 3) AS substring(s, 2, 3)#x, extract(year, t#x) AS extract(year FROM t)#x, (cast(cast(d#x as bigint) as double) / cast(cast(2 as bigint) as double)) AS (d / 2)#x, size(a#x, false) AS size(a)#x] +Sort [(i + 1)#x ASC NULLS FIRST, substring(s, 2, 3)#x ASC NULLS FIRST, extract(year FROM t)#x ASC NULLS FIRST, octet_length(d)#x ASC NULLS FIRST, size(a)#x ASC NULLS FIRST, (f / i)#x ASC NULLS FIRST], true ++- Aggregate [(i#x + 1), (f#x / cast(i#x as decimal(10,0))), substring(s#x, 2, 3), extract(year, t#x), octet_length(d#x), size(a#x, false)], [(i#x + 1) AS (i + 1)#x, (f#x / cast(i#x as decimal(10,0))) AS (f / i)#x, substring(s#x, 2, 3) AS substring(s, 2, 3)#x, extract(year, t#x) AS extract(year FROM t)#x, octet_length(d#x) AS octet_length(d)#x, size(a#x, false) AS size(a)#x] +- SubqueryAlias stuff +- View (`stuff`, [i#x, f#x, s#x, t#x, d#x, a#x]) +- Project [cast(i#x as int) AS i#x, cast(f#x as decimal(6,4)) AS f#x, cast(s#x as string) AS s#x, cast(t#x as string) AS t#x, cast(d#x as string) AS d#x, cast(a#x as array) AS a#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/keywords-enforced.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/keywords-enforced.sql.out new file mode 100644 index 0000000000000..a549a03316bce --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/keywords-enforced.sql.out @@ -0,0 +1,16 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT * from SQL_KEYWORDS() +-- !query analysis +Project [keyword#x, reserved#x] ++- Generate sql_keywords(), false, [keyword#x, reserved#x] + +- OneRowRelation + + +-- !query +SELECT keyword from SQL_KEYWORDS() WHERE reserved +-- !query analysis +Project [keyword#x] ++- Filter reserved#x: boolean + +- Generate sql_keywords(), false, [keyword#x, reserved#x] + +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/decimalArithmeticOperations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/decimalArithmeticOperations.sql.out index bcaa991ddae04..d75f4d41bd425 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/decimalArithmeticOperations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/decimalArithmeticOperations.sql.out @@ -1,4 +1,45 @@ -- Automatically generated by SQLQueryTestSuite +-- !query +CREATE TEMPORARY VIEW t AS SELECT 1.0 as a, 0.0 as b +-- !query analysis +CreateViewCommand `t`, SELECT 1.0 as a, 0.0 as b, false, false, LocalTempView, UNSUPPORTED, true + +- Project [1.0 AS a#x, 0.0 AS b#x] + +- OneRowRelation + + +-- !query +select a / b from t +-- !query analysis +Project [(a#x / b#x) AS (a / b)#x] ++- SubqueryAlias t + +- View (`t`, [a#x, b#x]) + +- Project [cast(a#x as decimal(2,1)) AS a#x, cast(b#x as decimal(1,1)) AS b#x] + +- Project [1.0 AS a#x, 0.0 AS b#x] + +- OneRowRelation + + +-- !query +select a % b from t +-- !query analysis +Project [(a#x % b#x) AS (a % b)#x] ++- SubqueryAlias t + +- View (`t`, [a#x, b#x]) + +- Project [cast(a#x as decimal(2,1)) AS a#x, cast(b#x as decimal(1,1)) AS b#x] + +- Project [1.0 AS a#x, 0.0 AS b#x] + +- OneRowRelation + + +-- !query +select pmod(a, b) from t +-- !query analysis +Project [pmod(a#x, b#x) AS pmod(a, b)#x] ++- SubqueryAlias t + +- View (`t`, [a#x, b#x]) + +- Project [cast(a#x as decimal(2,1)) AS a#x, cast(b#x as decimal(1,1)) AS b#x] + +- Project [1.0 AS a#x, 0.0 AS b#x] + +- OneRowRelation + + -- !query create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet -- !query analysis @@ -14,6 +55,15 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d +- LocalRelation [col1#x, col2#x, col3#x] +-- !query +select id, a+b, a-b, a*b, a/b from decimals_test order by id +-- !query analysis +Sort [id#x ASC NULLS FIRST], true ++- Project [id#x, (a#x + b#x) AS (a + b)#x, (a#x - b#x) AS (a - b)#x, (a#x * b#x) AS (a * b)#x, (a#x / b#x) AS (a / b)#x] + +- SubqueryAlias spark_catalog.default.decimals_test + +- Relation spark_catalog.default.decimals_test[id#x,a#x,b#x] parquet + + -- !query select id, a*10, b/10 from decimals_test order by id -- !query analysis @@ -58,6 +108,13 @@ Project [(10.300000000000000000 * 3.0000000000000000000) AS (10.3000000000000000 +- OneRowRelation +-- !query +select 2.35E10 * 1.0 +-- !query analysis +Project [(2.35E10 * cast(1.0 as double)) AS (2.35E10 * 1.0)#x] ++- OneRowRelation + + -- !query select (5e36BD + 0.1) + 5e36BD -- !query analysis @@ -86,6 +143,20 @@ Project [(100000000000000000000000000000000000 / 0.1) AS (1000000000000000000000 +- OneRowRelation +-- !query +select 1.2345678901234567890E30BD * 1.2345678901234567890E25BD +-- !query analysis +Project [(1234567890123456789000000000000 * 12345678901234567890000000) AS (1234567890123456789000000000000 * 12345678901234567890000000)#x] ++- OneRowRelation + + +-- !query +select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345 +-- !query analysis +Project [(12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345) AS (12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345)#x] ++- OneRowRelation + + -- !query select 123456789123456789.1234567890 * 1.123456789123456789 -- !query analysis @@ -93,6 +164,212 @@ Project [(123456789123456789.1234567890 * 1.123456789123456789) AS (123456789123 +- OneRowRelation +-- !query +select 12345678912345.123456789123 / 0.000000012345678 +-- !query analysis +Project [(12345678912345.123456789123 / 1.2345678E-8) AS (12345678912345.123456789123 / 1.2345678E-8)#x] ++- OneRowRelation + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(3, 1)) + CAST(90 AS DECIMAL(3, 1)) +-- !query analysis +Union false, false +:- Project [cast(20 as decimal(4,1)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- OneRowRelation ++- Project [(cast(10 as decimal(3,1)) + cast(90 as decimal(3,1))) AS (CAST(10 AS DECIMAL(3,1)) + CAST(90 AS DECIMAL(3,1)))#x] + +- OneRowRelation + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(3, 1)) - CAST(-90 AS DECIMAL(3, 1)) +-- !query analysis +Union false, false +:- Project [cast(20 as decimal(4,1)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- OneRowRelation ++- Project [(cast(10 as decimal(3,1)) - cast(-90 as decimal(3,1))) AS (CAST(10 AS DECIMAL(3,1)) - CAST(-90 AS DECIMAL(3,1)))#x] + +- OneRowRelation + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(3, 1)) * CAST(10 AS DECIMAL(3, 1)) +-- !query analysis +Union false, false +:- Project [cast(CAST(20 AS DECIMAL(4,1))#x as decimal(7,2)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- Project [cast(20 as decimal(4,1)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- OneRowRelation ++- Project [(cast(10 as decimal(3,1)) * cast(10 as decimal(3,1))) AS (CAST(10 AS DECIMAL(3,1)) * CAST(10 AS DECIMAL(3,1)))#x] + +- OneRowRelation + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(3, 1)) / CAST(10 AS DECIMAL(3, 1)) +-- !query analysis +Union false, false +:- Project [cast(CAST(20 AS DECIMAL(4,1))#x as decimal(9,6)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- Project [cast(20 as decimal(4,1)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- OneRowRelation ++- Project [(cast(10 as decimal(3,1)) / cast(10 as decimal(3,1))) AS (CAST(10 AS DECIMAL(3,1)) / CAST(10 AS DECIMAL(3,1)))#x] + +- OneRowRelation + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(10, 2)) % CAST(3 AS DECIMAL(5, 1)) +-- !query analysis +Union false, false +:- Project [cast(CAST(20 AS DECIMAL(4,1))#x as decimal(6,2)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- Project [cast(20 as decimal(4,1)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- OneRowRelation ++- Project [(cast(10 as decimal(10,2)) % cast(3 as decimal(5,1))) AS (CAST(10 AS DECIMAL(10,2)) % CAST(3 AS DECIMAL(5,1)))#x] + +- OneRowRelation + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT pmod(CAST(10 AS DECIMAL(10, 2)), CAST(3 AS DECIMAL(5, 1))) +-- !query analysis +Union false, false +:- Project [cast(CAST(20 AS DECIMAL(4,1))#x as decimal(6,2)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- Project [cast(20 as decimal(4,1)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- OneRowRelation ++- Project [pmod(cast(10 as decimal(10,2)), cast(3 as decimal(5,1))) AS pmod(CAST(10 AS DECIMAL(10,2)), CAST(3 AS DECIMAL(5,1)))#x] + +- OneRowRelation + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(10, 2)) div CAST(3 AS DECIMAL(5, 1)) +-- !query analysis +Union false, false +:- Project [cast(CAST(20 AS DECIMAL(4,1))#x as decimal(21,1)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- Project [cast(20 as decimal(4,1)) AS CAST(20 AS DECIMAL(4,1))#x] +: +- OneRowRelation ++- Project [cast((CAST(10 AS DECIMAL(10,2)) div CAST(3 AS DECIMAL(5,1)))#xL as decimal(21,1)) AS (CAST(10 AS DECIMAL(10,2)) div CAST(3 AS DECIMAL(5,1)))#x] + +- Project [(cast(10 as decimal(10,2)) div cast(3 as decimal(5,1))) AS (CAST(10 AS DECIMAL(10,2)) div CAST(3 AS DECIMAL(5,1)))#xL] + +- OneRowRelation + + +-- !query +set spark.sql.decimalOperations.allowPrecisionLoss=false +-- !query analysis +SetCommand (spark.sql.decimalOperations.allowPrecisionLoss,Some(false)) + + +-- !query +select /*+ COALESCE(1) */ id, a+b, a-b, a*b, a/b from decimals_test order by id +-- !query analysis +Sort [id#x ASC NULLS FIRST], true ++- Repartition 1, false + +- Project [id#x, (a#x + b#x) AS (a + b)#x, (a#x - b#x) AS (a - b)#x, (a#x * b#x) AS (a * b)#x, (a#x / b#x) AS (a / b)#x] + +- SubqueryAlias spark_catalog.default.decimals_test + +- Relation spark_catalog.default.decimals_test[id#x,a#x,b#x] parquet + + +-- !query +select id, a*10, b/10 from decimals_test order by id +-- !query analysis +Sort [id#x ASC NULLS FIRST], true ++- Project [id#x, (a#x * cast(10 as decimal(2,0))) AS (a * 10)#x, (b#x / cast(10 as decimal(2,0))) AS (b / 10)#x] + +- SubqueryAlias spark_catalog.default.decimals_test + +- Relation spark_catalog.default.decimals_test[id#x,a#x,b#x] parquet + + +-- !query +select 10.3 * 3.0 +-- !query analysis +Project [(10.3 * 3.0) AS (10.3 * 3.0)#x] ++- OneRowRelation + + +-- !query +select 10.3000 * 3.0 +-- !query analysis +Project [(10.3000 * 3.0) AS (10.3000 * 3.0)#x] ++- OneRowRelation + + +-- !query +select 10.30000 * 30.0 +-- !query analysis +Project [(10.30000 * 30.0) AS (10.30000 * 30.0)#x] ++- OneRowRelation + + +-- !query +select 10.300000000000000000 * 3.000000000000000000 +-- !query analysis +Project [(10.300000000000000000 * 3.000000000000000000) AS (10.300000000000000000 * 3.000000000000000000)#x] ++- OneRowRelation + + +-- !query +select 10.300000000000000000 * 3.0000000000000000000 +-- !query analysis +Project [(10.300000000000000000 * 3.0000000000000000000) AS (10.300000000000000000 * 3.0000000000000000000)#x] ++- OneRowRelation + + +-- !query +select 2.35E10 * 1.0 +-- !query analysis +Project [(2.35E10 * cast(1.0 as double)) AS (2.35E10 * 1.0)#x] ++- OneRowRelation + + +-- !query +select (5e36BD + 0.1) + 5e36BD +-- !query analysis +Project [((5000000000000000000000000000000000000 + 0.1) + 5000000000000000000000000000000000000) AS ((5000000000000000000000000000000000000 + 0.1) + 5000000000000000000000000000000000000)#x] ++- OneRowRelation + + +-- !query +select (-4e36BD - 0.1) - 7e36BD +-- !query analysis +Project [((-4000000000000000000000000000000000000 - 0.1) - 7000000000000000000000000000000000000) AS ((-4000000000000000000000000000000000000 - 0.1) - 7000000000000000000000000000000000000)#x] ++- OneRowRelation + + +-- !query +select 12345678901234567890.0 * 12345678901234567890.0 +-- !query analysis +Project [(12345678901234567890.0 * 12345678901234567890.0) AS (12345678901234567890.0 * 12345678901234567890.0)#x] ++- OneRowRelation + + +-- !query +select 1e35BD / 0.1 +-- !query analysis +Project [(100000000000000000000000000000000000 / 0.1) AS (100000000000000000000000000000000000 / 0.1)#x] ++- OneRowRelation + + +-- !query +select 1.2345678901234567890E30BD * 1.2345678901234567890E25BD +-- !query analysis +Project [(1234567890123456789000000000000 * 12345678901234567890000000) AS (1234567890123456789000000000000 * 12345678901234567890000000)#x] ++- OneRowRelation + + +-- !query +select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345 +-- !query analysis +Project [(12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345) AS (12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345)#x] ++- OneRowRelation + + -- !query select 123456789123456789.1234567890 * 1.123456789123456789 -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/double-quoted-identifiers-disabled.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/double-quoted-identifiers.sql.out similarity index 100% rename from sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/double-quoted-identifiers-disabled.sql.out rename to sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/double-quoted-identifiers.sql.out diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/union.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/union.sql.out index 93456003254b8..dfba3688f0b7d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/union.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/union.sql.out @@ -40,7 +40,7 @@ Project [c1#x, c2#x] -- !query SELECT * -FROM (SELECT * FROM t1 +FROM (SELECT * FROM t1 where c1 = 1 UNION ALL SELECT * FROM t2 UNION ALL @@ -52,11 +52,12 @@ Project [c1#x, c2#xL] :- Union false, false : :- Project [cast(c1#x as decimal(11,1)) AS c1#x, cast(c2#x as bigint) AS c2#xL] : : +- Project [c1#x, c2#x] - : : +- SubqueryAlias t1 - : : +- View (`t1`, [c1#x, c2#x]) - : : +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as string) AS c2#x] - : : +- SubqueryAlias tbl - : : +- LocalRelation [c1#x, c2#x] + : : +- Filter (c1#x = 1) + : : +- SubqueryAlias t1 + : : +- View (`t1`, [c1#x, c2#x]) + : : +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as string) AS c2#x] + : : +- SubqueryAlias tbl + : : +- LocalRelation [c1#x, c2#x] : +- Project [cast(c1#x as decimal(11,1)) AS c1#x, cast(c2#x as bigint) AS c2#xL] : +- Project [c1#x, c2#x] : +- SubqueryAlias t2 diff --git a/sql/core/src/test/resources/sql-tests/inputs/conditional-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/conditional-functions.sql new file mode 100644 index 0000000000000..c7a4b055f024e --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/conditional-functions.sql @@ -0,0 +1,38 @@ +-- Tests for conditional functions + +CREATE TABLE conditional_t USING PARQUET AS SELECT c1, c2 FROM VALUES(1d, 0),(2d, 1),(null, 1),(CAST('NaN' AS DOUBLE), 0) AS t(c1, c2); + +SELECT nanvl(c2, c1/c2 + c1/c2) FROM conditional_t; +SELECT nanvl(c2, 1/0) FROM conditional_t; +SELECT nanvl(1-0, 1/0) FROM conditional_t; + +SELECT if(c2 >= 0, 1-0, 1/0) from conditional_t; +SELECT if(1 == 1, 1, 1/0); +SELECT if(1 != 1, 1/0, 1); + +SELECT coalesce(c2, 1/0) from conditional_t; +SELECT coalesce(1, 1/0); +SELECT coalesce(null, 1, 1/0); + +SELECT case when c2 >= 0 then 1 else 1/0 end from conditional_t; +SELECT case when 1 < 2 then 1 else 1/0 end; +SELECT case when 1 > 2 then 1/0 else 1 end; + +SELECT nullifzero(0), + nullifzero(cast(0 as tinyint)), + nullifzero(cast(0 as bigint)), + nullifzero('0'), + nullifzero(0.0), + nullifzero(1), + nullifzero(null); + +SELECT nullifzero('abc'); + +SELECT zeroifnull(null), + zeroifnull(1), + zeroifnull(cast(1 as tinyint)), + zeroifnull(cast(1 as bigint)); + +SELECT zeroifnull('abc'); + +DROP TABLE conditional_t; diff --git a/sql/core/src/test/resources/sql-tests/inputs/nonansi/double-quoted-identifiers-enabled.sql b/sql/core/src/test/resources/sql-tests/inputs/double-quoted-identifiers-enabled.sql similarity index 100% rename from sql/core/src/test/resources/sql-tests/inputs/nonansi/double-quoted-identifiers-enabled.sql rename to sql/core/src/test/resources/sql-tests/inputs/double-quoted-identifiers-enabled.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by-all-mosha.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by-all-mosha.sql index 4dc6d3d0189cc..451f745a97ee6 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by-all-mosha.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by-all-mosha.sql @@ -12,7 +12,7 @@ SELECT i + 1 AS i1, COUNT(i - 2) ci, f / i AS fi, SUM(i + f) sif FROM stuff GROU SELECT i AS i, COUNT(i) ci, f AS f, SUM(i + f) sif FROM stuff GROUP BY ALL ORDER BY 1, i, 2, ci, 3, f, 4, sif; -SELECT i + 1, f / i, substring(s, 2, 3), extract(year from t), d / 2, size(a) FROM stuff +SELECT i + 1, f / i, substring(s, 2, 3), extract(year from t), octet_length(d), size(a) FROM stuff GROUP BY ALL ORDER BY 1, 3, 4, 5, 6, 2; -- unlike Mosha, I'm failing this case because IMO it is too implicit to automatically group by i. diff --git a/sql/core/src/test/resources/sql-tests/inputs/keywords-enforced.sql b/sql/core/src/test/resources/sql-tests/inputs/keywords-enforced.sql new file mode 100644 index 0000000000000..b0c6e5929b18d --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/keywords-enforced.sql @@ -0,0 +1,2 @@ +--SET spark.sql.ansi.enforceReservedKeywords = true +--IMPORT keywords.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/nonansi/conditional-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/nonansi/conditional-functions.sql index c7a4b055f024e..c4b3418cc6d89 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/nonansi/conditional-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/nonansi/conditional-functions.sql @@ -1,38 +1 @@ --- Tests for conditional functions - -CREATE TABLE conditional_t USING PARQUET AS SELECT c1, c2 FROM VALUES(1d, 0),(2d, 1),(null, 1),(CAST('NaN' AS DOUBLE), 0) AS t(c1, c2); - -SELECT nanvl(c2, c1/c2 + c1/c2) FROM conditional_t; -SELECT nanvl(c2, 1/0) FROM conditional_t; -SELECT nanvl(1-0, 1/0) FROM conditional_t; - -SELECT if(c2 >= 0, 1-0, 1/0) from conditional_t; -SELECT if(1 == 1, 1, 1/0); -SELECT if(1 != 1, 1/0, 1); - -SELECT coalesce(c2, 1/0) from conditional_t; -SELECT coalesce(1, 1/0); -SELECT coalesce(null, 1, 1/0); - -SELECT case when c2 >= 0 then 1 else 1/0 end from conditional_t; -SELECT case when 1 < 2 then 1 else 1/0 end; -SELECT case when 1 > 2 then 1/0 else 1 end; - -SELECT nullifzero(0), - nullifzero(cast(0 as tinyint)), - nullifzero(cast(0 as bigint)), - nullifzero('0'), - nullifzero(0.0), - nullifzero(1), - nullifzero(null); - -SELECT nullifzero('abc'); - -SELECT zeroifnull(null), - zeroifnull(1), - zeroifnull(cast(1 as tinyint)), - zeroifnull(cast(1 as bigint)); - -SELECT zeroifnull('abc'); - -DROP TABLE conditional_t; +--IMPORT conditional-functions.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/nonansi/decimalArithmeticOperations.sql b/sql/core/src/test/resources/sql-tests/inputs/nonansi/decimalArithmeticOperations.sql index c447511ba6055..7d6c336df4528 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/nonansi/decimalArithmeticOperations.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/nonansi/decimalArithmeticOperations.sql @@ -1,41 +1 @@ --- SPARK-23179: SQL ANSI 2011 states that in case of overflow during arithmetic operations, --- an exception should be thrown instead of returning NULL. --- This is what most of the SQL DBs do (e.g. SQLServer, DB2). - --- tests for decimals handling in operations -create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet; - -insert into decimals_test values(1, 100.0, 999.0), (2, 12345.123, 12345.123), - (3, 0.1234567891011, 1234.1), (4, 123456789123456789.0, 1.123456789123456789); - --- test operations between decimals and constants -select id, a*10, b/10 from decimals_test order by id; - --- test operations on constants -select 10.3 * 3.0; -select 10.3000 * 3.0; -select 10.30000 * 30.0; -select 10.300000000000000000 * 3.000000000000000000; -select 10.300000000000000000 * 3.0000000000000000000; - --- arithmetic operations causing an overflow throw exception -select (5e36BD + 0.1) + 5e36BD; -select (-4e36BD - 0.1) - 7e36BD; -select 12345678901234567890.0 * 12345678901234567890.0; -select 1e35BD / 0.1; - --- arithmetic operations causing a precision loss throw exception -select 123456789123456789.1234567890 * 1.123456789123456789; -select 123456789123456789.1234567890 * 1.123456789123456789; -select 12345678912345.123456789123 / 0.000000012345678; - -select 1.0123456789012345678901234567890123456e36BD / 0.1; -select 1.0123456789012345678901234567890123456e35BD / 1.0; -select 1.0123456789012345678901234567890123456e34BD / 1.0; -select 1.0123456789012345678901234567890123456e33BD / 1.0; -select 1.0123456789012345678901234567890123456e32BD / 1.0; -select 1.0123456789012345678901234567890123456e31BD / 1.0; -select 1.0123456789012345678901234567890123456e31BD / 0.1; -select 1.0123456789012345678901234567890123456e31BD / 10.0; - -drop table decimals_test; +--IMPORT decimalArithmeticOperations.sql \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/nonansi/double-quoted-identifiers-disabled.sql b/sql/core/src/test/resources/sql-tests/inputs/nonansi/double-quoted-identifiers-disabled.sql deleted file mode 100644 index b8ff8cdb81376..0000000000000 --- a/sql/core/src/test/resources/sql-tests/inputs/nonansi/double-quoted-identifiers-disabled.sql +++ /dev/null @@ -1,2 +0,0 @@ ---SET spark.sql.ansi.doubleQuotedIdentifiers=false ---IMPORT double-quoted-identifiers.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/nonansi/double-quoted-identifiers.sql b/sql/core/src/test/resources/sql-tests/inputs/nonansi/double-quoted-identifiers.sql new file mode 100644 index 0000000000000..7ccd294e709b6 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/nonansi/double-quoted-identifiers.sql @@ -0,0 +1 @@ +--IMPORT double-quoted-identifiers.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/nonansi/keywords.sql b/sql/core/src/test/resources/sql-tests/inputs/nonansi/keywords.sql index b0c6e5929b18d..ebc9345c3a738 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/nonansi/keywords.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/nonansi/keywords.sql @@ -1,2 +1 @@ ---SET spark.sql.ansi.enforceReservedKeywords = true --IMPORT keywords.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/union.sql b/sql/core/src/test/resources/sql-tests/inputs/union.sql index 8a5b6c50fc1e3..ab81cc7bbabb6 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/union.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/union.sql @@ -9,7 +9,7 @@ FROM (SELECT * FROM t1 -- Type Coerced Union SELECT * -FROM (SELECT * FROM t1 +FROM (SELECT * FROM t1 where c1 = 1 UNION ALL SELECT * FROM t2 UNION ALL diff --git a/sql/core/src/test/resources/sql-tests/results/conditional-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/conditional-functions.sql.out new file mode 100644 index 0000000000000..aa8a600f87560 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/conditional-functions.sql.out @@ -0,0 +1,202 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +CREATE TABLE conditional_t USING PARQUET AS SELECT c1, c2 FROM VALUES(1d, 0),(2d, 1),(null, 1),(CAST('NaN' AS DOUBLE), 0) AS t(c1, c2) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT nanvl(c2, c1/c2 + c1/c2) FROM conditional_t +-- !query schema +struct +-- !query output +0.0 +0.0 +1.0 +1.0 + + +-- !query +SELECT nanvl(c2, 1/0) FROM conditional_t +-- !query schema +struct +-- !query output +0.0 +0.0 +1.0 +1.0 + + +-- !query +SELECT nanvl(1-0, 1/0) FROM conditional_t +-- !query schema +struct +-- !query output +1.0 +1.0 +1.0 +1.0 + + +-- !query +SELECT if(c2 >= 0, 1-0, 1/0) from conditional_t +-- !query schema +struct<(IF((c2 >= 0), (1 - 0), (1 / 0))):double> +-- !query output +1.0 +1.0 +1.0 +1.0 + + +-- !query +SELECT if(1 == 1, 1, 1/0) +-- !query schema +struct<(IF((1 = 1), 1, (1 / 0))):double> +-- !query output +1.0 + + +-- !query +SELECT if(1 != 1, 1/0, 1) +-- !query schema +struct<(IF((NOT (1 = 1)), (1 / 0), 1)):double> +-- !query output +1.0 + + +-- !query +SELECT coalesce(c2, 1/0) from conditional_t +-- !query schema +struct +-- !query output +0.0 +0.0 +1.0 +1.0 + + +-- !query +SELECT coalesce(1, 1/0) +-- !query schema +struct +-- !query output +1.0 + + +-- !query +SELECT coalesce(null, 1, 1/0) +-- !query schema +struct +-- !query output +1.0 + + +-- !query +SELECT case when c2 >= 0 then 1 else 1/0 end from conditional_t +-- !query schema +struct= 0) THEN 1 ELSE (1 / 0) END:double> +-- !query output +1.0 +1.0 +1.0 +1.0 + + +-- !query +SELECT case when 1 < 2 then 1 else 1/0 end +-- !query schema +struct +-- !query output +1.0 + + +-- !query +SELECT case when 1 > 2 then 1/0 else 1 end +-- !query schema +struct 2) THEN (1 / 0) ELSE 1 END:double> +-- !query output +1.0 + + +-- !query +SELECT nullifzero(0), + nullifzero(cast(0 as tinyint)), + nullifzero(cast(0 as bigint)), + nullifzero('0'), + nullifzero(0.0), + nullifzero(1), + nullifzero(null) +-- !query schema +struct +-- !query output +NULL NULL NULL NULL NULL 1 NULL + + +-- !query +SELECT nullifzero('abc') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkNumberFormatException +{ + "errorClass" : "CAST_INVALID_INPUT", + "sqlState" : "22018", + "messageParameters" : { + "expression" : "'abc'", + "sourceType" : "\"STRING\"", + "targetType" : "\"BIGINT\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 24, + "fragment" : "nullifzero('abc')" + } ] +} + + +-- !query +SELECT zeroifnull(null), + zeroifnull(1), + zeroifnull(cast(1 as tinyint)), + zeroifnull(cast(1 as bigint)) +-- !query schema +struct +-- !query output +0 1 1 1 + + +-- !query +SELECT zeroifnull('abc') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkNumberFormatException +{ + "errorClass" : "CAST_INVALID_INPUT", + "sqlState" : "22018", + "messageParameters" : { + "expression" : "'abc'", + "sourceType" : "\"STRING\"", + "targetType" : "\"BIGINT\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 24, + "fragment" : "zeroifnull('abc')" + } ] +} + + +-- !query +DROP TABLE conditional_t +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/nonansi/double-quoted-identifiers-enabled.sql.out b/sql/core/src/test/resources/sql-tests/results/double-quoted-identifiers-enabled.sql.out similarity index 66% rename from sql/core/src/test/resources/sql-tests/results/nonansi/double-quoted-identifiers-enabled.sql.out rename to sql/core/src/test/resources/sql-tests/results/double-quoted-identifiers-enabled.sql.out index 81a98a60590f0..2444c399a87ec 100644 --- a/sql/core/src/test/resources/sql-tests/results/nonansi/double-quoted-identifiers-enabled.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/double-quoted-identifiers-enabled.sql.out @@ -4,14 +4,20 @@ SELECT 1 FROM "not_exist" -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "relationName" : "`not_exist`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 25, + "fragment" : "\"not_exist\"" + } ] } @@ -20,13 +26,12 @@ USE SCHEMA "not_exist" -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "SCHEMA_NOT_FOUND", + "sqlState" : "42704", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" + "schemaName" : "`spark_catalog`.`not_exist`" } } @@ -36,14 +41,20 @@ ALTER TABLE "not_exist" ADD COLUMN not_exist int -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "relationName" : "`not_exist`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 13, + "stopIndex" : 23, + "fragment" : "\"not_exist\"" + } ] } @@ -52,14 +63,20 @@ ALTER TABLE not_exist ADD COLUMN "not_exist" int -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "relationName" : "`not_exist`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 13, + "stopIndex" : 21, + "fragment" : "not_exist" + } ] } @@ -68,14 +85,20 @@ SELECT 1 AS "not_exist" FROM not_exist -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "relationName" : "`not_exist`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 30, + "stopIndex" : 38, + "fragment" : "not_exist" + } ] } @@ -84,14 +107,20 @@ SELECT 1 FROM not_exist AS X("hello") -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", "messageParameters" : { - "error" : "'\"hello\"'", - "hint" : "" - } + "relationName" : "`not_exist`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 23, + "fragment" : "not_exist" + } ] } @@ -100,14 +129,21 @@ SELECT "not_exist"() -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.AnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "UNRESOLVED_ROUTINE", + "sqlState" : "42883", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "routineName" : "`not_exist`", + "searchPath" : "[`system`.`builtin`, `system`.`session`, `spark_catalog`.`default`]" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 20, + "fragment" : "\"not_exist\"()" + } ] } @@ -116,14 +152,21 @@ SELECT "not_exist".not_exist() -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException +org.apache.spark.sql.AnalysisException { - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", + "errorClass" : "UNRESOLVED_ROUTINE", + "sqlState" : "42883", "messageParameters" : { - "error" : "'\"not_exist\"'", - "hint" : "" - } + "routineName" : "`not_exist`.`not_exist`", + "searchPath" : "[`system`.`builtin`, `system`.`session`, `spark_catalog`.`default`]" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 30, + "fragment" : "\"not_exist\".not_exist()" + } ] } @@ -301,9 +344,23 @@ org.apache.spark.sql.AnalysisException -- !query SELECT "hello" -- !query schema -struct +struct<> -- !query output -hello +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`hello`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 14, + "fragment" : "\"hello\"" + } ] +} -- !query @@ -311,7 +368,15 @@ CREATE TEMPORARY VIEW v(c1 COMMENT "hello") AS SELECT 1 -- !query schema struct<> -- !query output - +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'\"hello\"'", + "hint" : "" + } +} -- !query @@ -319,15 +384,30 @@ DROP VIEW v -- !query schema struct<> -- !query output - +org.apache.spark.sql.catalyst.analysis.NoSuchTableException +{ + "errorClass" : "TABLE_OR_VIEW_NOT_FOUND", + "sqlState" : "42P01", + "messageParameters" : { + "relationName" : "`spark_catalog`.`default`.`v`" + } +} -- !query SELECT INTERVAL "1" YEAR -- !query schema -struct +struct<> -- !query output -1-0 +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'\"1\"'", + "hint" : "" + } +} -- !query @@ -367,15 +447,7 @@ CREATE SCHEMA "myschema" -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException -{ - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", - "messageParameters" : { - "error" : "'\"myschema\"'", - "hint" : "" - } -} + -- !query @@ -384,31 +456,15 @@ CREATE TEMPORARY VIEW "myview"("c1") AS -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException -{ - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", - "messageParameters" : { - "error" : "'\"myview\"'", - "hint" : "" - } -} + -- !query SELECT "a1" AS "a2" FROM "myview" AS "atab"("a1") -- !query schema -struct<> +struct -- !query output -org.apache.spark.sql.catalyst.parser.ParseException -{ - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", - "messageParameters" : { - "error" : "'\"a2\"'", - "hint" : "" - } -} +1 -- !query @@ -416,15 +472,7 @@ DROP TABLE "myview" -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException -{ - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", - "messageParameters" : { - "error" : "'\"myview\"'", - "hint" : "" - } -} + -- !query @@ -432,12 +480,4 @@ DROP SCHEMA "myschema" -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException -{ - "errorClass" : "PARSE_SYNTAX_ERROR", - "sqlState" : "42601", - "messageParameters" : { - "error" : "'\"myschema\"'", - "hint" : "" - } -} + diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-all-mosha.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-all-mosha.sql.out index b97596df66cc1..f0708c56224fe 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by-all-mosha.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by-all-mosha.sql.out @@ -41,28 +41,14 @@ struct -- !query -SELECT i + 1, f / i, substring(s, 2, 3), extract(year from t), d / 2, size(a) FROM stuff +SELECT i + 1, f / i, substring(s, 2, 3), extract(year from t), octet_length(d), size(a) FROM stuff GROUP BY ALL ORDER BY 1, 3, 4, 5, 6, 2 -- !query schema -struct<> +struct<(i + 1):int,(f / i):decimal(17,15),substring(s, 2, 3):string,extract(year FROM t):int,octet_length(d):int,size(a):int> -- !query output -org.apache.spark.SparkNumberFormatException -{ - "errorClass" : "CAST_INVALID_INPUT", - "sqlState" : "22018", - "messageParameters" : { - "expression" : "'13.37'", - "sourceType" : "\"STRING\"", - "targetType" : "\"BIGINT\"" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 64, - "stopIndex" : 68, - "fragment" : "d / 2" - } ] -} +43 0.232142857142857 ell 1970 5 3 +43 0.318333333333333 est 1970 10 3 +1338 0.000923335826477 h n 2000 4 3 -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out new file mode 100644 index 0000000000000..b2331ec4ab804 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out @@ -0,0 +1,453 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT * from SQL_KEYWORDS() +-- !query schema +struct +-- !query output +ADD false +AFTER false +AGGREGATE false +ALL true +ALTER false +ALWAYS false +ANALYZE false +AND true +ANTI false +ANY true +ANY_VALUE false +ARCHIVE false +ARRAY false +AS true +ASC false +AT false +AUTHORIZATION true +BEGIN false +BETWEEN false +BIGINT false +BINARY false +BINDING false +BOOLEAN false +BOTH true +BUCKET false +BUCKETS false +BY false +BYTE false +CACHE false +CALL true +CALLED false +CASCADE false +CASE true +CAST true +CATALOG false +CATALOGS false +CHANGE false +CHAR false +CHARACTER false +CHECK true +CLEAR false +CLUSTER false +CLUSTERED false +CODEGEN false +COLLATE true +COLLATION true +COLLECTION false +COLUMN true +COLUMNS false +COMMENT false +COMMIT false +COMPACT false +COMPACTIONS false +COMPENSATION false +COMPUTE false +CONCATENATE false +CONSTRAINT true +CONTAINS false +COST false +CREATE true +CROSS true +CUBE false +CURRENT false +CURRENT_DATE true +CURRENT_TIME true +CURRENT_TIMESTAMP true +CURRENT_USER true +DATA false +DATABASE false +DATABASES false +DATE false +DATEADD false +DATEDIFF false +DATE_ADD false +DATE_DIFF false +DAY false +DAYOFYEAR false +DAYS false +DBPROPERTIES false +DEC false +DECIMAL false +DECLARE false +DEFAULT false +DEFINED false +DEFINER false +DELETE false +DELIMITED false +DESC false +DESCRIBE false +DETERMINISTIC false +DFS false +DIRECTORIES false +DIRECTORY false +DISTINCT true +DISTRIBUTE false +DIV false +DO false +DOUBLE false +DROP false +ELSE true +END true +ESCAPE true +ESCAPED false +EVOLUTION false +EXCEPT true +EXCHANGE false +EXCLUDE false +EXECUTE true +EXISTS false +EXPLAIN false +EXPORT false +EXTENDED false +EXTERNAL false +EXTRACT false +FALSE true +FETCH true +FIELDS false +FILEFORMAT false +FILTER true +FIRST false +FLOAT false +FOLLOWING false +FOR true +FOREIGN true +FORMAT false +FORMATTED false +FROM true +FULL true +FUNCTION false +FUNCTIONS false +GENERATED false +GLOBAL false +GRANT true +GROUP true +GROUPING false +HAVING true +HOUR false +HOURS false +IDENTIFIER false +IDENTITY false +IF false +IGNORE false +ILIKE false +IMMEDIATE false +IMPORT false +IN true +INCLUDE false +INCREMENT false +INDEX false +INDEXES false +INNER true +INPATH false +INPUT false +INPUTFORMAT false +INSERT false +INT false +INTEGER false +INTERSECT true +INTERVAL false +INTO true +INVOKER false +IS true +ITEMS false +ITERATE false +JOIN true +KEYS false +LANGUAGE false +LAST false +LATERAL true +LAZY false +LEADING true +LEAVE false +LEFT true +LIKE false +LIMIT false +LINES false +LIST false +LOAD false +LOCAL false +LOCATION false +LOCK false +LOCKS false +LOGICAL false +LONG false +LOOP false +MACRO false +MAP false +MATCHED false +MERGE false +MICROSECOND false +MICROSECONDS false +MILLISECOND false +MILLISECONDS false +MINUS false +MINUTE false +MINUTES false +MODIFIES false +MONTH false +MONTHS false +MSCK false +NAME false +NAMESPACE false +NAMESPACES false +NANOSECOND false +NANOSECONDS false +NATURAL true +NO false +NONE false +NOT true +NULL true +NULLS false +NUMERIC false +OF false +OFFSET true +ON true +ONLY true +OPTION false +OPTIONS false +OR true +ORDER true +OUT false +OUTER true +OUTPUTFORMAT false +OVER false +OVERLAPS true +OVERLAY false +OVERWRITE false +PARTITION false +PARTITIONED false +PARTITIONS false +PERCENT false +PIVOT false +PLACING false +POSITION false +PRECEDING false +PRIMARY true +PRINCIPALS false +PROPERTIES false +PURGE false +QUARTER false +QUERY false +RANGE false +READS false +REAL false +RECORDREADER false +RECORDWRITER false +RECOVER false +REDUCE false +REFERENCES true +REFRESH false +RENAME false +REPAIR false +REPEAT false +REPEATABLE false +REPLACE false +RESET false +RESPECT false +RESTRICT false +RETURN false +RETURNS false +REVOKE false +RIGHT true +ROLE false +ROLES false +ROLLBACK false +ROLLUP false +ROW false +ROWS false +SCHEMA false +SCHEMAS false +SECOND false +SECONDS false +SECURITY false +SELECT true +SEMI false +SEPARATED false +SERDE false +SERDEPROPERTIES false +SESSION_USER true +SET false +SETS false +SHORT false +SHOW false +SINGLE false +SKEWED false +SMALLINT false +SOME true +SORT false +SORTED false +SOURCE false +SPECIFIC false +SQL true +START false +STATISTICS false +STORED false +STRATIFY false +STRING false +STRUCT false +SUBSTR false +SUBSTRING false +SYNC false +SYSTEM_TIME false +SYSTEM_VERSION false +TABLE true +TABLES false +TABLESAMPLE false +TARGET false +TBLPROPERTIES false +TERMINATED false +THEN true +TIME true +TIMEDIFF false +TIMESTAMP false +TIMESTAMPADD false +TIMESTAMPDIFF false +TIMESTAMP_LTZ false +TIMESTAMP_NTZ false +TINYINT false +TO true +TOUCH false +TRAILING true +TRANSACTION false +TRANSACTIONS false +TRANSFORM false +TRIM false +TRUE false +TRUNCATE false +TRY_CAST false +TYPE false +UNARCHIVE false +UNBOUNDED false +UNCACHE false +UNION true +UNIQUE true +UNKNOWN true +UNLOCK false +UNPIVOT false +UNSET false +UNTIL false +UPDATE false +USE false +USER true +USING true +VALUES false +VAR false +VARCHAR false +VARIABLE false +VARIANT false +VERSION false +VIEW false +VIEWS false +VOID false +WEEK false +WEEKS false +WHEN true +WHERE true +WHILE false +WINDOW false +WITH true +WITHIN true +X false +YEAR false +YEARS false +ZONE false + + +-- !query +SELECT keyword from SQL_KEYWORDS() WHERE reserved +-- !query schema +struct +-- !query output +ALL +AND +ANY +AS +AUTHORIZATION +BOTH +CALL +CASE +CAST +CHECK +COLLATE +COLLATION +COLUMN +CONSTRAINT +CREATE +CROSS +CURRENT_DATE +CURRENT_TIME +CURRENT_TIMESTAMP +CURRENT_USER +DISTINCT +ELSE +END +ESCAPE +EXCEPT +EXECUTE +FALSE +FETCH +FILTER +FOR +FOREIGN +FROM +FULL +GRANT +GROUP +HAVING +IN +INNER +INTERSECT +INTO +IS +JOIN +LATERAL +LEADING +LEFT +NATURAL +NOT +NULL +OFFSET +ON +ONLY +OR +ORDER +OUTER +OVERLAPS +PRIMARY +REFERENCES +RIGHT +SELECT +SESSION_USER +SOME +SQL +TABLE +THEN +TIME +TO +TRAILING +UNION +UNIQUE +UNKNOWN +USER +USING +WHEN +WHERE +WITH +WITHIN diff --git a/sql/core/src/test/resources/sql-tests/results/nonansi/decimalArithmeticOperations.sql.out b/sql/core/src/test/resources/sql-tests/results/nonansi/decimalArithmeticOperations.sql.out index 8074a7bf2ac6d..8276168d8bb87 100644 --- a/sql/core/src/test/resources/sql-tests/results/nonansi/decimalArithmeticOperations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/nonansi/decimalArithmeticOperations.sql.out @@ -1,4 +1,36 @@ -- Automatically generated by SQLQueryTestSuite +-- !query +CREATE TEMPORARY VIEW t AS SELECT 1.0 as a, 0.0 as b +-- !query schema +struct<> +-- !query output + + + +-- !query +select a / b from t +-- !query schema +struct<(a / b):decimal(8,6)> +-- !query output +NULL + + +-- !query +select a % b from t +-- !query schema +struct<(a % b):decimal(1,1)> +-- !query output +NULL + + +-- !query +select pmod(a, b) from t +-- !query schema +struct +-- !query output +NULL + + -- !query create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet -- !query schema @@ -16,6 +48,17 @@ struct<> +-- !query +select id, a+b, a-b, a*b, a/b from decimals_test order by id +-- !query schema +struct +-- !query output +1 1099.00000000000000000 -899.00000000000000000 99900.000000 0.100100 +2 24690.24600000000000000 0.00000000000000000 152402061.885129 1.000000 +3 1234.22345678910110000 -1233.97654321089890000 152.358023 0.000100 +4 123456789123456790.12345678912345679 123456789123456787.87654321087654321 138698367904130467.515623 109890109097814272.043109 + + -- !query select id, a*10, b/10 from decimals_test order by id -- !query schema @@ -67,6 +110,14 @@ struct<(10.300000000000000000 * 3.0000000000000000000):decimal(38,34)> 30.9000000000000000000000000000000000 +-- !query +select 2.35E10 * 1.0 +-- !query schema +struct<(2.35E10 * 1.0):double> +-- !query output +2.35E10 + + -- !query select (5e36BD + 0.1) + 5e36BD -- !query schema @@ -100,11 +151,19 @@ NULL -- !query -select 123456789123456789.1234567890 * 1.123456789123456789 +select 1.2345678901234567890E30BD * 1.2345678901234567890E25BD -- !query schema -struct<(123456789123456789.1234567890 * 1.123456789123456789):decimal(38,18)> +struct<(1234567890123456789000000000000 * 12345678901234567890000000):decimal(38,0)> -- !query output -138698367904130467.654320988515622621 +NULL + + +-- !query +select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345 +-- !query schema +struct<(12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345):decimal(38,6)> +-- !query output +10012345678912345678912345678911.246907 -- !query @@ -123,10 +182,229 @@ struct<(12345678912345.123456789123 / 1.2345678E-8):decimal(38,9)> 1000000073899961059796.725866332 +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(3, 1)) + CAST(90 AS DECIMAL(3, 1)) +-- !query schema +struct +-- !query output +100.0 +20.0 + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(3, 1)) - CAST(-90 AS DECIMAL(3, 1)) +-- !query schema +struct +-- !query output +100.0 +20.0 + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(3, 1)) * CAST(10 AS DECIMAL(3, 1)) +-- !query schema +struct +-- !query output +100.00 +20.00 + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(3, 1)) / CAST(10 AS DECIMAL(3, 1)) +-- !query schema +struct +-- !query output +1.000000 +20.000000 + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(10, 2)) % CAST(3 AS DECIMAL(5, 1)) +-- !query schema +struct +-- !query output +1.00 +20.00 + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT pmod(CAST(10 AS DECIMAL(10, 2)), CAST(3 AS DECIMAL(5, 1))) +-- !query schema +struct +-- !query output +1.00 +20.00 + + +-- !query +SELECT CAST(20 AS DECIMAL(4, 1)) +UNION ALL +SELECT CAST(10 AS DECIMAL(10, 2)) div CAST(3 AS DECIMAL(5, 1)) +-- !query schema +struct +-- !query output +20.0 +3.0 + + +-- !query +set spark.sql.decimalOperations.allowPrecisionLoss=false +-- !query schema +struct +-- !query output +spark.sql.decimalOperations.allowPrecisionLoss false + + +-- !query +select /*+ COALESCE(1) */ id, a+b, a-b, a*b, a/b from decimals_test order by id +-- !query schema +struct +-- !query output +1 1099.000000000000000000 -899.000000000000000000 NULL 0.100100100100100100 +2 24690.246000000000000000 0.000000000000000000 NULL 1.000000000000000000 +3 1234.223456789101100000 -1233.976543210898900000 NULL 0.000100037913541123 +4 123456789123456790.123456789123456789 123456789123456787.876543210876543211 NULL 109890109097814272.043109406191131436 + + +-- !query +select id, a*10, b/10 from decimals_test order by id +-- !query schema +struct +-- !query output +1 1000.000000000000000000 99.9000000000000000000 +2 123451.230000000000000000 1234.5123000000000000000 +3 1.234567891011000000 123.4100000000000000000 +4 1234567891234567890.000000000000000000 0.1123456789123456789 + + +-- !query +select 10.3 * 3.0 +-- !query schema +struct<(10.3 * 3.0):decimal(6,2)> +-- !query output +30.90 + + +-- !query +select 10.3000 * 3.0 +-- !query schema +struct<(10.3000 * 3.0):decimal(9,5)> +-- !query output +30.90000 + + +-- !query +select 10.30000 * 30.0 +-- !query schema +struct<(10.30000 * 30.0):decimal(11,6)> +-- !query output +309.000000 + + +-- !query +select 10.300000000000000000 * 3.000000000000000000 +-- !query schema +struct<(10.300000000000000000 * 3.000000000000000000):decimal(38,36)> +-- !query output +30.900000000000000000000000000000000000 + + +-- !query +select 10.300000000000000000 * 3.0000000000000000000 +-- !query schema +struct<(10.300000000000000000 * 3.0000000000000000000):decimal(38,37)> +-- !query output +NULL + + +-- !query +select 2.35E10 * 1.0 +-- !query schema +struct<(2.35E10 * 1.0):double> +-- !query output +2.35E10 + + +-- !query +select (5e36BD + 0.1) + 5e36BD +-- !query schema +struct<((5000000000000000000000000000000000000 + 0.1) + 5000000000000000000000000000000000000):decimal(38,1)> +-- !query output +NULL + + +-- !query +select (-4e36BD - 0.1) - 7e36BD +-- !query schema +struct<((-4000000000000000000000000000000000000 - 0.1) - 7000000000000000000000000000000000000):decimal(38,1)> +-- !query output +NULL + + +-- !query +select 12345678901234567890.0 * 12345678901234567890.0 +-- !query schema +struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)> +-- !query output +NULL + + +-- !query +select 1e35BD / 0.1 +-- !query schema +struct<(100000000000000000000000000000000000 / 0.1):decimal(38,3)> +-- !query output +NULL + + +-- !query +select 1.2345678901234567890E30BD * 1.2345678901234567890E25BD +-- !query schema +struct<(1234567890123456789000000000000 * 12345678901234567890000000):decimal(38,0)> +-- !query output +NULL + + +-- !query +select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345 +-- !query schema +struct<(12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345):decimal(38,7)> +-- !query output +NULL + + +-- !query +select 123456789123456789.1234567890 * 1.123456789123456789 +-- !query schema +struct<(123456789123456789.1234567890 * 1.123456789123456789):decimal(38,28)> +-- !query output +NULL + + +-- !query +select 12345678912345.123456789123 / 0.000000012345678 +-- !query schema +struct<(12345678912345.123456789123 / 1.2345678E-8):decimal(38,18)> +-- !query output +NULL + + -- !query select 1.0123456789012345678901234567890123456e36BD / 0.1 -- !query schema -struct<(1012345678901234567890123456789012345.6 / 0.1):decimal(38,6)> +struct<(1012345678901234567890123456789012345.6 / 0.1):decimal(38,2)> -- !query output NULL @@ -134,7 +412,7 @@ NULL -- !query select 1.0123456789012345678901234567890123456e35BD / 1.0 -- !query schema -struct<(101234567890123456789012345678901234.56 / 1.0):decimal(38,6)> +struct<(101234567890123456789012345678901234.56 / 1.0):decimal(38,3)> -- !query output NULL @@ -142,25 +420,25 @@ NULL -- !query select 1.0123456789012345678901234567890123456e34BD / 1.0 -- !query schema -struct<(10123456789012345678901234567890123.456 / 1.0):decimal(38,6)> +struct<(10123456789012345678901234567890123.456 / 1.0):decimal(38,3)> -- !query output -NULL +10123456789012345678901234567890123.456 -- !query select 1.0123456789012345678901234567890123456e33BD / 1.0 -- !query schema -struct<(1012345678901234567890123456789012.3456 / 1.0):decimal(38,6)> +struct<(1012345678901234567890123456789012.3456 / 1.0):decimal(38,4)> -- !query output -NULL +1012345678901234567890123456789012.3456 -- !query select 1.0123456789012345678901234567890123456e32BD / 1.0 -- !query schema -struct<(101234567890123456789012345678901.23456 / 1.0):decimal(38,6)> +struct<(101234567890123456789012345678901.23456 / 1.0):decimal(38,5)> -- !query output -NULL +101234567890123456789012345678901.23456 -- !query @@ -182,9 +460,9 @@ NULL -- !query select 1.0123456789012345678901234567890123456e31BD / 10.0 -- !query schema -struct<(10123456789012345678901234567890.123456 / 10.0):decimal(38,6)> +struct<(10123456789012345678901234567890.123456 / 10.0):decimal(38,7)> -- !query output -1012345678901234567890123456789.012346 +1012345678901234567890123456789.0123456 -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/nonansi/double-quoted-identifiers-disabled.sql.out b/sql/core/src/test/resources/sql-tests/results/nonansi/double-quoted-identifiers.sql.out similarity index 100% rename from sql/core/src/test/resources/sql-tests/results/nonansi/double-quoted-identifiers-disabled.sql.out rename to sql/core/src/test/resources/sql-tests/results/nonansi/double-quoted-identifiers.sql.out diff --git a/sql/core/src/test/resources/sql-tests/results/nonansi/try_aggregates.sql.out.java21 b/sql/core/src/test/resources/sql-tests/results/nonansi/try_aggregates.sql.out.java21 new file mode 100644 index 0000000000000..7affe568234f9 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/nonansi/try_aggregates.sql.out.java21 @@ -0,0 +1,299 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT try_sum(col) FROM VALUES (5), (10), (15) AS tab(col) +-- !query schema +struct +-- !query output +30 + + +-- !query +SELECT try_sum(col) FROM VALUES (5.0), (10.0), (15.0) AS tab(col) +-- !query schema +struct +-- !query output +30.0 + + +-- !query +SELECT try_sum(col) FROM VALUES (NULL), (10), (15) AS tab(col) +-- !query schema +struct +-- !query output +25 + + +-- !query +SELECT try_sum(col) FROM VALUES (NULL), (NULL) AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_sum(col) FROM VALUES (9223372036854775807L), (1L) AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_sum(col) FROM VALUES (98765432109876543210987654321098765432BD), (98765432109876543210987654321098765432BD) AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_sum(col) FROM VALUES (interval '1 months'), (interval '1 months') AS tab(col) +-- !query schema +struct +-- !query output +0-2 + + +-- !query +SELECT try_sum(col) FROM VALUES (interval '2147483647 months'), (interval '1 months') AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_sum(col) FROM VALUES (interval '1 seconds'), (interval '1 seconds') AS tab(col) +-- !query schema +struct +-- !query output +0 00:00:02.000000000 + + +-- !query +SELECT try_sum(col) FROM VALUES (interval '106751991 DAYS'), (interval '1 DAYS') AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_sum(col / 0) FROM VALUES (5), (10), (15) AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_sum(col / 0) FROM VALUES (5.0), (10.0), (15.0) AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_sum(col / 0) FROM VALUES (NULL), (10), (15) AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_sum(col + 1L) FROM VALUES (9223372036854775807L), (1L) AS tab(col) +-- !query schema +struct +-- !query output +-9223372036854775806 + + +-- !query +SELECT try_sum(col / 0) FROM VALUES (interval '1 months'), (interval '1 months') AS tab(col) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "INTERVAL_DIVIDED_BY_ZERO", + "sqlState" : "22012", + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 22, + "fragment" : "col / 0" + } ] +} + + +-- !query +SELECT try_sum(col / 0) FROM VALUES (interval '1 seconds'), (interval '1 seconds') AS tab(col) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "INTERVAL_DIVIDED_BY_ZERO", + "sqlState" : "22012", + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 22, + "fragment" : "col / 0" + } ] +} + + +-- !query +SELECT try_avg(col) FROM VALUES (5), (10), (15) AS tab(col) +-- !query schema +struct +-- !query output +10.0 + + +-- !query +SELECT try_avg(col) FROM VALUES (5.0), (10.0), (15.0) AS tab(col) +-- !query schema +struct +-- !query output +10.00000 + + +-- !query +SELECT try_avg(col) FROM VALUES (NULL), (10), (15) AS tab(col) +-- !query schema +struct +-- !query output +12.5 + + +-- !query +SELECT try_avg(col) FROM VALUES (NULL), (NULL) AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_avg(col) FROM VALUES (9223372036854775807L), (1L) AS tab(col) +-- !query schema +struct +-- !query output +4.611686018427388E18 + + +-- !query +SELECT try_avg(col) FROM VALUES (98765432109876543210987654321098765432BD), (98765432109876543210987654321098765432BD) AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_avg(col) FROM VALUES (interval '1 months'), (interval '1 months') AS tab(col) +-- !query schema +struct +-- !query output +0-1 + + +-- !query +SELECT try_avg(col) FROM VALUES (interval '2147483647 months'), (interval '1 months') AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_avg(col) FROM VALUES (interval '1 seconds'), (interval '1 seconds') AS tab(col) +-- !query schema +struct +-- !query output +0 00:00:01.000000000 + + +-- !query +SELECT try_avg(col) FROM VALUES (interval '106751991 DAYS'), (interval '1 DAYS') AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_avg(col / 0) FROM VALUES (5), (10), (15) AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_avg(col / 0) FROM VALUES (5.0), (10.0), (15.0) AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_avg(col / 0) FROM VALUES (NULL), (10), (15) AS tab(col) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_avg(col + 1L) FROM VALUES (9223372036854775807L), (1L) AS tab(col) +-- !query schema +struct +-- !query output +-4.611686018427388E18 + + +-- !query +SELECT try_avg(col / 0) FROM VALUES (interval '1 months'), (interval '1 months') AS tab(col) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "INTERVAL_DIVIDED_BY_ZERO", + "sqlState" : "22012", + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 22, + "fragment" : "col / 0" + } ] +} + + +-- !query +SELECT try_avg(col / 0) FROM VALUES (interval '1 seconds'), (interval '1 seconds') AS tab(col) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "INTERVAL_DIVIDED_BY_ZERO", + "sqlState" : "22012", + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 22, + "fragment" : "col / 0" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/results/nonansi/try_arithmetic.sql.out.java21 b/sql/core/src/test/resources/sql-tests/results/nonansi/try_arithmetic.sql.out.java21 new file mode 100644 index 0000000000000..002a0dfcf37ef --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/nonansi/try_arithmetic.sql.out.java21 @@ -0,0 +1,569 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT try_add(1, 1) +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT try_add(2147483647, 1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_add(2147483647, decimal(1)) +-- !query schema +struct +-- !query output +2147483648 + + +-- !query +SELECT try_add(2147483647, "1") +-- !query schema +struct +-- !query output +2.147483648E9 + + +-- !query +SELECT try_add(-2147483648, -1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_add(9223372036854775807L, 1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_add(-9223372036854775808L, -1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_add(1, (2147483647 + 1)) +-- !query schema +struct +-- !query output +-2147483647 + + +-- !query +SELECT try_add(1L, (9223372036854775807L + 1L)) +-- !query schema +struct +-- !query output +-9223372036854775807 + + +-- !query +SELECT try_add(1, 1.0 / 0.0) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_add(date'2021-01-01', 1) +-- !query schema +struct +-- !query output +2021-01-02 + + +-- !query +SELECT try_add(1, date'2021-01-01') +-- !query schema +struct +-- !query output +2021-01-02 + + +-- !query +SELECT try_add(date'2021-01-01', interval 2 year) +-- !query schema +struct +-- !query output +2023-01-01 + + +-- !query +SELECT try_add(date'2021-01-01', interval 2 second) +-- !query schema +struct +-- !query output +2021-01-01 00:00:02 + + +-- !query +SELECT try_add(interval 2 year, date'2021-01-01') +-- !query schema +struct +-- !query output +2023-01-01 + + +-- !query +SELECT try_add(interval 2 second, date'2021-01-01') +-- !query schema +struct +-- !query output +2021-01-01 00:00:02 + + +-- !query +SELECT try_add(timestamp_ltz'2021-01-01 00:00:00', interval 2 year) +-- !query schema +struct +-- !query output +2023-01-01 00:00:00 + + +-- !query +SELECT try_add(timestamp_ntz'2021-01-01 00:00:00', interval 2 second) +-- !query schema +struct +-- !query output +2021-01-01 00:00:02 + + +-- !query +SELECT try_add(interval 2 year, timestamp_ltz'2021-01-01 00:00:00') +-- !query schema +struct +-- !query output +2023-01-01 00:00:00 + + +-- !query +SELECT try_add(interval 2 second, timestamp_ntz'2021-01-01 00:00:00') +-- !query schema +struct +-- !query output +2021-01-01 00:00:02 + + +-- !query +SELECT try_add(interval 2 year, interval 2 year) +-- !query schema +struct +-- !query output +4-0 + + +-- !query +SELECT try_add(interval 2 second, interval 2 second) +-- !query schema +struct +-- !query output +0 00:00:04.000000000 + + +-- !query +SELECT try_add(interval 2 year, interval 2 second) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"INTERVAL '2' YEAR\"", + "inputType" : "\"INTERVAL YEAR\"", + "paramIndex" : "first", + "requiredType" : "\"(TIMESTAMP OR TIMESTAMP WITHOUT TIME ZONE)\"", + "sqlExpr" : "\"INTERVAL '2' YEAR + INTERVAL '02' SECOND\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 50, + "fragment" : "try_add(interval 2 year, interval 2 second)" + } ] +} + + +-- !query +SELECT try_add(interval 2147483647 month, interval 2 month) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_add(interval 106751991 day, interval 3 day) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(1, 0.5) +-- !query schema +struct +-- !query output +2.000000 + + +-- !query +SELECT try_divide(1, 0) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(0, 0) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(1, (2147483647 + 1)) +-- !query schema +struct +-- !query output +-4.656612873077393E-10 + + +-- !query +SELECT try_divide(1L, (9223372036854775807L + 1L)) +-- !query schema +struct +-- !query output +-1.0842021724855044E-19 + + +-- !query +SELECT try_divide(1, 1.0 / 0.0) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(1, decimal(0)) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(1, "0") +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(interval 2 year, 2) +-- !query schema +struct +-- !query output +1-0 + + +-- !query +SELECT try_divide(interval 2 second, 2) +-- !query schema +struct +-- !query output +0 00:00:01.000000000 + + +-- !query +SELECT try_divide(interval 2 year, 0) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(interval 2 second, 0) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(interval 2147483647 month, 0.5) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(interval 106751991 day, 0.5) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_subtract(1, 1) +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT try_subtract(2147483647, -1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_subtract(2147483647, decimal(-1)) +-- !query schema +struct +-- !query output +2147483648 + + +-- !query +SELECT try_subtract(2147483647, "-1") +-- !query schema +struct +-- !query output +2.147483648E9 + + +-- !query +SELECT try_subtract(-2147483648, 1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_subtract(9223372036854775807L, -1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_subtract(-9223372036854775808L, 1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_subtract(1, (2147483647 + 1)) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_subtract(1L, (9223372036854775807L + 1L)) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_subtract(1, 1.0 / 0.0) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_subtract(interval 2 year, interval 3 year) +-- !query schema +struct +-- !query output +-1-0 + + +-- !query +SELECT try_subtract(interval 3 second, interval 2 second) +-- !query schema +struct +-- !query output +0 00:00:01.000000000 + + +-- !query +SELECT try_subtract(interval 2147483647 month, interval -2 month) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_subtract(interval 106751991 day, interval -3 day) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_multiply(2, 3) +-- !query schema +struct +-- !query output +6 + + +-- !query +SELECT try_multiply(2147483647, -2) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_multiply(2147483647, decimal(-2)) +-- !query schema +struct +-- !query output +-4294967294 + + +-- !query +SELECT try_multiply(2147483647, "-2") +-- !query schema +struct +-- !query output +-4.294967294E9 + + +-- !query +SELECT try_multiply(-2147483648, 2) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_multiply(9223372036854775807L, 2) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_multiply(-9223372036854775808L, -2) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_multiply(1, (2147483647 + 1)) +-- !query schema +struct +-- !query output +-2147483648 + + +-- !query +SELECT try_multiply(1L, (9223372036854775807L + 1L)) +-- !query schema +struct +-- !query output +-9223372036854775808 + + +-- !query +SELECT try_multiply(1, 1.0 / 0.0) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_multiply(interval 2 year, 2) +-- !query schema +struct +-- !query output +4-0 + + +-- !query +SELECT try_multiply(interval 2 second, 2) +-- !query schema +struct +-- !query output +0 00:00:04.000000000 + + +-- !query +SELECT try_multiply(interval 2 year, 0) +-- !query schema +struct +-- !query output +0-0 + + +-- !query +SELECT try_multiply(interval 2 second, 0) +-- !query schema +struct +-- !query output +0 00:00:00.000000000 + + +-- !query +SELECT try_multiply(interval 2147483647 month, 2) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_multiply(interval 106751991 day, 2) +-- !query schema +struct +-- !query output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/try_aggregates.sql.out.java21 b/sql/core/src/test/resources/sql-tests/results/try_aggregates.sql.out.java21 index 7affe568234f9..9d3c97baecabd 100644 --- a/sql/core/src/test/resources/sql-tests/results/try_aggregates.sql.out.java21 +++ b/sql/core/src/test/resources/sql-tests/results/try_aggregates.sql.out.java21 @@ -82,33 +82,91 @@ NULL -- !query SELECT try_sum(col / 0) FROM VALUES (5), (10), (15) AS tab(col) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "DIVIDE_BY_ZERO", + "sqlState" : "22012", + "messageParameters" : { + "config" : "\"spark.sql.ansi.enabled\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 22, + "fragment" : "col / 0" + } ] +} -- !query SELECT try_sum(col / 0) FROM VALUES (5.0), (10.0), (15.0) AS tab(col) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "DIVIDE_BY_ZERO", + "sqlState" : "22012", + "messageParameters" : { + "config" : "\"spark.sql.ansi.enabled\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 22, + "fragment" : "col / 0" + } ] +} -- !query SELECT try_sum(col / 0) FROM VALUES (NULL), (10), (15) AS tab(col) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "DIVIDE_BY_ZERO", + "sqlState" : "22012", + "messageParameters" : { + "config" : "\"spark.sql.ansi.enabled\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 22, + "fragment" : "col / 0" + } ] +} -- !query SELECT try_sum(col + 1L) FROM VALUES (9223372036854775807L), (1L) AS tab(col) -- !query schema -struct +struct<> -- !query output --9223372036854775806 +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "ARITHMETIC_OVERFLOW", + "sqlState" : "22003", + "messageParameters" : { + "alternative" : " Use 'try_add' to tolerate overflow and return NULL instead.", + "config" : "\"spark.sql.ansi.enabled\"", + "message" : "long overflow" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 23, + "fragment" : "col + 1L" + } ] +} -- !query @@ -232,33 +290,91 @@ NULL -- !query SELECT try_avg(col / 0) FROM VALUES (5), (10), (15) AS tab(col) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "DIVIDE_BY_ZERO", + "sqlState" : "22012", + "messageParameters" : { + "config" : "\"spark.sql.ansi.enabled\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 22, + "fragment" : "col / 0" + } ] +} -- !query SELECT try_avg(col / 0) FROM VALUES (5.0), (10.0), (15.0) AS tab(col) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "DIVIDE_BY_ZERO", + "sqlState" : "22012", + "messageParameters" : { + "config" : "\"spark.sql.ansi.enabled\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 22, + "fragment" : "col / 0" + } ] +} -- !query SELECT try_avg(col / 0) FROM VALUES (NULL), (10), (15) AS tab(col) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "DIVIDE_BY_ZERO", + "sqlState" : "22012", + "messageParameters" : { + "config" : "\"spark.sql.ansi.enabled\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 22, + "fragment" : "col / 0" + } ] +} -- !query SELECT try_avg(col + 1L) FROM VALUES (9223372036854775807L), (1L) AS tab(col) -- !query schema -struct +struct<> -- !query output --4.611686018427388E18 +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "ARITHMETIC_OVERFLOW", + "sqlState" : "22003", + "messageParameters" : { + "alternative" : " Use 'try_add' to tolerate overflow and return NULL instead.", + "config" : "\"spark.sql.ansi.enabled\"", + "message" : "long overflow" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 16, + "stopIndex" : 23, + "fragment" : "col + 1L" + } ] +} -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out.java21 b/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out.java21 index 002a0dfcf37ef..acf6e70a50dea 100644 --- a/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out.java21 +++ b/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out.java21 @@ -26,9 +26,9 @@ struct -- !query SELECT try_add(2147483647, "1") -- !query schema -struct +struct -- !query output -2.147483648E9 +2147483648 -- !query @@ -58,25 +58,71 @@ NULL -- !query SELECT try_add(1, (2147483647 + 1)) -- !query schema -struct +struct<> -- !query output --2147483647 +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "ARITHMETIC_OVERFLOW", + "sqlState" : "22003", + "messageParameters" : { + "alternative" : " Use 'try_add' to tolerate overflow and return NULL instead.", + "config" : "\"spark.sql.ansi.enabled\"", + "message" : "integer overflow" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 20, + "stopIndex" : 33, + "fragment" : "2147483647 + 1" + } ] +} -- !query SELECT try_add(1L, (9223372036854775807L + 1L)) -- !query schema -struct +struct<> -- !query output --9223372036854775807 +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "ARITHMETIC_OVERFLOW", + "sqlState" : "22003", + "messageParameters" : { + "alternative" : " Use 'try_add' to tolerate overflow and return NULL instead.", + "config" : "\"spark.sql.ansi.enabled\"", + "message" : "long overflow" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 21, + "stopIndex" : 45, + "fragment" : "9223372036854775807L + 1L" + } ] +} -- !query SELECT try_add(1, 1.0 / 0.0) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "DIVIDE_BY_ZERO", + "sqlState" : "22012", + "messageParameters" : { + "config" : "\"spark.sql.ansi.enabled\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 19, + "stopIndex" : 27, + "fragment" : "1.0 / 0.0" + } ] +} -- !query @@ -244,25 +290,71 @@ NULL -- !query SELECT try_divide(1, (2147483647 + 1)) -- !query schema -struct +struct<> -- !query output --4.656612873077393E-10 +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "ARITHMETIC_OVERFLOW", + "sqlState" : "22003", + "messageParameters" : { + "alternative" : " Use 'try_add' to tolerate overflow and return NULL instead.", + "config" : "\"spark.sql.ansi.enabled\"", + "message" : "integer overflow" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 23, + "stopIndex" : 36, + "fragment" : "2147483647 + 1" + } ] +} -- !query SELECT try_divide(1L, (9223372036854775807L + 1L)) -- !query schema -struct +struct<> -- !query output --1.0842021724855044E-19 +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "ARITHMETIC_OVERFLOW", + "sqlState" : "22003", + "messageParameters" : { + "alternative" : " Use 'try_add' to tolerate overflow and return NULL instead.", + "config" : "\"spark.sql.ansi.enabled\"", + "message" : "long overflow" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 24, + "stopIndex" : 48, + "fragment" : "9223372036854775807L + 1L" + } ] +} -- !query SELECT try_divide(1, 1.0 / 0.0) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "DIVIDE_BY_ZERO", + "sqlState" : "22012", + "messageParameters" : { + "config" : "\"spark.sql.ansi.enabled\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 22, + "stopIndex" : 30, + "fragment" : "1.0 / 0.0" + } ] +} -- !query @@ -356,9 +448,9 @@ struct -- !query SELECT try_subtract(2147483647, "-1") -- !query schema -struct +struct -- !query output -2.147483648E9 +2147483648 -- !query @@ -388,25 +480,71 @@ NULL -- !query SELECT try_subtract(1, (2147483647 + 1)) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "ARITHMETIC_OVERFLOW", + "sqlState" : "22003", + "messageParameters" : { + "alternative" : " Use 'try_add' to tolerate overflow and return NULL instead.", + "config" : "\"spark.sql.ansi.enabled\"", + "message" : "integer overflow" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 25, + "stopIndex" : 38, + "fragment" : "2147483647 + 1" + } ] +} -- !query SELECT try_subtract(1L, (9223372036854775807L + 1L)) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "ARITHMETIC_OVERFLOW", + "sqlState" : "22003", + "messageParameters" : { + "alternative" : " Use 'try_add' to tolerate overflow and return NULL instead.", + "config" : "\"spark.sql.ansi.enabled\"", + "message" : "long overflow" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 50, + "fragment" : "9223372036854775807L + 1L" + } ] +} -- !query SELECT try_subtract(1, 1.0 / 0.0) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "DIVIDE_BY_ZERO", + "sqlState" : "22012", + "messageParameters" : { + "config" : "\"spark.sql.ansi.enabled\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 24, + "stopIndex" : 32, + "fragment" : "1.0 / 0.0" + } ] +} -- !query @@ -468,9 +606,9 @@ struct -- !query SELECT try_multiply(2147483647, "-2") -- !query schema -struct +struct -- !query output --4.294967294E9 +-4294967294 -- !query @@ -500,25 +638,71 @@ NULL -- !query SELECT try_multiply(1, (2147483647 + 1)) -- !query schema -struct +struct<> -- !query output --2147483648 +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "ARITHMETIC_OVERFLOW", + "sqlState" : "22003", + "messageParameters" : { + "alternative" : " Use 'try_add' to tolerate overflow and return NULL instead.", + "config" : "\"spark.sql.ansi.enabled\"", + "message" : "integer overflow" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 25, + "stopIndex" : 38, + "fragment" : "2147483647 + 1" + } ] +} -- !query SELECT try_multiply(1L, (9223372036854775807L + 1L)) -- !query schema -struct +struct<> -- !query output --9223372036854775808 +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "ARITHMETIC_OVERFLOW", + "sqlState" : "22003", + "messageParameters" : { + "alternative" : " Use 'try_add' to tolerate overflow and return NULL instead.", + "config" : "\"spark.sql.ansi.enabled\"", + "message" : "long overflow" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 50, + "fragment" : "9223372036854775807L + 1L" + } ] +} -- !query SELECT try_multiply(1, 1.0 / 0.0) -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkArithmeticException +{ + "errorClass" : "DIVIDE_BY_ZERO", + "sqlState" : "22012", + "messageParameters" : { + "config" : "\"spark.sql.ansi.enabled\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 24, + "stopIndex" : 32, + "fragment" : "1.0 / 0.0" + } ] +} -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/union.sql.out b/sql/core/src/test/resources/sql-tests/results/union.sql.out index 3825470777bdb..d7db2163f8b87 100644 --- a/sql/core/src/test/resources/sql-tests/results/union.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/union.sql.out @@ -31,7 +31,7 @@ struct -- !query SELECT * -FROM (SELECT * FROM t1 +FROM (SELECT * FROM t1 where c1 = 1 UNION ALL SELECT * FROM t2 UNION ALL @@ -52,8 +52,8 @@ org.apache.spark.SparkNumberFormatException "objectType" : "", "objectName" : "", "startIndex" : 1, - "stopIndex" : 120, - "fragment" : "SELECT *\nFROM (SELECT * FROM t1\n UNION ALL\n SELECT * FROM t2\n UNION ALL\n SELECT * FROM t2)" + "stopIndex" : 133, + "fragment" : "SELECT *\nFROM (SELECT * FROM t1 where c1 = 1\n UNION ALL\n SELECT * FROM t2\n UNION ALL\n SELECT * FROM t2)" } ] } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala index 170105200f1d1..a4e39df6c632d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} +import org.apache.spark.sql.types.{DataType, IntegerType, SQLUserDefinedType, UserDefinedType} import org.apache.spark.unsafe.types.UTF8String /** @@ -549,6 +550,56 @@ trait SQLInsertTestSuite extends QueryTest with SQLTestUtils with AdaptiveSparkP assert(reusedExchanges.size == 1) } } + + test("SPARK-50340: unwrap UDT before insertion") { + withTempView("v") { + Seq((1, MyInt(2))).toDF("c1", "c2").createTempView("v") + + withTable("t") { + createTable("t", Seq("c1", "c2"), Seq("int", "int")) + sql("INSERT INTO t SELECT * FROM v") + checkAnswer(spark.table("t"), Row(1, 2)) + } + + // can upcast UDT input + withTable("t") { + createTable("t", Seq("c1", "c2"), Seq("int", "long")) + sql("INSERT INTO t SELECT * FROM v") + checkAnswer(spark.table("t"), Row(1, 2L)) + } + + // Array of UDT + withTable("t") { + createTable("t", Seq("c1", "c2"), Seq("int", "array")) + sql("INSERT INTO t SELECT c1, array(c2) FROM v") + checkAnswer(spark.table("t"), Row(1, Seq(2))) + } + + // Map of UDT + withTable("t") { + createTable("t", Seq("c1", "c2"), Seq("int", "map")) + sql("INSERT INTO t SELECT c1, map(c2, c2) FROM v") + checkAnswer(spark.table("t"), Row(1, Map(2 -> 2))) + } + + // Struct of UDT + withTable("t") { + createTable("t", Seq("c1", "c2"), Seq("int", "struct")) + sql("INSERT INTO t SELECT c1, struct(c2 as f1, c2 as f2) FROM v") + checkAnswer(spark.table("t"), Row(1, Row(2, 2))) + } + } + } +} + +@SQLUserDefinedType(udt = classOf[MyIntUDT]) +private case class MyInt(i: Int) + +private class MyIntUDT extends UserDefinedType[MyInt] { + override def sqlType: DataType = IntegerType + override def serialize(obj: MyInt): Any = obj.i + override def deserialize(datum: Any): MyInt = MyInt(datum.asInstanceOf[Int]) + override def userClass: Class[MyInt] = classOf[MyInt] } class FileSourceSQLInsertTestSuite extends SQLInsertTestSuite with SharedSparkSession {