diff --git a/DEPS.bzl b/DEPS.bzl index 186347a3af101..8ad9f50eaa4f7 100644 --- a/DEPS.bzl +++ b/DEPS.bzl @@ -3420,6 +3420,14 @@ def go_deps(): sum = "h1:CZ7eSOd3kZoaYDLbXnmzgQI5RlciuXBMA+18HwHRfZQ=", version = "v1.12.0", ) + go_repository( + name = "com_github_spkg_bom", + build_file_proto_mode = "disable", + importpath = "github.com/spkg/bom", + sum = "h1:S939THe0ukL5WcTGiGqkgtaW5JW+O6ITaIlpJXTYY64=", + version = "v1.0.0", + ) + go_repository( name = "com_github_ssgreg_nlreturn_v2", build_file_proto_mode = "disable", diff --git a/br/pkg/lightning/mydump/BUILD.bazel b/br/pkg/lightning/mydump/BUILD.bazel index dccd93f84e7ce..6bf020e2feb6b 100644 --- a/br/pkg/lightning/mydump/BUILD.bazel +++ b/br/pkg/lightning/mydump/BUILD.bazel @@ -31,6 +31,7 @@ go_library( "//util/slice", "//util/table-filter", "@com_github_pingcap_errors//:errors", + "@com_github_spkg_bom//:bom", "@com_github_xitongsys_parquet_go//parquet", "@com_github_xitongsys_parquet_go//reader", "@com_github_xitongsys_parquet_go//source", diff --git a/br/pkg/lightning/mydump/parser.go b/br/pkg/lightning/mydump/parser.go index 512c3789cfa7f..0ac82ce189d71 100644 --- a/br/pkg/lightning/mydump/parser.go +++ b/br/pkg/lightning/mydump/parser.go @@ -32,6 +32,7 @@ import ( "github.com/pingcap/tidb/br/pkg/lightning/worker" "github.com/pingcap/tidb/parser/mysql" "github.com/pingcap/tidb/types" + "github.com/spkg/bom" "go.uber.org/zap" "go.uber.org/zap/zapcore" ) @@ -285,7 +286,13 @@ func (parser *blockParser) readBlock() error { parser.remainBuf.Write(parser.buf) parser.appendBuf.Reset() parser.appendBuf.Write(parser.remainBuf.Bytes()) - parser.appendBuf.Write(parser.blockBuf[:n]) + blockData := parser.blockBuf[:n] + if parser.pos == 0 { + bomCleanedData := bom.Clean(blockData) + parser.pos += int64(n - len(bomCleanedData)) + blockData = bomCleanedData + } + parser.appendBuf.Write(blockData) parser.buf = parser.appendBuf.Bytes() if parser.metrics != nil { parser.metrics.ChunkParserReadBlockSecondsHistogram.Observe(time.Since(startTime).Seconds()) diff --git a/br/pkg/lightning/mydump/reader.go b/br/pkg/lightning/mydump/reader.go index 4837b35aceab2..3735e97cb48ee 100644 --- a/br/pkg/lightning/mydump/reader.go +++ b/br/pkg/lightning/mydump/reader.go @@ -26,6 +26,7 @@ import ( "github.com/pingcap/tidb/br/pkg/lightning/log" "github.com/pingcap/tidb/br/pkg/lightning/worker" "github.com/pingcap/tidb/br/pkg/storage" + "github.com/spkg/bom" "go.uber.org/zap" "golang.org/x/text/encoding/simplifiedchinese" ) @@ -83,7 +84,7 @@ func ExportStatement(ctx context.Context, store storage.ExternalStorage, sqlFile } defer fd.Close() - br := bufio.NewReader(fd) + br := bufio.NewReader(bom.NewReader(fd)) data := make([]byte, 0, sqlFile.FileMeta.FileSize+1) buffer := make([]byte, 0, sqlFile.FileMeta.FileSize+1) diff --git a/br/tests/lightning_bom_file/config.toml b/br/tests/lightning_bom_file/config.toml new file mode 100644 index 0000000000000..291d1b166103a --- /dev/null +++ b/br/tests/lightning_bom_file/config.toml @@ -0,0 +1,2 @@ +[mydumper.csv] +header = true diff --git a/br/tests/lightning_bom_file/data/mytest.testtbl-schema.sql b/br/tests/lightning_bom_file/data/mytest.testtbl-schema.sql new file mode 100644 index 0000000000000..4232788898790 --- /dev/null +++ b/br/tests/lightning_bom_file/data/mytest.testtbl-schema.sql @@ -0,0 +1,5 @@ +CREATE TABLE testtbl ( + id INTEGER, + val1 VARCHAR(40) NOT NULL, + INDEX `idx_val1` (`val1`) +); diff --git a/br/tests/lightning_bom_file/data/mytest.testtbl.csv b/br/tests/lightning_bom_file/data/mytest.testtbl.csv new file mode 100644 index 0000000000000..e0931cce2a480 --- /dev/null +++ b/br/tests/lightning_bom_file/data/mytest.testtbl.csv @@ -0,0 +1,6 @@ +id,val1 +1,"aaa01" +2,"aaa01" +3,"aaa02" +4,"aaa02" +5,"aaa05" diff --git a/br/tests/lightning_bom_file/original_data/mytest.testtbl-schema.sql b/br/tests/lightning_bom_file/original_data/mytest.testtbl-schema.sql new file mode 100644 index 0000000000000..dc1e032a16618 --- /dev/null +++ b/br/tests/lightning_bom_file/original_data/mytest.testtbl-schema.sql @@ -0,0 +1,5 @@ +CREATE TABLE testtbl ( + id INTEGER, + val1 VARCHAR(40) NOT NULL, + INDEX `idx_val1` (`val1`) +); diff --git a/br/tests/lightning_bom_file/original_data/mytest.testtbl.csv b/br/tests/lightning_bom_file/original_data/mytest.testtbl.csv new file mode 100644 index 0000000000000..270c410cd79fd --- /dev/null +++ b/br/tests/lightning_bom_file/original_data/mytest.testtbl.csv @@ -0,0 +1,6 @@ +id,val1 +1,"aaa01" +2,"aaa01" +3,"aaa02" +4,"aaa02" +5,"aaa05" diff --git a/br/tests/lightning_bom_file/run.sh b/br/tests/lightning_bom_file/run.sh new file mode 100755 index 0000000000000..88eada54c74a9 --- /dev/null +++ b/br/tests/lightning_bom_file/run.sh @@ -0,0 +1,56 @@ +#!/bin/sh +# +# Copyright 2023 PingCAP, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eux + +mydir=$(dirname "${BASH_SOURCE[0]}") + +original_schema_file="${mydir}/original_data/mytest.testtbl-schema.sql" +original_data_file="${mydir}/original_data/mytest.testtbl.csv" +schema_file="${original_schema_file/original_data/data}" +data_file="${original_data_file/original_data/data}" + +# add the BOM header +printf '\xEF\xBB\xBF' | cat - <( sed '1s/^\xEF\xBB\xBF//' "${original_schema_file}" ) > "${schema_file}" +printf '\xEF\xBB\xBF' | cat - <( sed '1s/^\xEF\xBB\xBF//' "${original_data_file}" ) > "${data_file}" + +# verify the BOM header +if ! grep -q $'^\xEF\xBB\xBF' "${schema_file}"; then + echo "schema file doesn't contain the BOM header" >&2 + exit 1 +fi + +if ! grep -q $'^\xEF\xBB\xBF' "${data_file}"; then + echo "data file doesn't contain the BOM header" >&2 + exit 1 +fi + +row_count=$( sed '1d' "${data_file}" | wc -l | xargs echo ) + +run_lightning --backend tidb + +# Check that everything is correctly imported +run_sql 'SELECT count(*) FROM mytest.testtbl' +check_contains "count(*): ${row_count}" + +check_cluster_version 4 0 0 'local backend' || exit 0 +run_sql "DROP TABLE mytest.testtbl" + +run_lightning --backend local + +# Check that everything is correctly imported +run_sql 'SELECT count(*) FROM mytest.testtbl' +check_contains "count(*): ${row_count}" diff --git a/go.mod b/go.mod index 8babb1b852f2a..afcf4f493ad0a 100644 --- a/go.mod +++ b/go.mod @@ -84,7 +84,13 @@ require ( github.com/soheilhy/cmux v0.1.5 github.com/spf13/cobra v1.6.1 github.com/spf13/pflag v1.0.5 +<<<<<<< HEAD github.com/stretchr/testify v1.8.0 +======= + github.com/spkg/bom v1.0.0 + github.com/stathat/consistent v1.0.0 + github.com/stretchr/testify v1.8.1 +>>>>>>> e11cbec12b (lightning: support data files with bom header (#40813)) github.com/tdakkota/asciicheck v0.1.1 github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2 github.com/tikv/client-go/v2 v2.0.3 diff --git a/go.sum b/go.sum index 81137a64e2707..dc1c08c1dbb01 100644 --- a/go.sum +++ b/go.sum @@ -906,6 +906,8 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s= github.com/spf13/viper v1.7.0/go.mod h1:8WkrPz2fc9jxqZNCJI/76HCieCp4Q8HaLFoCha5qpdg= +github.com/spkg/bom v1.0.0 h1:S939THe0ukL5WcTGiGqkgtaW5JW+O6ITaIlpJXTYY64= +github.com/spkg/bom v1.0.0/go.mod h1:lAz2VbTuYNcvs7iaFF8WW0ufXrHShJ7ck1fYFFbVXJs= github.com/stathat/consistent v1.0.0 h1:ZFJ1QTRn8npNBKW065raSZ8xfOqhpb8vLOkfp4CcL/U= github.com/stathat/consistent v1.0.0/go.mod h1:uajTPbgSygZBJ+V+0mY7meZ8i0XAcZs7AQ6V121XSxw= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=