Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lightning: support data files with bom header #40813

Merged
merged 5 commits into from
Jan 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions DEPS.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -3896,6 +3896,14 @@ def go_deps():
sum = "h1:CZ7eSOd3kZoaYDLbXnmzgQI5RlciuXBMA+18HwHRfZQ=",
version = "v1.12.0",
)
go_repository(
name = "com_github_spkg_bom",
build_file_proto_mode = "disable",
importpath = "github.com/spkg/bom",
sum = "h1:S939THe0ukL5WcTGiGqkgtaW5JW+O6ITaIlpJXTYY64=",
version = "v1.0.0",
)

go_repository(
name = "com_github_ssgreg_nlreturn_v2",
build_file_proto_mode = "disable",
Expand Down
1 change: 1 addition & 0 deletions br/pkg/lightning/mydump/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ go_library(
"//util/slice",
"//util/table-filter",
"@com_github_pingcap_errors//:errors",
"@com_github_spkg_bom//:bom",
"@com_github_xitongsys_parquet_go//parquet",
"@com_github_xitongsys_parquet_go//reader",
"@com_github_xitongsys_parquet_go//source",
Expand Down
9 changes: 8 additions & 1 deletion br/pkg/lightning/mydump/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/pingcap/tidb/br/pkg/lightning/worker"
"github.com/pingcap/tidb/parser/mysql"
"github.com/pingcap/tidb/types"
"github.com/spkg/bom"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
Expand Down Expand Up @@ -285,7 +286,13 @@ func (parser *blockParser) readBlock() error {
parser.remainBuf.Write(parser.buf)
parser.appendBuf.Reset()
parser.appendBuf.Write(parser.remainBuf.Bytes())
parser.appendBuf.Write(parser.blockBuf[:n])
blockData := parser.blockBuf[:n]
if parser.pos == 0 {
bomCleanedData := bom.Clean(blockData)
parser.pos += int64(n - len(bomCleanedData))
blockData = bomCleanedData
}
parser.appendBuf.Write(blockData)
parser.buf = parser.appendBuf.Bytes()
if parser.metrics != nil {
parser.metrics.ChunkParserReadBlockSecondsHistogram.Observe(time.Since(startTime).Seconds())
Expand Down
3 changes: 2 additions & 1 deletion br/pkg/lightning/mydump/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/pingcap/tidb/br/pkg/lightning/log"
"github.com/pingcap/tidb/br/pkg/lightning/worker"
"github.com/pingcap/tidb/br/pkg/storage"
"github.com/spkg/bom"
"go.uber.org/zap"
"golang.org/x/text/encoding/simplifiedchinese"
)
Expand Down Expand Up @@ -83,7 +84,7 @@ func ExportStatement(ctx context.Context, store storage.ExternalStorage, sqlFile
}
defer fd.Close()

br := bufio.NewReader(fd)
br := bufio.NewReader(bom.NewReader(fd))

data := make([]byte, 0, sqlFile.FileMeta.FileSize+1)
buffer := make([]byte, 0, sqlFile.FileMeta.FileSize+1)
Expand Down
2 changes: 2 additions & 0 deletions br/tests/lightning_bom_file/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[mydumper.csv]
header = true
5 changes: 5 additions & 0 deletions br/tests/lightning_bom_file/data/mytest.testtbl-schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CREATE TABLE testtbl (
id INTEGER,
val1 VARCHAR(40) NOT NULL,
INDEX `idx_val1` (`val1`)
);
6 changes: 6 additions & 0 deletions br/tests/lightning_bom_file/data/mytest.testtbl.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,val1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe check the BOM really exists in run.sh 😂

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I remember the BOM header cannot be presented properly in the text editor...Maybe it's just ignored similarly by GitHub

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it has no visual output, but we can use xxd or something tools to check it. I'm afraid that the BOM header will be deleted accidentally in future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

addressed in 9749703

1,"aaa01"
2,"aaa01"
3,"aaa02"
4,"aaa02"
5,"aaa05"
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CREATE TABLE testtbl (
id INTEGER,
val1 VARCHAR(40) NOT NULL,
INDEX `idx_val1` (`val1`)
);
6 changes: 6 additions & 0 deletions br/tests/lightning_bom_file/original_data/mytest.testtbl.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,val1
1,"aaa01"
2,"aaa01"
3,"aaa02"
4,"aaa02"
5,"aaa05"
56 changes: 56 additions & 0 deletions br/tests/lightning_bom_file/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/sh
#
# Copyright 2023 PingCAP, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -eux

mydir=$(dirname "${BASH_SOURCE[0]}")

original_schema_file="${mydir}/original_data/mytest.testtbl-schema.sql"
original_data_file="${mydir}/original_data/mytest.testtbl.csv"
schema_file="${original_schema_file/original_data/data}"
data_file="${original_data_file/original_data/data}"

# add the BOM header
printf '\xEF\xBB\xBF' | cat - <( sed '1s/^\xEF\xBB\xBF//' "${original_schema_file}" ) > "${schema_file}"
printf '\xEF\xBB\xBF' | cat - <( sed '1s/^\xEF\xBB\xBF//' "${original_data_file}" ) > "${data_file}"

# verify the BOM header
if ! grep -q $'^\xEF\xBB\xBF' "${schema_file}"; then
echo "schema file doesn't contain the BOM header" >&2
exit 1
fi

if ! grep -q $'^\xEF\xBB\xBF' "${data_file}"; then
echo "data file doesn't contain the BOM header" >&2
exit 1
fi

row_count=$( sed '1d' "${data_file}" | wc -l | xargs echo )

run_lightning --backend tidb

# Check that everything is correctly imported
run_sql 'SELECT count(*) FROM mytest.testtbl'
check_contains "count(*): ${row_count}"

check_cluster_version 4 0 0 'local backend' || exit 0
run_sql "DROP TABLE mytest.testtbl"

run_lightning --backend local

# Check that everything is correctly imported
run_sql 'SELECT count(*) FROM mytest.testtbl'
check_contains "count(*): ${row_count}"
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ require (
github.com/soheilhy/cmux v0.1.5
github.com/spf13/cobra v1.6.1
github.com/spf13/pflag v1.0.5
github.com/spkg/bom v1.0.0
github.com/stathat/consistent v1.0.0
github.com/stretchr/testify v1.8.1
github.com/tdakkota/asciicheck v0.1.1
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1278,6 +1278,8 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
github.com/spf13/viper v1.7.0/go.mod h1:8WkrPz2fc9jxqZNCJI/76HCieCp4Q8HaLFoCha5qpdg=
github.com/spkg/bom v1.0.0 h1:S939THe0ukL5WcTGiGqkgtaW5JW+O6ITaIlpJXTYY64=
github.com/spkg/bom v1.0.0/go.mod h1:lAz2VbTuYNcvs7iaFF8WW0ufXrHShJ7ck1fYFFbVXJs=
github.com/stathat/consistent v1.0.0 h1:ZFJ1QTRn8npNBKW065raSZ8xfOqhpb8vLOkfp4CcL/U=
github.com/stathat/consistent v1.0.0/go.mod h1:uajTPbgSygZBJ+V+0mY7meZ8i0XAcZs7AQ6V121XSxw=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
Expand Down