From 8516c81e4a42ae732f921b16ff4dfb8c5e617cc2 Mon Sep 17 00:00:00 2001 From: liningrui Date: Mon, 18 Mar 2019 11:54:32 +0800 Subject: [PATCH] Support config regex to skip specified line Implement #42 Change-Id: Ib27d2cfdaa05599761024ddd5c913e47cccb6eb6 --- .../loader/reader/file/AbstractFileReader.java | 8 ++++---- .../loader/source/file/FileSource.java | 18 ++++++++---------- .../loader/test/functional/FileLoadTest.java | 10 +++++----- .../schema.groovy | 0 .../struct.json | 2 +- 5 files changed, 18 insertions(+), 20 deletions(-) rename src/test/resources/{file_has_comment_line => file_has_skipped_line}/schema.groovy (100%) rename src/test/resources/{file_has_comment_line => file_has_skipped_line}/struct.json (88%) diff --git a/src/main/java/com/baidu/hugegraph/loader/reader/file/AbstractFileReader.java b/src/main/java/com/baidu/hugegraph/loader/reader/file/AbstractFileReader.java index 99407267c..66b0393bf 100644 --- a/src/main/java/com/baidu/hugegraph/loader/reader/file/AbstractFileReader.java +++ b/src/main/java/com/baidu/hugegraph/loader/reader/file/AbstractFileReader.java @@ -123,16 +123,16 @@ protected Line fetch() { return null; } - // Skip the comment line - if (this.isCommentLine(rawLine)) { + // Skip the line matched specified regex + if (this.needSkipLine(rawLine)) { return this.fetch(); } else { return this.parser.parse(rawLine); } } - private boolean isCommentLine(String line) { - return this.source.commentSymbols().stream().anyMatch(line::startsWith); + private boolean needSkipLine(String line) { + return line.matches(this.source.skippedLineRegex()); } private boolean isDuplicateHeader(String line) { diff --git a/src/main/java/com/baidu/hugegraph/loader/source/file/FileSource.java b/src/main/java/com/baidu/hugegraph/loader/source/file/FileSource.java index 843929fe8..7ed1b24c2 100644 --- a/src/main/java/com/baidu/hugegraph/loader/source/file/FileSource.java +++ b/src/main/java/com/baidu/hugegraph/loader/source/file/FileSource.java @@ -20,9 +20,7 @@ package com.baidu.hugegraph.loader.source.file; import java.util.Collections; -import java.util.HashSet; import java.util.List; -import java.util.Set; import com.baidu.hugegraph.loader.source.InputSource; import com.baidu.hugegraph.loader.source.SourceType; @@ -32,6 +30,7 @@ public class FileSource implements InputSource { private static final String DEFAULT_CHARSET = "UTF-8"; private static final String DEFAULT_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss"; + private static final String DEFAULT_SKIPPED_LINE_REGEX = ""; @JsonProperty("path") private String path; @@ -45,16 +44,16 @@ public class FileSource implements InputSource { private String charset; @JsonProperty("date_format") private String dateFormat; + @JsonProperty("skipped_line_regex") + private String skippedLineRegex; @JsonProperty("compression") private Compression compression; - @JsonProperty("comment_symbols") - private Set commentSymbols; public FileSource() { this.charset = DEFAULT_CHARSET; this.dateFormat = DEFAULT_DATE_FORMAT; + this.skippedLineRegex = DEFAULT_SKIPPED_LINE_REGEX; this.compression = Compression.NONE; - this.commentSymbols = new HashSet<>(); } @Override @@ -90,13 +89,12 @@ public String dateFormat() { return this.dateFormat; } - public Compression compression() { - return this.compression; + public String skippedLineRegex() { + return this.skippedLineRegex; } - public Set commentSymbols() { - assert this.commentSymbols != null; - return Collections.unmodifiableSet(this.commentSymbols); + public Compression compression() { + return this.compression; } @Override diff --git a/src/test/java/com/baidu/hugegraph/loader/test/functional/FileLoadTest.java b/src/test/java/com/baidu/hugegraph/loader/test/functional/FileLoadTest.java index 2c3e4a953..01fda9cb1 100644 --- a/src/test/java/com/baidu/hugegraph/loader/test/functional/FileLoadTest.java +++ b/src/test/java/com/baidu/hugegraph/loader/test/functional/FileLoadTest.java @@ -628,18 +628,18 @@ public void testMultiFilesHaveHeader() { } @Test - public void testFileHasCommentLine() { + public void testFileHasSkipLine() { ioUtil.write("vertex_person.csv", "name,age,city", "# This is a comment", - "marko,29,Beijing", + "marko,29,#Beijing", "// This is also a comment", "# This is still a comment", - "vadas,27,Hongkong"); + "vadas,27,//Hongkong"); String[] args = new String[]{ - "-f", configPath("file_has_comment_line/struct.json"), - "-s", configPath("file_has_comment_line/schema.groovy"), + "-f", configPath("file_has_skipped_line/struct.json"), + "-s", configPath("file_has_skipped_line/schema.groovy"), "-g", GRAPH, "-h", SERVER, "--test-mode", "true" diff --git a/src/test/resources/file_has_comment_line/schema.groovy b/src/test/resources/file_has_skipped_line/schema.groovy similarity index 100% rename from src/test/resources/file_has_comment_line/schema.groovy rename to src/test/resources/file_has_skipped_line/schema.groovy diff --git a/src/test/resources/file_has_comment_line/struct.json b/src/test/resources/file_has_skipped_line/struct.json similarity index 88% rename from src/test/resources/file_has_comment_line/struct.json rename to src/test/resources/file_has_skipped_line/struct.json index 79fdfbb62..20b183778 100644 --- a/src/test/resources/file_has_comment_line/struct.json +++ b/src/test/resources/file_has_skipped_line/struct.json @@ -7,7 +7,7 @@ "path": "${store_path}/vertex_person.csv", "format": "CSV", "charset": "UTF-8", - "comment_symbols": ["#", "//"] + "skipped_line_regex": "(^#|^//).*" }, "mapping": { "name": "name",