Skip to content

Commit

Permalink
Support config regex to skip specified line (#43)
Browse files Browse the repository at this point in the history
Implement #42

Change-Id: I08fd43a65706f9a402b2ac328a3a5c034b8445df
  • Loading branch information
Linary authored and zhoney committed Mar 18, 2019
1 parent c67d89a commit 505d91b
Show file tree
Hide file tree
Showing 9 changed files with 73 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -123,16 +123,16 @@ protected Line fetch() {
return null;
}

// Skip the comment line
if (this.isCommentLine(rawLine)) {
// Skip the line matched specified regex
if (this.needSkipLine(rawLine)) {
return this.fetch();
} else {
return this.parser.parse(rawLine);
}
}

private boolean isCommentLine(String line) {
return this.source.commentSymbols().stream().anyMatch(line::startsWith);
private boolean needSkipLine(String line) {
return line.matches(this.source.skippedLineRegex());
}

private boolean isDuplicateHeader(String line) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@
package com.baidu.hugegraph.loader.source.file;

import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import com.baidu.hugegraph.loader.source.InputSource;
import com.baidu.hugegraph.loader.source.SourceType;
Expand All @@ -32,6 +30,7 @@ public class FileSource implements InputSource {

private static final String DEFAULT_CHARSET = "UTF-8";
private static final String DEFAULT_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
private static final String DEFAULT_SKIPPED_LINE_REGEX = "";

@JsonProperty("path")
private String path;
Expand All @@ -45,16 +44,16 @@ public class FileSource implements InputSource {
private String charset;
@JsonProperty("date_format")
private String dateFormat;
@JsonProperty("skipped_line_regex")
private String skippedLineRegex;
@JsonProperty("compression")
private Compression compression;
@JsonProperty("comment_symbols")
private Set<String> commentSymbols;

public FileSource() {
this.charset = DEFAULT_CHARSET;
this.dateFormat = DEFAULT_DATE_FORMAT;
this.skippedLineRegex = DEFAULT_SKIPPED_LINE_REGEX;
this.compression = Compression.NONE;
this.commentSymbols = new HashSet<>();
}

@Override
Expand Down Expand Up @@ -90,13 +89,12 @@ public String dateFormat() {
return this.dateFormat;
}

public Compression compression() {
return this.compression;
public String skippedLineRegex() {
return this.skippedLineRegex;
}

public Set<String> commentSymbols() {
assert this.commentSymbols != null;
return Collections.unmodifiableSet(this.commentSymbols);
public Compression compression() {
return this.compression;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -590,12 +590,13 @@ public void testIgnoreNullValueColumns() {
}

@Test
public void testFileOnlyHasAnEmptyLine() {
ioUtil.write("vertex_person_empty.csv", "");
public void testFileNoHeader() {
ioUtil.write("vertex_person.csv",
"marko,29,Beijing");

String[] args = new String[]{
"-f", configPath("file_only_has_empty_line/struct.json"),
"-s", configPath("file_only_has_empty_line/schema.groovy"),
"-f", configPath("file_no_header/struct.json"),
"-s", configPath("file_no_header/schema.groovy"),
"-g", GRAPH,
"-h", SERVER,
"--test-mode", "true"
Expand Down Expand Up @@ -628,18 +629,39 @@ public void testMultiFilesHaveHeader() {
}

@Test
public void testFileHasCommentLine() {
public void testFileHasEmptyLine() {
ioUtil.write("vertex_person.csv",
"name,age,city",
"marko,29,#Beijing",
"",
"vadas,27,//Hongkong");

String[] args = new String[]{
"-f", configPath("file_has_empty_line/struct.json"),
"-s", configPath("file_has_empty_line/schema.groovy"),
"-g", GRAPH,
"-h", SERVER,
"--test-mode", "true"
};
HugeGraphLoader.main(args);

List<Vertex> vertices = CLIENT.graph().listVertices();
Assert.assertEquals(2, vertices.size());
}

@Test
public void testFileHasSkippedLine() {
ioUtil.write("vertex_person.csv",
"name,age,city",
"# This is a comment",
"marko,29,Beijing",
"marko,29,#Beijing",
"// This is also a comment",
"# This is still a comment",
"vadas,27,Hongkong");
"vadas,27,//Hongkong");

String[] args = new String[]{
"-f", configPath("file_has_comment_line/struct.json"),
"-s", configPath("file_has_comment_line/schema.groovy"),
"-f", configPath("file_has_skipped_line/struct.json"),
"-s", configPath("file_has_skipped_line/schema.groovy"),
"-g", GRAPH,
"-h", SERVER,
"--test-mode", "true"
Expand Down
18 changes: 18 additions & 0 deletions src/test/resources/file_has_empty_line/struct.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"vertices": [
{
"label": "person",
"input": {
"type": "${source_type}",
"path": "${store_path}/vertex_person.csv",
"format": "CSV",
"charset": "UTF-8"
},
"mapping": {
"name": "name",
"age": "age",
"city": "city"
}
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"path": "${store_path}/vertex_person.csv",
"format": "CSV",
"charset": "UTF-8",
"comment_symbols": ["#", "//"]
"skipped_line_regex": "(^#|^//).*"
},
"mapping": {
"name": "name",
Expand Down
10 changes: 10 additions & 0 deletions src/test/resources/file_no_header/schema.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// Define schema
schema.propertyKey("name").asText().ifNotExist().create();
schema.propertyKey("age").asInt().ifNotExist().create();
schema.propertyKey("city").asText().ifNotExist().create();
schema.propertyKey("weight").asDouble().ifNotExist().create();
schema.propertyKey("date").asText().ifNotExist().create();

schema.vertexLabel("person").properties("name", "age", "city").primaryKeys("name", "city").ifNotExist().create();

schema.edgeLabel("knows").sourceLabel("person").targetLabel("person").properties("date", "weight").ifNotExist().create();
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"label": "person",
"input": {
"type": "${source_type}",
"path": "src/test/resources/vertex_person_empty.csv",
"path": "src/test/resources/vertex_person.csv",
"format": "CSV",
"charset": "UTF-8"
}
Expand Down

0 comments on commit 505d91b

Please sign in to comment.