-
Notifications
You must be signed in to change notification settings - Fork 207
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implemented CSVCodec for S3 Source, config & unit tests #1644
Merged
dlvenable
merged 4 commits into
opensearch-project:main
from
finnroblin:csv-codec-implementation-2
Aug 9, 2022
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
eea0d1f
Implemented CSVCodec for S3 Source, config & unit tests
finnroblin cfb92fa
Addressed Travis's feedback
finnroblin 8fd9473
Addressed David's feedback & fixed NPE if header is null
finnroblin 89ee226
Renamed all instances of CSVProcessor to CsvProcessor (and offshoots …
finnroblin File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
143 changes: 143 additions & 0 deletions
143
...plugins/s3-source/src/main/java/com/amazon/dataprepper/plugins/source/codec/CsvCodec.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package com.amazon.dataprepper.plugins.source.codec; | ||
|
||
import com.amazon.dataprepper.model.annotations.DataPrepperPlugin; | ||
import com.amazon.dataprepper.model.annotations.DataPrepperPluginConstructor; | ||
import com.amazon.dataprepper.model.event.Event; | ||
import com.amazon.dataprepper.model.log.JacksonLog; | ||
import com.amazon.dataprepper.model.record.Record; | ||
import com.fasterxml.jackson.core.JsonParseException; | ||
import com.fasterxml.jackson.databind.MappingIterator; | ||
import com.fasterxml.jackson.dataformat.csv.CsvMapper; | ||
import com.fasterxml.jackson.dataformat.csv.CsvReadException; | ||
import com.fasterxml.jackson.dataformat.csv.CsvSchema; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
import java.util.function.Consumer; | ||
|
||
@DataPrepperPlugin(name = "csv", pluginType = Codec.class, pluginConfigurationType = CsvCodecConfig.class) | ||
public class CsvCodec implements Codec { | ||
private static final Logger LOG = LoggerFactory.getLogger(CsvCodec.class); | ||
private final CsvCodecConfig config; | ||
|
||
@DataPrepperPluginConstructor | ||
public CsvCodec(final CsvCodecConfig config) { | ||
Objects.requireNonNull(config); | ||
this.config = config; | ||
} | ||
|
||
@Override | ||
public void parse(final InputStream inputStream, final Consumer<Record<Event>> eventConsumer) throws IOException { | ||
try (final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))) { | ||
parseBufferedReader(reader, eventConsumer); | ||
} | ||
} | ||
|
||
private void parseBufferedReader(final BufferedReader reader, final Consumer<Record<Event>> eventConsumer) throws IOException { | ||
final CsvMapper mapper = createCsvMapper(); | ||
final CsvSchema schema; | ||
if (config.isDetectHeader()) { | ||
schema = createAutodetectHeaderCsvSchema(); | ||
} | ||
else { | ||
final int numberColumnsFirstLine = getNumberOfColumnsByMarkingBeginningOfInputStreamAndResettingReaderAfter(reader); | ||
schema = createCsvSchemaFromConfig(numberColumnsFirstLine); | ||
} | ||
|
||
MappingIterator<Map<String, String>> parsingIterator = mapper.readerFor(Map.class).with(schema).readValues(reader); | ||
try { | ||
while (parsingIterator.hasNextValue()) { | ||
readCsvLine(parsingIterator, eventConsumer); | ||
} | ||
} catch (Exception jsonExceptionOnHasNextLine) { | ||
LOG.error("An Exception occurred while determining if file has next line ", jsonExceptionOnHasNextLine); | ||
} | ||
} | ||
|
||
private int getNumberOfColumnsByMarkingBeginningOfInputStreamAndResettingReaderAfter(BufferedReader reader) throws IOException { | ||
final int defaultBufferSize = 8192; // this number doesn't affect even a thousand column header — it's sufficiently large. | ||
reader.mark(defaultBufferSize); // calling reader.readLine() will consume the first line, so mark initial location to reset after | ||
final int firstLineNumberColumns = extractNumberOfColumnsFromFirstLine(reader.readLine()); | ||
reader.reset(); // move reader pointer back to beginning of file in order to reread first line | ||
return firstLineNumberColumns; | ||
} | ||
|
||
private void readCsvLine(final MappingIterator<Map<String, String>> parsingIterator, final Consumer<Record<Event>> eventConsumer) { | ||
try { | ||
final Map<String, String> parsedLine = parsingIterator.nextValue(); | ||
|
||
final Event event = JacksonLog.builder() | ||
.withData(parsedLine) | ||
.build(); | ||
eventConsumer.accept(new Record<>(event)); | ||
} catch (final CsvReadException csvException) { | ||
LOG.error("Invalid CSV row, skipping this line. This typically means the row has too many lines. Consider using the CSV " + | ||
"Processor if there might be inconsistencies in the number of columns because it is more flexible. Error ", | ||
csvException); | ||
} catch (JsonParseException jsonException) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: missing |
||
LOG.error("A JsonParseException occurred on a row of the CSV file, skipping line. This typically means a quote character was " + | ||
"not properly closed. Error ", jsonException); | ||
} catch (final Exception e) { | ||
LOG.error("An Exception occurred while reading a row of the CSV file. Error ", e); | ||
} | ||
} | ||
|
||
private int extractNumberOfColumnsFromFirstLine(final String firstLine) { | ||
int numberOfSeparators = 0; | ||
for (int charPointer = 0; charPointer < firstLine.length(); charPointer++) { | ||
if (firstLine.charAt(charPointer) == config.getDelimiter().charAt(0)) { | ||
numberOfSeparators++; | ||
} | ||
} | ||
return numberOfSeparators + 1; | ||
} | ||
|
||
private CsvSchema createCsvSchemaFromConfig(final int firstLineSize) { | ||
final List<String> userSpecifiedHeader = Objects.isNull(config.getHeader()) ? new ArrayList<>() : config.getHeader(); | ||
final List<String> actualHeader = new ArrayList<>(); | ||
final char delimiter = config.getDelimiter().charAt(0); | ||
final char quoteCharacter = config.getQuoteCharacter().charAt(0); | ||
int providedHeaderColIdx = 0; | ||
for (; providedHeaderColIdx < userSpecifiedHeader.size() && providedHeaderColIdx < firstLineSize; providedHeaderColIdx++) { | ||
actualHeader.add(userSpecifiedHeader.get(providedHeaderColIdx)); | ||
} | ||
for (int remainingColIdx = providedHeaderColIdx; remainingColIdx < firstLineSize; remainingColIdx++) { | ||
actualHeader.add(generateColumnHeader(remainingColIdx)); | ||
} | ||
CsvSchema.Builder headerBuilder = CsvSchema.builder(); | ||
for (String columnName : actualHeader) { | ||
headerBuilder = headerBuilder.addColumn(columnName); | ||
} | ||
CsvSchema schema = headerBuilder.build().withColumnSeparator(delimiter).withQuoteChar(quoteCharacter); | ||
return schema; | ||
} | ||
|
||
private String generateColumnHeader(final int columnNumber) { | ||
final int displayColumnNumber = columnNumber + 1; // auto generated column name indices start from 1 (not 0) | ||
return "column" + displayColumnNumber; | ||
} | ||
|
||
private CsvMapper createCsvMapper() { | ||
final CsvMapper mapper = new CsvMapper(); | ||
return mapper; | ||
} | ||
|
||
private CsvSchema createAutodetectHeaderCsvSchema() { | ||
final CsvSchema schema = CsvSchema.emptySchema().withColumnSeparator(config.getDelimiter().charAt(0)) | ||
.withQuoteChar(config.getQuoteCharacter().charAt(0)).withHeader(); | ||
return schema; | ||
} | ||
} |
67 changes: 67 additions & 0 deletions
67
...s/s3-source/src/main/java/com/amazon/dataprepper/plugins/source/codec/CsvCodecConfig.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package com.amazon.dataprepper.plugins.source.codec; | ||
|
||
import com.fasterxml.jackson.annotation.JsonProperty; | ||
import jakarta.validation.constraints.AssertTrue; | ||
|
||
import java.util.List; | ||
import java.util.Objects; | ||
|
||
public class CsvCodecConfig { | ||
static final String DEFAULT_DELIMITER = ","; | ||
static final String DEFAULT_QUOTE_CHARACTER = "\""; // double quote | ||
static final Boolean DEFAULT_DETECT_HEADER = true; | ||
|
||
@JsonProperty("delimiter") | ||
private String delimiter = DEFAULT_DELIMITER; | ||
|
||
@JsonProperty("quote_character") | ||
private String quoteCharacter = DEFAULT_QUOTE_CHARACTER; | ||
|
||
@JsonProperty("header") | ||
private List<String> header; | ||
|
||
@JsonProperty("detect_header") | ||
private Boolean detectHeader = DEFAULT_DETECT_HEADER; | ||
|
||
public String getDelimiter() { | ||
return delimiter; | ||
} | ||
|
||
public String getQuoteCharacter() { | ||
return quoteCharacter; | ||
} | ||
|
||
public List<String> getHeader() { | ||
return header; | ||
} | ||
|
||
public Boolean isDetectHeader() { | ||
return detectHeader; | ||
} | ||
|
||
@AssertTrue(message = "delimiter must be exactly one character.") | ||
boolean isValidDelimiter() { | ||
return delimiter.length() == 1; | ||
} | ||
|
||
@AssertTrue(message = "quote_character must be exactly one character.") | ||
boolean isValidQuoteCharacter() { | ||
return quoteCharacter.length() == 1; | ||
} | ||
|
||
@AssertTrue(message = "quote_character and delimiter cannot be the same character.") | ||
boolean areDelimiterAndQuoteCharacterDifferent() { | ||
return !(delimiter.equals(quoteCharacter)); | ||
} | ||
|
||
@AssertTrue(message = "header must not be an empty list. To autogenerate columns, set detect_header: false and delete header " + | ||
"from config.") | ||
boolean isValidHeader() { | ||
return Objects.isNull(header) || header.size() > 0; | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
minor: I think you could just call this
getNumberOfColumns
since the caller of this function doesn't need to care how it works