Skip to content

Commit

Permalink
refactor ProblemAggregator
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd committed May 25, 2022
1 parent a90b869 commit 1d0df66
Show file tree
Hide file tree
Showing 7 changed files with 165 additions and 83 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregatorImpl;
import org.enso.table.read.WithProblems;

/**
Expand Down Expand Up @@ -31,7 +31,7 @@ public abstract class IncrementalDatatypeParser extends DatatypeParser {
*/
public WithProblems<Storage> parseColumn(String columnName, StringStorage sourceStorage) {
Builder builder = makeBuilderWithCapacity(sourceStorage.size());
var aggregator = new ProblemAggregator(columnName);
var aggregator = new ProblemAggregatorImpl(columnName);

for (int i = 0; i < sourceStorage.size(); ++i) {
String cell = sourceStorage.getItem(i);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregatorImpl;
import org.enso.table.parsing.problems.SimplifiedProblemAggregator;
import org.enso.table.read.WithProblems;

/**
Expand All @@ -26,8 +28,9 @@ public TypeInferringParser(

@Override
public Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
SimplifiedProblemAggregator internal = new SimplifiedProblemAggregator();
for (IncrementalDatatypeParser parser : baseParsers) {
ProblemAggregator internal = new ProblemAggregator(null);
internal.reset();
Object result = parser.parseSingleValue(text, internal);
if (!internal.hasProblems()) {
return result;
Expand All @@ -42,7 +45,7 @@ public WithProblems<Storage> parseColumn(String columnName, StringStorage source
parsers:
for (IncrementalDatatypeParser parser : baseParsers) {
Builder builder = parser.makeBuilderWithCapacity(sourceStorage.size());
var aggregator = new ProblemAggregator(columnName);
var aggregator = new ProblemAggregatorImpl(columnName);

for (int i = 0; i < sourceStorage.size(); ++i) {
String cell = sourceStorage.getItem(i);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package org.enso.table.parsing.problems;

import java.util.List;

/** A problem aggregator which ignores problems. */
public class NoOpProblemAggregator implements ProblemAggregator {

@Override
public void reportInvalidFormat(String cell) {}

@Override
public void reportLeadingZeroes(String cell) {}

@Override
public void reportMismatchedQuote() {}

@Override
public boolean hasProblems() {
throw new IllegalStateException("This implementation does not provide problem information.");
}

@Override
public List<ParsingProblem> getAggregatedProblems() {
throw new IllegalStateException("This implementation does not provide problem information.");
}
}
Original file line number Diff line number Diff line change
@@ -1,71 +1,32 @@
package org.enso.table.parsing.problems;

import java.util.ArrayList;
import java.util.List;

/**
* An aggregator for parsing problems.
*
* <p>Each strategy exposes a method that returns a summary of the problems. The particular methods
* for reporting each problem are defined in particular subclasses.
*/
public class ProblemAggregator {

private final List<String> invalidFormatCells = new ArrayList<>();
private final List<String> leadingZerosCells = new ArrayList<>();
private int mismatchedQuotes = 0;
public final String relatedColumnName;

public ProblemAggregator(String relatedColumnName) {
this.relatedColumnName = relatedColumnName;
}
/** An aggregator for parsing problems. */
public interface ProblemAggregator {

/**
* Reports a cell with an invalid format.
*
* <p>The reports are aggregated and finally a single problem containing all invalid cell for the
* <p>The reports are aggregated and finally a single problem containing all invalid cells for the
* given column is reported.
*/
public void reportInvalidFormat(String cell) {
invalidFormatCells.add(cell);
}
void reportInvalidFormat(String cell);

public void reportLeadingZeroes(String cell) {
leadingZerosCells.add(cell);
}
/** Reports a cell containing unexpected leading zeros. */
void reportLeadingZeroes(String cell);

public void reportMismatchedQuote() {
mismatchedQuotes++;
}
/** Reports that a mismatched quote has been encountered. */
void reportMismatchedQuote();

/**
* Checks if there are any problems already reported.
*
* <p>This method returns true if and only if {@code getAggregatedProblems} would return a
* non-empty list.
*/
public boolean hasProblems() {
return !invalidFormatCells.isEmpty() || !leadingZerosCells.isEmpty() || mismatchedQuotes > 0;
}
boolean hasProblems();

/** Return an aggregated summary of problems that have been reported. */
public List<ParsingProblem> getAggregatedProblems() {
List<ParsingProblem> problems = new ArrayList<>();

if (!invalidFormatCells.isEmpty()) {
problems.add(new InvalidFormat(relatedColumnName, invalidFormatCells));
}

if (!leadingZerosCells.isEmpty()) {
problems.add(new LeadingZeros(relatedColumnName, leadingZerosCells));
}

for (int i = 0; i < mismatchedQuotes; ++i) {
problems.add(new MismatchedQuote());
}

assert problems.isEmpty() == !hasProblems();

return problems;
}
List<ParsingProblem> getAggregatedProblems();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package org.enso.table.parsing.problems;

import java.util.ArrayList;
import java.util.List;

public class ProblemAggregatorImpl implements ProblemAggregator {
public final String relatedColumnName;
private final List<String> invalidFormatCells = new ArrayList<>();
private final List<String> leadingZerosCells = new ArrayList<>();
private int mismatchedQuotes = 0;

public ProblemAggregatorImpl(String relatedColumnName) {
this.relatedColumnName = relatedColumnName;
}

@Override
public void reportInvalidFormat(String cell) {
invalidFormatCells.add(cell);
}

@Override
public void reportLeadingZeroes(String cell) {
leadingZerosCells.add(cell);
}

@Override
public void reportMismatchedQuote() {
mismatchedQuotes++;
}

@Override
public boolean hasProblems() {
return !invalidFormatCells.isEmpty() || !leadingZerosCells.isEmpty() || mismatchedQuotes > 0;
}

@Override
public List<ParsingProblem> getAggregatedProblems() {
List<ParsingProblem> problems = new ArrayList<>();

if (!invalidFormatCells.isEmpty()) {
problems.add(new InvalidFormat(relatedColumnName, invalidFormatCells));
}

if (!leadingZerosCells.isEmpty()) {
problems.add(new LeadingZeros(relatedColumnName, leadingZerosCells));
}

for (int i = 0; i < mismatchedQuotes; ++i) {
problems.add(new MismatchedQuote());
}

assert problems.isEmpty() == !hasProblems();

return problems;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package org.enso.table.parsing.problems;

import java.util.List;

public class SimplifiedProblemAggregator implements ProblemAggregator {

private boolean hasProblems = false;

@Override
public void reportInvalidFormat(String cell) {
hasProblems = true;
}

@Override
public void reportLeadingZeroes(String cell) {
hasProblems = true;
}

@Override
public void reportMismatchedQuote() {
hasProblems = true;
}

@Override
public boolean hasProblems() {
return hasProblems;
}

public void reset() {
hasProblems = false;
}

@Override
public List<ParsingProblem> getAggregatedProblems() {
throw new IllegalStateException("Problem aggregation is not available in this implementation.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,29 +19,16 @@
import org.enso.table.parsing.problems.AdditionalInvalidRows;
import org.enso.table.parsing.problems.InvalidRow;
import org.enso.table.parsing.problems.MismatchedQuote;
import org.enso.table.parsing.problems.NoOpProblemAggregator;
import org.enso.table.parsing.problems.ParsingProblem;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.util.NameDeduplicator;

/** A helper for reading delimited (CSV-like) files. */
public class DelimitedReader {

/** Specifies how to set the headers for the returned table. */
public enum HeaderBehavior {
/** Tries to infer if the headers are present in the file. */
INFER,

/** Uses the first row in the file as headers. Duplicate names will be appended suffixes. */
USE_FIRST_ROW_AS_HEADERS,

/**
* Treats the first row as data and generates header names starting with {@code COLUMN_NAME}.
*/
GENERATE_HEADERS
}

private static final String COLUMN_NAME = "Column";

private static final char noQuoteCharacter = '\0';
private static final long invalidRowsLimit = 10;
private final char delimiter;
private final char quoteCharacter;
private final char quoteEscapeCharacter;
Expand All @@ -55,8 +42,12 @@ public enum HeaderBehavior {
private final TypeInferringParser cellTypeGuesser;
private final boolean keepInvalidRows;
private final boolean warningsAsErrors;

private static final char noQuoteCharacter = '\0';
private final NoOpProblemAggregator noOpProblemAggregator = new NoOpProblemAggregator();
private long invalidRowsCount = 0;
private long targetTableIndex = 0;
/** The line number of the start of the current row in the input file. */
private long currentLine = 0;
private StringStorageBuilder[] builders = null;

/**
* Creates a new reader.
Expand Down Expand Up @@ -181,9 +172,6 @@ private void reportMismatchedQuote() {
reportProblem(new MismatchedQuote());
}

private long invalidRowsCount = 0;
private static final long invalidRowsLimit = 10;

private void reportInvalidRow(long source_row, Long table_index, String[] row) {
if (invalidRowsCount < invalidRowsLimit) {
reportProblem(new InvalidRow(source_row, table_index, row));
Expand All @@ -210,13 +198,6 @@ private void reportProblem(ParsingProblem problem) {
}
}

private long targetTableIndex = 0;

/** The line number of the start of the current row in the input file. */
private long currentLine = 0;

private StringStorageBuilder[] builders = null;

/**
* Reads the next row and updates the current line accordingly.
*
Expand Down Expand Up @@ -281,10 +262,14 @@ private List<String> generateDefaultHeaders(int columnCount) {
return headerNames;
}

/** Checks if the given cell contains just plain text that is not null and is not convertible to any more specific type according to the {@code cellTypeGuesser}. This is used for checking the types when inferring the headers. */
/**
* Checks if the given cell contains just plain text that is not null and is not convertible to
* any more specific type according to the {@code cellTypeGuesser}. This is used for checking the
* types when inferring the headers.
*/
private boolean isPlainText(String cell) {
if (cell == null) return false;
Object parsed = cellTypeGuesser.parseSingleValue(cell, new ProblemAggregator(null));
Object parsed = cellTypeGuesser.parseSingleValue(cell, noOpProblemAggregator);
return parsed instanceof String;
}

Expand Down Expand Up @@ -373,4 +358,18 @@ private void initBuilders(int count) {
builders[i] = new StringStorageBuilder();
}
}

/** Specifies how to set the headers for the returned table. */
public enum HeaderBehavior {
/** Tries to infer if the headers are present in the file. */
INFER,

/** Uses the first row in the file as headers. Duplicate names will be appended suffixes. */
USE_FIRST_ROW_AS_HEADERS,

/**
* Treats the first row as data and generates header names starting with {@code COLUMN_NAME}.
*/
GENERATE_HEADERS
}
}

0 comments on commit 1d0df66

Please sign in to comment.