Skip to content

Commit

Permalink
Add name problems to DelimitedReader
Browse files Browse the repository at this point in the history
  • Loading branch information
jdunkerley committed May 31, 2022
1 parent b2a0226 commit d627378
Show file tree
Hide file tree
Showing 26 changed files with 111 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import Standard.Table

import Standard.Base.Error.Common as Errors
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior
from Standard.Table.Error as Table_Errors import Invalid_Row, Mismatched_Quote, Parser_Error, Additional_Invalid_Rows
from Standard.Table.Error as Table_Errors import Duplicate_Output_Column_Names, Invalid_Output_Column_Names, Invalid_Row, Mismatched_Quote, Parser_Error, Additional_Invalid_Rows
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
from Standard.Table.Io.File_Format import Infer
from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
Expand All @@ -13,6 +13,8 @@ polyglot java import org.enso.table.read.ParsingFailedException
polyglot java import org.enso.table.parsing.problems.InvalidRow
polyglot java import org.enso.table.parsing.problems.MismatchedQuote
polyglot java import org.enso.table.parsing.problems.AdditionalInvalidRows
polyglot java import org.enso.table.util.problems.DuplicateNames
polyglot java import org.enso.table.util.problems.InvalidNames
polyglot java import java.lang.IllegalArgumentException
polyglot java import java.io.IOException
polyglot java import com.univocity.parsers.common.TextParsingException
Expand Down Expand Up @@ -105,7 +107,9 @@ read_from_reader format java_reader on_problems max_columns=4096 =
if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else
if Java.is_instance java_problem MismatchedQuote then Mismatched_Quote else
if Java.is_instance java_problem AdditionalInvalidRows then Additional_Invalid_Rows java_problem.count else
java_problem
if Java.is_instance java_problem DuplicateNames then Duplicate_Output_Column_Names java_problem.duplicatedNames else
if Java.is_instance java_problem InvalidNames then Invalid_Output_Column_Names java_problem.invalidNames else
java_problem

translate_illegal_argument caught_panic =
Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage)
Expand All @@ -132,6 +136,6 @@ read_from_reader format java_reader on_problems max_columns=4096 =
formatter = format.value_formatter.if_nothing Data_Formatter
TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new
reader = DelimitedReader.new java_reader format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors
result = Table.Table reader.read
parsing_problems = Vector.Vector reader.getReportedProblems . map translate_parsing_problem
on_problems.attach_problems_after result parsing_problems
result_with_problems = reader.read
parsing_problems = Vector.Vector (result_with_problems.problems) . map translate_parsing_problem
on_problems.attach_problems_after (Table.Table result_with_problems.value) parsing_problems
7 changes: 6 additions & 1 deletion distribution/lib/Standard/Test/0.0.0-dev/src/Main.enso
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,12 @@ report_pending_group name reason config =
Spec.print_report : Suite_Config -> Nothing
Spec.print_report config =
if config.should_output_junit then
config.builder.append ('<testsuite name="' + this.name + '" timestamp="' + (Time.now.format "yyyy-MM-dd'T'HH:mm:ss") + '">')
config.builder.append ('<testsuite name="' + this.name + '" timestamp="' + (Time.now.format "yyyy-MM-dd'T'HH:mm:ss") + '"')
config.builder.append (' tests="' + this.behaviors.length.to_text + '"')
config.builder.append (' disabled="' + this.behaviors.filter (x->(x.is_a Pending)) . length . to_text + '"')
config.builder.append (' errors="' + this.behaviors.filter (x->(x.is_a Failure)) . length . to_text + '"')
config.builder.append ('>')

this.behaviors.reverse.each behavior->
config.builder.append ('<testcase name="' + behavior.name + '">')
case behavior.result of
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.read.WithProblems;
import org.enso.table.problems.WithProblems;

/** A base type for a parser capable of parsing a column of text values into some other type. */
public abstract class DatatypeParser {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.read.WithProblems;
import org.enso.table.problems.WithProblems;

/** A parser that just returns its input. Useful as a fallback. */
public class IdentityParser extends IncrementalDatatypeParser {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregatorImpl;
import org.enso.table.read.WithProblems;
import org.enso.table.problems.WithProblems;

/**
* A base type for a datatype parsing strategy which relies on a method parsing a single value.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregatorImpl;
import org.enso.table.parsing.problems.SimplifiedProblemAggregator;
import org.enso.table.read.WithProblems;
import org.enso.table.problems.WithProblems;

/**
* The type inferring parser tries to parse the given column using a set of provided parsers. It
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
package org.enso.table.parsing.problems;

import org.enso.table.problems.Problem;

/** A problem which indicates how many additional invalid rows were encountered. */
public record AdditionalInvalidRows(long count) implements ParsingProblem {}
public record AdditionalInvalidRows(long count) implements Problem {}
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.enso.table.parsing.problems;

import org.enso.table.problems.Problem;

import java.util.List;

/** Indicates that a text value did not match the format expected of a datatype. */
public record InvalidFormat(String column, List<String> cells) implements ParsingProblem {}
public record InvalidFormat(String column, List<String> cells) implements Problem {}
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
package org.enso.table.parsing.problems;

import org.enso.table.problems.Problem;

/** A problem indicating that a row contained more or less columns than expected. */
public record InvalidRow(long source_row, Long table_index, String[] row) implements ParsingProblem {}
public record InvalidRow(long source_row, Long table_index, String[] row) implements Problem {}
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.enso.table.parsing.problems;

import org.enso.table.problems.Problem;

import java.util.List;

/** Indicates that some values contained leading zeros when leading zeros where not allowed in the given numeric conversion. */
public record LeadingZeros(String column, List<String> cells) implements ParsingProblem {}
public record LeadingZeros(String column, List<String> cells) implements Problem {}
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
package org.enso.table.parsing.problems;

import org.enso.table.problems.Problem;

/** A problem indicating that a quote has been opened and never closed. */
public record MismatchedQuote() implements ParsingProblem {}
public record MismatchedQuote() implements Problem {}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.enso.table.parsing.problems;

import org.enso.table.problems.Problem;

import java.util.List;

/** A problem aggregator which ignores problems. */
Expand All @@ -20,7 +22,7 @@ public boolean hasProblems() {
}

@Override
public List<ParsingProblem> getAggregatedProblems() {
public List<Problem> getAggregatedProblems() {
throw new IllegalStateException("This implementation does not provide problem information.");
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.enso.table.parsing.problems;

import org.enso.table.problems.Problem;

import java.util.List;

/** An aggregator for parsing problems. */
Expand Down Expand Up @@ -28,5 +30,5 @@ public interface ProblemAggregator {
boolean hasProblems();

/** Return an aggregated summary of problems that have been reported. */
List<ParsingProblem> getAggregatedProblems();
List<Problem> getAggregatedProblems();
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.enso.table.parsing.problems;

import org.enso.table.problems.Problem;

import java.util.ArrayList;
import java.util.List;

Expand Down Expand Up @@ -34,8 +36,8 @@ public boolean hasProblems() {
}

@Override
public List<ParsingProblem> getAggregatedProblems() {
List<ParsingProblem> problems = new ArrayList<>();
public List<Problem> getAggregatedProblems() {
List<Problem> problems = new ArrayList<>();

if (!invalidFormatCells.isEmpty()) {
problems.add(new InvalidFormat(relatedColumnName, invalidFormatCells));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.enso.table.parsing.problems;

import org.enso.table.problems.Problem;

import java.util.List;

public class SimplifiedProblemAggregator implements ProblemAggregator {
Expand Down Expand Up @@ -27,7 +29,7 @@ public boolean hasProblems() {
}

@Override
public List<ParsingProblem> getAggregatedProblems() {
public List<Problem> getAggregatedProblems() {
throw new IllegalStateException("Problem aggregation is not available in this implementation.");
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package org.enso.table.parsing.problems;
package org.enso.table.problems;

/**
* A parent class for parsing problems which may be reported as warnings or errors, depending on the
* setup.
*/
public interface ParsingProblem {}
public interface Problem {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package org.enso.table.problems;

import java.util.List;

/** A value annotated with problems that occurred when it was being computed. */
public record WithProblems<T>(T value, List<Problem> problems) {}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.enso.table.data.column.builder.string.StringStorageBuilder;
Expand All @@ -14,13 +15,14 @@
import org.enso.table.data.index.DefaultIndex;
import org.enso.table.data.table.Column;
import org.enso.table.data.table.Table;
import org.enso.table.problems.WithProblems;
import org.enso.table.parsing.DatatypeParser;
import org.enso.table.parsing.TypeInferringParser;
import org.enso.table.parsing.problems.AdditionalInvalidRows;
import org.enso.table.parsing.problems.InvalidRow;
import org.enso.table.parsing.problems.MismatchedQuote;
import org.enso.table.parsing.problems.NoOpProblemAggregator;
import org.enso.table.parsing.problems.ParsingProblem;
import org.enso.table.problems.Problem;
import org.enso.table.util.NameDeduplicator;

/** A helper for reading delimited (CSV-like) files. */
Expand All @@ -36,7 +38,7 @@ public class DelimitedReader {
private final long skipRows;
private final long rowLimit;
private final int maxColumns;
private final List<ParsingProblem> warnings = new ArrayList<>();
private final List<Problem> warnings = new ArrayList<>();
private final CsvParser parser;
private final DatatypeParser valueParser;
private final TypeInferringParser cellTypeGuesser;
Expand Down Expand Up @@ -181,16 +183,18 @@ private void reportInvalidRow(long source_row, Long table_index, String[] row) {
}

/** Returns a list of currently reported problems encountered when parsing the input. */
public List<ParsingProblem> getReportedProblems() {
List<ParsingProblem> result = new ArrayList<>(warnings);
private List<Problem> getReportedProblems(List<Problem> nameProblems) {
List<Problem> result = new ArrayList<>(nameProblems.size() + warnings.size() + 1);
result.addAll(nameProblems);
result.addAll(warnings);
if (invalidRowsCount > invalidRowsLimit) {
long additionalInvalidRows = invalidRowsCount - invalidRowsLimit;
result.add(new AdditionalInvalidRows(additionalInvalidRows));
}
return result;
}

private void reportProblem(ParsingProblem problem) {
private void reportProblem(Problem problem) {
if (warningsAsErrors) {
throw new ParsingFailedException(problem);
} else {
Expand Down Expand Up @@ -220,7 +224,7 @@ private void appendRow(String[] row) {
builders[i] = builders[i].parseAndAppend(row[i]);
}

// If the current row had less columns than expected, nulls are inserted for the missing
// If the current row had fewer columns than expected, nulls are inserted for the missing
// values.
// If it had more columns, the excess columns are discarded.
for (int i = row.length; i < builders.length; i++) {
Expand Down Expand Up @@ -248,18 +252,21 @@ private void appendRowIfLimitPermits(String[] row) {
}
}

private List<String> headersFromRow(String[] row) {
private WithProblems<List<String>> headersFromRow(String[] row) {
List<String> preprocessedHeaders =
Arrays.stream(row).map(this::parseHeader).collect(Collectors.toList());
return new NameDeduplicator().makeUnique(preprocessedHeaders);

NameDeduplicator deduplicator = new NameDeduplicator();
List<String> names = deduplicator.makeUnique(preprocessedHeaders);
return new WithProblems<>(names, deduplicator.getProblems());
}

private List<String> generateDefaultHeaders(int columnCount) {
ArrayList<String> headerNames = new ArrayList<>(columnCount);
private WithProblems<List<String>> generateDefaultHeaders(int columnCount) {
List<String> headerNames = new ArrayList<>(columnCount);
for (int i = 0; i < columnCount; ++i) {
headerNames.add(COLUMN_NAME + "_" + (i + 1));
}
return headerNames;
return new WithProblems<>(headerNames, Collections.emptyList());
}

/**
Expand All @@ -274,8 +281,8 @@ private boolean isPlainText(String cell) {
}

/** Reads the input stream and returns a Table. */
public Table read() {
List<String> headerNames;
public WithProblems<Table> read() {
WithProblems<List<String>> headerNames;
String[] currentRow = readNextRow();

// Skip the first N rows.
Expand All @@ -285,7 +292,7 @@ public Table read() {

// If there are no rows to even infer the headers, we return an empty table.
if (currentRow == null) {
return new Table(new Column[0]);
return new WithProblems<>(new Table(new Column[0]), Collections.emptyList());
}

int expectedColumnCount = currentRow.length;
Expand Down Expand Up @@ -335,7 +342,7 @@ public Table read() {

Column[] columns = new Column[builders.length];
for (int i = 0; i < builders.length; i++) {
String columnName = headerNames.get(i);
String columnName = headerNames.value().get(i);
StringStorage col = builders[i].seal();

WithProblems<Storage> parseResult = valueParser.parseColumn(columnName, col);
Expand All @@ -346,7 +353,7 @@ public Table read() {

columns[i] = new Column(columnName, new DefaultIndex(storage.size()), storage);
}
return new Table(columns);
return new WithProblems<>(new Table(columns), getReportedProblems(headerNames.problems()));
}

private void initBuilders(int count) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
package org.enso.table.read;

import org.enso.table.parsing.problems.ParsingProblem;
import org.enso.table.problems.Problem;

/**
* An exception thrown when a problem occured during parsing and the parser is running in a mode
* that does not try recovering, so the parsing is stopped.
*/
public class ParsingFailedException extends RuntimeException {
public final ParsingProblem problem;
public final Problem problem;

public ParsingFailedException(ParsingProblem problem) {
public ParsingFailedException(Problem problem) {
this.problem = problem;
}
}

This file was deleted.

Loading

0 comments on commit d627378

Please sign in to comment.