-
Notifications
You must be signed in to change notification settings - Fork 1.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Kernel] Assign base row ID to AddFile actions #3894
base: master
Are you sure you want to change the base?
Changes from all commits
94260ef
7612613
2ac3985
807c896
84d4ae6
509bc27
83f3b1e
7d7032e
dadc5a6
0c188c7
7948fdf
6195e7f
fd06f6d
3e30a41
cec85cf
8edcc9a
baca1c5
7e0a172
127de8c
c5f3672
b2d6546
cd7ddeb
68c77d8
5afec36
e358878
4f56a2f
189ec75
a4e3104
b0e4a65
f89199c
a5c5726
a893fa0
73faf8f
9ce9b56
60a29dd
b32c56a
75c0005
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,11 +32,9 @@ | |
import io.delta.kernel.internal.fs.Path; | ||
import io.delta.kernel.internal.replay.ConflictChecker; | ||
import io.delta.kernel.internal.replay.ConflictChecker.TransactionRebaseState; | ||
import io.delta.kernel.internal.util.Clock; | ||
import io.delta.kernel.internal.util.ColumnMapping; | ||
import io.delta.kernel.internal.util.FileNames; | ||
import io.delta.kernel.internal.util.InCommitTimestampUtils; | ||
import io.delta.kernel.internal.util.VectorUtils; | ||
import io.delta.kernel.internal.rowtracking.RowIDAssignmentResult; | ||
import io.delta.kernel.internal.rowtracking.RowTracking; | ||
import io.delta.kernel.internal.util.*; | ||
import io.delta.kernel.types.StructType; | ||
import io.delta.kernel.utils.CloseableIterable; | ||
import io.delta.kernel.utils.CloseableIterator; | ||
|
@@ -76,6 +74,12 @@ public class TransactionImpl implements Transaction { | |
private Metadata metadata; | ||
private boolean shouldUpdateMetadata; | ||
|
||
// Contains domain metadata actions known prior to iterating and writing the data actions | ||
private final List<DomainMetadata> domainMetadatas = new ArrayList<>(); | ||
// Contains domain metadata actions generated on the fly while writing the data actions | ||
private CloseableIterator<DomainMetadata> domainMetadataIter = | ||
toCloseableIterator(Collections.emptyIterator()); | ||
Comment on lines
+77
to
+81
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I probably will change this. Currently the list is only used in tests, and iterator is only used for row tracking's domain metadata. I’ll look for more use cases of domain metadata in Delta-Spark to see if there is a better way for managing |
||
|
||
private boolean closed; // To avoid trying to commit the same transaction again. | ||
|
||
public TransactionImpl( | ||
|
@@ -120,6 +124,23 @@ public StructType getSchema(Engine engine) { | |
return readSnapshot.getSchema(engine); | ||
} | ||
|
||
public Optional<SetTransaction> getSetTxnOpt() { | ||
return setTxnOpt; | ||
} | ||
|
||
/** | ||
* Internal API to add domain metadata actions for this transaction. Visible for testing. | ||
* | ||
* @param domainMetadatas List of domain metadata to be added to the transaction. | ||
*/ | ||
public void addDomainMetadatas(List<DomainMetadata> domainMetadatas) { | ||
this.domainMetadatas.addAll(domainMetadatas); | ||
} | ||
|
||
public List<DomainMetadata> getDomainMetadatas() { | ||
return domainMetadatas; | ||
} | ||
|
||
@Override | ||
public TransactionCommitResult commit(Engine engine, CloseableIterable<Row> dataActions) | ||
throws ConcurrentWriteException { | ||
|
@@ -131,6 +152,16 @@ public TransactionCommitResult commit(Engine engine, CloseableIterable<Row> data | |
CommitInfo attemptCommitInfo = generateCommitAction(engine); | ||
updateMetadataWithICTIfRequired( | ||
engine, attemptCommitInfo.getInCommitTimestamp(), readSnapshot.getVersion(engine)); | ||
|
||
// For Row Tracking, we assign base row IDs to all AddFiles inside dataActions that | ||
// do not have it yet. If the high water mark has changed, we also emit a | ||
// DomainMetadata action with the new high water mark. | ||
RowIDAssignmentResult rowIDAssignmentResult = | ||
RowTracking.assignBaseRowId(protocol, readSnapshot, FULL_SCHEMA, dataActions); | ||
dataActions = rowIDAssignmentResult.getDataActions(); | ||
domainMetadataIter = | ||
domainMetadataIter.combine(rowIDAssignmentResult.getDomainMetadatasIter()); | ||
Comment on lines
+159
to
+163
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is done in this way because I remember we discussed that we want to keep |
||
|
||
int numRetries = 0; | ||
do { | ||
logger.info("Committing transaction as version = {}.", commitAsVersion); | ||
|
@@ -223,9 +254,12 @@ private TransactionCommitResult doCommit( | |
|
||
try (CloseableIterator<Row> stageDataIter = dataActions.iterator()) { | ||
// Create a new CloseableIterator that will return the metadata actions followed by the | ||
// data actions. | ||
// data actions, and then DomainMetadata actions. The order is crucial as DomainMetadata | ||
// actions may depend on the data actions. | ||
CloseableIterator<Row> dataAndMetadataActions = | ||
toCloseableIterator(metadataActions.iterator()).combine(stageDataIter); | ||
toCloseableIterator(metadataActions.iterator()) | ||
.combine(stageDataIter) | ||
.combine(getDomainMetadataActions()); | ||
|
||
if (commitAsVersion == 0) { | ||
// New table, create a delta log directory | ||
|
@@ -265,10 +299,6 @@ public boolean isBlindAppend() { | |
return true; | ||
} | ||
|
||
public Optional<SetTransaction> getSetTxnOpt() { | ||
return setTxnOpt; | ||
} | ||
|
||
/** | ||
* Generates a timestamp which is greater than the commit timestamp of the readSnapshot. This can | ||
* result in an additional file read and that this will only happen if ICT is enabled. | ||
|
@@ -313,6 +343,29 @@ private Map<String, String> getOperationParameters() { | |
return Collections.emptyMap(); | ||
} | ||
|
||
/** | ||
* Returns an iterator of all domain metadata actions to be committed. Domain metadata actions | ||
* exist in two places: | ||
* | ||
* <ol> | ||
* <li>{@code List<DomainMetadata> domainMetadatas}: Contains domain metadata actions known | ||
* prior to iterating and writing the data actions. | ||
* <li>{@code CloseableIterator<DomainMetadata> domainMetadataIter}`: Contains domain metadata | ||
* actions generated on the fly while writing the data actions. | ||
* </ol> | ||
* | ||
* This method combines both sources of domain metadata actions and returns an iterator of all | ||
* domain metadata action rows to be committed. | ||
* | ||
* @return a {@link CloseableIterator} of domain metadata action rows | ||
*/ | ||
private CloseableIterator<Row> getDomainMetadataActions() { | ||
// TODO: Implement the check for duplicate domain metadata and if the protocol supports here | ||
return toCloseableIterator(domainMetadatas.listIterator()) | ||
.combine(domainMetadataIter) | ||
.map(domainMetadata -> createDomainMetadataSingleAction(domainMetadata.toRow())); | ||
} | ||
|
||
/** | ||
* Get the part of the schema of the table that needs the statistics to be collected per file. | ||
* | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As we discussed, this can be an issue: if connectors don't populate
numRecords
stats in the addFile action that are committed, the commit will fail if row tracking is supported (note that this is still better than today where we always fail in that case since we don't support row tracking.Question more for kernel folks: do we some guarantee or requirement that connectors populate
numRecords
? Are connectors that implement writes today (if any) populatingnumRecords
?In any case, I would word the exception so that it puts the burden more on the connector, for example:
"All add actions must have statistics that include the number of records when writing to a Delta table with the RowTracking table feature enabled."