-
Notifications
You must be signed in to change notification settings - Fork 62
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add S3 upload integrity check #26
Changes from all commits
3b224ee
05bba4b
f2162b4
fa81144
9c8a355
45c99ff
6e23595
40fa47e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package alex.mojaki.s3upload; | ||
|
||
/** | ||
* Thrown when final integrity check fails. It suggests that the multipart upload failed | ||
* due to data corruption. See {@link StreamTransferManager#checkIntegrity(boolean)} for details. | ||
*/ | ||
public class IntegrityCheckException extends RuntimeException { | ||
|
||
public IntegrityCheckException(String message) { | ||
super(message); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,12 +2,16 @@ | |
|
||
import com.amazonaws.services.s3.AmazonS3; | ||
import com.amazonaws.services.s3.model.*; | ||
import com.amazonaws.util.BinaryUtils; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.ByteArrayInputStream; | ||
import java.math.BigInteger; | ||
import java.security.MessageDigest; | ||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.Comparator; | ||
import java.util.List; | ||
import java.util.concurrent.*; | ||
|
||
|
@@ -109,6 +113,7 @@ public class StreamTransferManager { | |
protected int numUploadThreads = 1; | ||
protected int queueCapacity = 1; | ||
protected int partSize = 5 * MB; | ||
protected boolean checkIntegrity = false; | ||
private final List<PartETag> partETags = Collections.synchronizedList(new ArrayList<PartETag>()); | ||
private List<MultiPartOutputStream> multiPartOutputStreams; | ||
private ExecutorServiceResultsHandler<Void> executorServiceResultsHandler; | ||
|
@@ -241,6 +246,39 @@ public StreamTransferManager partSize(long partSize) { | |
return this; | ||
} | ||
|
||
/** | ||
* Sets whether data integrity check should be performed during upload. | ||
* <p> | ||
* By default integrity check is disabled. | ||
* <p> | ||
* Essentially, data integrity check consists of two steps. First, each upload part integrity | ||
* is verified. To ensure that data is not corrupted traversing the network, <b>Content-MD5</b> | ||
* header is used. When the header is provided, Amazon S3 checks the object against | ||
* the provided MD5 value and, if they do not match, returns an error. The header value is the | ||
* base64-encoded 128-bit MD5 digest of the request body. | ||
* <p> | ||
* The second step is to ensure integrity of the final object merged from the uploaded parts. | ||
* This is achieved by comparing the expected ETag value with the actual returned by S3. | ||
* However, the ETag value is not a MD5 hash. When S3 combines the parts of a multipart upload | ||
* into the final object, the ETag value is set to the hex-encoded MD5 hash of the concatenated | ||
* binary-encoded MD5 hashes of each part followed by "-" and the number of parts, for instance: | ||
* <pre>57f456164b0e5f365aaf9bb549731f32-95</pre> | ||
* <b>Please note that the final check is based on undocumented behaviour of S3.</b> | ||
* | ||
* @param checkIntegrity <code>true</code> if data integrity should be checked | ||
* @return this {@code StreamTransferManager} for chaining. | ||
* @throws IllegalStateException if {@link StreamTransferManager#getMultiPartOutputStreams} has already | ||
* been called, initiating the upload. | ||
*/ | ||
public StreamTransferManager checkIntegrity(boolean checkIntegrity) { | ||
ensureCanSet(); | ||
if (checkIntegrity) { | ||
Utils.md5(); // check that algorithm is available | ||
} | ||
this.checkIntegrity = checkIntegrity; | ||
return this; | ||
} | ||
|
||
private void ensureCanSet() { | ||
if (queue != null) { | ||
abort(); | ||
|
@@ -322,7 +360,7 @@ public void complete() { | |
executorServiceResultsHandler.awaitCompletion(); | ||
log.debug("{}: Pool terminated", this); | ||
if (leftoverStreamPart != null) { | ||
log.info("{}: Uploading leftover stream {}", leftoverStreamPart); | ||
log.info("{}: Uploading leftover stream {}", this, leftoverStreamPart); | ||
uploadStreamPart(leftoverStreamPart); | ||
log.debug("{}: Leftover uploaded", this); | ||
} | ||
|
@@ -343,14 +381,43 @@ public void complete() { | |
uploadId, | ||
partETags); | ||
customiseCompleteRequest(completeRequest); | ||
s3Client.completeMultipartUpload(completeRequest); | ||
CompleteMultipartUploadResult completeMultipartUploadResult = s3Client.completeMultipartUpload(completeRequest); | ||
if (checkIntegrity) { | ||
checkCompleteFileIntegrity(completeMultipartUploadResult.getETag()); | ||
} | ||
} | ||
log.info("{}: Completed", this); | ||
} catch (IntegrityCheckException e) { | ||
// Nothing to abort. Upload has already finished. | ||
throw e; | ||
} catch (Throwable e) { | ||
throw abort(e); | ||
} | ||
} | ||
|
||
private void checkCompleteFileIntegrity(String s3ObjectETag) { | ||
List<PartETag> parts = new ArrayList<PartETag>(partETags); | ||
Collections.sort(parts, new PartNumberComparator()); | ||
String expectedETag = computeCompleteFileETag(parts); | ||
if (!expectedETag.equals(s3ObjectETag)) { | ||
throw new IntegrityCheckException(String.format( | ||
"File upload completed, but integrity check failed. Expected ETag: %s but actual is %s", | ||
expectedETag, s3ObjectETag)); | ||
} | ||
} | ||
|
||
private String computeCompleteFileETag(List<PartETag> parts) { | ||
// When S3 combines the parts of a multipart upload into the final object, the ETag value is set to the | ||
// hex-encoded MD5 hash of the concatenated binary-encoded (raw bytes) MD5 hashes of each part followed by | ||
// "-" and the number of parts. | ||
MessageDigest md = Utils.md5(); | ||
for (PartETag partETag : parts) { | ||
md.update(BinaryUtils.fromHex(partETag.getETag())); | ||
} | ||
// Represent byte array as a 32-digit number hexadecimal format followed by "-<partCount>". | ||
return String.format("%032x-%d", new BigInteger(1, md.digest()), parts.size()); | ||
} | ||
|
||
/** | ||
* Aborts the upload and rethrows the argument, wrapped in a RuntimeException if necessary. | ||
* Write {@code throw abort(e)} to make it clear to the compiler and readers that the code | ||
|
@@ -470,6 +537,9 @@ private void uploadStreamPart(StreamPart part) { | |
.withUploadId(uploadId).withPartNumber(part.getPartNumber()) | ||
.withInputStream(part.getInputStream()) | ||
.withPartSize(part.size()); | ||
if (checkIntegrity) { | ||
uploadRequest.setMd5Digest(part.getMD5Digest()); | ||
} | ||
customiseUploadPartRequest(uploadRequest); | ||
|
||
UploadPartResult uploadPartResult = s3Client.uploadPart(uploadRequest); | ||
|
@@ -502,4 +572,16 @@ public void customiseCompleteRequest(CompleteMultipartUploadRequest request) { | |
public void customisePutEmptyObjectRequest(PutObjectRequest request) { | ||
} | ||
|
||
private static class PartNumberComparator implements Comparator<PartETag> { | ||
@Override | ||
public int compare(PartETag o1, PartETag o2) { | ||
int partNumber1 = o1.getPartNumber(); | ||
int partNumber2 = o2.getPartNumber(); | ||
|
||
if (partNumber1 == partNumber2) { | ||
return 0; | ||
} | ||
return partNumber1 > partNumber2 ? 1 : -1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Replace body with: return Integer.compare(o1.getPartNumber(), o2.getPartNumber()) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is available since java 1.7. This project uses 1.6. |
||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
s3proxy.endpoint=https://127.0.0.1:0 | ||
s3proxy.endpoint=http://127.0.0.1:0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When I leave |
||
# authorization must be aws-v2 or none | ||
s3proxy.authorization=aws-v2 | ||
s3proxy.identity=local-identity | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add some documentation explaining the two ways that integrity is checked, and that the final check is based on undocumented behaviour of S3 that may change, so they may get a false alarm exception. Also just note how this may lead to exceptions in general.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
I added
IntegrityCheckException
to improve error handling. But I've also noticed that when a part upload fails, logs do not contain any information related. Sample logs when I deliberately make the request fail:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you elaborate? What did you do in this case to make the request fail?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
.withPartSize(-1)
or.setMd5Digest("123");
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean I modify
UploadPartRequest
inalex.mojaki.s3upload.StreamTransferManager#uploadStreamPart
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I reproduced this and ensured that a message shows in d3e4e20
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cool! Thanks!