Skip to content

Commit

Permalink
PARQUET-2328: Add overwrite option to the parquet-cli's rewrite subco…
Browse files Browse the repository at this point in the history
…mmand (#1125)
  • Loading branch information
sekikn authored Aug 3, 2023
1 parent 169c624 commit 928d35b
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import com.beust.jcommander.Parameters;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.cli.BaseCommand;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
Expand All @@ -44,16 +45,25 @@ public class RewriteCommand extends BaseCommand {
description = "<comma-separated text of input parquet file paths>",
required = true)
List<String> inputs;

@Parameter(
names = {"-o", "--output"},
description = "<output parquet file path>",
required = true)
String output;

@Parameter(
names={"--overwrite"},
description="Overwrite the output file if it exists",
required = false)
boolean overwrite;

@Parameter(
names = {"--mask-mode"},
description = "<mask mode: nullify>",
required = false)
String maskMode;

@Parameter(
names = {"--mask-columns"},
description = "<columns to be replaced with masked value>",
Expand All @@ -76,7 +86,7 @@ public RewriteCommand(Logger console) {
super(console);
}

private RewriteOptions buildOptionsOrFail() {
private RewriteOptions buildOptionsOrFail() throws IOException {
Preconditions.checkArgument(inputs != null && !inputs.isEmpty() && output != null,
"Both input and output parquet file paths are required.");

Expand Down Expand Up @@ -108,7 +118,16 @@ private RewriteOptions buildOptionsOrFail() {
builder.transform(codecName);
}

return builder.build();
RewriteOptions options = builder.build();

// If RewriteOptions are successfully built and the overwrite option is specified, remove the output path
FileSystem outFS = outputPath.getFileSystem(getConf());
if (overwrite && outFS.exists(outputPath)) {
console.debug("Deleting output file {} (already exists)", outputPath);
outFS.delete(outputPath);
}

return options;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@
package org.apache.parquet.cli.commands;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.junit.Assert;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.Arrays;

public class RewriteCommandTest extends ParquetFileTest {
Expand All @@ -38,4 +40,32 @@ public void testRewriteCommand() throws IOException {
Assert.assertEquals(0, command.run());
Assert.assertTrue(output.exists());
}

@Test(expected = FileAlreadyExistsException.class)
public void testRewriteCommandWithoutOverwrite() throws IOException {
File file = parquetFile();
RewriteCommand command = new RewriteCommand(createLogger());
command.inputs = Arrays.asList(file.getAbsolutePath());
File output = new File(getTempFolder(), "converted.parquet");
command.output = output.getAbsolutePath();
command.setConf(new Configuration());

Files.createFile(output.toPath());
command.run();
}

@Test
public void testRewriteCommandWithOverwrite() throws IOException {
File file = parquetFile();
RewriteCommand command = new RewriteCommand(createLogger());
command.inputs = Arrays.asList(file.getAbsolutePath());
File output = new File(getTempFolder(), "converted.parquet");
command.output = output.getAbsolutePath();
command.overwrite = true;
command.setConf(new Configuration());

Files.createFile(output.toPath());
Assert.assertEquals(0, command.run());
Assert.assertTrue(output.exists());
}
}

0 comments on commit 928d35b

Please sign in to comment.