broadinstitute · droazen · Jul 21, 2022 · Jun 29, 2022 · Jun 29, 2022 · Jun 30, 2022
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
@@ -697,6 +697,10 @@ public final <T extends Feature> Object getHeaderForFeatures( final FeatureInput
         return hasFeatures() ? features.getHeader(featureDescriptor) : null;
     }
 
+    public final GATKPath getReferencePath(){
+        return referenceArguments.getReferenceSpecifier();
+    }
+
     /**
      * Initialize our data sources, make sure that all tool requirements for input data have been satisfied
      * and start the progress meter.

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/reference/CompareReferences.java b/src/main/java/org/broadinstitute/hellbender/tools/reference/CompareReferences.java
@@ -0,0 +1,217 @@
+package org.broadinstitute.hellbender.tools.reference;
+
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.engine.GATKTool;
+import org.broadinstitute.hellbender.engine.ReferenceDataSource;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.tsv.DataLine;
+import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
+import org.broadinstitute.hellbender.utils.tsv.TableWriter;
+import picard.cmdline.programgroups.ReferenceProgramGroup;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.*;
+
+/**
+ *
+ */
+@CommandLineProgramProperties(
+        summary = "",
+        oneLineSummary = "",
+        programGroup = ReferenceProgramGroup.class
+)
+public class CompareReferences extends GATKTool {
+
+    @Argument(fullName = "references-to-compare", shortName = "refcomp", doc = "Reference sequence file(s) to compare.")
+    private List<GATKPath> references;
+
+    /**
+     * Output file will be written here.
+     *
+     * Note: If no output file provided, table will print to standard output.
+     */
+    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "If provided, file to output table.", optional = true)
+    private GATKPath output;
+
+    @Argument(fullName = "md5-calculation-mode", shortName = "md5-calculation-mode", doc = "MD5CalculationMode indicating method of MD5 calculation.", optional = true)
+    private MD5CalculationMode md5CalculationMode = MD5CalculationMode.USE_DICT;
+    @Argument(fullName = "display-sequences-by-name", doc = "If provided, the table by sequence name will be printed.", optional = true)
+    private boolean displaySequencesByName = false;
+
+    @Argument(fullName = "display-only-differing-sequences", doc = "If provided, only display sequence names ", optional = true)
+    private boolean onlyDisplayDifferingSequences = false;
+
+    public enum MD5CalculationMode {
+          USE_DICT,
+          RECALCULATE_IF_MISSING,
+          ALWAYS_RECALCULATE;
+    }
+
+    private Map<GATKPath, ReferenceDataSource> referenceSources;
+
+    @Override
+    public boolean requiresReference() {
+        return true;
+    }
+
+    @Override
+    public void onTraversalStart() {
+        // add data source for -R reference
+        referenceSources = new LinkedHashMap<>();
+        referenceSources.put(getReferencePath(), directlyAccessEngineReferenceDataSource());
+
+        // add data sources for remaining references
+        for(GATKPath path : references){
+            referenceSources.put(path, ReferenceDataSource.of(path.toPath()));
+        }
+    }
+
+    @Override
+    public void traverse(){
+        ReferenceSequenceTable table = new ReferenceSequenceTable(referenceSources, md5CalculationMode);
+        table.build();
+
+        if(output == null){
+            writeTableToStdOutput(table);
+        }
+        else{
+            writeTableToFileOutput(table);
+        }
+
+        if(displaySequencesByName){
+            tableBySequenceName(table);
+        }
+
+        List<GATKPath> refs = new ArrayList<>();
+        refs.addAll(referenceSources.keySet());
+
+        List<ReferencePair> referencePairs = table.analyzeTable();
+        for(ReferencePair pair : referencePairs){
+            System.out.println(pair);
+        }
+    }
+
+    /**
+     * Given a table, write table to standard output.
+     *
+     * @param table
+     */
+    private void writeTableToStdOutput(ReferenceSequenceTable table){
+        // print header
+        List<String> columnNames = table.getColumnNames();
+        for(int i = 0 ; i < columnNames.size(); i++){
+            if(i == 0){
+                System.out.print(columnNames.get(i));
+            }
+            else{
+                System.out.print("\t" + columnNames.get(i));
+            }
+        }
+        System.out.println();
+
+        // use string format to output as a table
+        for(ReferenceSequenceTable.TableRow row : table){
+            for(ReferenceSequenceTable.TableEntry currEntry : row.getEntries()){
+                System.out.printf("%s\t", currEntry.getColumnValue());
+            }
+            System.out.println();
+        }
+    }
+
+    /**
+     * Given a table, write table to file output.
+     *
+     * @param table
+     */
+    private void writeTableToFileOutput(ReferenceSequenceTable table) {
+        TableColumnCollection columns = new TableColumnCollection(table.getColumnNames());
+        try(CompareReferences.CompareReferencesOutputTableWriter writer = new CompareReferences.CompareReferencesOutputTableWriter(output.toPath(), columns)){
+            writer.writeHeaderIfApplies();
+            for(ReferenceSequenceTable.TableRow row : table){
+                writer.writeRecord(row);
+            }
+        }
+        catch(IOException exception){
+            throw new UserException.CouldNotCreateOutputFile(output, "Failed to write output table.", exception);
+        }
+    }
+
+    @Override
+    public Object onTraversalSuccess() {
+        return null;
+    }
+
+    /**
+     * Given a table, write table by sequence name to standard output
+     *
+     * @param table
+     */
+    public void tableBySequenceName(ReferenceSequenceTable table){
+        List<String> output = new ArrayList<>();
+        output.add("Sequence \tMD5 \tReference\n");
+
+        for(String sequenceName : table.getAllSequenceNames()){
+            Set<ReferenceSequenceTable.TableRow> rows = table.queryBySequenceName(sequenceName);
+            if(onlyDisplayDifferingSequences) {
+                if(rows.size() > 1) {
+                    output.addAll(displayBySequenceName(rows, sequenceName));
+                }
+            } else {
+                output.addAll(displayBySequenceName(rows, sequenceName));
+            }
+        }
+
+        for(String str : output){
+            System.out.print(str);
+        }
+        System.out.println();
+    }
+
+    private List<String> displayBySequenceName(Set<ReferenceSequenceTable.TableRow> rows, String sequenceName){
+        List<String> output = new ArrayList<>();
+        output.add(sequenceName);
+        for(ReferenceSequenceTable.TableRow row : rows) {
+            ReferenceSequenceTable.TableEntry[] entries = row.getEntries();
+            output.add("\n\t" + row.getMd5() + "\t");
+
+            for(int i = 2; i < entries.length; i++) {
+                if(entries[i].getColumnValue().equals(sequenceName)) {
+                    output.add(entries[i].getColumnName() + "\t");
+                }
+            }
+        }
+        output.add("\n");
+        return output;
+    }
+
+    @Override
+    public void closeTool() {
+        for(Map.Entry<GATKPath, ReferenceDataSource> entry : referenceSources.entrySet()){
+            if(!entry.getKey().equals(getReferencePath())){
+                entry.getValue().close();
+            }
+        }
+    }
+
+    public static class CompareReferencesOutputTableWriter extends TableWriter<ReferenceSequenceTable.TableRow> {
+        private TableColumnCollection columnCollection;
+
+        public CompareReferencesOutputTableWriter(final Path table, TableColumnCollection columns) throws IOException {
+            super(table, columns);
+            columnCollection = columns;
+        }
+
+        @Override
+        protected void composeLine(final ReferenceSequenceTable.TableRow record, final DataLine dataLine) {
+            List<String> columnNames = record.getColumnNames();
+            ReferenceSequenceTable.TableEntry[] entries = record.getEntries();
+            for(int i = 0; i < columnNames.size(); i++){
+                dataLine.set(entries[i].getColumnName(), entries[i].getColumnValue());
+            }
+        }
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/reference/ReferencePair.java b/src/main/java/org/broadinstitute/hellbender/tools/reference/ReferencePair.java
@@ -0,0 +1,90 @@
+package org.broadinstitute.hellbender.tools.reference;
+
+import org.broadinstitute.hellbender.engine.GATKPath;
+
+import java.util.*;
+
+public class ReferencePair {
+
+    private final GATKPath ref1;
+    private final GATKPath ref2;
+    private final int ref1ColumnIndex;
+    private final int ref2ColumnIndex;
+    public enum Status{
+        EXACT_MATCH,
+        DIFFER_IN_SEQUENCE_NAMES,
+        DIFFER_IN_SEQUENCE,
+        DIFFER_IN_SEQUENCES_PRESENT,
+        SUPERSET,
+        SUBSET;
+    }
+    private EnumSet<Status> analysis;
+
+    public ReferencePair(ReferenceSequenceTable table, GATKPath reference1, GATKPath reference2){
+        ref1 = reference1;
+        ref2 = reference2;
+        ref1ColumnIndex = table.getColumnIndices().get(ref1.toPath().getFileName().toString());
+        ref2ColumnIndex = table.getColumnIndices().get(ref2.toPath().getFileName().toString());
+        analysis = EnumSet.of(Status.EXACT_MATCH);
+    }
+
+    public void addStatus(Status status) {
+        analysis.add(status);
+    }
+
+    public void removeStatus(Status status){
+        analysis.remove(status);
+    }
+
+    public int getRef1ColumnIndex() {
+        return ref1ColumnIndex;
+    }
+
+    public int getRef2ColumnIndex() {
+        return ref2ColumnIndex;
+    }
+
+    public String getRef1(){
+        return ref1.toPath().getFileName().toString();
+    }
+    public String getRef2(){
+        return ref2.toPath().getFileName().toString();
+    }
+
+    /**
+     * Displays a ReferencePair's status set
+     *
+     * @return the status set as a formatted String
+     */
+    public String statusAsString(){
+        String output = "";
+        for(Status status : analysis){
+            output += String.format("\t%s\n", status.name());
+        }
+        return output;
+    }
+
+    public EnumSet<Status> getStatus(){
+        return analysis;
+    }
+
+    public String toString(){
+        return String.format("REFERENCE PAIR: %s, %s\nStatus:\n%s",
+                ReferenceSequenceTable.getReferenceDisplayName(ref1),
+                ReferenceSequenceTable.getReferenceDisplayName(ref2),
+                statusAsString());
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        ReferencePair that = (ReferencePair) o;
+        return Objects.equals(ref1, that.ref1) && Objects.equals(ref2, that.ref2);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(ref1, ref2);
+    }
+}