-
Notifications
You must be signed in to change notification settings - Fork 594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CompareReferences tool #7930
CompareReferences tool #7930
Changes from 18 commits
92bc20a
85bc87a
4335f4f
aea17a5
7869a86
41ce2f6
fa26001
2671b43
8f74dd7
006ada6
b344c64
b2507a1
e3b1aac
ee0db1a
6c00ada
233bc97
9a5cf45
437eb84
5c887a1
dc59744
70ec346
a736923
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -697,6 +697,10 @@ public final <T extends Feature> Object getHeaderForFeatures( final FeatureInput | |
return hasFeatures() ? features.getHeader(featureDescriptor) : null; | ||
} | ||
|
||
public final GATKPath getReferencePath(){ | ||
return referenceArguments.getReferenceSpecifier(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since |
||
} | ||
|
||
/** | ||
* Initialize our data sources, make sure that all tool requirements for input data have been satisfied | ||
* and start the progress meter. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
package org.broadinstitute.hellbender.tools.reference; | ||
|
||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.engine.GATKPath; | ||
import org.broadinstitute.hellbender.engine.GATKTool; | ||
import org.broadinstitute.hellbender.engine.ReferenceDataSource; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.utils.tsv.DataLine; | ||
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; | ||
import org.broadinstitute.hellbender.utils.tsv.TableWriter; | ||
import picard.cmdline.programgroups.ReferenceProgramGroup; | ||
|
||
import java.io.IOException; | ||
import java.nio.file.Path; | ||
import java.util.*; | ||
|
||
/** | ||
* | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
*/ | ||
@CommandLineProgramProperties( | ||
summary = "", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fill these summaries in. |
||
oneLineSummary = "", | ||
programGroup = ReferenceProgramGroup.class | ||
) | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
public class CompareReferences extends GATKTool { | ||
|
||
@Argument(fullName = "references-to-compare", shortName = "refcomp", doc = "Reference sequence file(s) to compare.") | ||
private List<GATKPath> references; | ||
|
||
/** | ||
* Output file will be written here. | ||
* | ||
* Note: If no output file provided, table will print to standard output. | ||
*/ | ||
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "If provided, file to output table.", optional = true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @orlicohen looks like you missed this |
||
private GATKPath output; | ||
|
||
@Argument(fullName = "md5-calculation-mode", shortName = "md5-calculation-mode", doc = "MD5CalculationMode indicating method of MD5 calculation.", optional = true) | ||
private MD5CalculationMode md5CalculationMode = MD5CalculationMode.USE_DICT; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Think James is correct that we probably want to default to |
||
@Argument(fullName = "display-sequences-by-name", doc = "If provided, the table by sequence name will be printed.", optional = true) | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
private boolean displaySequencesByName = false; | ||
|
||
@Argument(fullName = "display-only-differing-sequences", doc = "If provided, only display sequence names ", optional = true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "only display sequence names that differ in their actual sequence" |
||
private boolean onlyDisplayDifferingSequences = false; | ||
|
||
public enum MD5CalculationMode { | ||
USE_DICT, | ||
RECALCULATE_IF_MISSING, | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ALWAYS_RECALCULATE; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a one-line comment above each enum value explaining what each mode does. |
||
|
||
private Map<GATKPath, ReferenceDataSource> referenceSources; | ||
|
||
@Override | ||
public boolean requiresReference() { | ||
return true; | ||
} | ||
|
||
@Override | ||
public void onTraversalStart() { | ||
// add data source for -R reference | ||
referenceSources = new LinkedHashMap<>(); | ||
referenceSources.put(getReferencePath(), directlyAccessEngineReferenceDataSource()); | ||
|
||
// add data sources for remaining references | ||
for(GATKPath path : references){ | ||
referenceSources.put(path, ReferenceDataSource.of(path.toPath())); | ||
} | ||
} | ||
|
||
@Override | ||
public void traverse(){ | ||
ReferenceSequenceTable table = new ReferenceSequenceTable(referenceSources, md5CalculationMode); | ||
table.build(); | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if(output == null){ | ||
writeTableToStdOutput(table); | ||
} | ||
else{ | ||
writeTableToFileOutput(table); | ||
} | ||
|
||
if(displaySequencesByName){ | ||
tableBySequenceName(table); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might need to add a way to capture this optional second table to a file. We can discuss at one of our planning meetings. |
||
} | ||
|
||
List<GATKPath> refs = new ArrayList<>(); | ||
refs.addAll(referenceSources.keySet()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
List<ReferencePair> referencePairs = table.analyzeTable(); | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for(ReferencePair pair : referencePairs){ | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
System.out.println(pair); | ||
} | ||
} | ||
|
||
/** | ||
* Given a table, write table to standard output. | ||
* | ||
* @param table | ||
*/ | ||
private void writeTableToStdOutput(ReferenceSequenceTable table){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To unify the table writing as James suggests, you'll need to add a second constructor to your
|
||
// print header | ||
List<String> columnNames = table.getColumnNames(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think now is the time to go back and merge these two writers. They are both printing the same data (and are both also writing the header for example). I would advocate simply providing the table writer an output stream that prints to stdout here. I think that will be much cleaner rather than having these two slightly incongruent methods. |
||
for(int i = 0 ; i < columnNames.size(); i++){ | ||
if(i == 0){ | ||
System.out.print(columnNames.get(i)); | ||
} | ||
else{ | ||
System.out.print("\t" + columnNames.get(i)); | ||
} | ||
} | ||
System.out.println(); | ||
|
||
// use string format to output as a table | ||
for(ReferenceSequenceTable.TableRow row : table){ | ||
for(ReferenceSequenceTable.TableEntry currEntry : row.getEntries()){ | ||
System.out.printf("%s\t", currEntry.getColumnValue()); | ||
} | ||
System.out.println(); | ||
} | ||
} | ||
|
||
/** | ||
* Given a table, write table to file output. | ||
* | ||
* @param table | ||
*/ | ||
private void writeTableToFileOutput(ReferenceSequenceTable table) { | ||
TableColumnCollection columns = new TableColumnCollection(table.getColumnNames()); | ||
try(CompareReferences.CompareReferencesOutputTableWriter writer = new CompareReferences.CompareReferencesOutputTableWriter(output.toPath(), columns)){ | ||
writer.writeHeaderIfApplies(); | ||
for(ReferenceSequenceTable.TableRow row : table){ | ||
writer.writeRecord(row); | ||
} | ||
} | ||
catch(IOException exception){ | ||
throw new UserException.CouldNotCreateOutputFile(output, "Failed to write output table.", exception); | ||
} | ||
} | ||
|
||
@Override | ||
public Object onTraversalSuccess() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you aren't using it you can get rid of this override. |
||
return null; | ||
} | ||
|
||
/** | ||
* Given a table, write table by sequence name to standard output | ||
* | ||
* @param table | ||
*/ | ||
public void tableBySequenceName(ReferenceSequenceTable table){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
List<String> output = new ArrayList<>(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since it turns out we didn't need to save the output after all, you can remove this |
||
output.add("Sequence \tMD5 \tReference\n"); | ||
|
||
for(String sequenceName : table.getAllSequenceNames()){ | ||
Set<ReferenceSequenceTable.TableRow> rows = table.queryBySequenceName(sequenceName); | ||
if(onlyDisplayDifferingSequences) { | ||
if(rows.size() > 1) { | ||
output.addAll(displayBySequenceName(rows, sequenceName)); | ||
} | ||
} else { | ||
output.addAll(displayBySequenceName(rows, sequenceName)); | ||
} | ||
} | ||
|
||
for(String str : output){ | ||
System.out.print(str); | ||
} | ||
System.out.println(); | ||
} | ||
|
||
private List<String> displayBySequenceName(Set<ReferenceSequenceTable.TableRow> rows, String sequenceName){ | ||
List<String> output = new ArrayList<>(); | ||
output.add(sequenceName); | ||
for(ReferenceSequenceTable.TableRow row : rows) { | ||
ReferenceSequenceTable.TableEntry[] entries = row.getEntries(); | ||
output.add("\n\t" + row.getMd5() + "\t"); | ||
|
||
for(int i = 2; i < entries.length; i++) { | ||
if(entries[i].getColumnValue().equals(sequenceName)) { | ||
output.add(entries[i].getColumnName() + "\t"); | ||
} | ||
} | ||
} | ||
output.add("\n"); | ||
return output; | ||
} | ||
|
||
@Override | ||
public void closeTool() { | ||
for(Map.Entry<GATKPath, ReferenceDataSource> entry : referenceSources.entrySet()){ | ||
if(!entry.getKey().equals(getReferencePath())){ | ||
entry.getValue().close(); | ||
} | ||
} | ||
} | ||
|
||
public static class CompareReferencesOutputTableWriter extends TableWriter<ReferenceSequenceTable.TableRow> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a quick javadoc here explaining this is necessary to write the table output |
||
private TableColumnCollection columnCollection; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
|
||
public CompareReferencesOutputTableWriter(final Path table, TableColumnCollection columns) throws IOException { | ||
super(table, columns); | ||
columnCollection = columns; | ||
} | ||
|
||
@Override | ||
protected void composeLine(final ReferenceSequenceTable.TableRow record, final DataLine dataLine) { | ||
List<String> columnNames = record.getColumnNames(); | ||
ReferenceSequenceTable.TableEntry[] entries = record.getEntries(); | ||
for(int i = 0; i < columnNames.size(); i++){ | ||
dataLine.set(entries[i].getColumnName(), entries[i].getColumnValue()); | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
package org.broadinstitute.hellbender.tools.reference; | ||
|
||
import org.broadinstitute.hellbender.engine.GATKPath; | ||
|
||
import java.util.*; | ||
|
||
public class ReferencePair { | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
private final GATKPath ref1; | ||
private final GATKPath ref2; | ||
private final int ref1ColumnIndex; | ||
private final int ref2ColumnIndex; | ||
public enum Status{ | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
EXACT_MATCH, | ||
DIFFER_IN_SEQUENCE_NAMES, | ||
DIFFER_IN_SEQUENCE, | ||
DIFFER_IN_SEQUENCES_PRESENT, | ||
SUPERSET, | ||
SUBSET; | ||
} | ||
private EnumSet<Status> analysis; | ||
|
||
public ReferencePair(ReferenceSequenceTable table, GATKPath reference1, GATKPath reference2){ | ||
ref1 = reference1; | ||
ref2 = reference2; | ||
ref1ColumnIndex = table.getColumnIndices().get(ref1.toPath().getFileName().toString()); | ||
ref2ColumnIndex = table.getColumnIndices().get(ref2.toPath().getFileName().toString()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When looking up the reference column indices, call the |
||
analysis = EnumSet.of(Status.EXACT_MATCH); | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
public void addStatus(Status status) { | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
analysis.add(status); | ||
} | ||
|
||
public void removeStatus(Status status){ | ||
analysis.remove(status); | ||
} | ||
|
||
public int getRef1ColumnIndex() { | ||
return ref1ColumnIndex; | ||
} | ||
|
||
public int getRef2ColumnIndex() { | ||
return ref2ColumnIndex; | ||
} | ||
|
||
public String getRef1(){ | ||
return ref1.toPath().getFileName().toString(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Call |
||
} | ||
public String getRef2(){ | ||
return ref2.toPath().getFileName().toString(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Call |
||
} | ||
|
||
/** | ||
* Displays a ReferencePair's status set | ||
* | ||
* @return the status set as a formatted String | ||
*/ | ||
public String statusAsString(){ | ||
String output = ""; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm... this currently is missing any indication of WHAT lines the status' are being generated by. I don't see where those are getting printed and it looks like we forgot to do that. I think we need to point the user at the specific lines that are problemeatic somehow for example "chr3 mismatches with reference A and B" for example. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jamesemery We said that we would print just the statuses per reference pair in a compact tabular form -- linking the statuses to individual TableRows is not a feature that has been implemented yet, and is probably out of scope for this PR. |
||
for(Status status : analysis){ | ||
output += String.format("\t%s\n", status.name()); | ||
} | ||
return output; | ||
} | ||
|
||
public EnumSet<Status> getStatus(){ | ||
return analysis; | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
public String toString(){ | ||
return String.format("REFERENCE PAIR: %s, %s\nStatus:\n%s", | ||
ReferenceSequenceTable.getReferenceDisplayName(ref1), | ||
ReferenceSequenceTable.getReferenceDisplayName(ref2), | ||
statusAsString()); | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if (this == o) return true; | ||
if (o == null || getClass() != o.getClass()) return false; | ||
ReferencePair that = (ReferencePair) o; | ||
return Objects.equals(ref1, that.ref1) && Objects.equals(ref2, that.ref2); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(ref1, ref2); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Public methods in GATKTool definitely need javadocs explaining themselves.