-
Notifications
You must be signed in to change notification settings - Fork 594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CompareReferences tool #7930
CompareReferences tool #7930
Changes from all commits
92bc20a
85bc87a
4335f4f
aea17a5
7869a86
41ce2f6
fa26001
2671b43
8f74dd7
006ada6
b344c64
b2507a1
e3b1aac
ee0db1a
6c00ada
233bc97
9a5cf45
437eb84
5c887a1
dc59744
70ec346
a736923
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,238 @@ | ||
package org.broadinstitute.hellbender.tools.reference; | ||
|
||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
import org.broadinstitute.barclay.argparser.ExperimentalFeature; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.engine.GATKPath; | ||
import org.broadinstitute.hellbender.engine.GATKTool; | ||
import org.broadinstitute.hellbender.engine.ReferenceDataSource; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.utils.tsv.DataLine; | ||
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; | ||
import org.broadinstitute.hellbender.utils.tsv.TableWriter; | ||
import picard.cmdline.programgroups.ReferenceProgramGroup; | ||
|
||
import java.io.IOException; | ||
import java.io.OutputStreamWriter; | ||
import java.io.Writer; | ||
import java.nio.file.Path; | ||
import java.util.*; | ||
|
||
/** | ||
* Display reference comparison as a tab-delimited table and summarize reference differences. | ||
* | ||
* <p>This tool generates a MD5-keyed table comparing specified references and does an analysis to summarize the differences | ||
* between the references provided. Comparisons are made against a "special" reference, specified with the -R | ||
* argument. Subsequent references to be compared may be specified using the -references-to-compare argument. | ||
* The table can be directed to a file or standard output using provided command-line arguments. | ||
* A supplementary table keyed by sequence name can be displayed using the -display-sequences-by-name argument; | ||
* to display only sequence names for which the references are not consistent, run with the -display-only-differing-sequences | ||
* argument as well.</p> | ||
* | ||
* <h3>Input</h3> | ||
* <p> | ||
* The references and MD5 calculation mode. | ||
* </p> | ||
* | ||
* <h3>Output</h3> | ||
* <p> | ||
* A TSV file or output stream displaying the MD5-keyed table comparison, and a table analysis displayed to standard output. | ||
* <pre> | ||
* MD5 Length hg19mini.fasta hg19mini_1renamed.fasta | ||
* 8c0c38e352d8f3309eabe4845456f274 16000 1 chr1 | ||
* 5f8388fe3fb34aa38375ae6cf5e45b89 16000 2 2 | ||
* 94de808a3a2203dbb02434a47bd8184f 16000 3 3 | ||
* 7d397ee919e379328d8f52c57a54c778 16000 4 4 | ||
* | ||
* REFERENCE PAIR: hg19mini.fasta, hg19mini_1renamed.fasta | ||
* Status: | ||
* DIFFER_IN_SEQUENCE_NAMES | ||
* </pre> | ||
* </p> | ||
* | ||
* <h3>Usage example</h3> | ||
* <pre> | ||
* gatk CompareReferences \ | ||
* -R reference1.fasta \ | ||
* -refcomp reference2.fasta | ||
* -O output.fasta \ | ||
* -md5-calculation-mode USE_DICT | ||
* </pre> | ||
* | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
*/ | ||
|
||
@CommandLineProgramProperties( | ||
summary = "Compare multiple references and output a tab-delimited table detailing what the differences are and a summarized analysis of each pair of references.", | ||
oneLineSummary = "Display reference comparison as a tab-delimited table and summarize reference differences.", | ||
programGroup = ReferenceProgramGroup.class | ||
) | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
@DocumentedFeature | ||
@ExperimentalFeature | ||
public class CompareReferences extends GATKTool { | ||
|
||
@Argument(fullName = "references-to-compare", shortName = "refcomp", doc = "Reference sequence file(s) to compare.") | ||
private List<GATKPath> references; | ||
|
||
/** | ||
* Output file will be written here. | ||
* | ||
* Note: If no output file provided, table will print to standard output. | ||
*/ | ||
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "If provided, file to output table.", optional = true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @orlicohen looks like you missed this |
||
private GATKPath output; | ||
|
||
@Argument(fullName = "md5-calculation-mode", shortName = "md5-calculation-mode", doc = "MD5CalculationMode indicating method of MD5 calculation.", optional = true) | ||
private MD5CalculationMode md5CalculationMode = MD5CalculationMode.RECALCULATE_IF_MISSING; | ||
|
||
@Argument(fullName = "display-sequences-by-name", doc = "If provided, the table by sequence name will be printed.", optional = true) | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
private boolean displaySequencesByName = false; | ||
|
||
@Argument(fullName = "display-only-differing-sequences", doc = "If provided, only display sequence names that differ in their actual sequence.", optional = true) | ||
private boolean onlyDisplayDifferingSequences = false; | ||
|
||
public enum MD5CalculationMode { | ||
// use only MD5s found in dictionary; if MD5 missing, crashes | ||
USE_DICT, | ||
// use any MD5s found in dictionary and recalculate any missing MD5s | ||
RECALCULATE_IF_MISSING, | ||
// recalculate all MD5s, regardless of presence in dictionary | ||
ALWAYS_RECALCULATE; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a one-line comment above each enum value explaining what each mode does. |
||
|
||
private Map<GATKPath, ReferenceDataSource> referenceSources; | ||
|
||
@Override | ||
public boolean requiresReference() { | ||
return true; | ||
} | ||
|
||
@Override | ||
public void onTraversalStart() { | ||
// add data source for -R reference | ||
referenceSources = new LinkedHashMap<>(); | ||
referenceSources.put(referenceArguments.getReferenceSpecifier(), directlyAccessEngineReferenceDataSource()); | ||
|
||
// add data sources for remaining references | ||
for(GATKPath path : references){ | ||
referenceSources.put(path, ReferenceDataSource.of(path.toPath())); | ||
} | ||
} | ||
|
||
@Override | ||
public void traverse(){ | ||
ReferenceSequenceTable table = new ReferenceSequenceTable(referenceSources, md5CalculationMode); | ||
logger.info("Building reference sequence table."); | ||
table.build(); | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
logger.info("Finished building table."); | ||
|
||
writeTable(table); | ||
|
||
if(displaySequencesByName){ | ||
writeTableBySequenceName(table); | ||
} | ||
|
||
logger.info("Analyzing table."); | ||
List<ReferencePair> referencePairs = table.analyzeTable(); | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
logger.info("Finished analyzing table."); | ||
System.out.println("*********************************************************"); | ||
for(ReferencePair pair : referencePairs){ | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
System.out.println(pair); | ||
} | ||
} | ||
|
||
|
||
/** | ||
* Given a table, write table to output. | ||
* | ||
* Note: if no output file specified, displays to standard output. | ||
* | ||
* @param table | ||
*/ | ||
private void writeTable(ReferenceSequenceTable table){ | ||
TableColumnCollection columns = new TableColumnCollection(table.getColumnNames()); | ||
try(CompareReferences.CompareReferencesOutputTableWriter writer = output == null | ||
? new CompareReferences.CompareReferencesOutputTableWriter(new OutputStreamWriter(System.out), columns) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since there is no test coverage for the stdout case, have you run the tool manually without |
||
: new CompareReferences.CompareReferencesOutputTableWriter(output.toPath(), columns) | ||
){ | ||
writer.writeHeaderIfApplies(); | ||
for(ReferenceSequenceTable.TableRow row : table){ | ||
writer.writeRecord(row); | ||
} | ||
} | ||
catch(IOException exception){ | ||
throw (output == null) | ||
? new UserException.CouldNotCreateOutputFile("System.out", "Failed to write output table.", exception) | ||
: new UserException.CouldNotCreateOutputFile(output, "Failed to write output table.", exception); | ||
} | ||
} | ||
|
||
/** | ||
* Given a table, write table by sequence name to standard output | ||
* | ||
* @param table | ||
*/ | ||
public void writeTableBySequenceName(ReferenceSequenceTable table){ | ||
System.out.print("*********************************************************\n"); | ||
System.out.print("Name \tMD5 \tReference\n"); | ||
for(String sequenceName : table.getAllSequenceNames()){ | ||
Set<ReferenceSequenceTable.TableRow> rows = table.queryBySequenceName(sequenceName); | ||
if(onlyDisplayDifferingSequences) { | ||
if(rows.size() > 1) { | ||
displayBySequenceName(rows, sequenceName); | ||
} | ||
} else { | ||
displayBySequenceName(rows, sequenceName); | ||
} | ||
} | ||
} | ||
|
||
private void displayBySequenceName(Set<ReferenceSequenceTable.TableRow> rows, String sequenceName){ | ||
System.out.print(sequenceName); | ||
for(ReferenceSequenceTable.TableRow row : rows) { | ||
ReferenceSequenceTable.TableEntry[] entries = row.getEntries(); | ||
System.out.print("\n\t" + row.getMd5() + "\t"); | ||
|
||
for(int i = 2; i < entries.length; i++) { | ||
if(entries[i].getColumnValue().equals(sequenceName)) { | ||
System.out.print(entries[i].getColumnName() + "\t"); | ||
} | ||
} | ||
} | ||
System.out.println(); | ||
} | ||
|
||
@Override | ||
public void closeTool() { | ||
for(Map.Entry<GATKPath, ReferenceDataSource> entry : referenceSources.entrySet()){ | ||
if(!entry.getKey().equals(referenceArguments.getReferenceSpecifier())){ | ||
entry.getValue().close(); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* TableWriter to format and write the table output. | ||
*/ | ||
public static class CompareReferencesOutputTableWriter extends TableWriter<ReferenceSequenceTable.TableRow> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a quick javadoc here explaining this is necessary to write the table output |
||
|
||
public CompareReferencesOutputTableWriter(final Path table, TableColumnCollection columns) throws IOException { | ||
super(table, columns); | ||
} | ||
|
||
public CompareReferencesOutputTableWriter(final Writer writer, TableColumnCollection columns) throws IOException { | ||
super(writer, columns); | ||
} | ||
|
||
@Override | ||
protected void composeLine(final ReferenceSequenceTable.TableRow record, final DataLine dataLine) { | ||
List<String> columnNames = record.getColumnNames(); | ||
ReferenceSequenceTable.TableEntry[] entries = record.getEntries(); | ||
for(int i = 0; i < columnNames.size(); i++){ | ||
dataLine.set(entries[i].getColumnName(), entries[i].getColumnValue()); | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
package org.broadinstitute.hellbender.tools.reference; | ||
|
||
import org.broadinstitute.hellbender.engine.GATKPath; | ||
|
||
import java.util.*; | ||
|
||
/** | ||
* Class representing a pair of references and their differences. | ||
* | ||
*/ | ||
public class ReferencePair { | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
private final GATKPath ref1; | ||
|
||
private final GATKPath ref2; | ||
|
||
private final int ref1ColumnIndex; | ||
|
||
private final int ref2ColumnIndex; | ||
|
||
public enum Status{ | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// references match exactly (table entries identical across references) | ||
EXACT_MATCH, | ||
// found sequences with same MD5s but inconsistent names between the 2 references | ||
DIFFER_IN_SEQUENCE_NAMES, | ||
// found sequences with same names but inconsistent MD5s between the 2 references | ||
DIFFER_IN_SEQUENCE, | ||
// at least one sequence is present in one reference but not the other | ||
DIFFER_IN_SEQUENCES_PRESENT, | ||
// ref1 is a superset of ref2 | ||
SUPERSET, | ||
// ref1 is a subset of ref2 | ||
SUBSET; | ||
} | ||
|
||
private EnumSet<Status> analysis; | ||
|
||
public ReferencePair(ReferenceSequenceTable table, GATKPath reference1, GATKPath reference2){ | ||
ref1 = reference1; | ||
ref2 = reference2; | ||
ref1ColumnIndex = table.getColumnIndices().get(ReferenceSequenceTable.getReferenceColumnName(ref1)); | ||
ref2ColumnIndex = table.getColumnIndices().get(ReferenceSequenceTable.getReferenceColumnName(ref2)); | ||
// assume EXACT_MATCH until proven otherwise - if any discrepancies found, EXACT_MATCH status removed | ||
analysis = EnumSet.of(Status.EXACT_MATCH); | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
/** | ||
* Given a Status, add it to the ReferencePair's analysis | ||
* | ||
* @param status | ||
*/ | ||
public void addStatus(Status status) { | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
analysis.add(status); | ||
} | ||
|
||
/** | ||
* Given a Status, remove it from the ReferencePair's analysis, if present | ||
* | ||
* @param status | ||
*/ | ||
public void removeStatus(Status status){ | ||
analysis.remove(status); | ||
} | ||
|
||
public int getRef1ColumnIndex() { | ||
return ref1ColumnIndex; | ||
} | ||
|
||
public int getRef2ColumnIndex() { | ||
return ref2ColumnIndex; | ||
} | ||
|
||
public String getRef1(){ | ||
return ReferenceSequenceTable.getReferenceColumnName(ref1); | ||
} | ||
public String getRef2(){ | ||
return ReferenceSequenceTable.getReferenceColumnName(ref2); | ||
} | ||
|
||
/** | ||
* Displays a ReferencePair's status set | ||
* | ||
* @return the status set as a formatted String | ||
*/ | ||
public String statusAsString(){ | ||
String output = ""; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm... this currently is missing any indication of WHAT lines the status' are being generated by. I don't see where those are getting printed and it looks like we forgot to do that. I think we need to point the user at the specific lines that are problemeatic somehow for example "chr3 mismatches with reference A and B" for example. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jamesemery We said that we would print just the statuses per reference pair in a compact tabular form -- linking the statuses to individual TableRows is not a feature that has been implemented yet, and is probably out of scope for this PR. |
||
for(Status status : analysis){ | ||
output += String.format("\t%s\n", status.name()); | ||
} | ||
return output; | ||
} | ||
|
||
public EnumSet<Status> getAnalysis(){ | ||
return analysis; | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
public String toString(){ | ||
return String.format("REFERENCE PAIR: %s, %s\nStatus:\n%s", | ||
ReferenceSequenceTable.getReferenceColumnName(ref1), | ||
ReferenceSequenceTable.getReferenceColumnName(ref2), | ||
statusAsString()); | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if (this == o) return true; | ||
if (o == null || getClass() != o.getClass()) return false; | ||
ReferencePair that = (ReferencePair) o; | ||
return Objects.equals(ref1, that.ref1) && Objects.equals(ref2, that.ref2); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(ref1, ref2); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a good javadoc for the class