Skip to content

Commit

Permalink
finished table analysis and added documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
orlicohen committed Jul 18, 2022
1 parent 8df6e3d commit a0e00bb
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 64 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,20 @@ public class CompareReferences extends GATKTool {
@Argument(fullName = "references-to-compare", shortName = "refcomp", doc = "Reference sequence file(s) to compare.")
private List<GATKPath> references;

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "", optional = true)
/**
* Output file will be written here.
*
* Note: If no output file provided, table will print to standard output.
*/
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "If provided, file to output table.", optional = true)
private GATKPath output;

@Argument(fullName = "md5-calculation-mode", shortName = "md5-calculation-mode", doc = "", optional = true)
@Argument(fullName = "md5-calculation-mode", shortName = "md5-calculation-mode", doc = "MD5CalculationMode indicating method of MD5 calculation.", optional = true)
private MD5CalculationMode md5CalculationMode = MD5CalculationMode.USE_DICT;
@Argument(fullName = "display-sequences-by-name", doc = "If provided, the table by sequence name will be printed.", optional = true)
private boolean displaySequencesByName = false;

@Argument(fullName = "display-only-differing-sequences", shortName = "", doc = "", optional = true)
@Argument(fullName = "display-only-differing-sequences", doc = "If provided, only display sequence names ", optional = true)
private boolean onlyDisplayDifferingSequences = false;

public enum MD5CalculationMode {
Expand Down Expand Up @@ -75,19 +82,24 @@ public void traverse(){
writeTableToFileOutput(table);
}

//displayMissingEntries(table);
listDifferentSequenceSameName(table);
//writeTableToStdOutput(table);
if(displaySequencesByName){
tableBySequenceName(table);
}

List<GATKPath> refs = new ArrayList<>();
refs.addAll(referenceSources.keySet());
//table.generateReferencePairs(refs);

List<ReferencePair> referencePairs = table.analyzeTable();
for(ReferencePair pair : referencePairs){
System.out.println(pair);
}
}

/**
* Given a table, write table to standard output.
*
* @param table
*/
private void writeTableToStdOutput(ReferenceSequenceTable table){
// print header
List<String> columnNames = table.getColumnNames();
Expand All @@ -110,6 +122,11 @@ private void writeTableToStdOutput(ReferenceSequenceTable table){
}
}

/**
* Given a table, write table to file output.
*
* @param table
*/
private void writeTableToFileOutput(ReferenceSequenceTable table) {
TableColumnCollection columns = new TableColumnCollection(table.getColumnNames());
try(CompareReferences.CompareReferencesOutputTableWriter writer = new CompareReferences.CompareReferencesOutputTableWriter(output.toPath(), columns)){
Expand All @@ -128,33 +145,12 @@ public Object onTraversalSuccess() {
return null;
}

public void displayMissingEntries(ReferenceSequenceTable table) {
boolean noMissingEntries = true;
String output = "";
int currRow = 0;

for(ReferenceSequenceTable.TableRow row : table){
currRow++;
ReferenceSequenceTable.TableEntry[] entries = row.getEntries();

for(int i = 0; i < entries.length; i++){
if(entries[i].isEmpty()){
output += String.format("Row %d: Missing entry in %s column.\n", currRow, entries[i].getColumnName());
noMissingEntries = false;
}
}
}

if(noMissingEntries){
output += "No missing entries.\n";
}
else{
output += "References are not an exact match. See table output for details.\n";
}
System.out.println(output);
}

public void listDifferentSequenceSameName(ReferenceSequenceTable table){
/**
* Given a table, write table by sequence name to standard output
*
* @param table
*/
public void tableBySequenceName(ReferenceSequenceTable table){
List<String> output = new ArrayList<>();
output.add("Sequence \tMD5 \tReference\n");

Expand All @@ -175,7 +171,7 @@ public void listDifferentSequenceSameName(ReferenceSequenceTable table){
System.out.println();
}

public List<String> displayBySequenceName(Set<ReferenceSequenceTable.TableRow> rows, String sequenceName){
private List<String> displayBySequenceName(Set<ReferenceSequenceTable.TableRow> rows, String sequenceName){
List<String> output = new ArrayList<>();
output.add(sequenceName);
for(ReferenceSequenceTable.TableRow row : rows) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ public String getRef2(){
return ref2.toPath().getFileName().toString();
}

/**
* Displays a ReferencePair's status set
*
* @return the status set as a formatted String
*/
public String statusAsString(){
String output = "";
for(Status status : analysis){
Expand All @@ -59,11 +64,27 @@ public String statusAsString(){
return output;
}

public EnumSet<Status> getStatus(){
return analysis;
}

public String toString(){
return String.format("REFERENCE PAIR: %s, %s\nStatus:\n%s",
ReferenceSequenceTable.getReferenceDisplayName(ref1),
ReferenceSequenceTable.getReferenceDisplayName(ref2),
statusAsString());
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ReferencePair that = (ReferencePair) o;
return Objects.equals(ref1, that.ref1) && Objects.equals(ref2, that.ref2);
}

@Override
public int hashCode() {
return Objects.hash(ref1, ref2);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ public List<String> getColumnNames() {
return columnNames;
}

/**
* Given a GATKPath, return the name of the file as a String.
*
* @param reference The path to a reference.
* @return the name of the reference as a String.
*/
public static String getReferenceDisplayName(GATKPath reference){
return reference.toPath().getFileName().toString();
}
Expand All @@ -68,6 +74,9 @@ public Map<String, Integer> getColumnIndices() {
return columnIndices;
}

/**
* Construct 2 tables of references: one keyed by MD5, one keyed by sequence name.
*/
public void build() {
tableByMD5 = new LinkedHashMap<>();
tableBySequenceName = new LinkedHashMap<>();
Expand Down Expand Up @@ -102,19 +111,22 @@ public Set<String> getAllSequenceNames(){
return tableBySequenceName.keySet();
}

// number of rows in md5 keyed table
public int size(){
int size = 0;
for(TableRow row : this){
size++;
}
return size;
}

/**
* Given an MD5, returns its corresponding row
*
* @param md5 The MD5 as a String
* @return the corresponding TableRow from the tableByMD5
*/
public TableRow queryByMD5(String md5){
return tableByMD5.get(md5);
}

/**
* Given a sequence name, returns the set of its corresponding rows
*
* @param sequenceName The sequence name as a String
* @return the set of TableRows that contain the sequence name
*/
public Set<TableRow> queryBySequenceName(String sequenceName){
return tableBySequenceName.get(sequenceName) == null ? Collections.emptySet() : tableBySequenceName.get(sequenceName);
}
Expand Down Expand Up @@ -146,7 +158,12 @@ private String calculateMD5(SAMSequenceRecord record, ReferenceDataSource source
return md5;
}

public List<ReferencePair> generateReferencePairs(List<GATKPath> references){
/**
* Generate ReferencePairs for pairwise comparison of all references present in the table
*
* @return the list of ReferencePairs for every pair of references
*/
public List<ReferencePair> generateReferencePairs(){
List<ReferencePair> referencePairs = new ArrayList<>();
for(int i = 0; i < references.size(); i++){
for(int j = i + 1; j < references.size(); j++){
Expand All @@ -156,8 +173,20 @@ public List<ReferencePair> generateReferencePairs(List<GATKPath> references){
return referencePairs;
}

/**
* Analyze the table by doing a pairwise comparison for all table references. Generates all ReferencePairs, then analyzes
* each pair and assigns it an analysis as a set of the following statuses:
* EXACT_MATCH,
* DIFFER_IN_SEQUENCE_NAMES,
* DIFFER_IN_SEQUENCE,
* DIFFER_IN_SEQUENCES_PRESENT,
* SUPERSET,
* SUBSET
*
* @return list of ReferencePairs with updated status sets
*/
public List<ReferencePair> analyzeTable(){
List<ReferencePair> refPairs = generateReferencePairs(references);
List<ReferencePair> refPairs = generateReferencePairs();

for(TableRow row : tableByMD5.values()) {
for(ReferencePair pair : refPairs) {
Expand Down Expand Up @@ -198,7 +227,7 @@ public List<ReferencePair> analyzeTable(){
if(ref1Value.getColumnValue().equals(MISSING_ENTRY)){
subset = true;
}
else{
else if(ref2Value.getColumnValue().equals(MISSING_ENTRY)){
superset = true;
}
}
Expand All @@ -215,12 +244,12 @@ public List<ReferencePair> analyzeTable(){

if(superset ^ subset){
pair.removeStatus(ReferencePair.Status.DIFFER_IN_SEQUENCES_PRESENT);
}

if(superset && !subset){
pair.addStatus(ReferencePair.Status.SUPERSET);
} else if(subset && !superset) {
pair.addStatus(ReferencePair.Status.SUBSET);
if(superset && !subset){
pair.addStatus(ReferencePair.Status.SUPERSET);
} else if(subset && !superset) {
pair.addStatus(ReferencePair.Status.SUBSET);
}
}
}
return refPairs;
Expand Down Expand Up @@ -263,9 +292,6 @@ public int hashCode() {
}

public class TableRow {

// private static final int MD5_COLUMN_INDEX = 0;
// private static final int LENGTH_COLUMN_INDEX = 1;
private final String md5;
private final TableEntry[] entries;
private final int length;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,33 +99,36 @@ public void testCompareReferencesUseDictMD5MissingValue() throws IOException{
runCommandLine(args);
}

@Test
// The following three tests run the tool on different combinations of reference files
// and produce output to stdout for the sake of manually inspecting outputs.
// Disabled, as no actual assertions made.

@Test(enabled = false)
public void testCompareReferencesToStdOutput() throws IOException{
final File ref1 = new File(getToolTestDataDir() + "hg19mini.fasta");
final File ref2 = new File(getToolTestDataDir() + "hg19mini_missingsequence.fasta");
final File ref2 = new File(getToolTestDataDir() + "hg19mini_missingchr1.fasta");

final String[] args = new String[] {"-R", ref1.getAbsolutePath() , "-refcomp", ref2.getAbsolutePath()};
runCommandLine(args);
}

@Test
@Test(enabled = false)
public void testCompareReferencesMultipleReferencesStdOut() throws IOException{
final File ref1 = new File(getToolTestDataDir() + "hg19mini.fasta");
final File ref2 = new File(getToolTestDataDir() + "hg19mini_1renamed.fasta");
final File ref3 = new File(getToolTestDataDir() + "hg19mini_chr2snp.fasta");
final File ref4 = new File(getToolTestDataDir() + "hg19mini_missingsequence.fasta");
final File ref4 = new File(getToolTestDataDir() + "hg19mini_missingchr1.fasta");

final String[] args = new String[] {"-R", ref1.getAbsolutePath() , "-refcomp", ref2.getAbsolutePath(), "-refcomp", ref3.getAbsolutePath(),
"-refcomp", ref4.getAbsolutePath(),
};
"-refcomp", ref4.getAbsolutePath()};
runCommandLine(args);
}

@Test
@Test(enabled = false)
public void testCompareReferencesMissingSequencesStdOut() throws IOException{
final File ref1 = new File(getToolTestDataDir() + "hg19mini.fasta");
final File ref2 = new File(getToolTestDataDir() + "hg19mini_missingchr3.fasta");
final File ref3 = new File(getToolTestDataDir() + "hg19mini_missingsequence.fasta");
final File ref3 = new File(getToolTestDataDir() + "hg19mini_missingchr1.fasta");

final String[] args = new String[] {"-R", ref1.getAbsolutePath() , "-refcomp", ref2.getAbsolutePath(), "-refcomp", ref3.getAbsolutePath()};
runCommandLine(args);
Expand Down
Loading

0 comments on commit a0e00bb

Please sign in to comment.