-
Notifications
You must be signed in to change notification settings - Fork 33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix Confidence Adjustment for Larger Shingle Sizes #407
Merged
Merged
Changes from 1 commit
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -80,44 +80,6 @@ public float[] getScaledShingledInput(double[] inputPoint, long timestamp, int[] | |
return point; | ||
} | ||
|
||
/** | ||
* the timestamps are now used to calculate the number of imputed tuples in the | ||
* shingle | ||
* | ||
* @param timestamp the timestamp of the current input | ||
*/ | ||
@Override | ||
protected void updateTimestamps(long timestamp) { | ||
/* | ||
* For imputations done on timestamps other than the current one (specified by | ||
* the timestamp parameter), the timestamp of the imputed tuple matches that of | ||
* the input tuple, and we increment numberOfImputed. For imputations done at | ||
* the current timestamp (if all input values are missing), the timestamp of the | ||
* imputed tuple is the current timestamp, and we increment numberOfImputed. | ||
* | ||
* To check if imputed values are still present in the shingle, we use the first | ||
* condition (previousTimeStamps[0] == previousTimeStamps[1]). This works | ||
* because previousTimeStamps has a size equal to the shingle size and is filled | ||
* with the current timestamp. However, there are scenarios where we might miss | ||
* decrementing numberOfImputed: | ||
* | ||
* 1. Not all values in the shingle are imputed. 2. We accumulated | ||
* numberOfImputed when the current timestamp had missing values. | ||
* | ||
* As a result, this could cause the data quality measure to decrease | ||
* continuously since we are always counting missing values that should | ||
* eventually be reset to zero. The second condition <pre> timestamp > | ||
* previousTimeStamps[previousTimeStamps.length-1] && numberOfImputed > 0 </pre> | ||
* will decrement numberOfImputed when we move to a new timestamp, provided | ||
* numberOfImputed is greater than zero. | ||
*/ | ||
if (previousTimeStamps[0] == previousTimeStamps[1] | ||
|| (timestamp > previousTimeStamps[previousTimeStamps.length - 1] && numberOfImputed > 0)) { | ||
numberOfImputed = numberOfImputed - 1; | ||
} | ||
super.updateTimestamps(timestamp); | ||
} | ||
|
||
/** | ||
* decides if the forest should be updated, this is needed for imputation on the | ||
* fly. The main goal of this function is to avoid runaway sequences where a | ||
|
@@ -128,7 +90,10 @@ protected void updateTimestamps(long timestamp) { | |
*/ | ||
protected boolean updateAllowed() { | ||
double fraction = numberOfImputed * 1.0 / (shingleSize); | ||
if (numberOfImputed == shingleSize - 1 && previousTimeStamps[0] != previousTimeStamps[1] | ||
if (fraction > 1) { | ||
fraction = 1; | ||
} | ||
if (numberOfImputed >= shingleSize - 1 && previousTimeStamps[0] != previousTimeStamps[1] | ||
&& (transformMethod == DIFFERENCE || transformMethod == NORMALIZE_DIFFERENCE)) { | ||
// this shingle is disconnected from the previously seen values | ||
// these transformations will have little meaning | ||
|
@@ -144,10 +109,19 @@ protected boolean updateAllowed() { | |
// two different points). | ||
return false; | ||
} | ||
|
||
dataQuality[0].update(1 - fraction); | ||
return (fraction < useImputedFraction && internalTimeStamp >= shingleSize); | ||
} | ||
|
||
@Override | ||
protected void updateTimestamps(long timestamp) { | ||
if (previousTimeStamps[0] == previousTimeStamps[1]) { | ||
numberOfImputed = numberOfImputed - 1; | ||
} | ||
super.updateTimestamps(timestamp); | ||
} | ||
|
||
/** | ||
* the following function mutates the forest, the lastShingledPoint, | ||
* lastShingledInput as well as previousTimeStamps, and adds the shingled input | ||
|
@@ -168,7 +142,9 @@ void updateForest(boolean changeForest, double[] input, long timestamp, RandomCu | |
updateShingle(input, scaledInput); | ||
updateTimestamps(timestamp); | ||
if (isFullyImputed) { | ||
numberOfImputed = numberOfImputed + 1; | ||
numberOfImputed = Math.min(numberOfImputed + 1, shingleSize); | ||
} else if (numberOfImputed > 0) { | ||
numberOfImputed = numberOfImputed - 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. at this point are we guaranteed to have seen a new value that is not imputed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes. |
||
} | ||
if (changeForest) { | ||
if (forest.isInternalShinglingEnabled()) { | ||
|
@@ -190,7 +166,9 @@ public void update(double[] point, float[] rcfPoint, long timestamp, int[] missi | |
return; | ||
} | ||
generateShingle(point, timestamp, missing, getTimeFactor(timeStampDeviations[1]), true, forest); | ||
++valuesSeen; | ||
if (missing == null || missing.length != point.length) { | ||
++valuesSeen; | ||
} | ||
} | ||
|
||
protected double getTimeFactor(Deviation deviation) { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At least this part of the comment is still valid right? Would be good to have as I was pretty confused what
previousTimeStamps[0] != previousTimeStamps[1]
really meant to do. or maybe exaplain that part inAdditionally so I am understaind correct, we are saying if the last 10 values were imputed and shingle size is 8, we will most likely return false here and not allow an update until we get an actual value?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let me add the comment back.
yes, your understanding is correct.