Skip to content

Commit

Permalink
Merge pull request #28 from AndrewRook/validation_fix
Browse files Browse the repository at this point in the history
Validation improve
  • Loading branch information
AndrewRook authored Aug 14, 2016
2 parents 406000a + 0e3236a commit 504e9cb
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 18 deletions.
20 changes: 14 additions & 6 deletions doc/source/model.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,17 @@ seasons and validate on the 2011 and 2012 playoffs, you would do the following:
>>> new_data_model = WPModel()
>>> new_data_model.train_model(training_seasons=[2009, 2010], training_season_types=["Regular"])
>>> new_data_model.validate_model(validation_seasons=[2011, 2012], validation_season_types=["Postseason"])
0.14963235412213988
(21.355462918011327, 565.56909036318007)
If you want to supply your own data, that's easy too - simply set the
`source_data` kwarg of :meth:`~nflwin.model.WPModel.train_model` and
:meth:`~nflwin.model.WPModel.validate_model` to be a Pandas DataFrame of your training and validation data (respectively):

..
from nflwin.utilities import get_nfldb_play_data
training_data = get_nfldb_play_data(season_years=[2012, 2013])
validation_data = get_nfldb_play_data(season_years=[2014])
.. code-block:: python
>>> from nflwin.model import WPModel
Expand Down Expand Up @@ -78,7 +83,7 @@ If you want to supply your own data, that's easy too - simply set the
3 0
4 0
>>> new_data_model.validate_model(source_data=validation_data)
6.8222808634589248e-35
(8.9344062502671591, 265.7971863696315)
Building a New Model
--------------------
Expand Down Expand Up @@ -176,8 +181,11 @@ it for yourself by calling the

.. image:: _static/validation_plot.png

From there NFLWin estimates if the deviations between predicted and actual WP are
statistically significant - that's what is actually returned by :meth:`~nflwin.model.WPModel.validate_model`. This is decidedly not awesome - even if the deviations aren't
significant you can't prove that your model is correct; rather all you can
say that it is "not inconsistent" with the validation data. If anyone
From there NFLWin computes both the maximum deviation at any given
percentage and the total area between the estimated WP from the model
and what would be expected if the model was perfect - that's what is
actually returned by
:meth:`~nflwin.model.WPModel.validate_model`. This is obviously not
ideal given that it's not directly estimating uncertainties in
the model, but it's the best I've been able to come up with so far. If anyone
has an idea for how to do this better I would welcome it enthusiastically.
5 changes: 3 additions & 2 deletions make_default_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ def main():
print("Took {0:.2f}s to build model".format(time.time() - start))

start = time.time()
combined_pvalue = win_probability_model.validate_model(validation_seasons=[2015])
print("Took {0:.2f}s to validate model, with combined p_value of {1:.2f}".format(time.time() - start, combined_pvalue))
max_deviation, residual_area = win_probability_model.validate_model(validation_seasons=[2015])
print("Took {0:.2f}s to validate model, with a max residual of {1:.2f} and a residual area of {2:.2f}"
.format(time.time() - start, max_deviation, residual_area))

win_probability_model.save_model()

Expand Down
2 changes: 1 addition & 1 deletion nflwin/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = u"0.1.0"
__version__ = u"0.2.0"
42 changes: 33 additions & 9 deletions nflwin/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os

import numpy as np
from scipy import integrate
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
Expand Down Expand Up @@ -254,10 +255,10 @@ def validate_model(self,
self._validation_season_types = []
if isinstance(source_data, basestring):
if source_data == "nfldb":
source_data = utilities.get_nfldb_play_data(season_years=training_seasons,
season_types=training_season_types)
self._training_seasons = training_seasons
self._training_season_types = training_season_types
source_data = utilities.get_nfldb_play_data(season_years=validation_seasons,
season_types=validation_season_types)
self._validation_seasons = validation_seasons
self._validation_season_types = validation_season_types
else:
raise ValueError("WPModel: if source_data is a string, it must be 'nfldb'")

Expand All @@ -268,13 +269,36 @@ def validate_model(self,
self._sample_probabilities, self._predicted_win_percents, self._num_plays_used = (
WPModel._compute_predicted_percentages(target_col.values, predicted_probabilities))

#Compute the maximal deviation from a perfect prediction as well as the area under the
#curve of the residual between |predicted - perfect|:
max_deviation, residual_area = self._compute_prediction_statistics(self.sample_probabilities,
self.predicted_win_percents)
return max_deviation, residual_area

#Compute p-values for each where null hypothesis is that distributions are same, then combine
#them all to make sure data is not inconsistent with accurate predictions.
combined_pvalue = self._test_distribution(self.sample_probabilities,
self.predicted_win_percents,
self.num_plays_used)
# combined_pvalue = self._test_distribution(self.sample_probabilities,
# self.predicted_win_percents,
# self.num_plays_used)

return combined_pvalue
# return combined_pvalue

@staticmethod
def _compute_prediction_statistics(sample_probabilities, predicted_win_percents):
"""Take the KDE'd model estimates, then compute statistics.
Returns
-------
A tuple of (``max_deviation``, ``residual_area``), where ``max_deviation``
is the largest discrepancy between the model and expectation at any WP,
and ``residual_area`` is the total area under the curve of |predicted WP - expected WP|.
"""
abs_deviations = np.abs(predicted_win_percents - sample_probabilities)
max_deviation = np.max(abs_deviations)
residual_area = integrate.simps(abs_deviations,
sample_probabilities)
return (max_deviation, residual_area)


def predict_wp(self, plays):
"""Estimate the win probability for a set of plays.
Expand Down Expand Up @@ -369,7 +393,7 @@ def _compute_predicted_percentages(actual_results, predicted_win_probabilities):
number_total = number_density_total * len(actual_results) / np.sum(number_density_total)
predicted_win_percents = number_offense_won / number_total

return sample_probabilities, predicted_win_percents, number_total
return 100.*sample_probabilities, 100.*predicted_win_percents, number_total

def create_default_pipeline(self):
"""Create the default win probability estimation pipeline.
Expand Down

0 comments on commit 504e9cb

Please sign in to comment.