QuantEcon · doctor-phil · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -16,7 +16,7 @@ jobs:
           auto-update-conda: true
           auto-activate-base: true
           miniconda-version: 'latest'
-          python-version: 3.9
+          python-version: 3.12
           environment-file: environment.yml
           activate-environment: lecture-datascience
       - name: Display Conda Environment Versions

diff --git a/environment.yml b/environment.yml
@@ -32,7 +32,7 @@ dependencies:
     - xgboost
     - graphviz
     - bokeh
-    # - nltk
+    - nltk
     - pandas-datareader
     - seaborn
     - patsy

diff --git a/lectures/_data/avalanche_forecasts.zip b/lectures/_data/avalanche_forecasts.zip
diff --git a/lectures/applications/ml_in_economics.md b/lectures/applications/ml_in_economics.md
@@ -13,6 +13,7 @@ kernelspec:
 
 **Author**
 > - [Paul Schrimpf *UBC*](https://economics.ubc.ca/faculty-and-staff/paul-schrimpf/)
+> - [Philip Solimine *UBC*](https://www.psolimine.net/)
 
 **Prerequisites**
 
@@ -259,11 +260,11 @@ tags: [hide-output]
 ---
 cps["female"] = (cps.sex==2)
 cps["log_earn"] = np.log(cps.earnwke)
-cps["log_earn"][np.isinf(cps.log_earn)] = np.nan
+cps.loc[np.isinf(cps.log_earn),"log_earn"] = np.nan
 cps["log_uhours"] = np.log(cps.uhourse)
-cps["log_uhours"][np.isinf(cps.log_uhours)] = np.nan
+cps.loc[np.isinf(cps.log_uhours),"log_uhours"] = np.nan
 cps["log_hourslw"] = np.log(cps.hourslw)
-cps["log_hourslw"][np.isinf(cps.log_hourslw)] = np.nan
+cps.loc[np.isinf(cps.log_hourslw),"log_hourslw"] = np.nan
 cps["log_wageu"] = cps.log_earn - cps.log_uhours
 cps["log_wagelw"] = cps.log_earn - cps.log_hourslw
 
@@ -394,12 +395,8 @@ def plotpredictions(pl) :
     plt.title("Prediction Errors")
 
     plt.figure()
-    sns.distplot(pl[2][female==0], hist = True, kde = False,
-                 kde_kws = {'shade': True, 'linewidth': 3},
-                 label = "Male")
-    sns.distplot(pl[2][female==1], hist = True, kde = False,
-                 kde_kws = {'shade': True, 'linewidth': 3},
-                 label = "Female")
+    sns.histplot(pl[2][female == 0], bins=30, label="Male", kde=False)
+    sns.histplot(pl[2][female == 1], bins=30, label="Female", kde=False)
     plt.title('P(female|x)')
 plotpredictions(pl_lasso)
 ```

diff --git a/lectures/applications/networks.md b/lectures/applications/networks.md
@@ -540,7 +540,7 @@ def truncate(f): # define a function that "rounds" a number to 0 if it is lower
         return 1
 
 # we already know that every stock is perfectly correlated with itself, so the ones on the diagonal are not really useful information. Let's get rid of them.
-adj = corr.applymap(truncate) - np.identity(10)
+adj = corr.map(truncate) - np.identity(10)
 adj
 ```
 

diff --git a/lectures/applications/recidivism.md b/lectures/applications/recidivism.md
@@ -138,7 +138,7 @@ Let's look at how the dataset is broken down into age, sex, and race.
 ```{code-cell} python
 def create_groupcount_barplot(df, group_col, figsize, **kwargs):
     "call df.groupby(group_col), then count number of records and plot"
-    counts = df.groupby(group_col)["name"].count().sort_index()
+    counts = df.groupby(group_col,observed=True)["name"].count().sort_index()
 
     fig, ax = plt.subplots(figsize=figsize)
     counts.plot(kind="bar", **kwargs)
@@ -177,7 +177,7 @@ is mostly African-American or Caucasian.
 We now look into how recidivism is split across groups.
 
 ```{code-cell} python
-recid = df.groupby(["age_cat", "sex", "race"])["two_year_recid"].mean().unstack(level="race")
+recid = df.groupby(["age_cat", "sex", "race"], observed=True)["two_year_recid"].mean().unstack(level="race")
 recid
 ```
 
@@ -201,8 +201,8 @@ create_groupcount_barplot(df, "decile_score", (12, 8), color="DarkBlue", rot=0)
 How do these scores differ by race?
 
 ```{code-cell} python
-dfgb = df.groupby("race")
-race_count = df.groupby("race")["name"].count()
+dfgb = df.groupby("race", observed=True)
+race_count = df.groupby("race", observed=True)["name"].count()
 
 fig, ax = plt.subplots(3, figsize=(14, 8))
 
@@ -253,7 +253,7 @@ One of the key critiques from Pro Publica, though, was that the inaccuracies wer
 Let's now separate the correlations by race and see what happens.
 
 ```{code-cell} python
-recid_rates = df.pivot_table(index="decile_score", columns="race", values="two_year_recid")
+recid_rates = df.pivot_table(index="decile_score", columns="race", values="two_year_recid", observed=True)
 
 recid_rates
 ```
@@ -789,10 +789,10 @@ def balance_hist_plot(pred, y, df, bins=20):
         _ax = ax[np.unravel_index(g, ax.shape)]
         y_sub = y[subset]
         pred_sub = pred[subset]
-        sns.distplot(pred_sub[y_sub==0], hist=True, bins=bins, kde=False, ax=_ax,
-                     label="No recidivate", norm_hist=True, axlabel="Predicted Probability")
-        sns.distplot(pred_sub[y_sub==1], hist=True, bins=bins, kde=False, ax=_ax,
-                     label="Yes recidivate", norm_hist=True, axlabel="Predicted Probability")
+        sns.histplot(pred_sub[y_sub==0], bins=bins, kde=False, ax=_ax,
+                     label="No recidivate")
+        sns.histplot(pred_sub[y_sub==1], bins=bins, kde=False, ax=_ax,
+                     label="Yes recidivate")
         _ax.set_title(group)
 
     plt.legend()
@@ -1021,7 +1021,7 @@ def balance_scorer(y_true, prob, df, weights):
            -weights[2]*(metrics.log_loss(y_true, prob, normalize=True)))
 
 score_params = {"df": df_train, "weights": [10.0, 1.0, 0.0]}
-scorer = metrics.make_scorer(balance_scorer, **score_params, needs_proba=True)
+scorer = metrics.make_scorer(balance_scorer, **score_params, response_method="predict_proba")
 grid_cv = model_selection.GridSearchCV(
     estimator=linear_model.LogisticRegression(penalty="l1",
                                               max_iter=100,
@@ -1059,22 +1059,38 @@ Unfortunately, this makes all the predictions identical, so these predictions
 are not so useful.
 
 ```{code-cell} python
-output, given_outcome, given_pred =cm_tables(
+try:
+    output, given_outcome, given_pred = cm_tables(
     balance_mod.best_estimator_.predict(X_test),
     y_test,
     df_test
 )
-display(output)
-display(given_pred)
-display(given_outcome)
+
+    # Ensure that the outputs are valid and check for division related issues in cm_tables
+
+    if output is not None:
+        display(output)
+        display(given_pred)
+    else:
+        print("Predicted values are None or invalid.")
+
+    if given_outcome is not None:
+        display(given_outcome)
+    else:
+        print("Outcome values are None or invalid.")
+
+except ZeroDivisionError:
+    print("Caught a division by zero error in cm_tables. Please check inputs or calculations.")
+except Exception as e:
+    print(f"An unexpected error occurred: {e}")
 ```
 
 What if we change our CV scoring function to care about both
 prediction and balance?
 
 ```{code-cell} python
 score_params = {"df": df_train, "weights": [10.0, 1.0, 5.0]}
-grid_cv.set_params(scoring=metrics.make_scorer(balance_scorer, **score_params, needs_proba=True))
+grid_cv.set_params(scoring=metrics.make_scorer(balance_scorer, **score_params, response_method="predict_proba"))
 bf_mod=grid_cv.fit(X_train,y_train)
 grid_cv_plot(bf_mod,"CV balance & fit")
 

diff --git a/lectures/applications/working_with_text.md b/lectures/applications/working_with_text.md
@@ -13,6 +13,7 @@ kernelspec:
 
 **Author**
 > - [Paul Schrimpf *UBC*](https://economics.ubc.ca/faculty-and-staff/paul-schrimpf/)
+> - [Phil Solimine *UBC*](https://www.psolimine.net/)
 
 **Prerequisites**
 
@@ -126,17 +127,18 @@ def get_incident_details(id):
     return(result)
 
 
-incidentsfile = "https://datascience.quantecon.org/assets/data/avalanche_incidents.csv"
+incidentsfile = "http://datascience.quantecon.org/assets/data/avalanche_incidents.csv"
 
 # To avoid loading the avalanche Canada servers, we save the incident details locally.
-if (not os.path.isfile(incidentsfile)):
+# to update the data locally, change the incidentsfile to some other file name
+
+try:
+    incidents = pd.read_csv(incidentsfile)
+except Exception as e:
     incident_detail_list = incidents_brief.id.apply(get_incident_details).to_list()
     incidents = pd.DataFrame.from_dict(incident_detail_list, orient="columns")
     incidents.to_csv(incidentsfile)
-else:
-    incidents = pd.read_csv(incidentsfile)
-
-incidents
+incidents.head()
 ```
 
 Many incidents include coordinates, but others do not. Most
@@ -317,10 +319,9 @@ You may have to uncomment the second line below if  folium is not installed.
 import folium
 import matplotlib
 
-cmap = matplotlib.cm.get_cmap('Set1')
+cmap = matplotlib.colormaps["Set1"]
 fmap = folium.Map(location=[60, -98],
-                            zoom_start=3,
-                            tiles='Stamen Terrain')
+                            zoom_start=3)
 with urllib.request.urlopen(req) as response:
     regions_tmp = json.loads(response.read().decode('utf-8'))
 folium.GeoJson(regions_tmp,
@@ -411,6 +412,7 @@ def download_cached_forecasts():
                     warnings.warn(f"'File $f exists and is larger than version in cache. Not replacing.")
                 else :
                     z.extract(f)
+                    print("Downloaded and extracted", f)
 
 download_cached_forecasts()
 ```
@@ -443,7 +445,7 @@ def get_forecasts(start, end, region):
         #print("working on {}, {}".format(region,day))
         forecasts = forecasts + [get_forecast(day, region)]
         #print("sleeping")
-        time.sleep(0.1) # to avoid too much load on Avalanche Canada servers
+        time.sleep(0.01) # to avoid too much load on Avalanche Canada servers
         day = day + pd.Timedelta(1,"D")
     return(forecasts)
 
@@ -456,11 +458,13 @@ def get_season(year, region):
         os.mkdir("avalanche_forecasts")
     seasonfile = "avalanche_forecasts/{}_{}-{}.json".format(region, year, year+1)
     if (not os.path.isfile(seasonfile)):
-        startdate = pd.to_datetime("{}-{}-{} 12:00".format(year, start_month, start_day))
-        lastdate = pd.to_datetime("{}-{}-{} 12:00".format(year+1, last_month, last_day))
-        season = get_forecasts(startdate,lastdate,region)
-        with open(seasonfile, 'w') as outfile:
-            json.dump(season, outfile, ensure_ascii=False)
+        print(f"Season file {seasonfile} not found. Uncomment code here to update cached data")
+        season = []
+        #startdate = pd.to_datetime("{}-{}-{} 12:00".format(year, start_month, start_day))
+        #lastdate = pd.to_datetime("{}-{}-{} 12:00".format(year+1, last_month, last_day))
+        #season = get_forecasts(startdate,lastdate,region)
+        #with open(seasonfile, 'w') as outfile:
+        #    json.dump(season, outfile, ensure_ascii=False)
     else:
         with open(seasonfile, "rb") as json_data:
             season = json.load(json_data)
@@ -481,7 +485,7 @@ for year in range(2011,2019):
 forecasts = pd.DataFrame.from_dict([f for f in forecastlist if not f==None],orient="columns")
 
 forecasts["danger_date"] = forecasts.dangerRatings.apply(lambda r: r[0]["date"])
-forecasts["danger_date"] = pd.to_datetime(forecasts.danger_date, utc=True).dt.date
+forecasts["danger_date"] = pd.to_datetime(forecasts.danger_date, format='ISO8601').dt.date
 forecasts["danger_alpine"]=forecasts.dangerRatings.apply(lambda r: r[0]["dangerRating"]["alp"])
 forecasts["danger_treeline"]=forecasts.dangerRatings.apply(lambda r: r[0]["dangerRating"]["tln"])
 forecasts["danger_belowtree"]=forecasts.dangerRatings.apply(lambda r: r[0]["dangerRating"]["btl"])
@@ -532,6 +536,7 @@ import nltk
 import string
 nltk.download('omw-1.4')
 nltk.download('punkt')
+nltk.download('punkt_tab')
 nltk.download('stopwords')
 nltk.download('wordnet')
 # Remove stopwords (the, a, is, etc)
@@ -783,7 +788,7 @@ dimensional space or that the t-SNE algorithm parameters were
 chosen poorly.
 
 ```{code-cell} python
-cmap = matplotlib.cm.get_cmap('Paired')
+cmap = matplotlib.colormaps["Paired"]
 fig, ax = plt.subplots(1,2,figsize=(16,6))
 n_topics=len(svd_model.components_)
 lsa_keys = np.argmax(lsa_topic_sample, axis=1)

diff --git a/lectures/tools/matplotlib.md b/lectures/tools/matplotlib.md
@@ -318,7 +318,7 @@ def scale_by_middle(df):
 ```
 
 ```{code-cell} python
-to_plot = prices.groupby("Model").apply(scale_by_middle).T
+to_plot = prices.groupby("Model").apply(scale_by_middle, include_groups=False).T
 to_plot
 ```
 

diff --git a/lectures/tools/regression.md b/lectures/tools/regression.md
@@ -783,7 +783,7 @@ This improves predictions and reduces the variance of the predictions.
 
 from sklearn.ensemble import RandomForestRegressor
 forest = RandomForestRegressor(n_estimators = 10).fit(Xsim,ysim)
-fig=surface_scatter_plot(Xsim,ysim,lambda x: forest.predict([x]),
+fig=surface_scatter_plot(Xsim,ysim,lambda x: forest.predict([x])[0],
                          show_f0=True)
 fig
 ```
@@ -892,7 +892,7 @@ from sklearn import neural_network
 nn = neural_network.MLPRegressor((6,), activation="logistic",
                                  verbose=True, solver="lbfgs",
                                  alpha=0.0).fit(Xsim,ysim)
-fig=surface_scatter_plot(Xsim,ysim,lambda x: nn.predict([x]), show_f0=True)
+fig=surface_scatter_plot(Xsim,ysim,lambda x: nn.predict([x])[0], show_f0=True)
 fig
 ```
-Original file line number
+Diff line change
@@ Expand Up / @@ -32,7 +32,7 @@ dependencies: @@
         - xgboost
         - graphviz
         - bokeh
-        # - nltk
+        - nltk
         - pandas-datareader
         - seaborn
         - patsy
@@ Expand Down @@