experimentalist working with pandas dataframes

blinodelka · Aug 18, 2023 · 1610edc · 1610edc
1 parent 236bf8b
commit 1610edc
Show file tree

Hide file tree

Showing 9 changed files with 110 additions and 60 deletions.
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/autora-experimentalist-sampler-mixture_experimentalist.iml b/.idea/autora-experimentalist-sampler-mixture_experimentalist.iml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/docs/basic-usage.ipynb b/docs/basic-usage.ipynb
@@ -25,11 +25,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'autora'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-1-5feb8d753802>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mautora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexperimentalist\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msampler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmixture\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmixture_sample\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'autora'"
+     ]
+    }
+   ],
    "source": [
     "from autora.experimentalist.sampler.mixture import mixture_sample"
    ]
@@ -432,7 +442,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -446,7 +456,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.7.4"
   }
  },
  "nbformat": 4,

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "autora-core",
     "typing",
     "numpy",
+    "pandas"
 ]
 
 [project.optional-dependencies]

diff --git a/src/autora/experimentalist/sampler/mixture/__init__.py b/src/autora/experimentalist/sampler/mixture/__init__.py
@@ -22,72 +22,64 @@ def adjust_distribution(p, temperature):
 
 
 
-def mixture_sample(condition_pool: np.ndarray, temperature: float, samplers: list, params: dict, num_samples: Optional[int] = None) -> np.ndarray:
+def mixture_sample(conditions: Union[pd.DataFrame, np.ndarray], temperature: float,
+                   samplers: list, params: dict,
+                   num_samples: Optional[int] = None) -> pd.DataFrame:
     """
 
     Args:
-        condition_pool: pool of experimental conditions to evaluate
+        conditions: pool of experimental conditions to evaluate: pd.Dataframe
         temperature: how random is selection of conditions (cannot be 0; (0:1) - the choices are more deterministic than the choices made wrt
-        samplers: tuple containing sampler functions, their names, and weights 
+        samplers: tuple containing sampler functions, their names, and weights
         for sampler functions that return both positive and negative scores, user can provide a list with two weights: the first one will be applied to positive scores, the second one -- to the negative
         params: nested dictionary. keys correspond to the sampler function names (same as provided in samplers),
         values correspond to the dictionaries of function arguments (argument name: its value)
         num_samples: number of experimental conditions to select
-        
+
     Returns:
-        Sampled pool of experimental conditions
+        Sampled pool of experimental conditions with the scores attached to them
     """
-
-    rankings = []
-    scores = []
-
+
+    condition_pool = pd.DataFrame(conditions)
+
+    rankings = pd.DataFrame()
+    mixture_scores = np.zeros(len(condition_pool))
     ## getting rankings and weighted scores from each function
     for (function, name, weight) in samplers:
-        sampler_params = params[name]
-        cur_ranking, cur_scores = function(condition_pool=condition_pool, **sampler_params)
-        cur_indices = np.argsort(cur_ranking, axis=None)
-        cur_ranking_sorted = cur_ranking[cur_indices]
-        rankings.append(cur_ranking_sorted) # for checking: all elements should be the same & same order
-        ## if function scores can be negative, then create a reversed dimension for them
-        if np.sum(cur_scores<0)>0:
-
-            cur_scores_positive = np.copy(cur_scores)
-            cur_scores_positive[cur_scores<0]=0
-            cur_scores_negative = -np.copy(cur_scores)
-            cur_scores_negative[cur_scores>0]=0
-
-            # aligning scores
-            cur_scores_positive_sorted = cur_scores_positive[cur_indices]
-            cur_scores_negative_sorted = cur_scores_negative[cur_indices]
-
-            # if only one weight is provided, use it for both negative and positive dimensions
-            if isinstance(weight, int):
-                cur_scores_positive_weighted = cur_scores_positive_sorted * weight
-                cur_scores_negative_weighted = cur_scores_negative_sorted * weight
-            else:
-                cur_scores_positive_weighted = cur_scores_positive_sorted * weight[0] # positive dimension gets the first weight
-                cur_scores_negative_weighted = cur_scores_negative_sorted * weight[1] # negative dimension gets the second weight
-
-            scores.append(cur_scores_positive_weighted)
-            scores.append(cur_scores_negative_weighted)
-
+        try:
+            sampler_params = params[name]
+            pd_ranking = function(conditions=condition_pool, **sampler_params)
+        except:
+            pd_ranking = function(conditions=condition_pool)
+        # sorting by index
+        pd_ranking = pd_ranking.sort_index()
+
+        # if only one weight is provided, use it for both negative and positive dimensions
+        if isinstance(weight, float) or isinstance(weight, int):
+            pd_ranking["score"] = pd_ranking["score"] * weight
         else:
-            cur_scores_sorted = cur_scores[cur_indices]
-            if isinstance(weight, int):
-                cur_scores_weighted = cur_scores_sorted * weight
-            else: 
-                cur_scores_weighted = cur_scores_sorted * weight[0]
-            scores.append(cur_scores_weighted)
-
-    weighted_mixture_scores = np.sum(scores, axis = 0)
-
+            if len(pd_ranking["score"] < 0) > 0 and len(pd_ranking["score"] > 0) > 0:  # there are both positive and negative values
+
+                pd_ranking.loc[pd_ranking["score"] > 0]["score"] = pd_ranking.loc[pd_ranking["score"] > 0]["score"] * weight[0]  # positive dimension gets the first weight
+                pd_ranking.loc[pd_ranking["score"] < 0]["score"] = pd_ranking.loc[pd_ranking["score"] < 0]["score"] * weight[1]  # negative dimension gets the second weight
+            else:
+                pd_ranking["score"] = pd_ranking["score"] * weight[0]
+
+        pd_ranking.rename(columns={"score": f"{name}_score"}, inplace=True)
+        # sum_scores are arranged based on the original conditions_ indices
+        mixture_scores = mixture_scores + pd_ranking[f"{name}_score"]
+
+        rankings = pd.merge(rankings, pd_ranking, left_index=True, right_index=True, how="outer")
+
     # adjust mixture scores wrt temperature
-    weighted_mixture_scores_adjusted = adjust_distribution(weighted_mixture_scores, temperature)
-    
+    weighted_mixture_scores_adjusted = adjust_distribution(mixture_scores, temperature)
+
     if num_samples is None:
         num_samples = condition_pool.shape[0]
-
-    conditions = np.random.choice(cur_ranking_sorted.T.squeeze(), num_samples,
-              p=weighted_mixture_scores_adjusted, replace = False)
-
-    return conditions
+
+    condition_indices = np.random.choice(np.arange(len(condition_pool)), num_samples,
+                                         p=weighted_mixture_scores_adjusted, replace=False)
+    conditions_ = condition_pool.iloc[condition_indices]
+    conditions_["score"] = mixture_scores
+
+    return conditions_