NNPDF · RoyStegeman · Apr 11, 2024 · Apr 3, 2024 · Apr 11, 2024 · Apr 11, 2024
diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py
@@ -4,6 +4,7 @@
 Providers which prepare the data ready for
 :py:func:`n3fit.performfit.performfit`.
 """
+
 from collections import defaultdict
 import functools
 import hashlib
@@ -70,8 +71,7 @@ def __init__(self, group_name, seed, masks=None):
         super().__init__(group_name, seed)
 
     def __iter__(self):
-        for m in self.masks:
-            yield m
+        yield from self.masks
 
 
 def tr_masks(data, replica_trvlseed, parallel_models=False, replica=1, replicas=(1,)):
@@ -343,7 +343,7 @@ def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nn
 
 replicas_nnseed_fitting_data_dict = collect("replica_nnseed_fitting_data_dict", ("replicas",))
 groups_replicas_indexed_make_replica = collect(
-    "indexed_make_replica", ("group_dataset_inputs_by_experiment", "replicas")
+    "indexed_make_replica", ("replicas", "group_dataset_inputs_by_experiment")
 )
 
 
@@ -359,10 +359,24 @@ def pseudodata_table(groups_replicas_indexed_make_replica, replicas):
     `fitting::savepseudodata` is `true` (as per the default setting) and
     replicas are fitted one at a time. The table can be found in the replica
     folder i.e. <fit dir>/nnfit/replica_*/
-
     """
-    # Concatenate over replicas
-    df = pd.concat(groups_replicas_indexed_make_replica)
+    # groups_replicas_indexed_make_replica is collected over both replicas and dataset_input groups,
+    # in that order. What this means is that groups_replicas_indexed_make_replica is a list of size
+    # number_of_replicas x number_of_data_groups. Where the ordering inside the list is as follows:
+    # [data1_rep1, data2_rep1, ..., datan_rep1, ..., data1_repn, data2_repn, ..., datan_repn].
+
+    # To correctly put this into a single dataframe, we first need to know the number of
+    # dataset_input groups there are for each replica
+    groups_per_replica = len(groups_replicas_indexed_make_replica) // len(replicas)
+    # then we make a list of pandas dataframes, each containing the pseudodata of all datasets
+    # generated for a single replica
+    df = [
+        pd.concat(groups_replicas_indexed_make_replica[i : i + groups_per_replica])
+        for i in range(0, len(groups_replicas_indexed_make_replica), groups_per_replica)
+    ]
+    # then we concatentate the pseudodata of all replicas into a single dataframe
+    df = pd.concat(df, axis=1)
+    # and finally we add as column titles the replica name
     df.columns = [f"replica {rep}" for rep in replicas]
     return df