diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h
index f6556779e7..fb770b1d53 100644
--- a/include/maths/CBoostedTree.h
+++ b/include/maths/CBoostedTree.h
@@ -35,7 +35,8 @@ class CEncodedDataFrameRowRef;
 namespace boosted_tree_detail {
 class MATHS_EXPORT CArgMinLossImpl {
 public:
-    using TDouble1Vec = core::CSmallVector<double, 1>;
+    using TDoubleVector = CDenseVector<double>;
+    using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
 
 public:
     CArgMinLossImpl(double lambda);
@@ -43,9 +44,11 @@ class MATHS_EXPORT CArgMinLossImpl {
 
     virtual std::unique_ptr<CArgMinLossImpl> clone() const = 0;
     virtual bool nextPass() = 0;
-    virtual void add(TDouble1Vec prediction, double actual, double weight = 1.0) = 0;
+    virtual void add(const TMemoryMappedFloatVector& prediction,
+                     double actual,
+                     double weight = 1.0) = 0;
     virtual void merge(const CArgMinLossImpl& other) = 0;
-    virtual TDouble1Vec value() const = 0;
+    virtual TDoubleVector value() const = 0;
 
 protected:
     double lambda() const;
@@ -61,9 +64,9 @@ class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl {
     CArgMinMseImpl(double lambda);
     std::unique_ptr<CArgMinLossImpl> clone() const override;
     bool nextPass() override;
-    void add(TDouble1Vec prediction, double actual, double weight = 1.0) override;
+    void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override;
     void merge(const CArgMinLossImpl& other) override;
-    TDouble1Vec value() const override;
+    TDoubleVector value() const override;
 
 private:
     using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator;
@@ -79,14 +82,14 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
     CArgMinLogisticImpl(double lambda);
     std::unique_ptr<CArgMinLossImpl> clone() const override;
     bool nextPass() override;
-    void add(TDouble1Vec prediction, double actual, double weight = 1.0) override;
+    void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override;
     void merge(const CArgMinLossImpl& other) override;
-    TDouble1Vec value() const override;
+    TDoubleVector value() const override;
 
 private:
     using TMinMaxAccumulator = CBasicStatistics::CMinMax<double>;
-    using TVector = CVectorNx1<double, 2>;
-    using TVectorVec = std::vector<TVector>;
+    using TDoubleVector2x1 = CVectorNx1<double, 2>;
+    using TDoubleVector2x1Vec = std::vector<TDoubleVector2x1>;
 
 private:
     std::size_t bucket(double prediction) const {
@@ -108,8 +111,8 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
 private:
     std::size_t m_CurrentPass = 0;
     TMinMaxAccumulator m_PredictionMinMax;
-    TVector m_CategoryCounts;
-    TVectorVec m_BucketCategoryCounts;
+    TDoubleVector2x1 m_CategoryCounts;
+    TDoubleVector2x1Vec m_BucketCategoryCounts;
 };
 }
 
@@ -118,7 +121,8 @@ namespace boosted_tree {
 //! \brief Computes the leaf value which minimizes the loss function.
 class MATHS_EXPORT CArgMinLoss {
 public:
-    using TDouble1Vec = core::CSmallVector<double, 1>;
+    using TDoubleVector = CDenseVector<double>;
+    using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
 
 public:
     CArgMinLoss(const CArgMinLoss& other);
@@ -133,7 +137,7 @@ class MATHS_EXPORT CArgMinLoss {
     bool nextPass() const;
 
     //! Update with a point prediction and actual value.
-    void add(TDouble1Vec prediction, double actual, double weight = 1.0);
+    void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0);
 
     //! Get the minimiser over the predictions and actual values added to both
     //! this and \p other.
@@ -144,7 +148,7 @@ class MATHS_EXPORT CArgMinLoss {
     //!
     //! Formally, returns \f$x^* = arg\min_x\{\sum_i{L(p_i + x, a_i)}\}\f$
     //! for predictions and actuals \f$p_i\f$ and \f$a_i\f$, respectively.
-    TDouble1Vec value() const;
+    TDoubleVector value() const;
 
 private:
     using TArgMinLossImplUPtr = std::unique_ptr<boosted_tree_detail::CArgMinLossImpl>;
@@ -161,7 +165,9 @@ class MATHS_EXPORT CArgMinLoss {
 //! \brief Defines the loss function for the regression problem.
 class MATHS_EXPORT CLoss {
 public:
-    using TDouble1Vec = core::CSmallVector<double, 1>;
+    using TDoubleVector = CDenseVector<double>;
+    using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
+    using TWriter = std::function<void(std::size_t, double)>;
 
 public:
     virtual ~CLoss() = default;
@@ -170,17 +176,23 @@ class MATHS_EXPORT CLoss {
     //! The number of parameters to the loss function.
     virtual std::size_t numberParameters() const = 0;
     //! The value of the loss function.
-    virtual double value(TDouble1Vec prediction, double actual, double weight = 1.0) const = 0;
+    virtual double value(const TMemoryMappedFloatVector& prediction,
+                         double actual,
+                         double weight = 1.0) const = 0;
     //! The gradient of the loss function.
-    virtual TDouble1Vec
-    gradient(TDouble1Vec prediction, double actual, double weight = 1.0) const = 0;
+    virtual void gradient(const TMemoryMappedFloatVector& prediction,
+                          double actual,
+                          TWriter writer,
+                          double weight = 1.0) const = 0;
     //! The Hessian of the loss function (flattened).
-    virtual TDouble1Vec
-    curvature(TDouble1Vec prediction, double actual, double weight = 1.0) const = 0;
+    virtual void curvature(const TMemoryMappedFloatVector& prediction,
+                           double actual,
+                           TWriter writer,
+                           double weight = 1.0) const = 0;
     //! Returns true if the loss curvature is constant.
     virtual bool isCurvatureConstant() const = 0;
     //! Transforms a prediction from the forest to the target space.
-    virtual TDouble1Vec transform(TDouble1Vec prediction) const = 0;
+    virtual TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const = 0;
     //! Get an object which computes the leaf value that minimises loss.
     virtual CArgMinLoss minimizer(double lambda) const = 0;
     //! Get the name of the loss function
@@ -198,11 +210,19 @@ class MATHS_EXPORT CMse final : public CLoss {
 public:
     std::unique_ptr<CLoss> clone() const override;
     std::size_t numberParameters() const override;
-    double value(TDouble1Vec prediction, double actual, double weight = 1.0) const override;
-    TDouble1Vec gradient(TDouble1Vec prediction, double actual, double weight = 1.0) const override;
-    TDouble1Vec curvature(TDouble1Vec prediction, double actual, double weight = 1.0) const override;
+    double value(const TMemoryMappedFloatVector& prediction,
+                 double actual,
+                 double weight = 1.0) const override;
+    void gradient(const TMemoryMappedFloatVector& prediction,
+                  double actual,
+                  TWriter writer,
+                  double weight = 1.0) const override;
+    void curvature(const TMemoryMappedFloatVector& prediction,
+                   double actual,
+                   TWriter writer,
+                   double weight = 1.0) const override;
     bool isCurvatureConstant() const override;
-    TDouble1Vec transform(TDouble1Vec prediction) const override;
+    TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
     CArgMinLoss minimizer(double lambda) const override;
     const std::string& name() const override;
 };
@@ -223,11 +243,19 @@ class MATHS_EXPORT CBinomialLogistic final : public CLoss {
 public:
     std::unique_ptr<CLoss> clone() const override;
     std::size_t numberParameters() const override;
-    double value(TDouble1Vec prediction, double actual, double weight = 1.0) const override;
-    TDouble1Vec gradient(TDouble1Vec prediction, double actual, double weight = 1.0) const override;
-    TDouble1Vec curvature(TDouble1Vec prediction, double actual, double weight = 1.0) const override;
+    double value(const TMemoryMappedFloatVector& prediction,
+                 double actual,
+                 double weight = 1.0) const override;
+    void gradient(const TMemoryMappedFloatVector& prediction,
+                  double actual,
+                  TWriter writer,
+                  double weight = 1.0) const override;
+    void curvature(const TMemoryMappedFloatVector& prediction,
+                   double actual,
+                   TWriter writer,
+                   double weight = 1.0) const override;
     bool isCurvatureConstant() const override;
-    TDouble1Vec transform(TDouble1Vec prediction) const override;
+    TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
     CArgMinLoss minimizer(double lambda) const override;
     const std::string& name() const override;
 };
diff --git a/include/maths/CBoostedTreeLeafNodeStatistics.h b/include/maths/CBoostedTreeLeafNodeStatistics.h
index 74f6588510..1373bd422e 100644
--- a/include/maths/CBoostedTreeLeafNodeStatistics.h
+++ b/include/maths/CBoostedTreeLeafNodeStatistics.h
@@ -8,17 +8,25 @@
 #define INCLUDED_ml_maths_CBoostedTreeLeafNodeStatistics_h
 
 #include <core/CImmutableRadixSet.h>
+#include <core/CMemory.h>
 #include <core/CPackedBitVector.h>
 #include <core/CSmallVector.h>
 
 #include <maths/CBoostedTreeHyperparameters.h>
 #include <maths/CBoostedTreeUtils.h>
+#include <maths/CChecksum.h>
+#include <maths/CLinearAlgebraEigen.h>
+#include <maths/CLinearAlgebraShims.h>
 #include <maths/COrderings.h>
+#include <maths/ImportExport.h>
+#include <maths/MathsTypes.h>
 
 #include <boost/operators.hpp>
 
 #include <cstddef>
+#include <cstdint>
 #include <limits>
+#include <numeric>
 #include <vector>
 
 namespace ml {
@@ -37,14 +45,310 @@ class CEncodedDataFrameRowRef;
 //! The regression tree is grown top down by greedily selecting the split with
 //! the maximum gain (in the loss). This finds and scores the maximum gain split
 //! of a single leaf of the tree.
-class CBoostedTreeLeafNodeStatistics final {
+class MATHS_EXPORT CBoostedTreeLeafNodeStatistics final {
 public:
+    using TDoubleVec = std::vector<double>;
     using TSizeVec = std::vector<std::size_t>;
     using TSizeDoublePr = std::pair<std::size_t, double>;
     using TRegularization = CBoostedTreeRegularization<double>;
-    using TImmutableRadixSetVec = std::vector<core::CImmutableRadixSet<double>>;
+    using TImmutableRadixSet = core::CImmutableRadixSet<double>;
+    using TImmutableRadixSetVec = std::vector<TImmutableRadixSet>;
     using TPtr = std::shared_ptr<CBoostedTreeLeafNodeStatistics>;
     using TPtrPtrPr = std::pair<TPtr, TPtr>;
+    using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
+    using TMemoryMappedDoubleVector = CMemoryMappedDenseVector<double>;
+    using TMemoryMappedDoubleMatrix = CMemoryMappedDenseMatrix<double>;
+
+    //! \brief Accumulates aggregate derivatives.
+    class MATHS_EXPORT CDerivatives {
+    public:
+        //! Bounds the minimum diagonal of the Hessian.
+        static constexpr double SMALLEST_RELATIVE_CURVATURE{1e-20};
+
+        //! See core::CMemory.
+        static bool dynamicSizeAlwaysZero() { return true; }
+
+    public:
+        CDerivatives(std::size_t numberLossParameters, double* storage)
+            : m_Count{0}, m_Gradient{storage, static_cast<int>(numberLossParameters)},
+              m_Curvature{storage + numberLossParameters,
+                          static_cast<int>(numberLossParameters),
+                          static_cast<int>(numberLossParameters)} {}
+
+        //! Get the accumulated count.
+        std::size_t count() const { return m_Count; }
+
+        //! Get the accumulated gradient.
+        const TMemoryMappedDoubleVector& gradient() const { return m_Gradient; }
+
+        //! Get the accumulated curvature.
+        const TMemoryMappedDoubleMatrix& curvature() const {
+            return m_Curvature;
+        }
+
+        //! Add \p count, \p gradient and \p curvature to the accumulator.
+        void add(std::size_t count,
+                 const TMemoryMappedFloatVector& gradient,
+                 const TMemoryMappedFloatVector& curvature) {
+            m_Count += count;
+            m_Gradient += gradient;
+            this->curvatureTriangleView() += curvature;
+        }
+
+        //! Compute the accumulation of both collections of derivatives.
+        void add(const CDerivatives& other) {
+            m_Count += other.m_Count;
+            m_Gradient += other.m_Gradient;
+            m_Curvature += other.m_Curvature;
+        }
+
+        //! Set to the difference of \p lhs and \p rhs.
+        void subtract(const CDerivatives& rhs) {
+            m_Count -= rhs.m_Count;
+            if (m_Count > 0) {
+                m_Gradient -= rhs.m_Gradient;
+                m_Curvature -= rhs.m_Curvature;
+                // None of our loss functions have negative curvature therefore we
+                // shouldn't allow the cumulative curvature to be negative either.
+                // In this case we force it to be a very small multiple of the
+                // magnitude of the gradient since this is the closest feasible
+                // estimate.
+                for (int i = 0; i < m_Gradient.size(); ++i) {
+                    m_Curvature(i, i) = std::max(m_Curvature(i, i),
+                                                 SMALLEST_RELATIVE_CURVATURE *
+                                                     std::fabs(m_Gradient(i)));
+                }
+            } else {
+                // Numeric errors mean that it's possible the sum curvature for a
+                // candidate split is identically zero while the gradient is epsilon.
+                // This can cause the node gain to appear infinite (when there is no
+                // weight regularisation) which in turns causes problems initialising
+                // the region we search for optimal hyperparameter values. We can
+                // safely force the gradient and curvature to be zero if we detect
+                // that the count is zero.
+                m_Gradient.setZero();
+                m_Curvature.setZero();
+            }
+        }
+
+        //! Remap the accumulated curvature to lower triangle row major format.
+        void remapCurvature() {
+            // For performance, we accumulate curvatures into the first n (n + 1) / 2
+            // elements of the array backing m_Curvature. However, the memory mapped
+            // matrix class expects them to be stored column major in the lower triangle
+            // of n x n matrix. This copies them backwards to their correct positions.
+            for (std::ptrdiff_t j = m_Curvature.cols() - 1,
+                                k = m_Curvature.rows() * (m_Curvature.rows() + 1) / 2 - 1;
+                 j >= 0; --j) {
+                for (std::ptrdiff_t i = m_Curvature.rows() - 1; i >= j; --i, --k) {
+                    m_Curvature(i, j) = m_Curvature.array()(k);
+                }
+            }
+        }
+
+        //! Get a checksum of this object.
+        std::uint64_t checksum(std::uint64_t seed = 0) const {
+            seed = CChecksum::calculate(seed, m_Count);
+            seed = CChecksum::calculate(seed, m_Gradient);
+            return CChecksum::calculate(seed, m_Curvature);
+        }
+
+    private:
+        TMemoryMappedDoubleVector curvatureTriangleView() {
+            return {m_Curvature.data(), m_Curvature.rows() * (m_Curvature.rows() + 1) / 2};
+        }
+
+    private:
+        std::size_t m_Count = 0;
+        TMemoryMappedDoubleVector m_Gradient;
+        TMemoryMappedDoubleMatrix m_Curvature;
+    };
+
+    //! \brief A collection of aggregate derivatives for candidate feature splits.
+    class MATHS_EXPORT CPerSplitDerivatives {
+    public:
+        using TDerivativesVec = std::vector<CDerivatives>;
+
+    public:
+        explicit CPerSplitDerivatives(std::size_t numberLossParameters = 0)
+            : m_NumberLossParameters{numberLossParameters} {}
+        CPerSplitDerivatives(const TImmutableRadixSetVec& candidateSplits,
+                             std::size_t numberLossParameters)
+            : m_NumberLossParameters{numberLossParameters} {
+            this->map(candidateSplits);
+        }
+        CPerSplitDerivatives(const CPerSplitDerivatives& other)
+            : m_NumberLossParameters{other.m_NumberLossParameters} {
+            this->map(other.m_Derivatives);
+            this->add(other);
+        }
+        CPerSplitDerivatives(CPerSplitDerivatives&&) = default;
+
+        CPerSplitDerivatives& operator=(const CPerSplitDerivatives& other) = delete;
+        CPerSplitDerivatives& operator=(CPerSplitDerivatives&&) = default;
+
+        //! \return The aggregate count for \p feature and \p split.
+        std::size_t count(std::size_t feature, std::size_t split) const {
+            return m_Derivatives[feature][split].count();
+        }
+
+        //! \return The aggregate gradient for \p feature and \p split.
+        const TMemoryMappedDoubleVector& gradient(std::size_t feature, std::size_t split) const {
+            return m_Derivatives[feature][split].gradient();
+        }
+
+        //! \return The aggregate curvature for \p feature and \p split.
+        const TMemoryMappedDoubleMatrix& curvature(std::size_t feature, std::size_t split) const {
+            return m_Derivatives[feature][split].curvature();
+        }
+
+        //! \return All the split aggregate derivatives for \p feature.
+        const TDerivativesVec& derivatives(std::size_t feature) const {
+            return m_Derivatives[feature];
+        }
+
+        //! \return The count for missing \p feature.
+        std::size_t missingCount(std::size_t feature) const {
+            return m_MissingDerivatives[feature].count();
+        }
+
+        //! \return The aggregate gradients for missing \p feature.
+        const TMemoryMappedDoubleVector& missingGradient(std::size_t feature) const {
+            return m_MissingDerivatives[feature].gradient();
+        }
+
+        //! \return The aggregate curvatures for missing \p feature.
+        const TMemoryMappedDoubleMatrix& missingCurvature(std::size_t feature) const {
+            return m_MissingDerivatives[feature].curvature();
+        }
+
+        //! Add \p gradient and \p curvature to the accumulated derivatives for
+        //! the \p split of \p feature.
+        void addDerivatives(std::size_t feature,
+                            std::size_t split,
+                            const TMemoryMappedFloatVector& gradient,
+                            const TMemoryMappedFloatVector& curvature) {
+            m_Derivatives[feature][split].add(1, gradient, curvature);
+        }
+
+        //! Add \p gradient and \p curvature to the accumulated derivatives for
+        //! missing values of \p feature.
+        void addMissingDerivatives(std::size_t feature,
+                                   const TMemoryMappedFloatVector& gradient,
+                                   const TMemoryMappedFloatVector& curvature) {
+            m_MissingDerivatives[feature].add(1, gradient, curvature);
+        }
+
+        //! Compute the accumulation of both collections of per split derivatives.
+        void add(const CPerSplitDerivatives& other) {
+            for (std::size_t i = 0; i < other.m_Derivatives.size(); ++i) {
+                for (std::size_t j = 0; j < other.m_Derivatives[i].size(); ++j) {
+                    m_Derivatives[i][j].add(other.m_Derivatives[i][j]);
+                }
+                m_MissingDerivatives[i].add(other.m_MissingDerivatives[i]);
+            }
+        }
+
+        //! Subtract \p rhs.
+        void subtract(const CPerSplitDerivatives& rhs) {
+            for (std::size_t i = 0; i < m_Derivatives.size(); ++i) {
+                for (std::size_t j = 0; j < m_Derivatives[i].size(); ++j) {
+                    m_Derivatives[i][j].subtract(rhs.m_Derivatives[i][j]);
+                }
+                m_MissingDerivatives[i].subtract(rhs.m_MissingDerivatives[i]);
+            }
+        }
+
+        //! Remap the accumulated curvature to lower triangle row major format.
+        void remapCurvature() {
+            for (std::size_t i = 0; i < m_Derivatives.size(); ++i) {
+                for (std::size_t j = 0; j < m_Derivatives[i].size(); ++j) {
+                    m_Derivatives[i][j].remapCurvature();
+                }
+                m_MissingDerivatives[i].remapCurvature();
+            }
+        }
+
+        //! Get the memory used by this object.
+        std::size_t memoryUsage() const {
+            return core::CMemory::dynamicSize(m_Derivatives) +
+                   core::CMemory::dynamicSize(m_MissingDerivatives) +
+                   core::CMemory::dynamicSize(m_Storage);
+        }
+
+        //! Estimate the split derivatives' memory usage for a data frame with
+        //! \p numberCols columns using \p numberSplitsPerFeature for a loss
+        //! function with \p numberLossParameters parameters.
+        static std::size_t estimateMemoryUsage(std::size_t numberFeatures,
+                                               std::size_t numberSplitsPerFeature,
+                                               std::size_t numberLossParameters) {
+            std::size_t derivativesSize{numberFeatures * (numberSplitsPerFeature + 1) *
+                                        sizeof(CDerivatives)};
+            std::size_t storageSize{numberFeatures * (numberSplitsPerFeature + 1) * numberLossParameters *
+                                    (numberLossParameters + 1) * sizeof(double)};
+            return sizeof(CPerSplitDerivatives) + derivativesSize + storageSize;
+        }
+
+        //! Get a checksum of this object.
+        std::uint64_t checksum(std::uint64_t seed = 0) const {
+            seed = CChecksum::calculate(seed, m_NumberLossParameters);
+            seed = CChecksum::calculate(seed, m_Derivatives);
+            seed = CChecksum::calculate(seed, m_MissingDerivatives);
+            return seed;
+        }
+
+    private:
+        using TDerivativesVecVec = std::vector<TDerivativesVec>;
+
+    private:
+        static std::size_t number(const TDerivativesVec& derivatives) {
+            return derivatives.size();
+        }
+        static std::size_t number(const TImmutableRadixSet& splits) {
+            return splits.size() + 1;
+        }
+        template<typename SPLITS>
+        void map(const SPLITS& splits) {
+            // This function maps the memory in a single presized buffer containing
+            // enough space to store all gradient vectors and curvatures. For each
+            // feature the layout in this buffer is as follows:
+            //
+            // "split grad" "split hessian"       "missing grad" "missing hessian"
+            //       |            |                     |              |
+            //       V            V                     V              V
+            // |     n     |      n^2      | ... |      n       |      n^2       |
+
+            std::size_t totalNumberSplits{
+                std::accumulate(splits.begin(), splits.end(), std::size_t{0},
+                                [](std::size_t size, const auto& featureSplits) {
+                                    return size + number(featureSplits);
+                                })};
+
+            int numberGradients{static_cast<int>(m_NumberLossParameters)};
+            int numberCurvatures{numberGradients * numberGradients};
+            int numberDerivatives{numberGradients + numberCurvatures};
+
+            m_Derivatives.resize(splits.size());
+            m_MissingDerivatives.reserve(splits.size());
+            m_Storage.resize((totalNumberSplits + splits.size()) * numberDerivatives, 0.0);
+
+            double* storage{&m_Storage[0]};
+            for (std::size_t i = 0; i < splits.size(); ++i, storage += numberDerivatives) {
+                std::size_t size{number(splits[i])};
+                m_Derivatives[i].reserve(size);
+                for (std::size_t j = 0; j < size; ++j, storage += numberDerivatives) {
+                    m_Derivatives[i].emplace_back(m_NumberLossParameters, storage);
+                }
+                m_MissingDerivatives.emplace_back(m_NumberLossParameters, storage);
+            }
+        }
+
+    private:
+        std::size_t m_NumberLossParameters = 0;
+        TDerivativesVecVec m_Derivatives;
+        TDerivativesVec m_MissingDerivatives;
+        TDoubleVec m_Storage;
+    };
 
 public:
     CBoostedTreeLeafNodeStatistics(std::size_t id,
@@ -76,7 +380,7 @@ class CBoostedTreeLeafNodeStatistics final {
 
     //! Only called by split but is public so it's accessible to std::make_shared.
     CBoostedTreeLeafNodeStatistics(std::size_t id,
-                                   const CBoostedTreeLeafNodeStatistics& parent,
+                                   CBoostedTreeLeafNodeStatistics&& parent,
                                    const CBoostedTreeLeafNodeStatistics& sibling,
                                    const TRegularization& regularization,
                                    const TSizeVec& featureBag,
@@ -98,8 +402,7 @@ class CBoostedTreeLeafNodeStatistics final {
                     const TRegularization& regularization,
                     const TImmutableRadixSetVec& candidateSplits,
                     const TSizeVec& featureBag,
-                    const CBoostedTreeNode& split,
-                    bool leftChildHasFewerRows);
+                    const CBoostedTreeNode& split);
 
     //! Order two leaves by decreasing gain in splitting them.
     bool operator<(const CBoostedTreeLeafNodeStatistics& rhs) const;
@@ -130,17 +433,17 @@ class CBoostedTreeLeafNodeStatistics final {
     std::size_t memoryUsage() const;
 
     //! Estimate the maximum leaf statistics' memory usage training on a data frame
-    //! with \p numberRows rows and \p numberCols columns using \p featureBagFraction
-    //! and \p numberSplitsPerFeature.
+    //! with \p numberRows rows and \p numberCols columns using \p numberSplitsPerFeature
+    //! for a loss function with \p numberLossParameters parameters.
     static std::size_t estimateMemoryUsage(std::size_t numberRows,
                                            std::size_t numberCols,
-                                           std::size_t numberSplitsPerFeature);
+                                           std::size_t numberSplitsPerFeature,
+                                           std::size_t numberLossParameters);
 
 private:
-    using TDouble1Vec = core::CSmallVector<double, 1>;
-
     //! \brief Statistics relating to a split of the node.
-    struct SSplitStatistics : private boost::less_than_comparable<SSplitStatistics> {
+    struct MATHS_EXPORT SSplitStatistics
+        : private boost::less_than_comparable<SSplitStatistics> {
         SSplitStatistics() = default;
         SSplitStatistics(double gain,
                          double curvature,
@@ -172,61 +475,8 @@ class CBoostedTreeLeafNodeStatistics final {
         bool s_AssignMissingToLeft = true;
     };
 
-    //! \brief Aggregate derivatives.
-    struct SAggregateDerivatives {
-        void add(std::size_t count, const TDouble1Vec& gradient, const TDouble1Vec& curvature) {
-            s_Count += count;
-            s_Gradient += gradient[0];
-            s_Curvature += curvature[0];
-        }
-
-        void merge(const SAggregateDerivatives& other) {
-            this->add(other.s_Count, {other.s_Gradient}, {other.s_Curvature});
-        }
-
-        std::string print() const {
-            std::ostringstream result;
-            result << "count = " << s_Count << ", gradient = " << s_Gradient
-                   << ", curvature = " << s_Curvature;
-            return result.str();
-        }
-
-        std::size_t s_Count = 0;
-        double s_Gradient = 0.0;
-        double s_Curvature = 0.0;
-    };
-
-    using TAggregateDerivativesVec = std::vector<SAggregateDerivatives>;
-    using TAggregateDerivativesVecVec = std::vector<TAggregateDerivativesVec>;
-
-    //! \brief A collection of aggregate derivatives for candidate feature splits.
-    struct SSplitAggregateDerivatives {
-        SSplitAggregateDerivatives(const TImmutableRadixSetVec& candidateSplits)
-            : s_Derivatives(candidateSplits.size()),
-              s_MissingDerivatives(candidateSplits.size()) {
-            for (std::size_t i = 0; i < candidateSplits.size(); ++i) {
-                s_Derivatives[i].resize(candidateSplits[i].size() + 1);
-            }
-        }
-
-        void merge(const SSplitAggregateDerivatives& other) {
-            for (std::size_t i = 0; i < s_Derivatives.size(); ++i) {
-                for (std::size_t j = 0; j < s_Derivatives[i].size(); ++j) {
-                    s_Derivatives[i][j].merge(other.s_Derivatives[i][j]);
-                }
-                s_MissingDerivatives[i].merge(other.s_MissingDerivatives[i]);
-            }
-        }
-
-        auto move() {
-            return std::make_pair(std::move(s_Derivatives), std::move(s_MissingDerivatives));
-        }
-
-        TAggregateDerivativesVecVec s_Derivatives;
-        TAggregateDerivativesVec s_MissingDerivatives;
-    };
-
 private:
+    void maybeRecoverMemory();
     void computeAggregateLossDerivatives(std::size_t numberThreads,
                                          const core::CDataFrame& frame,
                                          const CDataFrameCategoryEncoder& encoder);
@@ -237,7 +487,7 @@ class CBoostedTreeLeafNodeStatistics final {
                                                    const CBoostedTreeNode& split,
                                                    const core::CPackedBitVector& parentRowMask);
     void addRowDerivatives(const CEncodedDataFrameRowRef& row,
-                           SSplitAggregateDerivatives& splitAggregateDerivatives) const;
+                           CPerSplitDerivatives& splitDerivatives) const;
     SSplitStatistics computeBestSplitStatistics(const TRegularization& regularization,
                                                 const TSizeVec& featureBag) const;
 
@@ -248,8 +498,7 @@ class CBoostedTreeLeafNodeStatistics final {
     std::size_t m_NumberLossParameters;
     const TImmutableRadixSetVec& m_CandidateSplits;
     core::CPackedBitVector m_RowMask;
-    TAggregateDerivativesVecVec m_Derivatives;
-    TAggregateDerivativesVec m_MissingDerivatives;
+    CPerSplitDerivatives m_Derivatives;
     SSplitStatistics m_BestSplit;
 };
 }
diff --git a/include/maths/CBoostedTreeUtils.h b/include/maths/CBoostedTreeUtils.h
index 57cbac7284..700ee07019 100644
--- a/include/maths/CBoostedTreeUtils.h
+++ b/include/maths/CBoostedTreeUtils.h
@@ -8,26 +8,27 @@
 #define INCLUDED_ml_maths_CBoostedTreeUtils_h
 
 #include <core/CDataFrame.h>
-#include <core/CSmallVector.h>
+
+#include <maths/CLinearAlgebraEigen.h>
+#include <maths/ImportExport.h>
+#include <maths/MathsTypes.h>
 
 #include <cmath>
 #include <cstddef>
 
 namespace ml {
 namespace maths {
+namespace boosted_tree {
+class CLoss;
+}
 namespace boosted_tree_detail {
-using TDouble1Vec = core::CSmallVector<double, 1>;
 using TRowRef = core::CDataFrame::TRowRef;
+using TMemoryMappedFloatVector = CMemoryMappedDenseVector<CFloatStorage>;
 
 inline std::size_t lossHessianStoredSize(std::size_t numberLossParameters) {
     return numberLossParameters * (numberLossParameters + 1) / 2;
 }
 
-inline std::size_t numberLossParametersForHessianStoredSize(std::size_t lossHessianStoredSize) {
-    return static_cast<std::size_t>(
-        (std::sqrt(8.0 * static_cast<double>(lossHessianStoredSize) + 1.0) - 1.0) / 2.0 + 0.5);
-}
-
 inline std::size_t predictionColumn(std::size_t numberInputColumns) {
     return numberInputColumns;
 }
@@ -48,31 +49,50 @@ inline std::size_t exampleWeightColumn(std::size_t numberInputColumns,
            lossHessianStoredSize(numberLossParameters);
 }
 
+//! Read the prediction from \p row.
+MATHS_EXPORT
+TMemoryMappedFloatVector readPrediction(const TRowRef& row,
+                                        std::size_t numberInputColumns,
+                                        std::size_t numberLossParamaters);
+
+//! Zero the prediction of \p row.
+MATHS_EXPORT
+void zeroPrediction(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParamaters);
+
+//! Read the loss gradient from \p row.
 MATHS_EXPORT
-TDouble1Vec readPrediction(const TRowRef& row,
-                           std::size_t numberInputColumns,
-                           std::size_t numberLossParamaters);
+TMemoryMappedFloatVector readLossGradient(const TRowRef& row,
+                                          std::size_t numberInputColumns,
+                                          std::size_t numberLossParameters);
 
+//! Zero the loss gradient of \p row.
 MATHS_EXPORT
-void writePrediction(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& prediction);
+void zeroLossGradient(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters);
 
+//! Write the loss gradient to \p row.
 MATHS_EXPORT
-TDouble1Vec readLossGradient(const TRowRef& row,
-                             std::size_t numberInputColumns,
-                             std::size_t numberLossParameters);
+void writeLossGradient(const TRowRef& row,
+                       std::size_t numberInputColumns,
+                       const boosted_tree::CLoss& loss,
+                       const TMemoryMappedFloatVector& prediction,
+                       double actual,
+                       double weight = 1.0);
 
 MATHS_EXPORT
-void writeLossGradient(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& gradient);
+TMemoryMappedFloatVector readLossCurvature(const TRowRef& row,
+                                           std::size_t numberInputColumns,
+                                           std::size_t numberLossParameters);
 
 MATHS_EXPORT
-TDouble1Vec readLossCurvature(const TRowRef& row,
-                              std::size_t numberInputColumns,
-                              std::size_t numberLossParameters);
+void zeroLossCurvature(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters);
 
 MATHS_EXPORT
 void writeLossCurvature(const TRowRef& row,
                         std::size_t numberInputColumns,
-                        const TDouble1Vec& curvature);
+                        const boosted_tree::CLoss& curvature,
+                        const TMemoryMappedFloatVector& prediction,
+                        double actual,
+                        double weight = 1.0);
 
 MATHS_EXPORT
 double readExampleWeight(const TRowRef& row,
diff --git a/include/maths/CLinearAlgebraEigen.h b/include/maths/CLinearAlgebraEigen.h
index 0e296f52da..72b51319d8 100644
--- a/include/maths/CLinearAlgebraEigen.h
+++ b/include/maths/CLinearAlgebraEigen.h
@@ -370,6 +370,16 @@ class CMemoryMappedDenseMatrix
     }
     //@}
 
+    //! Get a checksum of this object.
+    std::uint64_t checksum(std::uint64_t seed = 0) const {
+        for (std::ptrdiff_t i = 0; i < this->rows(); ++i) {
+            for (std::ptrdiff_t j = 0; j < this->rows(); ++j) {
+                seed = CChecksum::calculate(seed, (*this)(i, j));
+            }
+        }
+        return seed;
+    }
+
 private:
     void reseat(const CMemoryMappedDenseMatrix& other) {
         TBase* base{static_cast<TBase*>(this)};
@@ -668,4 +678,16 @@ CVector<T>::CVector(const CDenseVectorInitializer<VECTOR>& v) {
 }
 }
 
+namespace Eigen {
+template<typename BIN_OP>
+struct ScalarBinaryOpTraits<ml::core::CFloatStorage, double, BIN_OP> {
+    using ReturnType = double;
+};
+
+template<typename BIN_OP>
+struct ScalarBinaryOpTraits<double, ml::core::CFloatStorage, BIN_OP> {
+    using ReturnType = double;
+};
+}
+
 #endif // INCLUDED_ml_maths_CLinearAlgebraEigen_h
diff --git a/include/maths/CLinearAlgebraShims.h b/include/maths/CLinearAlgebraShims.h
index 70c961e902..914d128abf 100644
--- a/include/maths/CLinearAlgebraShims.h
+++ b/include/maths/CLinearAlgebraShims.h
@@ -335,37 +335,63 @@ outer(const CAnnotatedVector<VECTOR, ANNOTATION>& x) {
 
 namespace las_detail {
 template<typename VECTOR>
-struct SEstimateMemoryUsage {
+struct SEstimateVectorMemoryUsage {
     static std::size_t value(std::size_t) { return 0; }
 };
 template<typename T>
-struct SEstimateMemoryUsage<CVector<T>> {
+struct SEstimateVectorMemoryUsage<CVector<T>> {
     static std::size_t value(std::size_t dimension) {
         return dimension * sizeof(T);
     }
 };
 template<typename SCALAR>
-struct SEstimateMemoryUsage<CDenseVector<SCALAR>> {
+struct SEstimateVectorMemoryUsage<CDenseVector<SCALAR>> {
     static std::size_t value(std::size_t dimension) {
         // Ignore pad for alignment.
         return dimension * sizeof(SCALAR);
     }
 };
 template<typename VECTOR, typename ANNOTATION>
-struct SEstimateMemoryUsage<CAnnotatedVector<VECTOR, ANNOTATION>> {
+struct SEstimateVectorMemoryUsage<CAnnotatedVector<VECTOR, ANNOTATION>> {
     static std::size_t value(std::size_t dimension) {
         // Ignore any dynamic memory used by the annotation: we don't know how to
         // compute this here. It will be up to the calling code to estimate this
         // correctly.
-        return SEstimateMemoryUsage<VECTOR>::value(dimension);
+        return SEstimateVectorMemoryUsage<VECTOR>::value(dimension);
+    }
+};
+
+template<typename MATRIX>
+struct SEstimateMatrixMemoryUsage {
+    static std::size_t value(std::size_t, std::size_t) { return 0; }
+};
+template<typename T>
+struct SEstimateMatrixMemoryUsage<CSymmetricMatrix<T>> {
+    static std::size_t value(std::size_t rows, std::size_t) {
+        return sizeof(T) * rows * (rows + 1) / 2;
+    }
+};
+template<typename SCALAR>
+struct SEstimateMatrixMemoryUsage<CDenseMatrix<SCALAR>> {
+    static std::size_t value(std::size_t rows, std::size_t columns) {
+        // Ignore pad for alignment.
+        return sizeof(SCALAR) * rows * columns;
     }
 };
 }
 
-//! Estimate the amount of memory a point of type VECTOR and \p dimension will use.
+//! Estimate the amount of memory a vector of type VECTOR and size \p dimension
+//! will use.
 template<typename VECTOR>
 std::size_t estimateMemoryUsage(std::size_t dimension) {
-    return las_detail::SEstimateMemoryUsage<VECTOR>::value(dimension);
+    return las_detail::SEstimateVectorMemoryUsage<VECTOR>::value(dimension);
+}
+
+//! Estimate the amount of memory a matrix of type MATRIX and size \p rows by
+//! \p columns will use.
+template<typename MATRIX>
+std::size_t estimateMemoryUsage(std::size_t rows, std::size_t columns) {
+    return las_detail::SEstimateMatrixMemoryUsage<MATRIX>::value(rows, columns);
 }
 }
 }
diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
index c60a6bb4b3..600b99643a 100644
--- a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
+++ b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
@@ -136,7 +136,7 @@ struct SFixture {
         api::CDataFrameAnalyzer analyzer{
             test::CDataFrameAnalysisSpecificationFactory::predictionSpec(
                 test::CDataFrameAnalysisSpecificationFactory::regression(),
-                "target", s_Rows, 5, 8000000, 0, 0, {"c1"}, s_Alpha, s_Lambda,
+                "target", s_Rows, 5, 26000000, 0, 0, {"c1"}, s_Alpha, s_Lambda,
                 s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta,
                 s_MaximumNumberTrees, s_FeatureBagFraction, shapValues),
             outputWriterFactory};
@@ -169,9 +169,9 @@ struct SFixture {
         api::CDataFrameAnalyzer analyzer{
             test::CDataFrameAnalysisSpecificationFactory::predictionSpec(
                 test::CDataFrameAnalysisSpecificationFactory::classification(),
-                "target", s_Rows, 5, 8000000, 0, 0, {"target"}, s_Alpha, s_Lambda,
-                s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta,
-                s_MaximumNumberTrees, s_FeatureBagFraction, shapValues),
+                "target", s_Rows, 5, 26000000, 0, 0, {"target"}, s_Alpha,
+                s_Lambda, s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance,
+                s_Eta, s_MaximumNumberTrees, s_FeatureBagFraction, shapValues),
             outputWriterFactory};
         TStrVec fieldNames{"target", "c1", "c2", "c3", "c4", ".", "."};
         TStrVec fieldValues{"", "", "", "", "", "0", ""};
@@ -197,7 +197,7 @@ struct SFixture {
         api::CDataFrameAnalyzer analyzer{
             test::CDataFrameAnalysisSpecificationFactory::predictionSpec(
                 test::CDataFrameAnalysisSpecificationFactory::regression(),
-                "target", s_Rows, 5, 8000000, 0, 0, {}, s_Alpha, s_Lambda,
+                "target", s_Rows, 5, 26000000, 0, 0, {}, s_Alpha, s_Lambda,
                 s_Gamma, s_SoftTreeDepthLimit, s_SoftTreeDepthTolerance, s_Eta,
                 s_MaximumNumberTrees, s_FeatureBagFraction, shapValues),
             outputWriterFactory};
diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
index bb50d962d6..10989bd093 100644
--- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
+++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
@@ -426,7 +426,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTraining) {
 
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(
                            counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 6000000);
-    BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1400000);
+    BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1500000);
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) > 0);
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) <= duration);
 }
@@ -724,7 +724,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierTraining) {
               << "ms");
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(
                            counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 6000000);
-    BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1400000);
+    BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1500000);
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) > 0);
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) <= duration);
 }
@@ -752,7 +752,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierImbalanced) {
     api::CDataFrameAnalyzer analyzer{
         test::CDataFrameAnalysisSpecificationFactory::predictionSpec(
             test::CDataFrameAnalysisSpecificationFactory::classification(),
-            "target", numberExamples, 4, 14000000, 0, 0, {"target"}),
+            "target", numberExamples, 4, 17000000, 0, 0, {"target"}),
         outputWriterFactory};
 
     TStrVec actuals;
@@ -800,7 +800,7 @@ BOOST_AUTO_TEST_CASE(testCategoricalFields) {
         api::CDataFrameAnalyzer analyzer{
             test::CDataFrameAnalysisSpecificationFactory::predictionSpec(
                 test::CDataFrameAnalysisSpecificationFactory::regression(),
-                "x5", 1000, 5, 19000000, 0, 0, {"x1", "x2"}),
+                "x5", 1000, 5, 26000000, 0, 0, {"x1", "x2"}),
             outputWriterFactory};
 
         TStrVec x[]{{"x11", "x12", "x13", "x14", "x15"},
@@ -906,7 +906,7 @@ BOOST_AUTO_TEST_CASE(testCategoricalFieldsEmptyAsMissing) {
     api::CDataFrameAnalyzer analyzer{
         test::CDataFrameAnalysisSpecificationFactory::predictionSpec(
             test::CDataFrameAnalysisSpecificationFactory::classification(),
-            "x5", 1000, 5, 19000000, 0, 0, {"x1", "x2", "x5"}),
+            "x5", 1000, 5, 26000000, 0, 0, {"x1", "x2", "x5"}),
         outputWriterFactory};
 
     TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."};
diff --git a/lib/maths/CBoostedTree.cc b/lib/maths/CBoostedTree.cc
index e6ccdabd69..a967808e3c 100644
--- a/lib/maths/CBoostedTree.cc
+++ b/lib/maths/CBoostedTree.cc
@@ -67,8 +67,8 @@ bool CArgMinMseImpl::nextPass() {
     return false;
 }
 
-void CArgMinMseImpl::add(TDouble1Vec prediction, double actual, double weight) {
-    m_MeanError.add(actual - prediction[0], weight);
+void CArgMinMseImpl::add(const TMemoryMappedFloatVector& prediction, double actual, double weight) {
+    m_MeanError.add(actual - prediction(0), weight);
 }
 
 void CArgMinMseImpl::merge(const CArgMinLossImpl& other) {
@@ -78,7 +78,7 @@ void CArgMinMseImpl::merge(const CArgMinLossImpl& other) {
     }
 }
 
-CArgMinMseImpl::TDouble1Vec CArgMinMseImpl::value() const {
+CArgMinMseImpl::TDoubleVector CArgMinMseImpl::value() const {
 
     // We searching for the value x which minimises
     //
@@ -89,12 +89,16 @@ CArgMinMseImpl::TDouble1Vec CArgMinMseImpl::value() const {
     // error m = 1/n sum_i{ a_i - p_i } we have x^* = n / (n + lambda) m.
 
     double count{CBasicStatistics::count(m_MeanError)};
-    return {count == 0.0 ? 0.0 : count / (count + this->lambda()) * CBasicStatistics::mean(m_MeanError)};
+    double meanError{CBasicStatistics::mean(m_MeanError)};
+
+    TDoubleVector result(1);
+    result(0) = count == 0.0 ? 0.0 : count / (count + this->lambda()) * meanError;
+    return result;
 }
 
 CArgMinLogisticImpl::CArgMinLogisticImpl(double lambda)
     : CArgMinLossImpl{lambda}, m_CategoryCounts{0},
-      m_BucketCategoryCounts(128, TVector{0.0}) {
+      m_BucketCategoryCounts(128, TDoubleVector2x1{0.0}) {
 }
 
 std::unique_ptr<CArgMinLossImpl> CArgMinLogisticImpl::clone() const {
@@ -106,15 +110,17 @@ bool CArgMinLogisticImpl::nextPass() {
     return m_CurrentPass < 2;
 }
 
-void CArgMinLogisticImpl::add(TDouble1Vec prediction, double actual, double weight) {
+void CArgMinLogisticImpl::add(const TMemoryMappedFloatVector& prediction,
+                              double actual,
+                              double weight) {
     switch (m_CurrentPass) {
     case 0: {
-        m_PredictionMinMax.add(prediction[0]);
+        m_PredictionMinMax.add(prediction(0));
         m_CategoryCounts(static_cast<std::size_t>(actual)) += weight;
         break;
     }
     case 1: {
-        auto& count = m_BucketCategoryCounts[this->bucket(prediction[0])];
+        auto& count = m_BucketCategoryCounts[this->bucket(prediction(0))];
         count(static_cast<std::size_t>(actual)) += weight;
         break;
     }
@@ -142,7 +148,7 @@ void CArgMinLogisticImpl::merge(const CArgMinLossImpl& other) {
     }
 }
 
-CArgMinLogisticImpl::TDouble1Vec CArgMinLogisticImpl::value() const {
+CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const {
 
     std::function<double(double)> objective;
     double minWeight;
@@ -190,8 +196,11 @@ CArgMinLogisticImpl::TDouble1Vec CArgMinLogisticImpl::value() const {
         maxWeight = -m_PredictionMinMax.min() + 5.0;
     }
 
+    TDoubleVector result(1);
+
     if (minWeight == maxWeight) {
-        return {minWeight};
+        result(0) = minWeight;
+        return result;
     }
 
     double minimum;
@@ -201,7 +210,8 @@ CArgMinLogisticImpl::TDouble1Vec CArgMinLogisticImpl::value() const {
                        objective, 1e-3, maxIterations, minimum, objectiveAtMinimum);
     LOG_TRACE(<< "minimum = " << minimum << " objective(minimum) = " << objectiveAtMinimum);
 
-    return {minimum};
+    result(0) = minimum;
+    return result;
 }
 }
 
@@ -223,7 +233,7 @@ bool CArgMinLoss::nextPass() const {
     return m_Impl->nextPass();
 }
 
-void CArgMinLoss::add(TDouble1Vec prediction, double actual, double weight) {
+void CArgMinLoss::add(const TMemoryMappedFloatVector& prediction, double actual, double weight) {
     return m_Impl->add(prediction, actual, weight);
 }
 
@@ -231,7 +241,7 @@ void CArgMinLoss::merge(CArgMinLoss& other) {
     return m_Impl->merge(*other.m_Impl);
 }
 
-CArgMinLoss::TDouble1Vec CArgMinLoss::value() const {
+CArgMinLoss::TDoubleVector CArgMinLoss::value() const {
     return m_Impl->value();
 }
 
@@ -250,25 +260,30 @@ std::size_t CMse::numberParameters() const {
     return 1;
 }
 
-double CMse::value(TDouble1Vec prediction, double actual, double weight) const {
-    return weight * CTools::pow2(prediction[0] - actual);
+double CMse::value(const TMemoryMappedFloatVector& prediction, double actual, double weight) const {
+    return weight * CTools::pow2(prediction(0) - actual);
 }
 
-CMse::TDouble1Vec CMse::gradient(TDouble1Vec prediction, double actual, double weight) const {
-    return {2.0 * weight * (prediction[0] - actual)};
+void CMse::gradient(const TMemoryMappedFloatVector& prediction,
+                    double actual,
+                    TWriter writer,
+                    double weight) const {
+    writer(0, 2.0 * weight * (prediction(0) - actual));
 }
 
-CMse::TDouble1Vec
-CMse::curvature(TDouble1Vec /*prediction*/, double /*actual*/, double weight) const {
-    return {2.0 * weight};
+void CMse::curvature(const TMemoryMappedFloatVector& /*prediction*/,
+                     double /*actual*/,
+                     TWriter writer,
+                     double weight) const {
+    writer(0, 2.0 * weight);
 }
 
 bool CMse::isCurvatureConstant() const {
     return true;
 }
 
-CMse::TDouble1Vec CMse::transform(TDouble1Vec prediction) const {
-    return prediction;
+CMse::TDoubleVector CMse::transform(const TMemoryMappedFloatVector& prediction) const {
+    return TDoubleVector{prediction};
 }
 
 CArgMinLoss CMse::minimizer(double lambda) const {
@@ -289,35 +304,43 @@ std::size_t CBinomialLogistic::numberParameters() const {
     return 1;
 }
 
-double CBinomialLogistic::value(TDouble1Vec prediction, double actual, double weight) const {
-    // Cross entropy
-    return -weight * ((1.0 - actual) * logOneMinusLogistic(prediction[0]) +
-                      actual * logLogistic(prediction[0]));
+double CBinomialLogistic::value(const TMemoryMappedFloatVector& prediction,
+                                double actual,
+                                double weight) const {
+    return -weight * ((1.0 - actual) * logOneMinusLogistic(prediction(0)) +
+                      actual * logLogistic(prediction(0)));
 }
 
-CBinomialLogistic::TDouble1Vec
-CBinomialLogistic::gradient(TDouble1Vec prediction, double actual, double weight) const {
-    if (prediction[0] > -LOG_EPSILON && actual == 1.0) {
-        return {-weight * std::exp(-prediction[0])};
+void CBinomialLogistic::gradient(const TMemoryMappedFloatVector& prediction,
+                                 double actual,
+                                 TWriter writer,
+                                 double weight) const {
+    if (prediction(0) > -LOG_EPSILON && actual == 1.0) {
+        writer(0, -weight * std::exp(-prediction(0)));
     }
-    return {weight * CTools::logisticFunction(prediction[0]) - actual};
+    writer(0, weight * (CTools::logisticFunction(prediction(0)) - actual));
 }
 
-CBinomialLogistic::TDouble1Vec
-CBinomialLogistic::curvature(TDouble1Vec prediction, double /*actual*/, double weight) const {
-    if (prediction[0] > -LOG_EPSILON) {
-        return {weight * std::exp(-prediction[0])};
+void CBinomialLogistic::curvature(const TMemoryMappedFloatVector& prediction,
+                                  double /*actual*/,
+                                  TWriter writer,
+                                  double weight) const {
+    if (prediction(0) > -LOG_EPSILON) {
+        writer(0, weight * std::exp(-prediction(0)));
     }
-    double probability{CTools::logisticFunction(prediction[0])};
-    return {weight * probability * (1.0 - probability)};
+    double probability{CTools::logisticFunction(prediction(0))};
+    writer(0, weight * probability * (1.0 - probability));
 }
 
 bool CBinomialLogistic::isCurvatureConstant() const {
     return false;
 }
 
-CBinomialLogistic::TDouble1Vec CBinomialLogistic::transform(TDouble1Vec prediction) const {
-    return {CTools::logisticFunction(prediction[0])};
+CBinomialLogistic::TDoubleVector
+CBinomialLogistic::transform(const TMemoryMappedFloatVector& prediction) const {
+    TDoubleVector result{prediction};
+    result(0) = CTools::logisticFunction(result(0));
+    return result;
 }
 
 CArgMinLoss CBinomialLogistic::minimizer(double lambda) const {
diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
index 067eb63061..d8ed7b5ae6 100644
--- a/lib/maths/CBoostedTreeImpl.cc
+++ b/lib/maths/CBoostedTreeImpl.cc
@@ -12,7 +12,6 @@
 #include <core/CLoopProgress.h>
 #include <core/CPersistUtils.h>
 #include <core/CProgramCounters.h>
-#include <core/CSmallVector.h>
 #include <core/CStopWatch.h>
 
 #include <maths/CBasicStatisticsPersist.h>
@@ -40,11 +39,6 @@ namespace {
 // by only refreshing once every MINIMUM_SPLIT_REFRESH_INTERVAL trees we add.
 const double MINIMUM_SPLIT_REFRESH_INTERVAL{3.0};
 
-double lossAtNSigma(double n, const TMeanVarAccumulator& lossMoments) {
-    return CBasicStatistics::mean(lossMoments) +
-           n * std::sqrt(CBasicStatistics::variance(lossMoments));
-}
-
 //! \brief Record the memory used by a supplied object using the RAII idiom.
 class CScopeRecordMemoryUsage {
 public:
@@ -122,6 +116,21 @@ class CTrainForestStoppingCondition {
     TDoubleSizePrMinAccumulator m_BestTestLoss;
 };
 
+double lossAtNSigma(double n, const TMeanVarAccumulator& lossMoments) {
+    return CBasicStatistics::mean(lossMoments) +
+           n * std::sqrt(CBasicStatistics::variance(lossMoments));
+}
+
+double trace(std::size_t columns, const TMemoryMappedFloatVector& upperTriangle) {
+    // This assumes the upper triangle of the matrix is stored row major.
+    double result{0.0};
+    for (int i = 0, j = static_cast<int>(columns);
+         i < upperTriangle.size() && j > 0; i += j, --j) {
+        result += upperTriangle(i);
+    }
+    return result;
+}
+
 CDataFrameAnalysisInstrumentationStub INSTRUMENTATION_STUB;
 }
 
@@ -254,9 +263,10 @@ void CBoostedTreeImpl::predict(core::CDataFrame& frame) const {
     bool successful;
     std::tie(std::ignore, successful) = frame.writeColumns(
         m_NumberThreads, 0, frame.numberRows(), [&](TRowItr beginRows, TRowItr endRows) {
+            std::size_t numberLossParameters{m_Loss->numberParameters()};
             for (auto row = beginRows; row != endRows; ++row) {
-                writePrediction(*row, m_NumberInputColumns,
-                                {predictRow(m_Encoder->encode(*row), m_BestForest)});
+                auto prediction = readPrediction(*row, m_NumberInputColumns, numberLossParameters);
+                prediction(0) = predictRow(m_Encoder->encode(*row), m_BestForest);
             }
         });
     if (successful == false) {
@@ -271,17 +281,21 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows,
     // A binary tree with n + 1 leaves has 2n + 1 nodes in total.
     std::size_t maximumNumberLeaves{this->maximumTreeSize(numberRows) + 1};
     std::size_t maximumNumberNodes{2 * maximumNumberLeaves - 1};
+    std::size_t maximumNumberFeatures{std::min(numberColumns - 1, numberRows / m_RowsPerFeature)};
     std::size_t forestMemoryUsage{
         m_MaximumNumberTrees *
         (sizeof(TNodeVec) + maximumNumberNodes * sizeof(CBoostedTreeNode))};
     std::size_t extraColumnsMemoryUsage{numberExtraColumnsForTrain(m_Loss->numberParameters()) *
                                         numberRows * sizeof(CFloatStorage)};
+    std::size_t foldRoundLossMemoryUsage{m_NumberFolds * m_NumberRounds *
+                                         sizeof(TOptionalDouble)};
     std::size_t hyperparametersMemoryUsage{numberColumns * sizeof(double)};
     std::size_t leafNodeStatisticsMemoryUsage{
         maximumNumberLeaves * CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(
-                                  numberRows, numberColumns, m_NumberSplitsPerFeature)};
-    std::size_t dataTypeMemoryUsage{numberColumns * sizeof(CDataFrameUtils::SDataType)};
-    std::size_t featureSampleProbabilities{numberColumns * sizeof(double)};
+                                  numberRows, maximumNumberFeatures, m_NumberSplitsPerFeature,
+                                  m_Loss->numberParameters())};
+    std::size_t dataTypeMemoryUsage{maximumNumberFeatures * sizeof(CDataFrameUtils::SDataType)};
+    std::size_t featureSampleProbabilities{maximumNumberFeatures * sizeof(double)};
     std::size_t missingFeatureMaskMemoryUsage{
         numberColumns * numberRows / PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE};
     std::size_t trainTestMaskMemoryUsage{2 * m_NumberFolds * numberRows /
@@ -291,8 +305,8 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows,
     std::size_t shapMemoryUsage{
         m_TopShapValues > 0 ? numberRows * numberColumns * sizeof(CFloatStorage) : 0};
     return sizeof(*this) + forestMemoryUsage + extraColumnsMemoryUsage +
-           hyperparametersMemoryUsage +
-           std::max(leafNodeStatisticsMemoryUsage, shapMemoryUsage) + // not current
+           foldRoundLossMemoryUsage + hyperparametersMemoryUsage +
+           std::max(leafNodeStatisticsMemoryUsage, shapMemoryUsage) + // not concurrent
            dataTypeMemoryUsage + featureSampleProbabilities + missingFeatureMaskMemoryUsage +
            trainTestMaskMemoryUsage + bayesianOptimisationMemoryUsage;
 }
@@ -422,12 +436,10 @@ CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivat
         m_NumberThreads, 0, frame.numberRows(),
         [this](TRowItr beginRows, TRowItr endRows) {
             std::size_t numberLossParameters{m_Loss->numberParameters()};
-            TDouble1Vec zero(numberLossParameters, 0.0);
-            TDouble1Vec zeroHessian(lossHessianStoredSize(numberLossParameters), 0.0);
             for (auto row = beginRows; row != endRows; ++row) {
-                writePrediction(*row, m_NumberInputColumns, zero);
-                writeLossGradient(*row, m_NumberInputColumns, zero);
-                writeLossCurvature(*row, m_NumberInputColumns, zeroHessian);
+                zeroPrediction(*row, m_NumberInputColumns, numberLossParameters);
+                zeroLossGradient(*row, m_NumberInputColumns, numberLossParameters);
+                zeroLossCurvature(*row, m_NumberInputColumns, numberLossParameters);
             }
         },
         &updateRowMask);
@@ -576,8 +588,9 @@ CBoostedTreeImpl::candidateSplits(const core::CDataFrame& frame,
             m_Encoder.get(),
             [this](const TRowRef& row) {
                 // TODO Think about what scalar measure of the Hessian to use here?
-                return readLossCurvature(row, m_NumberInputColumns,
-                                         m_Loss->numberParameters())[0];
+                std::size_t numberLossParameters{m_Loss->numberParameters()};
+                return trace(numberLossParameters,
+                             readLossCurvature(row, m_NumberInputColumns, numberLossParameters));
             })
             .first;
 
@@ -690,7 +703,6 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame,
         double splitValue;
         std::tie(splitFeature, splitValue) = leaf->bestSplit();
 
-        bool leftChildHasFewerRows{leaf->leftChildHasFewerRows()};
         bool assignMissingToLeft{leaf->assignMissingToLeft()};
 
         std::size_t leftChildId, rightChildId;
@@ -700,10 +712,9 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame,
 
         TLeafNodeStatisticsPtr leftChild;
         TLeafNodeStatisticsPtr rightChild;
-        std::tie(leftChild, rightChild) =
-            leaf->split(leftChildId, rightChildId, m_NumberThreads, frame, *m_Encoder,
-                        m_Regularization, candidateSplits, this->featureBag(),
-                        tree[leaf->id()], leftChildHasFewerRows);
+        std::tie(leftChild, rightChild) = leaf->split(
+            leftChildId, rightChildId, m_NumberThreads, frame, *m_Encoder,
+            m_Regularization, candidateSplits, this->featureBag(), tree[leaf->id()]);
 
         scopeMemoryUsage.add(leftChild);
         scopeMemoryUsage.add(rightChild);
@@ -931,14 +942,13 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr
             std::size_t numberLossParameters{m_Loss->numberParameters()};
             for (auto row = beginRows; row != endRows; ++row) {
                 auto prediction = readPrediction(*row, m_NumberInputColumns, numberLossParameters);
-                prediction[0] += root(tree).value(m_Encoder->encode(*row), tree);
                 double actual{readActual(*row, m_DependentVariable)};
                 double weight{readExampleWeight(*row, m_NumberInputColumns, numberLossParameters)};
-                writePrediction(*row, m_NumberInputColumns, prediction);
-                writeLossGradient(*row, m_NumberInputColumns,
-                                  m_Loss->gradient(prediction, actual, weight));
-                writeLossCurvature(*row, m_NumberInputColumns,
-                                   m_Loss->curvature(prediction, actual, weight));
+                prediction(0) += root(tree).value(m_Encoder->encode(*row), tree);
+                writeLossGradient(*row, m_NumberInputColumns, *m_Loss,
+                                  prediction, actual, weight);
+                writeLossCurvature(*row, m_NumberInputColumns, *m_Loss,
+                                   prediction, actual, weight);
             }
         },
         &updateRowMask);
diff --git a/lib/maths/CBoostedTreeLeafNodeStatistics.cc b/lib/maths/CBoostedTreeLeafNodeStatistics.cc
index 9bab4402ca..caf3e02e29 100644
--- a/lib/maths/CBoostedTreeLeafNodeStatistics.cc
+++ b/lib/maths/CBoostedTreeLeafNodeStatistics.cc
@@ -9,7 +9,6 @@
 #include <core/CDataFrame.h>
 #include <core/CImmutableRadixSet.h>
 #include <core/CLogger.h>
-#include <core/CMemory.h>
 
 #include <maths/CBoostedTree.h>
 #include <maths/CDataFrameCategoryEncoder.h>
@@ -23,7 +22,6 @@ using TRowItr = core::CDataFrame::TRowItr;
 namespace {
 const std::size_t ASSIGN_MISSING_TO_LEFT{0};
 const std::size_t ASSIGN_MISSING_TO_RIGHT{1};
-const double SMALLEST_RELATIVE_CURVATURE{1e-20};
 }
 
 CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics(
@@ -69,58 +67,17 @@ CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics(
 
 CBoostedTreeLeafNodeStatistics::CBoostedTreeLeafNodeStatistics(
     std::size_t id,
-    const CBoostedTreeLeafNodeStatistics& parent,
+    CBoostedTreeLeafNodeStatistics&& parent,
     const CBoostedTreeLeafNodeStatistics& sibling,
     const TRegularization& regularization,
     const TSizeVec& featureBag,
     core::CPackedBitVector rowMask)
     : m_Id{id}, m_Depth{sibling.m_Depth}, m_NumberInputColumns{sibling.m_NumberInputColumns},
       m_NumberLossParameters{sibling.m_NumberLossParameters},
-      m_CandidateSplits{sibling.m_CandidateSplits}, m_RowMask{std::move(rowMask)} {
-
-    m_Derivatives.resize(m_CandidateSplits.size());
-    m_MissingDerivatives.resize(m_CandidateSplits.size());
-
-    for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) {
-        std::size_t numberSplits{m_CandidateSplits[i].size() + 1};
-        m_Derivatives[i].resize(numberSplits);
-        for (std::size_t j = 0; j < numberSplits; ++j) {
-            // Numeric errors mean that it's possible the sum curvature for a candidate
-            // split is identically zero while the gradient is epsilon. This can cause
-            // the node gain to appear infinite (when there is no weight regularisation)
-            // which in turns causes problems initialising the region we search for optimal
-            // hyperparameter values. We can safely force the gradient and curvature to
-            // be zero if we detect that the count is zero. Also, none of our loss functions
-            // have negative curvature therefore we shouldn't allow the cumulative curvature
-            // to be negative either. In this case we force it to be a v.small multiple
-            // of the magnitude of the gradient since this is the closest feasible estimate.
-            std::size_t count{parent.m_Derivatives[i][j].s_Count -
-                              sibling.m_Derivatives[i][j].s_Count};
-            if (count > 0) {
-                double gradient{parent.m_Derivatives[i][j].s_Gradient -
-                                sibling.m_Derivatives[i][j].s_Gradient};
-                double curvature{parent.m_Derivatives[i][j].s_Curvature -
-                                 sibling.m_Derivatives[i][j].s_Curvature};
-                curvature = std::max(curvature, SMALLEST_RELATIVE_CURVATURE *
-                                                    std::fabs(gradient));
-                m_Derivatives[i][j] = SAggregateDerivatives{count, gradient, curvature};
-            }
-        }
-        std::size_t count{parent.m_MissingDerivatives[i].s_Count -
-                          sibling.m_MissingDerivatives[i].s_Count};
-        if (count > 0) {
-            double gradient{parent.m_MissingDerivatives[i].s_Gradient -
-                            sibling.m_MissingDerivatives[i].s_Gradient};
-            double curvature{parent.m_MissingDerivatives[i].s_Curvature -
-                             sibling.m_MissingDerivatives[i].s_Curvature};
-            curvature = std::max(curvature, SMALLEST_RELATIVE_CURVATURE * std::fabs(gradient));
-            m_MissingDerivatives[i] = SAggregateDerivatives{count, gradient, curvature};
-        }
-    }
-    LOG_TRACE(<< "derivatives = " << core::CContainerPrinter::print(m_Derivatives));
-    LOG_TRACE(<< "missing derivatives = "
-              << core::CContainerPrinter::print(m_MissingDerivatives));
+      m_CandidateSplits{sibling.m_CandidateSplits}, m_RowMask{std::move(rowMask)},
+      m_Derivatives{std::move(parent.m_Derivatives)} {
 
+    m_Derivatives.subtract(sibling.m_Derivatives);
     m_BestSplit = this->computeBestSplitStatistics(regularization, featureBag);
 }
 
@@ -133,18 +90,20 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId,
                                       const TRegularization& regularization,
                                       const TImmutableRadixSetVec& candidateSplits,
                                       const TSizeVec& featureBag,
-                                      const CBoostedTreeNode& split,
-                                      bool leftChildHasFewerRows) {
-    if (leftChildHasFewerRows) {
+                                      const CBoostedTreeNode& split) {
+
+    if (this->leftChildHasFewerRows()) {
         auto leftChild = std::make_shared<CBoostedTreeLeafNodeStatistics>(
             leftChildId, m_NumberInputColumns, m_NumberLossParameters,
             numberThreads, frame, encoder, regularization, candidateSplits,
             featureBag, true /*is left child*/, m_Depth + 1, split, m_RowMask);
-        core::CPackedBitVector rightChildRowMask{m_RowMask};
+        core::CPackedBitVector rightChildRowMask{std::move(m_RowMask)};
         rightChildRowMask ^= leftChild->rowMask();
         auto rightChild = std::make_shared<CBoostedTreeLeafNodeStatistics>(
-            rightChildId, *this, *leftChild, regularization, featureBag,
-            std::move(rightChildRowMask));
+            rightChildId, std::move(*this), *leftChild, regularization,
+            featureBag, std::move(rightChildRowMask));
+        leftChild->maybeRecoverMemory();
+        rightChild->maybeRecoverMemory();
 
         return std::make_pair(leftChild, rightChild);
     }
@@ -153,11 +112,13 @@ CBoostedTreeLeafNodeStatistics::split(std::size_t leftChildId,
         rightChildId, m_NumberInputColumns, m_NumberLossParameters,
         numberThreads, frame, encoder, regularization, candidateSplits,
         featureBag, false /*is left child*/, m_Depth + 1, split, m_RowMask);
-    core::CPackedBitVector leftChildRowMask{m_RowMask};
+    core::CPackedBitVector leftChildRowMask{std::move(m_RowMask)};
     leftChildRowMask ^= rightChild->rowMask();
     auto leftChild = std::make_shared<CBoostedTreeLeafNodeStatistics>(
-        leftChildId, *this, *rightChild, regularization, featureBag,
+        leftChildId, std::move(*this), *rightChild, regularization, featureBag,
         std::move(leftChildRowMask));
+    leftChild->maybeRecoverMemory();
+    rightChild->maybeRecoverMemory();
 
     return std::make_pair(leftChild, rightChild);
 }
@@ -195,25 +156,29 @@ core::CPackedBitVector& CBoostedTreeLeafNodeStatistics::rowMask() {
 }
 
 std::size_t CBoostedTreeLeafNodeStatistics::memoryUsage() const {
-    std::size_t mem{core::CMemory::dynamicSize(m_RowMask)};
-    mem += core::CMemory::dynamicSize(m_Derivatives);
-    mem += core::CMemory::dynamicSize(m_MissingDerivatives);
-    return mem;
+    return core::CMemory::dynamicSize(m_RowMask) + core::CMemory::dynamicSize(m_Derivatives);
 }
 
-std::size_t CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(std::size_t numberRows,
-                                                                std::size_t numberCols,
-                                                                std::size_t numberSplitsPerFeature) {
+std::size_t
+CBoostedTreeLeafNodeStatistics::estimateMemoryUsage(std::size_t numberRows,
+                                                    std::size_t numberFeatures,
+                                                    std::size_t numberSplitsPerFeature,
+                                                    std::size_t numberLossParameters) {
     // We will typically get the close to the best compression for most of the
     // leaves when the set of splits becomes large, corresponding to the worst
     // case for memory usage. This is because the rows will be spread over many
     // rows so the masks will mainly contain 0 bits in this case.
     std::size_t rowMaskSize{numberRows / PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE};
-    std::size_t derivativesSize{(numberCols - 1) * numberSplitsPerFeature *
-                                sizeof(SAggregateDerivatives)};
-    std::size_t missingDerivativesSize{(numberCols - 1) * sizeof(SAggregateDerivatives)};
-    return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize +
-           derivativesSize + missingDerivativesSize;
+    std::size_t perSplitDerivativesSize{CPerSplitDerivatives::estimateMemoryUsage(
+        numberFeatures, numberSplitsPerFeature, numberLossParameters)};
+    return sizeof(CBoostedTreeLeafNodeStatistics) + rowMaskSize + perSplitDerivativesSize;
+}
+
+void CBoostedTreeLeafNodeStatistics::maybeRecoverMemory() {
+    if (this->gain() <= 0.0) {
+        m_RowMask = core::CPackedBitVector{};
+        m_Derivatives = CPerSplitDerivatives{};
+    }
 }
 
 void CBoostedTreeLeafNodeStatistics::computeAggregateLossDerivatives(
@@ -224,26 +189,19 @@ void CBoostedTreeLeafNodeStatistics::computeAggregateLossDerivatives(
     auto result = frame.readRows(
         numberThreads, 0, frame.numberRows(),
         core::bindRetrievableState(
-            [&](SSplitAggregateDerivatives& splitAggregateDerivatives,
-                TRowItr beginRows, TRowItr endRows) {
+            [&](CPerSplitDerivatives& perSplitDerivatives, TRowItr beginRows, TRowItr endRows) {
                 for (auto row = beginRows; row != endRows; ++row) {
-                    this->addRowDerivatives(encoder.encode(*row), splitAggregateDerivatives);
+                    this->addRowDerivatives(encoder.encode(*row), perSplitDerivatives);
                 }
             },
-            SSplitAggregateDerivatives{m_CandidateSplits}),
+            CPerSplitDerivatives{m_CandidateSplits, m_NumberLossParameters}),
         &m_RowMask);
-    auto& state = result.first;
 
-    SSplitAggregateDerivatives derivatives{std::move(state[0].s_FunctionState)};
-    for (std::size_t i = 1; i < state.size(); ++i) {
-        derivatives.merge(state[i].s_FunctionState);
+    m_Derivatives = std::move(result.first[0].s_FunctionState);
+    for (std::size_t i = 1; i < result.first.size(); ++i) {
+        m_Derivatives.add(result.first[i].s_FunctionState);
     }
-
-    std::tie(m_Derivatives, m_MissingDerivatives) = derivatives.move();
-
-    LOG_TRACE(<< "derivatives = " << core::CContainerPrinter::print(m_Derivatives));
-    LOG_TRACE(<< "missing derivatives = "
-              << core::CContainerPrinter::print(m_MissingDerivatives));
+    m_Derivatives.remapCurvature();
 }
 
 void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives(
@@ -257,59 +215,51 @@ void CBoostedTreeLeafNodeStatistics::computeRowMaskAndAggregateLossDerivatives(
     auto result = frame.readRows(
         numberThreads, 0, frame.numberRows(),
         core::bindRetrievableState(
-            [&](std::pair<core::CPackedBitVector, SSplitAggregateDerivatives>& state,
+            [&](std::pair<core::CPackedBitVector, CPerSplitDerivatives>& state,
                 TRowItr beginRows, TRowItr endRows) {
                 auto& mask = state.first;
-                auto& splitAggregateDerivatives = state.second;
+                auto& perSplitDerivatives = state.second;
                 for (auto row = beginRows; row != endRows; ++row) {
                     auto encodedRow = encoder.encode(*row);
                     if (split.assignToLeft(encodedRow) == isLeftChild) {
                         std::size_t index{row->index()};
                         mask.extend(false, index - mask.size());
                         mask.extend(true);
-                        this->addRowDerivatives(encodedRow, splitAggregateDerivatives);
+                        this->addRowDerivatives(encodedRow, perSplitDerivatives);
                     }
                 }
             },
-            std::make_pair(core::CPackedBitVector{}, SSplitAggregateDerivatives{m_CandidateSplits})),
+            std::make_pair(core::CPackedBitVector{},
+                           CPerSplitDerivatives{m_CandidateSplits, m_NumberLossParameters})),
         &parentRowMask);
-    auto& state = result.first;
 
-    for (auto& mask_ : state) {
+    for (auto& mask_ : result.first) {
         auto& mask = mask_.s_FunctionState.first;
         mask.extend(false, parentRowMask.size() - mask.size());
     }
 
-    m_RowMask = std::move(state[0].s_FunctionState.first);
-    SSplitAggregateDerivatives derivatives{std::move(state[0].s_FunctionState.second)};
-    for (std::size_t i = 1; i < state.size(); ++i) {
-        m_RowMask |= state[i].s_FunctionState.first;
-        derivatives.merge(state[i].s_FunctionState.second);
+    m_RowMask = std::move(result.first[0].s_FunctionState.first);
+    m_Derivatives = std::move(result.first[0].s_FunctionState.second);
+    for (std::size_t i = 1; i < result.first.size(); ++i) {
+        m_RowMask |= result.first[i].s_FunctionState.first;
+        m_Derivatives.add(result.first[i].s_FunctionState.second);
     }
-
-    std::tie(m_Derivatives, m_MissingDerivatives) = derivatives.move();
-
-    LOG_TRACE(<< "row mask = " << m_RowMask);
-    LOG_TRACE(<< "derivatives = " << core::CContainerPrinter::print(m_Derivatives));
-    LOG_TRACE(<< "missing derivatives = "
-              << core::CContainerPrinter::print(m_MissingDerivatives));
+    m_Derivatives.remapCurvature();
 }
 
-void CBoostedTreeLeafNodeStatistics::addRowDerivatives(
-    const CEncodedDataFrameRowRef& row,
-    SSplitAggregateDerivatives& splitAggregateDerivatives) const {
+void CBoostedTreeLeafNodeStatistics::addRowDerivatives(const CEncodedDataFrameRowRef& row,
+                                                       CPerSplitDerivatives& perSplitDerivatives) const {
 
     const TRowRef& unencodedRow{row.unencodedRow()};
     auto gradient = readLossGradient(unencodedRow, m_NumberInputColumns, m_NumberLossParameters);
     auto curvature = readLossCurvature(unencodedRow, m_NumberInputColumns, m_NumberLossParameters);
-
-    for (std::size_t i = 0; i < m_CandidateSplits.size(); ++i) {
-        double featureValue{row[i]};
+    for (std::size_t feature = 0; feature < m_CandidateSplits.size(); ++feature) {
+        double featureValue{row[feature]};
         if (CDataFrameUtils::isMissing(featureValue)) {
-            splitAggregateDerivatives.s_MissingDerivatives[i].add(1, gradient, curvature);
+            perSplitDerivatives.addMissingDerivatives(feature, gradient, curvature);
         } else {
-            std::ptrdiff_t j{m_CandidateSplits[i].upperBound(featureValue)};
-            splitAggregateDerivatives.s_Derivatives[i][j].add(1, gradient, curvature);
+            std::ptrdiff_t split{m_CandidateSplits[feature].upperBound(featureValue)};
+            perSplitDerivatives.addDerivatives(feature, split, gradient, curvature);
         }
     }
 }
@@ -325,54 +275,116 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization
 
     SSplitStatistics result;
 
-    for (auto i : featureBag) {
-        std::size_t c{m_MissingDerivatives[i].s_Count};
-        double g{m_MissingDerivatives[i].s_Gradient};
-        double h{m_MissingDerivatives[i].s_Curvature};
-        for (const auto& derivatives : m_Derivatives[i]) {
-            c += derivatives.s_Count;
-            g += derivatives.s_Gradient;
-            h += derivatives.s_Curvature;
+    // We seek to find the value at the minimum of the quadratic expansion of the
+    // regularized loss function. For a given leaf this expansion is
+    //
+    //   L(w) = 1/2 w^t H(\lambda) w + g^t w
+    //
+    // where H(\lambda) = \sum_i H_i + \lambda I, g = \sum_i g_i and w is the leaf's
+    // weight. Here, g_i and H_i denote an example's loss gradient and Hessian and i
+    // ranges over the examples in the leaf. Writing this as the sum of a quadratic
+    // form and constant, i.e. x(w)^t H(\lambda) x(w) + constant, and noting that H
+    // is positive definite, we see that we'll minimise loss by choosing w such that
+    // x is zero, i.e. w^* = arg\min_w(L(w)) satisfies x(w) = 0. This gives
+    //
+    //   L(w^*) = -1/2 g^t H(\lambda)^{-1} g
+
+    using TDoubleVector = CDenseVector<double>;
+    using TDoubleMatrix = CDenseMatrix<double>;
+    using TMinimumLoss = std::function<double(const TDoubleVector&, const TDoubleMatrix&)>;
+
+    int d{static_cast<int>(m_NumberLossParameters)};
+
+    TMinimumLoss minimumLoss;
+
+    double lambda{regularization.leafWeightPenaltyMultiplier()};
+    Eigen::MatrixXd placeholder{d, d};
+    if (m_NumberLossParameters == 1) {
+        // There is a large fixed overhead for using ldl^t even when g and h are
+        // scalar so we have special case handling.
+        minimumLoss = [&](const TDoubleVector& g, const TDoubleMatrix& h) -> double {
+            return CTools::pow2(g(0)) / (h(0, 0) + lambda);
+        };
+    } else {
+        // TODO use Cholesky (but need to handle positive semi-definite case).
+        minimumLoss = [&](const TDoubleVector& g, const TDoubleMatrix& h) -> double {
+            placeholder =
+                (h + lambda * TDoubleMatrix::Identity(d, d)).selfadjointView<Eigen::Lower>();
+            Eigen::LDLT<Eigen::Ref<Eigen::MatrixXd>> ldlt{placeholder};
+            return g.transpose() * ldlt.solve(g);
+        };
+    }
+
+    TDoubleVector g{d};
+    TDoubleMatrix h{d, d};
+    TDoubleVector gl[]{TDoubleVector{d}, TDoubleVector{d}};
+    TDoubleVector gr[]{TDoubleVector{d}, TDoubleVector{d}};
+    TDoubleMatrix hl[]{TDoubleMatrix{d, d}, TDoubleMatrix{d, d}};
+    TDoubleMatrix hr[]{TDoubleMatrix{d, d}, TDoubleMatrix{d, d}};
+
+    for (auto feature : featureBag) {
+        std::size_t c{m_Derivatives.missingCount(feature)};
+        g = m_Derivatives.missingGradient(feature);
+        h = m_Derivatives.missingCurvature(feature);
+        for (const auto& derivatives : m_Derivatives.derivatives(feature)) {
+            c += derivatives.count();
+            g += derivatives.gradient();
+            h += derivatives.curvature();
         }
-        std::size_t cl[]{m_MissingDerivatives[i].s_Count, 0};
-        double gl[]{m_MissingDerivatives[i].s_Gradient, 0.0};
-        double hl[]{m_MissingDerivatives[i].s_Curvature, 0.0};
+        std::size_t cl[]{m_Derivatives.missingCount(feature), 0};
+        gl[ASSIGN_MISSING_TO_LEFT] = m_Derivatives.missingGradient(feature);
+        gl[ASSIGN_MISSING_TO_RIGHT] = TDoubleVector::Zero(g.rows());
+        gr[ASSIGN_MISSING_TO_LEFT] = g - m_Derivatives.missingGradient(feature);
+        gr[ASSIGN_MISSING_TO_RIGHT] = g;
+        hl[ASSIGN_MISSING_TO_LEFT] = m_Derivatives.missingCurvature(feature);
+        hl[ASSIGN_MISSING_TO_RIGHT] = TDoubleMatrix::Zero(h.rows(), h.cols());
+        hr[ASSIGN_MISSING_TO_LEFT] = h - m_Derivatives.missingCurvature(feature);
+        hr[ASSIGN_MISSING_TO_RIGHT] = h;
 
         double maximumGain{-INF};
         double splitAt{-INF};
         bool leftChildHasFewerRows{true};
         bool assignMissingToLeft{true};
+        std::size_t size{m_Derivatives.derivatives(feature).size()};
+
+        for (std::size_t split = 0; split + 1 < size; ++split) {
 
-        for (std::size_t j = 0; j + 1 < m_Derivatives[i].size(); ++j) {
-            cl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Count;
-            gl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Gradient;
-            hl[ASSIGN_MISSING_TO_LEFT] += m_Derivatives[i][j].s_Curvature;
-            cl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Count;
-            gl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Gradient;
-            hl[ASSIGN_MISSING_TO_RIGHT] += m_Derivatives[i][j].s_Curvature;
-
-            double gain[]{CTools::pow2(gl[ASSIGN_MISSING_TO_LEFT]) /
-                                  (hl[ASSIGN_MISSING_TO_LEFT] +
-                                   regularization.leafWeightPenaltyMultiplier()) +
-                              CTools::pow2(g - gl[ASSIGN_MISSING_TO_LEFT]) /
-                                  (h - hl[ASSIGN_MISSING_TO_LEFT] +
-                                   regularization.leafWeightPenaltyMultiplier()),
-                          CTools::pow2(gl[ASSIGN_MISSING_TO_RIGHT]) /
-                                  (hl[ASSIGN_MISSING_TO_RIGHT] +
-                                   regularization.leafWeightPenaltyMultiplier()) +
-                              CTools::pow2(g - gl[ASSIGN_MISSING_TO_RIGHT]) /
-                                  (h - hl[ASSIGN_MISSING_TO_RIGHT] +
-                                   regularization.leafWeightPenaltyMultiplier())};
+            std::size_t count{m_Derivatives.count(feature, split)};
+            if (count == 0) {
+                continue;
+            }
+
+            const TMemoryMappedDoubleVector& gradient{m_Derivatives.gradient(feature, split)};
+            const TMemoryMappedDoubleMatrix& curvature{m_Derivatives.curvature(feature, split)};
+
+            cl[ASSIGN_MISSING_TO_LEFT] += count;
+            cl[ASSIGN_MISSING_TO_RIGHT] += count;
+            gl[ASSIGN_MISSING_TO_LEFT] += gradient;
+            gl[ASSIGN_MISSING_TO_RIGHT] += gradient;
+            gr[ASSIGN_MISSING_TO_LEFT] -= gradient;
+            gr[ASSIGN_MISSING_TO_RIGHT] -= gradient;
+            hl[ASSIGN_MISSING_TO_LEFT] += curvature;
+            hl[ASSIGN_MISSING_TO_RIGHT] += curvature;
+            hr[ASSIGN_MISSING_TO_LEFT] -= curvature;
+            hr[ASSIGN_MISSING_TO_RIGHT] -= curvature;
+
+            double gain[2];
+            gain[ASSIGN_MISSING_TO_LEFT] =
+                minimumLoss(gl[ASSIGN_MISSING_TO_LEFT], hl[ASSIGN_MISSING_TO_LEFT]) +
+                minimumLoss(gr[ASSIGN_MISSING_TO_LEFT], hr[ASSIGN_MISSING_TO_LEFT]);
+            gain[ASSIGN_MISSING_TO_RIGHT] =
+                minimumLoss(gl[ASSIGN_MISSING_TO_RIGHT], hl[ASSIGN_MISSING_TO_RIGHT]) +
+                minimumLoss(gr[ASSIGN_MISSING_TO_RIGHT], hr[ASSIGN_MISSING_TO_RIGHT]);
 
             if (gain[ASSIGN_MISSING_TO_LEFT] > maximumGain) {
                 maximumGain = gain[ASSIGN_MISSING_TO_LEFT];
-                splitAt = m_CandidateSplits[i][j];
+                splitAt = m_CandidateSplits[feature][split];
                 leftChildHasFewerRows = (2 * cl[ASSIGN_MISSING_TO_LEFT] < c);
                 assignMissingToLeft = true;
             }
             if (gain[ASSIGN_MISSING_TO_RIGHT] > maximumGain) {
                 maximumGain = gain[ASSIGN_MISSING_TO_RIGHT];
-                splitAt = m_CandidateSplits[i][j];
+                splitAt = m_CandidateSplits[feature][split];
                 leftChildHasFewerRows = (2 * cl[ASSIGN_MISSING_TO_RIGHT] < c);
                 assignMissingToLeft = false;
             }
@@ -380,13 +392,20 @@ CBoostedTreeLeafNodeStatistics::computeBestSplitStatistics(const TRegularization
 
         double penaltyForDepth{regularization.penaltyForDepth(m_Depth)};
         double penaltyForDepthPlusOne{regularization.penaltyForDepth(m_Depth + 1)};
-        double gain{0.5 * (maximumGain - CTools::pow2(g) / (h + regularization.leafWeightPenaltyMultiplier())) -
+
+        // The gain is the difference between the quadratic minimum for loss with
+        // no split and the loss with the minimum loss split we found.
+        double gain{0.5 * (maximumGain - minimumLoss(g, h)) -
                     regularization.treeSizePenaltyMultiplier() -
                     regularization.depthPenaltyMultiplier() *
                         (2.0 * penaltyForDepthPlusOne - penaltyForDepth)};
 
-        SSplitStatistics candidate{
-            gain, h, i, splitAt, leftChildHasFewerRows, assignMissingToLeft};
+        SSplitStatistics candidate{gain,
+                                   h.trace() / static_cast<double>(m_NumberLossParameters),
+                                   feature,
+                                   splitAt,
+                                   leftChildHasFewerRows,
+                                   assignMissingToLeft};
         LOG_TRACE(<< "candidate split: " << candidate.print());
 
         if (candidate > result) {
diff --git a/lib/maths/CBoostedTreeUtils.cc b/lib/maths/CBoostedTreeUtils.cc
index f204231e65..e138ca6b3d 100644
--- a/lib/maths/CBoostedTreeUtils.cc
+++ b/lib/maths/CBoostedTreeUtils.cc
@@ -6,57 +6,88 @@
 
 #include <maths/CBoostedTreeUtils.h>
 
+#include <maths/CBoostedTree.h>
+
 namespace ml {
 namespace maths {
 namespace boosted_tree_detail {
+using namespace boosted_tree;
 
-TDouble1Vec readPrediction(const TRowRef& row,
-                           std::size_t numberInputColumns,
-                           std::size_t numberLossParamaters) {
-    const auto* start = row.data() + predictionColumn(numberInputColumns);
-    return TDouble1Vec(start, start + numberLossParamaters);
+TMemoryMappedFloatVector readPrediction(const TRowRef& row,
+                                        std::size_t numberInputColumns,
+                                        std::size_t numberLossParamaters) {
+    return {row.data() + predictionColumn(numberInputColumns),
+            static_cast<int>(numberLossParamaters)};
 }
 
-void writePrediction(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& prediction) {
+void zeroPrediction(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParamaters) {
     std::size_t offset{predictionColumn(numberInputColumns)};
-    for (std::size_t i = 0; i < prediction.size(); ++i) {
-        row.writeColumn(offset + i, prediction[i]);
+    for (std::size_t i = 0; i < numberLossParamaters; ++i) {
+        row.writeColumn(offset + i, 0.0);
     }
 }
 
-TDouble1Vec readLossGradient(const TRowRef& row,
-                             std::size_t numberInputColumns,
-                             std::size_t numberLossParameters) {
-    const auto* start = row.data() + lossGradientColumn(numberInputColumns, numberLossParameters);
-    return TDouble1Vec(start, start + numberLossParameters);
+TMemoryMappedFloatVector readLossGradient(const TRowRef& row,
+                                          std::size_t numberInputColumns,
+                                          std::size_t numberLossParameters) {
+    return {row.data() + lossGradientColumn(numberInputColumns, numberLossParameters),
+            static_cast<int>(numberLossParameters)};
 }
 
-void writeLossGradient(const TRowRef& row, std::size_t numberInputColumns, const TDouble1Vec& gradient) {
-    std::size_t offset{lossGradientColumn(numberInputColumns, gradient.size())};
-    for (std::size_t i = 0; i < gradient.size(); ++i) {
-        row.writeColumn(offset + i, gradient[i]);
+void zeroLossGradient(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) {
+    std::size_t offset{lossGradientColumn(numberInputColumns, numberLossParameters)};
+    for (std::size_t i = 0; i < numberLossParameters; ++i) {
+        row.writeColumn(offset + i, 0.0);
     }
 }
 
-TDouble1Vec readLossCurvature(const TRowRef& row,
-                              std::size_t numberInputColumns,
-                              std::size_t numberLossParameters) {
-    const auto* start = row.data() + lossCurvatureColumn(numberInputColumns, numberLossParameters);
-    return TDouble1Vec(start, start + lossHessianStoredSize(numberLossParameters));
+void writeLossGradient(const TRowRef& row,
+                       std::size_t numberInputColumns,
+                       const CLoss& loss,
+                       const TMemoryMappedFloatVector& prediction,
+                       double actual,
+                       double weight) {
+    std::size_t offset{lossGradientColumn(numberInputColumns, prediction.size())};
+    auto writer = [&row, offset](std::size_t i, double value) {
+        row.writeColumn(offset + i, value);
+    };
+    // We wrap the writer in another lambda which we know takes advantage
+    // of std::function small size optimization to avoid heap allocations.
+    loss.gradient(prediction, actual,
+                  [&writer](std::size_t i, double value) { writer(i, value); }, weight);
 }
 
-void writeLossCurvature(const TRowRef& row,
-                        std::size_t numberInputColumns,
-                        const TDouble1Vec& curvature) {
-    // This comes from solving x(x+1)/2 = n.
-    std::size_t numberLossParameters{
-        numberLossParametersForHessianStoredSize(curvature.size())};
+TMemoryMappedFloatVector readLossCurvature(const TRowRef& row,
+                                           std::size_t numberInputColumns,
+                                           std::size_t numberLossParameters) {
+    return {row.data() + lossCurvatureColumn(numberInputColumns, numberLossParameters),
+            static_cast<int>(lossHessianStoredSize(numberLossParameters))};
+}
+
+void zeroLossCurvature(const TRowRef& row, std::size_t numberInputColumns, std::size_t numberLossParameters) {
     std::size_t offset{lossCurvatureColumn(numberInputColumns, numberLossParameters)};
-    for (std::size_t i = 0; i < curvature.size(); ++i) {
-        row.writeColumn(offset + i, curvature[i]);
+    for (std::size_t i = 0, size = lossHessianStoredSize(numberLossParameters);
+         i < size; ++i) {
+        row.writeColumn(offset + i, 0.0);
     }
 }
 
+void writeLossCurvature(const TRowRef& row,
+                        std::size_t numberInputColumns,
+                        const CLoss& loss,
+                        const TMemoryMappedFloatVector& prediction,
+                        double actual,
+                        double weight) {
+    std::size_t offset{lossCurvatureColumn(numberInputColumns, prediction.size())};
+    auto writer = [&row, offset](std::size_t i, double value) {
+        row.writeColumn(offset + i, value);
+    };
+    // We wrap the writer in another lambda which we know takes advantage
+    // of std::function small size optimization to avoid heap allocations.
+    loss.curvature(prediction, actual,
+                   [&writer](std::size_t i, double value) { writer(i, value); }, weight);
+}
+
 double readExampleWeight(const TRowRef& row,
                          std::size_t numberInputColumns,
                          std::size_t numberLossParameters) {
diff --git a/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc
new file mode 100644
index 0000000000..a0a2c28584
--- /dev/null
+++ b/lib/maths/unittest/CBoostedTreeLeafNodeStatisticsTest.cc
@@ -0,0 +1,302 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+#include <maths/CBoostedTreeLeafNodeStatistics.h>
+
+#include <core/CLogger.h>
+
+#include <maths/CBoostedTreeUtils.h>
+#include <maths/CLinearAlgebraEigen.h>
+
+#include <test/CRandomNumbers.h>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(CBoostedTreeLeafNodeStatisticsTest)
+
+using namespace ml;
+using TDoubleVec = std::vector<double>;
+using TDoubleVecVec = std::vector<TDoubleVec>;
+using TSizeVec = std::vector<std::size_t>;
+using TSizeVecVec = std::vector<TSizeVec>;
+using TFloatVec = std::vector<maths::CFloatStorage>;
+using TVector = maths::CDenseVector<double>;
+using TVectorVec = std::vector<TVector>;
+using TVectorVecVec = std::vector<TVectorVec>;
+using TMatrix = maths::CDenseMatrix<double>;
+using TMatrixVec = std::vector<TMatrix>;
+using TMatrixVecVec = std::vector<TMatrixVec>;
+using TImmutableRadixSet = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSet;
+using TImmutableRadixSetVec = maths::CBoostedTreeLeafNodeStatistics::TImmutableRadixSetVec;
+using TDerivatives = maths::CBoostedTreeLeafNodeStatistics::CDerivatives;
+using TPerSplitDerivatives = maths::CBoostedTreeLeafNodeStatistics::CPerSplitDerivatives;
+
+namespace {
+
+template<typename T>
+maths::CMemoryMappedDenseVector<T> makeGradient(T* storage, std::size_t n) {
+    return maths::CMemoryMappedDenseVector<T>{storage, static_cast<int>(n)};
+}
+
+template<typename T>
+maths::CMemoryMappedDenseVector<T> makeCurvature(T* storage, std::size_t n) {
+    return maths::CMemoryMappedDenseVector<T>(storage, static_cast<int>(n));
+}
+
+template<typename T>
+TMatrix rowMajorHessian(std::size_t n, const maths::CMemoryMappedDenseVector<T>& curvatures) {
+    TMatrix result{n, n};
+    for (std::size_t i = 0, k = 0; i < n; ++i) {
+        for (std::size_t j = i; j < n; ++j, ++k) {
+            result(i, j) = result(j, i) = curvatures(k);
+        }
+    }
+    return result;
+}
+
+void testDerivativesFor(std::size_t numberParameters) {
+
+    LOG_DEBUG(<< "Testing " << numberParameters << " parameters");
+
+    test::CRandomNumbers rng;
+
+    std::size_t numberGradients{numberParameters};
+    std::size_t numberCurvatures{numberParameters * (numberParameters + 1) / 2};
+
+    TDoubleVecVec gradients(numberGradients);
+    TDoubleVecVec curvatures(numberCurvatures);
+    for (std::size_t i = 0; i < numberGradients; ++i) {
+        rng.generateUniformSamples(-1.0, 1.5, 20, gradients[i]);
+    }
+    for (std::size_t i = 0; i < numberCurvatures; ++i) {
+        rng.generateUniformSamples(0.1, 0.5, 20, curvatures[i]);
+    }
+
+    LOG_DEBUG(<< "Accumulate");
+
+    TDoubleVec storage1(numberGradients * (numberGradients + 1), 0.0);
+    TDerivatives derivatives1{numberParameters, &storage1[0]};
+
+    for (std::size_t j = 0; j < 10; ++j) {
+        TFloatVec storage;
+        for (std::size_t i = 0; i < numberGradients; ++i) {
+            storage.push_back(gradients[i][j]);
+        }
+        for (std::size_t i = 0; i < numberCurvatures; ++i) {
+            storage.push_back(curvatures[i][j]);
+        }
+        auto gradient = makeGradient(&storage[0], numberGradients);
+        auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures);
+        derivatives1.add(1, gradient, curvature);
+    }
+    derivatives1.remapCurvature();
+
+    BOOST_REQUIRE_EQUAL(10, derivatives1.count());
+    for (std::size_t i = 0; i < numberGradients; ++i) {
+        BOOST_REQUIRE_CLOSE(
+            std::accumulate(gradients[i].begin(), gradients[i].begin() + 10, 0.0),
+            derivatives1.gradient()(i), 1e-4);
+    }
+    for (std::size_t j = 0, k = 0; j < numberGradients; ++j) {
+        for (std::size_t i = j; i < numberGradients; ++i, ++k) {
+            BOOST_REQUIRE_CLOSE(std::accumulate(curvatures[k].begin(),
+                                                curvatures[k].begin() + 10, 0.0),
+                                derivatives1.curvature()(i, j), 1e-4);
+        }
+    }
+
+    LOG_DEBUG(<< "Merge");
+
+    TDoubleVec storage2(numberGradients * (numberGradients + 1), 0.0);
+    TDerivatives derivatives2{numberParameters, &storage2[0]};
+
+    for (std::size_t j = 10; j < 20; ++j) {
+        TFloatVec storage;
+        for (std::size_t i = 0; i < numberGradients; ++i) {
+            storage.push_back(gradients[i][j]);
+        }
+        for (std::size_t i = 0; i < numberCurvatures; ++i) {
+            storage.push_back(curvatures[i][j]);
+        }
+        auto gradient = makeGradient(&storage[0], numberGradients);
+        auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures);
+        derivatives2.add(1, gradient, curvature);
+    }
+    derivatives2.remapCurvature();
+
+    derivatives1.add(derivatives2);
+
+    BOOST_REQUIRE_EQUAL(20, derivatives1.count());
+    for (std::size_t i = 0; i < numberGradients; ++i) {
+        BOOST_REQUIRE_CLOSE(std::accumulate(gradients[i].begin(), gradients[i].end(), 0.0),
+                            derivatives1.gradient()(i), 1e-4);
+    }
+    for (std::size_t j = 0, k = 0; j < numberGradients; ++j) {
+        for (std::size_t i = j; i < numberGradients; ++i, ++k) {
+            BOOST_REQUIRE_CLOSE(
+                std::accumulate(curvatures[k].begin(), curvatures[k].end(), 0.0),
+                derivatives1.curvature()(i, j), 1e-4);
+        }
+    }
+
+    LOG_DEBUG(<< "Difference");
+
+    derivatives1.subtract(derivatives2);
+
+    BOOST_REQUIRE_EQUAL(10, derivatives1.count());
+    for (std::size_t i = 0; i < numberGradients; ++i) {
+        BOOST_REQUIRE_CLOSE(
+            std::accumulate(gradients[i].begin(), gradients[i].begin() + 10, 0.0),
+            derivatives1.gradient()(i), 1e-4);
+    }
+    for (std::size_t j = 0, k = 0; j < numberGradients; ++j) {
+        for (std::size_t i = j; i < numberGradients; ++i, ++k) {
+            BOOST_REQUIRE_CLOSE(std::accumulate(curvatures[k].begin(),
+                                                curvatures[k].begin() + 10, 0.0),
+                                derivatives1.curvature()(i, j), 1e-4);
+        }
+    }
+}
+
+void testPerSplitDerivativesFor(std::size_t numberParameters) {
+
+    LOG_DEBUG(<< "Testing " << numberParameters << " parameters");
+
+    TImmutableRadixSetVec featureSplits;
+    featureSplits.push_back(TImmutableRadixSet{1.0, 2.0, 3.0});
+    featureSplits.push_back(TImmutableRadixSet{0.1, 0.7, 1.1, 1.4});
+
+    test::CRandomNumbers rng;
+
+    std::size_t numberSamples{20};
+    std::size_t numberGradients{numberParameters};
+    std::size_t numberCurvatures{numberParameters * (numberParameters + 1) / 2};
+
+    for (std::size_t t = 0; t < 100; ++t) {
+        TSizeVec features;
+        TSizeVec splits[2];
+        TDoubleVec uniform01;
+        TDoubleVec gradients;
+        TDoubleVec curvatures;
+        rng.generateUniformSamples(0, 2, numberSamples, features);
+        rng.generateUniformSamples(0, featureSplits[0].size() + 1,
+                                   numberSamples, splits[0]);
+        rng.generateUniformSamples(0, featureSplits[1].size() + 1,
+                                   numberSamples, splits[1]);
+        rng.generateUniformSamples(0.0, 1.0, numberSamples, uniform01);
+        rng.generateUniformSamples(-1.5, 1.0, numberSamples * numberGradients, gradients);
+        rng.generateUniformSamples(0.1, 0.5, numberSamples * numberCurvatures, curvatures);
+
+        TSizeVecVec expectedCounts(2);
+        TVectorVecVec expectedGradients(2);
+        TMatrixVecVec expectedCurvatures(2);
+        TSizeVec expectedMissingCounts(2, 0);
+        TVectorVec expectedMissingGradients(2, TVector::Zero(numberParameters));
+        TMatrixVec expectedMissingCurvatures(2, TMatrix::Zero(numberParameters, numberParameters));
+        for (std::size_t i = 0; i < 2; ++i) {
+            expectedCounts[i].resize(featureSplits[i].size() + 1, 0);
+            expectedGradients[i].resize(featureSplits[i].size() + 1,
+                                        TVector::Zero(numberParameters));
+            expectedCurvatures[i].resize(featureSplits[i].size() + 1,
+                                         TMatrix::Zero(numberParameters, numberParameters));
+        }
+
+        auto addDerivatives = [&](TPerSplitDerivatives& derivatives) {
+            for (std::size_t i = 0, j = 0, k = 0; i < numberSamples;
+                 ++i, j += numberGradients, k += numberCurvatures) {
+
+                TFloatVec storage;
+                storage.insert(storage.end(), &gradients[j], &gradients[j + numberGradients]);
+                storage.insert(storage.end(), &curvatures[j],
+                               &curvatures[k + numberCurvatures]);
+                auto gradient = makeGradient(&storage[0], numberGradients);
+                auto curvature = makeCurvature(&storage[numberGradients], numberCurvatures);
+
+                if (uniform01[i] < 0.1) {
+                    derivatives.addMissingDerivatives(features[i], gradient, curvature);
+                    ++expectedMissingCounts[features[i]];
+                    expectedMissingGradients[features[i]] += gradient;
+                    expectedMissingCurvatures[features[i]] +=
+                        rowMajorHessian(numberParameters, curvature);
+                } else {
+                    derivatives.addDerivatives(features[i], splits[features[i]][i],
+                                               gradient, curvature);
+                    ++expectedCounts[features[i]][splits[features[i]][i]];
+                    expectedGradients[features[i]][splits[features[i]][i]] += gradient;
+                    expectedCurvatures[features[i]][splits[features[i]][i]] +=
+                        rowMajorHessian(numberParameters, curvature);
+                }
+            }
+            derivatives.remapCurvature();
+        };
+
+        auto validate = [&](const TPerSplitDerivatives& derivatives) {
+            for (std::size_t i = 0; i < expectedCounts.size(); ++i) {
+                for (std::size_t j = 0; j < expectedGradients[i].size(); ++j) {
+                    TMatrix curvature{
+                        derivatives.curvature(i, j).selfadjointView<Eigen::Lower>()};
+                    BOOST_REQUIRE_EQUAL(expectedCounts[i][j], derivatives.count(i, j));
+                    BOOST_REQUIRE_EQUAL(expectedGradients[i][j],
+                                        derivatives.gradient(i, j));
+                    BOOST_REQUIRE_EQUAL(expectedCurvatures[i][j], curvature);
+                }
+            }
+            for (std::size_t i = 0; i < expectedMissingCounts.size(); ++i) {
+                TMatrix curvature{
+                    derivatives.missingCurvature(i).selfadjointView<Eigen::Lower>()};
+                BOOST_REQUIRE_EQUAL(expectedMissingCounts[i], derivatives.missingCount(i));
+                BOOST_REQUIRE_EQUAL(expectedMissingGradients[i],
+                                    derivatives.missingGradient(i));
+                BOOST_REQUIRE_EQUAL(expectedMissingCurvatures[i], curvature);
+            }
+        };
+
+        LOG_TRACE(<< "Test accumulation");
+
+        TPerSplitDerivatives derivatives1{featureSplits, numberParameters};
+
+        addDerivatives(derivatives1);
+        validate(derivatives1);
+
+        LOG_TRACE(<< "Test merge");
+
+        rng.generateUniformSamples(0.0, 1.0, numberSamples, uniform01);
+        rng.generateUniformSamples(-1.5, 1.0, numberSamples * numberGradients, gradients);
+        rng.generateUniformSamples(0.1, 0.5, numberSamples * numberCurvatures, curvatures);
+
+        TPerSplitDerivatives derivatives2{featureSplits, numberParameters};
+
+        addDerivatives(derivatives2);
+        derivatives1.add(derivatives2);
+        validate(derivatives1);
+
+        LOG_TRACE(<< "Test copy");
+
+        TPerSplitDerivatives derivatives3{derivatives1};
+        BOOST_REQUIRE_EQUAL(derivatives1.checksum(), derivatives3.checksum());
+    }
+}
+}
+
+BOOST_AUTO_TEST_CASE(testDerivatives) {
+
+    // Test individual derivatives accumulation for single and multi parameter
+    // loss functions.
+
+    testDerivativesFor(1 /*loss function parameter*/);
+    testDerivativesFor(3 /*loss function parameters*/);
+}
+
+BOOST_AUTO_TEST_CASE(testPerSplitDerivatives) {
+
+    // Test per split derivatives accumulation for single and multi parameter
+    // loss functions.
+
+    testPerSplitDerivativesFor(1 /*loss function parameter*/);
+    testPerSplitDerivativesFor(3 /*loss function parameters*/);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc
index 1f7a37b572..eafe61f1ff 100644
--- a/lib/maths/unittest/CBoostedTreeTest.cc
+++ b/lib/maths/unittest/CBoostedTreeTest.cc
@@ -42,6 +42,7 @@ using TRowRef = core::CDataFrame::TRowRef;
 using TRowItr = core::CDataFrame::TRowItr;
 using TMeanAccumulator = maths::CBasicStatistics::SSampleMean<double>::TAccumulator;
 using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
+using TMemoryMappedFloatVector = maths::boosted_tree::CLoss::TMemoryMappedFloatVector;
 
 namespace {
 
@@ -875,10 +876,12 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) {
     // All predictions equal and zero.
     {
         CArgMinLogisticImpl argmin{0.0};
-        argmin.add({0.0}, 0.0);
-        argmin.add({0.0}, 1.0);
-        argmin.add({0.0}, 1.0);
-        argmin.add({0.0}, 0.0);
+        maths::CFloatStorage storage[]{0.0};
+        TMemoryMappedFloatVector prediction{storage, 1};
+        argmin.add(prediction, 0.0);
+        argmin.add(prediction, 1.0);
+        argmin.add(prediction, 1.0);
+        argmin.add(prediction, 0.0);
         argmin.nextPass();
         BOOST_REQUIRE_EQUAL(0.0, argmin.value()[0]);
     }
@@ -902,7 +905,9 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) {
         do {
             ++numberPasses;
             for (std::size_t i = 0; i < labels.size(); ++i) {
-                argmin.add({weights[i]}, labels[i]);
+                maths::CFloatStorage storage[]{weights[i]};
+                TMemoryMappedFloatVector prediction{storage, 1};
+                argmin.add(prediction, labels[i]);
                 ++counts[static_cast<std::size_t>(labels[i])];
             }
         } while (argmin.nextPass());
@@ -923,7 +928,9 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) {
         TDoubleVec actuals{1.0, 1.0, 0.0, 1.0};
         do {
             for (std::size_t i = 0; i < predictions.size(); ++i) {
-                argmin.add({predictions[i]}, actuals[i]);
+                maths::CFloatStorage storage[]{predictions[i]};
+                TMemoryMappedFloatVector prediction{storage, 1};
+                argmin.add(prediction, actuals[i]);
             }
         } while (argmin.nextPass());
 
@@ -935,7 +942,9 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) {
         for (double eps : {-10.0, 0.0, 10.0}) {
             double lossAtEps{0.0};
             for (std::size_t i = 0; i < predictions.size(); ++i) {
-                lossAtEps += loss.value({predictions[i] + minimizer + eps}, actuals[i]);
+                maths::CFloatStorage storage[]{predictions[i] + minimizer + eps};
+                TMemoryMappedFloatVector probe{storage, 1};
+                lossAtEps += loss.value(probe, actuals[i]);
             }
             losses.push_back(lossAtEps);
         }
@@ -1011,19 +1020,23 @@ BOOST_AUTO_TEST_CASE(testLogisticMinimizerRandom) {
 
             do {
                 for (std::size_t i = 0; i < labels.size() / 2; ++i) {
-                    argmin.add({weights[i]}, labels[i]);
-                    argminPartition[0].add({weights[i]}, labels[i]);
+                    maths::CFloatStorage storage[]{weights[i]};
+                    TMemoryMappedFloatVector prediction{storage, 1};
+                    argmin.add(prediction, labels[i]);
+                    argminPartition[0].add(prediction, labels[i]);
                 }
                 for (std::size_t i = labels.size() / 2; i < labels.size(); ++i) {
-                    argmin.add({weights[i]}, labels[i]);
-                    argminPartition[1].add({weights[i]}, labels[i]);
+                    maths::CFloatStorage storage[]{weights[i]};
+                    TMemoryMappedFloatVector prediction{storage, 1};
+                    argmin.add(prediction, labels[i]);
+                    argminPartition[1].add(prediction, labels[i]);
                 }
                 argminPartition[0].merge(argminPartition[1]);
                 argminPartition[1] = argminPartition[0];
             } while (nextPass());
 
-            double actual{argmin.value()[0]};
-            double actualPartition{argminPartition[0].value()[0]};
+            double actual{argmin.value()(0)};
+            double actualPartition{argminPartition[0].value()(0)};
             LOG_DEBUG(<< "actual = " << actual
                       << " objective at actual = " << objective(actual));
 
@@ -1049,13 +1062,17 @@ BOOST_AUTO_TEST_CASE(testLogisticLossForUnderflow) {
 
     // Losses should be very nearly linear function of log-odds when they're large.
     {
-        TDoubleVec lastLoss{loss.value({1.0 - std::log(eps)}, 0.0),
-                            loss.value({1.0 + std::log(eps)}, 1.0)};
+        maths::CFloatStorage predictions[]{1.0 - std::log(eps), 1.0 + std::log(eps)};
+        TMemoryMappedFloatVector prediction0{&predictions[0], 1};
+        TMemoryMappedFloatVector prediction1{&predictions[1], 1};
+        TDoubleVec lastLoss{loss.value(prediction0, 0.0), loss.value(prediction1, 1.0)};
         for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) {
-            TDoubleVec currentLoss{loss.value({scale - std::log(eps)}, 0.0),
-                                   loss.value({scale + std::log(eps)}, 1.0)};
-            BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, lastLoss[0] - currentLoss[0], 5e-3);
-            BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, lastLoss[1] - currentLoss[1], 5e-3);
+            predictions[0] = scale - std::log(eps);
+            predictions[1] = scale + std::log(eps);
+            TDoubleVec currentLoss{loss.value(prediction0, 0.0),
+                                   loss.value(prediction1, 1.0)};
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, lastLoss[0] - currentLoss[0], 0.005);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, lastLoss[1] - currentLoss[1], 0.005);
             lastLoss = currentLoss;
         }
     }
@@ -1063,24 +1080,42 @@ BOOST_AUTO_TEST_CASE(testLogisticLossForUnderflow) {
     // The gradient and curvature should be proportional to the exponential of the
     // log-odds when they're small.
     {
-        TDoubleVec lastGradient{loss.gradient({1.0 + std::log(eps)}, 0.0)[0],
-                                loss.gradient({1.0 - std::log(eps)}, 1.0)[0]};
-        TDoubleVec lastCurvature{loss.curvature({1.0 + std::log(eps)}, 0.0)[0],
-                                 loss.curvature({1.0 - std::log(eps)}, 1.0)[0]};
+        auto readDerivatives = [&](double prediction, TDoubleVec& gradients,
+                                   TDoubleVec& curvatures) {
+            maths::CFloatStorage predictions[]{prediction + std::log(eps),
+                                               prediction - std::log(eps)};
+            TMemoryMappedFloatVector prediction0{&predictions[0], 1};
+            TMemoryMappedFloatVector prediction1{&predictions[1], 1};
+            loss.gradient(prediction0, 0.0, [&](std::size_t, double value) {
+                gradients[0] = value;
+            });
+            loss.gradient(prediction1, 1.0, [&](std::size_t, double value) {
+                gradients[1] = value;
+            });
+            loss.curvature(prediction0, 0.0, [&](std::size_t, double value) {
+                curvatures[0] = value;
+            });
+            loss.curvature(prediction1, 1.0, [&](std::size_t, double value) {
+                curvatures[1] = value;
+            });
+        };
+
+        TDoubleVec lastGradient(2);
+        TDoubleVec lastCurvature(2);
+        readDerivatives(1.0, lastGradient, lastCurvature);
+
         for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) {
-            TDoubleVec currentGradient{loss.gradient({scale + std::log(eps)}, 0.0)[0],
-                                       loss.gradient({scale - std::log(eps)}, 1.0)[0]};
-            TDoubleVec currentCurvature{
-                loss.curvature({scale + std::log(eps)}, 0.0)[0],
-                loss.curvature({scale - std::log(eps)}, 1.0)[0]};
+            TDoubleVec currentGradient(2);
+            TDoubleVec currentCurvature(2);
+            readDerivatives(scale, currentGradient, currentCurvature);
             BOOST_REQUIRE_CLOSE_ABSOLUTE(std::exp(0.25),
-                                         lastGradient[0] / currentGradient[0], 5e-3);
+                                         lastGradient[0] / currentGradient[0], 0.01);
             BOOST_REQUIRE_CLOSE_ABSOLUTE(std::exp(-0.25),
-                                         lastGradient[1] / currentGradient[1], 5e-3);
+                                         lastGradient[1] / currentGradient[1], 0.01);
             BOOST_REQUIRE_CLOSE_ABSOLUTE(
-                std::exp(0.25), lastCurvature[0] / currentCurvature[0], 5e-3);
+                std::exp(0.25), lastCurvature[0] / currentCurvature[0], 0.01);
             BOOST_REQUIRE_CLOSE_ABSOLUTE(
-                std::exp(-0.25), lastCurvature[1] / currentCurvature[1], 5e-3);
+                std::exp(-0.25), lastCurvature[1] / currentCurvature[1], 0.01);
             lastGradient = currentGradient;
             lastCurvature = currentCurvature;
         }
diff --git a/lib/maths/unittest/Makefile b/lib/maths/unittest/Makefile
index dcab0474aa..a6dfab62a6 100644
--- a/lib/maths/unittest/Makefile
+++ b/lib/maths/unittest/Makefile
@@ -24,6 +24,7 @@ SRCS=\
 	CBasicStatisticsTest.cc \
 	CBayesianOptimisationTest.cc \
 	CBjkstUniqueValuesTest.cc \
+	CBoostedTreeLeafNodeStatisticsTest.cc \
 	CBoostedTreeTest.cc \
 	CBootstrapClustererTest.cc \
 	CBoundingBoxTest.cc \