lbfgs.cpp

//   MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
//
//   $Id: lbfgs.c 1528 2006-08-07 02:39:50Z taku $;
//
//   lbfgs.c was ported from the FORTRAN code of lbfgs.m to C
//   using f2c converter
//
//   http://www.ece.northwestern.edu/~nocedal/lbfgs.html
//
//   Software for Large-scale Unconstrained Optimization
//   L-BFGS is a limited-memory quasi-Newton code for unconstrained
//   optimization.
//   The code has been developed at the Optimization Technology Center,
//   a joint venture of Argonne National Laboratory and Northwestern University.
//
//   Authors
//   Jorge Nocedal
//
//   References
//   - J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage(1980),
//   Mathematics of Computation 35, pp. 773-782.
//   - D.C. Liu and J. Nocedal. On the Limited Memory Method for
//   Large Scale Optimization(1989),
//   Mathematical Programming B, 45, 3, pp. 503-528.
#include <cmath>
#include <iostream>
#include <numeric>
#include "lbfgs.h"
#include "common.h"

#define min(a, b) ((a) <= (b) ? (a) : (b))
#define max(a, b) ((a) >= (b) ? (a) : (b))

namespace {
static const double ftol = 1e-4;
static const double xtol = 1e-16;
static const double eps  = 1e-7;
static const double lb3_1_gtol = 0.9;
static const double lb3_1_stpmin = 1e-20;
static const double lb3_1_stpmax = 1e20;
static const int lb3_1_mp = 6;
static const int lb3_1_lp = 6;

// inline double pi(double x, double y) {
//   return CRFPP::sigma(x) == CRFPP::sigma(y) ?x : 0.0;
// }

inline void daxpy_(int n, double da, const double *dx, double *dy) {
  for (int i = 0; i < n; ++i)
    dy[i] += da * dx[i];
}

inline double ddot_(int size, const double *dx, const double *dy) {
  return std::inner_product(dx, dx + size, dy, 0.0);
}

void mcstep(double *stx, double *fx, double *dx,
            double *sty, double *fy, double *dy,
            double *stp, double fp, double dp,
            int *brackt,
            double stpmin, double stpmax,
            int *info) {
  bool bound = true;
  double p, q, s, d1, d2, d3, r, gamma, theta, stpq, stpc, stpf;
  *info = 0;

  if (*brackt && ((*stp <= min(*stx, *sty) || *stp >= max(*stx, *sty)) ||
                  *dx * (*stp - *stx) >= 0.0 || stpmax < stpmin)) {
    return;
  }

  double sgnd = dp * (*dx / std::abs(*dx));

  if (fp > *fx) {
    *info = 1;
    bound = true;
    theta =(*fx - fp) * 3 / (*stp - *stx) + *dx + dp;
    d1 = std::abs(theta);
    d2 = std::abs(*dx);
    d1 = max(d1, d2);
    d2 = std::abs(dp);
    s = max(d1, d2);
    d1 = theta / s;
    gamma = s * std::sqrt(d1 * d1 - *dx / s *(dp / s));
    if (*stp < *stx) {
      gamma = -gamma;
    }
    p = gamma - *dx + theta;
    q = gamma - *dx + gamma + dp;
    r = p / q;
    stpc = *stx + r * (*stp - *stx);
    stpq = *stx + *dx / ((*fx - fp) /
                         (*stp - *stx) + *dx) / 2 * (*stp - *stx);
    if ((d1 = stpc - *stx, std::abs(d1)) < (d2 = stpq - *stx, std::abs(d2))) {
      stpf = stpc;
    } else {
      stpf = stpc + (stpq - stpc) / 2;
    }
    *brackt = true;
  } else if (sgnd < 0.0) {
    *info = 2;
    bound = false;
    theta = (*fx - fp) * 3 / (*stp - *stx) + *dx + dp;
    d1 = std::abs(theta);
    d2 = std::abs(*dx);
    d1 = max(d1, d2);
    d2 = std::abs(dp);
    s = max(d1, d2);
    d1 = theta / s;
    gamma = s * std::sqrt(d1 * d1 - *dx / s * (dp / s));
    if (*stp > *stx) {
      gamma = -gamma;
    }
    p = gamma - dp + theta;
    q = gamma - dp + gamma + *dx;
    r = p / q;
    stpc = *stp + r *(*stx - *stp);
    stpq = *stp + dp /(dp - *dx) * (*stx - *stp);
    if ((d1 = stpc - *stp, std::abs(d1)) > (d2 = stpq - *stp, std::abs(d2))) {
      stpf = stpc;
    } else {
      stpf = stpq;
    }
    *brackt = true;
  } else if (std::abs(dp) < std::abs(*dx)) {
    *info = 3;
    bound = true;
    theta = (*fx - fp) * 3 / (*stp - *stx) + *dx + dp;
    d1 = std::abs(theta);
    d2 = std::abs(*dx);
    d1 = max(d1, d2);
    d2 = std::abs(dp);
    s = max(d1, d2);
    d3 = theta / s;
    d1 = 0.0;
    d2 = d3 * d3 - *dx / s *(dp / s);
    gamma = s * std::sqrt((max(d1, d2)));
    if (*stp > *stx) {
      gamma = -gamma;
    }
    p = gamma - dp + theta;
    q = gamma + (*dx - dp) + gamma;
    r = p / q;
    if (r < 0.0 && gamma != 0.0) {
      stpc = *stp + r *(*stx - *stp);
    } else if (*stp > *stx) {
      stpc = stpmax;
    } else {
      stpc = stpmin;
    }
    stpq = *stp + dp /(dp - *dx) * (*stx - *stp);
    if (*brackt) {
      if ((d1 = *stp - stpc, std::abs(d1)) <
          (d2 = *stp - stpq, std::abs(d2))) {
        stpf = stpc;
      } else {
        stpf = stpq;
      }
    } else {
      if ((d1 = *stp - stpc, std::abs(d1)) >
          (d2 = *stp - stpq, std::abs(d2))) {
        stpf = stpc;
      } else {
        stpf = stpq;
      }
    }
  } else {
    *info = 4;
    bound = false;
    if (*brackt) {
      theta =(fp - *fy) * 3 / (*sty - *stp) + *dy + dp;
      d1 = std::abs(theta);
      d2 = std::abs(*dy);
      d1 = max(d1, d2);
      d2 = std::abs(dp);
      s = max(d1, d2);
      d1 = theta / s;
      gamma = s * std::sqrt(d1 * d1 - *dy / s * (dp / s));
      if (*stp > *sty) {
        gamma = -gamma;
      }
      p = gamma - dp + theta;
      q = gamma - dp + gamma + *dy;
      r = p / q;
      stpc = *stp + r * (*sty - *stp);
      stpf = stpc;
    } else if (*stp > *stx) {
      stpf = stpmax;
    } else {
      stpf = stpmin;
    }
  }

  if (fp > *fx) {
    *sty = *stp;
    *fy = fp;
    *dy = dp;
  } else {
    if (sgnd < 0.0) {
      *sty = *stx;
      *fy = *fx;
      *dy = *dx;
    }
    *stx = *stp;
    *fx = fp;
    *dx = dp;
  }

  stpf = min(stpmax, stpf);
  stpf = max(stpmin, stpf);
  *stp = stpf;
  if (*brackt && bound) {
    if (*sty > *stx) {
      d1 = *stx + (*sty - *stx) * 0.66;
      *stp = min(d1, *stp);
    } else {
      d1 = *stx + (*sty - *stx) * 0.66;
      *stp = max(d1, *stp);
    }
  }

  return;
}
}

namespace CRFPP {

class LBFGS::Mcsrch {
 private:
  int infoc, stage1, brackt;
  double finit, dginit, dgtest, width, width1;
  double stx, fx, dgx, sty, fy, dgy, stmin, stmax;

 public:
  Mcsrch():
      infoc(0),
      stage1(0),
      brackt(0),
      finit(0.0), dginit(0.0), dgtest(0.0), width(0.0), width1(0.0),
      stx(0.0), fx(0.0), dgx(0.0), sty(0.0), fy(0.0), dgy(0.0),
      stmin(0.0), stmax(0.0) {}

  void mcsrch(int size,
              double *x,
              double f, const double *g, double *s,
              double *stp,
              int *info, int *nfev, double *wa) {
    static const double p5 = 0.5;
    static const double p66 = 0.66;
    static const double xtrapf = 4.0;
    static const int maxfev = 20;

    /* Parameter adjustments */
    --wa;
    --s;
    --g;
    --x;

    if (*info == -1) goto L45;
    infoc = 1;

    if (size <= 0 || *stp <= 0.0) return;

    dginit = ddot_(size, &g[1], &s[1]);
    if (dginit >= 0.0) return;

    brackt = false;
    stage1 = true;
    *nfev = 0;
    finit = f;
    dgtest = ftol * dginit;
    width = lb3_1_stpmax - lb3_1_stpmin;
    width1 = width / p5;
    for (int j = 1; j <= size; ++j) {
      wa[j] = x[j];
    }

    stx = 0.0;
    fx = finit;
    dgx = dginit;
    sty = 0.0;
    fy = finit;
    dgy = dginit;

    while (true) {
      if (brackt) {
        stmin = min(stx, sty);
        stmax = max(stx, sty);
      } else {
        stmin = stx;
        stmax = *stp + xtrapf * (*stp - stx);
      }

      *stp = max(*stp, lb3_1_stpmin);
      *stp = min(*stp, lb3_1_stpmax);

      if ((brackt && ((*stp <= stmin || *stp >= stmax) ||
                      *nfev >= maxfev - 1 || infoc == 0)) ||
          (brackt && (stmax - stmin <= xtol * stmax))) {
        *stp = stx;
      }

      for (int j = 1; j <= size; ++j) {
        x[j] = wa[j] + *stp * s[j];
      }
      *info = -1;
      return;

   L45:
      *info = 0;
      ++(*nfev);
      double dg = ddot_(size, &g[1], &s[1]);
      double ftest1 = finit + *stp * dgtest;

      if (brackt && ((*stp <= stmin || *stp >= stmax) || infoc == 0)) {
        *info = 6;
      }
      if (*stp == lb3_1_stpmax && f <= ftest1 && dg <= dgtest) {
        *info = 5;
      }
      if (*stp == lb3_1_stpmin && (f > ftest1 || dg >= dgtest)) {
        *info = 4;
      }
      if (*nfev >= maxfev) {
        *info = 3;
      }
      if (brackt && stmax - stmin <= xtol * stmax) {
        *info = 2;
      }
      if (f <= ftest1 && std::abs(dg) <= lb3_1_gtol * (-dginit)) {
        *info = 1;
      }

      if (*info != 0) {
        return;
      }

      if (stage1 && f <= ftest1 && dg >= min(ftol, lb3_1_gtol) * dginit) {
        stage1 = false;
      }

      if (stage1 && f <= fx && f > ftest1) {
        double fm = f - *stp * dgtest;
        double fxm = fx - stx * dgtest;
        double fym = fy - sty * dgtest;
        double dgm = dg - dgtest;
        double dgxm = dgx - dgtest;
        double dgym = dgy - dgtest;
        mcstep(&stx, &fxm, &dgxm, &sty, &fym, &dgym, stp, fm, dgm, &brackt,
               stmin, stmax, &infoc);
        fx = fxm + stx * dgtest;
        fy = fym + sty * dgtest;
        dgx = dgxm + dgtest;
        dgy = dgym + dgtest;
      } else {
        mcstep(&stx, &fx, &dgx, &sty, &fy, &dgy, stp, f, dg, &brackt,
               stmin, stmax, &infoc);
      }

      if (brackt) {
        double d1 = 0.0;
        if ((d1 = sty - stx, std::abs(d1)) >= p66 * width1) {
          *stp = stx + p5 * (sty - stx);
        }
        width1 = width;
        width = (d1 = sty - stx, std::abs(d1));
      }
    }

    return;
  }
};

void LBFGS::clear() {
  iflag_ = iscn = nfev = iycn = point = npt =
      iter = info = ispt = isyt = iypt = 0;
  stp = stp1 = 0.0;
  diag_.clear();
  w_.clear();
  delete mcsrch_;
  mcsrch_ = 0;
}

void LBFGS::pseudo_gradient(int size,
                            double *v,
                            double *x,
                            const double *g,
                            double C) {
  for (int i = 1; i <= size; ++i) {
    if (x[i] == 0) {
      if (g[i] + C < 0) {
        v[i] = g[i] + C;
      } else if (g[i] - C > 0) {
        v[i] = g[i] - C;
      } else {
        v[i] = 0;
      }
    }  else {
      v[i] = g[i] + C * sigma(x[i]);
    }
  }
}


void LBFGS::lbfgs_optimize(int size,
                           int msize,
                           double *x,
                           double f,
                           const double *g,
                           double *diag,
                           double *w,
                           bool orthant,
                           double C,
                           double *v,
                           double *xi,
                           int *iflag) {
  double yy = 0.0;
  double ys = 0.0;
  int bound = 0;
  int cp = 0;

  --diag;
  --g;
  --x;
  --w;
  --v;

  if (orthant) {
    --xi;
    pseudo_gradient(size, v, x, g, C);
  }

  if (!mcsrch_) mcsrch_ = new Mcsrch;

  if (*iflag == 1) goto L172;
  if (*iflag == 2) goto L100;

  // initialization
  if (*iflag == 0) {
    point = 0;
    for (int i = 1; i <= size; ++i) {
      diag[i] = 1.0;
    }
    ispt = size + (msize << 1);
    iypt = ispt + size * msize;
    for (int i = 1; i <= size; ++i) {
      w[ispt + i] = -v[i] * diag[i];
    }
    stp1 = 1.0 / std::sqrt(ddot_(size, &v[1], &v[1]));
  }

  // MAIN ITERATION LOOP
  while (true) {
    ++iter;
    info = 0;
    if (orthant) {
      for (int i = 1; i <= size; ++i) {
        xi[i] = (x[i] != 0 ? sigma(x[i]) : sigma(-v[i]));
      }
    }
    if (iter == 1) goto L165;
    if (iter > size) bound = size;

    // COMPUTE -H*G USING THE FORMULA GIVEN IN: Nocedal, J. 1980,
    // "Updating quasi-Newton matrices with limited storage",
    // Mathematics of Computation, Vol.24, No.151, pp. 773-782.
    ys = ddot_(size, &w[iypt + npt + 1], &w[ispt + npt + 1]);
    yy = ddot_(size, &w[iypt + npt + 1], &w[iypt + npt + 1]);
    for (int i = 1; i <= size; ++i) {
      diag[i] = ys / yy;
    }

 L100:
    cp = point;
    if (point == 0) cp = msize;
    w[size + cp] = 1.0 / ys;

    for (int i = 1; i <= size; ++i) {
      w[i] = -v[i];
    }

    bound = min(iter - 1, msize);

    cp = point;
    for (int i = 1; i <= bound; ++i) {
      --cp;
      if (cp == -1) cp = msize - 1;
      double sq = ddot_(size, &w[ispt + cp * size + 1], &w[1]);
      int inmc = size + msize + cp + 1;
      iycn = iypt + cp * size;
      w[inmc] = w[size + cp + 1] * sq;
      double d = -w[inmc];
      daxpy_(size, d, &w[iycn + 1], &w[1]);
    }

    for (int i = 1; i <= size; ++i) {
      w[i] = diag[i] * w[i];
    }

    for (int i = 1; i <= bound; ++i) {
      double yr = ddot_(size, &w[iypt + cp * size + 1], &w[1]);
      double beta = w[size + cp + 1] * yr;
      int inmc = size + msize + cp + 1;
      beta = w[inmc] - beta;
      iscn = ispt + cp * size;
      daxpy_(size, beta, &w[iscn + 1], &w[1]);
      ++cp;
      if (cp == msize) cp = 0;
    }

    if (orthant) {
      for (int i = 1; i <= size; ++i) {
        w[i] = (sigma(w[i]) == sigma(-v[i]) ? w[i] : 0);
      }
    }
    // STORE THE NEW SEARCH DIRECTION
    for (int i = 1; i <= size; ++i) {
      w[ispt + point * size + i] = w[i];
    }

 L165:
    // OBTAIN THE ONE-DIMENSIONAL MINIMIZER OF THE FUNCTION
    // BY USING THE LINE SEARCH ROUTINE MCSRCH
    nfev = 0;
    stp = 1.0;
    if (iter == 1) {
      stp = stp1;
    }
    for (int i = 1; i <= size; ++i) {
      w[i] = g[i];
    }

 L172:
    mcsrch_->mcsrch(size, &x[1], f, &v[1], &w[ispt + point * size + 1],
                    &stp, &info, &nfev, &diag[1]);
    if (info == -1) {
      if (orthant) {
        for (int i = 1; i <= size; ++i) {
          x[i] = (sigma(x[i]) == sigma(xi[i]) ? x[i] : 0);
        }
      }
      *iflag = 1;  // next value
      return;
    }
    if (info != 1) {
      std::cerr << "The line search routine mcsrch failed: error code:"
                << info << std::endl;
      *iflag = -1;
      return;
    }

    // COMPUTE THE NEW STEP AND GRADIENT CHANGE
    npt = point * size;
    for (int i = 1; i <= size; ++i) {
      w[ispt + npt + i] = stp * w[ispt + npt + i];
      w[iypt + npt + i] = g[i] - w[i];
    }
    ++point;
    if (point == msize) point = 0;

    double gnorm = std::sqrt(ddot_(size, &v[1], &v[1]));
    double xnorm = max(1.0, std::sqrt(ddot_(size, &x[1], &x[1])));
    if (gnorm / xnorm <= eps) {
      *iflag = 0;  // OK terminated
      return;
    }
  }

  return;
}
}