// Copyright (C) 2008 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_
#define DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_
#include <cmath>
#include <limits>
#include "../matrix.h"
#include "../algs.h"
#include "optimization_search_strategies_abstract.h"
#include "../sequence.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class cg_search_strategy
{
public:
cg_search_strategy() : been_used(false) {}
double get_wolfe_rho (
) const { return 0.001; }
double get_wolfe_sigma (
) const { return 0.01; }
unsigned long get_max_line_search_iterations (
) const { return 100; }
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& ,
const double ,
const T& funct_derivative
)
{
if (been_used == false)
{
been_used = true;
prev_direction = -funct_derivative;
}
else
{
// Use the Polak-Ribiere (4.1.12) conjugate gradient described by Fletcher on page 83
const double temp = trans(prev_derivative)*prev_derivative;
// If this value hits zero then just use the direction of steepest descent.
if (std::abs(temp) < std::numeric_limits<double>::epsilon())
{
prev_derivative = funct_derivative;
prev_direction = -funct_derivative;
return prev_direction;
}
double b = trans(funct_derivative-prev_derivative)*funct_derivative/(temp);
prev_direction = -funct_derivative + b*prev_direction;
}
prev_derivative = funct_derivative;
return prev_direction;
}
private:
bool been_used;
matrix<double,0,1> prev_derivative;
matrix<double,0,1> prev_direction;
};
// ----------------------------------------------------------------------------------------
class bfgs_search_strategy
{
public:
bfgs_search_strategy() : been_used(false), been_used_twice(false) {}
double get_wolfe_rho (
) const { return 0.01; }
double get_wolfe_sigma (
) const { return 0.9; }
unsigned long get_max_line_search_iterations (
) const { return 100; }
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& x,
const double ,
const T& funct_derivative
)
{
if (been_used == false)
{
been_used = true;
H = identity_matrix<double>(x.size());
}
else
{
// update H with the BFGS formula from (3.2.12) on page 55 of Fletcher
delta = (x-prev_x);
gamma = funct_derivative-prev_derivative;
double dg = dot(delta,gamma);
// Try to set the initial value of the H matrix to something reasonable if we are still
// in the early stages of figuring out what it is. This formula below is what is suggested
// in the book Numerical Optimization by Nocedal and Wright in the chapter on Quasi-Newton methods.
if (been_used_twice == false)
{
double gg = trans(gamma)*gamma;
if (std::abs(gg) > std::numeric_limits<double>::epsilon())
{
const double temp = put_in_range(0.01, 100, dg/gg);
H = diagm(uniform_matrix<double>(x.size(),1, temp));
been_used_twice = true;
}
}
Hg = H*gamma;
gH = trans(trans(gamma)*H);
double gHg = trans(gamma)*H*gamma;
if (gHg < std::numeric_limits<double>::infinity() && dg < std::numeric_limits<double>::infinity() &&
dg != 0)
{
H += (1 + gHg/dg)*delta*trans(delta)/(dg) - (delta*trans(gH) + Hg*trans(delta))/(dg);
}
else
{
H = identity_matrix<double>(H.nr());
been_used_twice = false;
}
}
prev_x = x;
prev_direction = -H*funct_derivative;
prev_derivative = funct_derivative;
return prev_direction;
}
private:
bool been_used;
bool been_used_twice;
matrix<double,0,1> prev_x;
matrix<double,0,1> prev_derivative;
matrix<double,0,1> prev_direction;
matrix<double> H;
matrix<double,0,1> delta, gamma, Hg, gH;
};
// ----------------------------------------------------------------------------------------
class lbfgs_search_strategy
{
public:
explicit lbfgs_search_strategy(unsigned long max_size_) : max_size(max_size_), been_used(false)
{
DLIB_ASSERT (
max_size > 0,
"\t lbfgs_search_strategy(max_size)"
<< "\n\t max_size can't be zero"
);
}
lbfgs_search_strategy(const lbfgs_search_strategy& item)
{
max_size = item.max_size;
been_used = item.been_used;
prev_x = item.prev_x;
prev_derivative = item.prev_derivative;
prev_direction = item.prev_direction;
alpha = item.alpha;
dh_temp = item.dh_temp;
}
double get_wolfe_rho (
) const { return 0.01; }
double get_wolfe_sigma (
) const { return 0.9; }
unsigned long get_max_line_search_iterations (
) const { return 100; }
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& x,
const double ,
const T& funct_derivative
)
{
prev_direction = -funct_derivative;
if (been_used == false)
{
been_used = true;
}
else
{
// add an element into the stored data sequence
dh_temp.s = x - prev_x;
dh_temp.y = funct_derivative - prev_derivative;
double temp = dot(dh_temp.s, dh_temp.y);
// only accept this bit of data if temp isn't zero
if (std::abs(temp) > std::numeric_limits<double>::epsilon())
{
dh_temp.rho = 1/temp;
data.add(data.size(), dh_temp);
}
else
{
data.clear();
}
if (data.size() > 0)
{
// This block of code is from algorithm 7.4 in the Nocedal book.
alpha.resize(data.size());
for (unsigned long i = data.size()-1; i < data.size(); --i)
{
alpha[i] = data[i].rho*dot(data[i].s, prev_direction);
prev_direction -= alpha[i]*data[i].y;
}
// Take a guess at what the first H matrix should be. This formula below is what is suggested
// in the book Numerical Optimization by Nocedal and Wright in the chapter on Large Scale
// Unconstrained Optimization (in the L-BFGS section).
double H_0 = 1.0/data[data.size()-1].rho/dot(data[data.size()-1].y, data[data.size()-1].y);
H_0 = put_in_range(0.001, 1000.0, H_0);
prev_direction *= H_0;
for (unsigned long i = 0; i < data.size(); ++i)
{
double beta = data[i].rho*dot(data[i].y, prev_direction);
prev_direction += data[i].s * (alpha[i] - beta);
}
}
}
if (data.size() > max_size)
{
// remove the oldest element in the data sequence
data.remove(0, dh_temp);
}
prev_x = x;
prev_derivative = funct_derivative;
return prev_direction;
}
private:
struct data_helper
{
matrix<double,0,1> s;
matrix<double,0,1> y;
double rho;
friend void swap(data_helper& a, data_helper& b)
{
a.s.swap(b.s);
a.y.swap(b.y);
std::swap(a.rho, b.rho);
}
};
sequence<data_helper>::kernel_2a data;
unsigned long max_size;
bool been_used;
matrix<double,0,1> prev_x;
matrix<double,0,1> prev_derivative;
matrix<double,0,1> prev_direction;
std::vector<double> alpha;
data_helper dh_temp;
};
// ----------------------------------------------------------------------------------------
template <typename hessian_funct>
class newton_search_strategy_obj
{
public:
explicit newton_search_strategy_obj(
const hessian_funct& hess
) : hessian(hess) {}
double get_wolfe_rho (
) const { return 0.01; }
double get_wolfe_sigma (
) const { return 0.9; }
unsigned long get_max_line_search_iterations (
) const { return 100; }
template <typename T>
const matrix<double,0,1> get_next_direction (
const T& x,
const double ,
const T& funct_derivative
)
{
return -inv(hessian(x))*funct_derivative;
}
private:
hessian_funct hessian;
};
template <typename hessian_funct>
newton_search_strategy_obj<hessian_funct> newton_search_strategy (
hessian_funct hessian
) { return newton_search_strategy_obj<hessian_funct>(hessian); }
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_