My Project
s_gd.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include "../common.hpp"
4 #include "../seed.hpp"
6 #include <Eigen/Eigen>
7 #include <chrono>
8 #include <cmath>
9 #include <random>
10 
11 namespace cpu_mlp {
12 
17 template <typename V, typename M> class StochasticGradientDescent : public StochasticMinimizer<V, M> {
18  // Public inheritance needed!
20  using Base::_iters;
21  using Base::_max_iters;
22  using Base::_tol;
23  using Base::step_size;
24 
25 public:
27  using Base::setStepSize;
28  using Base::setTolerance;
29 
30  using S_VecFun = std::function<double(const V &, const V &, const V &)>;
31  using BatchGradFun = std::function<void(const V &, const std::vector<size_t> &, V &)>;
32 
37 
41  static std::vector<size_t> sample_minibatch_indices(const size_t N, size_t batch_size, std::mt19937 &rng);
42 
46  void setData(const M &inputs, const M &targets, const S_VecFun &f, const BatchGradFun &g) {
47  _inputs = inputs;
48  _targets = targets;
49  _sf = f;
50  _batch_g = g;
51  }
52 
63  V stochastic_solve(const V &init_w,
64  int m,
65  int b,
66  double step,
67  bool verbose = false,
68 
69  int print_every = 50) {
70  if (_inputs.cols() == 0 || _targets.cols() == 0) {
71  throw std::runtime_error("No data set for StochasticGradientDescent::stochastic_solve");
72  }
73 
74  int N = static_cast<int>(_inputs.cols());
75  std::vector<size_t> full_indices(static_cast<size_t>(N));
76  for (int i = 0; i < N; ++i) {
77  full_indices[static_cast<size_t>(i)] = static_cast<size_t>(i);
78  }
79 
80  V w = init_w;
81  int dim = static_cast<int>(w.size());
82  step_size = step;
83 
84  double passes = 0.0;
85  std::mt19937 rng(kDefaultSeed);
86 
87  const bool timing = (this->recorder_ != nullptr);
88  if (this->recorder_) this->recorder_->reset();
89  auto start_time = std::chrono::steady_clock::now();
90 
91  while (_iters < _max_iters) {
92  // one epoch of m minibatches
93  double last_grad_norm = 0.0;
94 
95  for (int t = 0; t < m; ++t) {
96  auto minibatch_indices = sample_minibatch_indices(N, b, rng);
97 
98  V grad_est = V::Zero(dim);
99 
100  _batch_g(w, minibatch_indices, grad_est);
101  last_grad_norm = grad_est.norm();
102 
103  w = w - step_size * grad_est;
104 
105  passes += static_cast<double>(b) / static_cast<double>(N);
106  }
107 
108  double loss = 0.0;
109  for (int i = 0; i < N; ++i) {
110  loss += _sf(w, _inputs.col(i), _targets.col(i));
111  }
112  loss /= static_cast<double>(N);
113 
114  if (verbose && ((_iters % print_every) == 0)) {
115  std::cout << "Epoch " << (_iters + 1) << " loss=" << loss << " passes=" << passes << std::endl;
116  }
117 
118  if (this->recorder_) {
119  double grad_norm = last_grad_norm;
120  if (N > 0) {
121  V full_grad = V::Zero(dim);
122  _batch_g(w, full_indices, full_grad);
123  grad_norm = full_grad.norm();
124  }
125  double elapsed_ms = 0.0;
126  if (timing) {
127  auto now = std::chrono::steady_clock::now();
128  elapsed_ms = std::chrono::duration<double, std::milli>(now - start_time).count();
129  }
130  this->recorder_->record(_iters, loss, grad_norm, elapsed_ms);
131  }
132 
133  _iters++;
134  }
135 
136  return w;
137  }
138 
139 private:
140  M _inputs;
141  M _targets;
142  S_VecFun _sf;
143  BatchGradFun _batch_g;
144 };
145 
146 template <typename V, typename M>
148  const size_t N, size_t batch_size, std::mt19937 &rng) {
149  if (N == 0 || batch_size == 0) {
150  std::cerr << "Warning: sampling minibatch from empty dataset or with batch_size=0." << std::endl;
151  return {};
152  }
153 
154  std::vector<size_t> idx(N);
155  for (size_t i = 0; i < N; ++i) {
156  idx[i] = i;
157  }
158 
159  if (batch_size >= N) {
160  return idx;
161  }
162 
163  for (size_t i = 0; i < batch_size; ++i) {
164  std::uniform_int_distribution<size_t> dist(i, N - 1);
165  size_t j = dist(rng);
166  std::swap(idx[i], idx[j]);
167  }
168  idx.resize(batch_size);
169  return idx;
170 }
171 
172 } // namespace cpu_mlp
void reset()
Reset recorded size without releasing memory.
Definition: iteration_recorder.hpp:31
void record(int idx, double loss, double grad_norm, double time_ms=0.0)
Record a loss/grad/time entry at iteration index.
Definition: iteration_recorder.hpp:40
Stochastic Gradient Descent (SGD) minimizer.
Definition: s_gd.hpp:17
V stochastic_solve(const V &init_w, int m, int b, double step, bool verbose=false, int print_every=50)
Execute Stochastic Solve.
Definition: s_gd.hpp:63
static std::vector< size_t > sample_minibatch_indices(const size_t N, size_t batch_size, std::mt19937 &rng)
Helper to sample minibatch indices without replacement (Fisher-Yates shuffle subset).
Definition: s_gd.hpp:147
std::function< void(const V &, const std::vector< size_t > &, V &)> BatchGradFun
Definition: s_gd.hpp:31
std::function< double(const V &, const V &, const V &)> S_VecFun
Definition: s_gd.hpp:30
void setData(const M &inputs, const M &targets, const S_VecFun &f, const BatchGradFun &g)
Configure data and callbacks for the solver.
Definition: s_gd.hpp:46
StochasticGradientDescent()=default
Default constructor.
Base class for Stochastic Minimizers.
Definition: stochastic_minimizer.hpp:16
void setMaxIterations(int max_iters)
Sets the maximum number of iterations.
Definition: stochastic_minimizer.hpp:24
unsigned int _iters
Definition: stochastic_minimizer.hpp:45
unsigned int _max_iters
Definition: stochastic_minimizer.hpp:44
void setTolerance(double tol)
Sets the tolerance for convergence (full gradient norm).
Definition: stochastic_minimizer.hpp:36
::IterationRecorder< CpuBackend > * recorder_
Optional recorder for diagnostics.
Definition: stochastic_minimizer.hpp:48
void setStepSize(double s)
Sets the step size (learning rate).
Definition: stochastic_minimizer.hpp:30
double step_size
Definition: stochastic_minimizer.hpp:47
double _tol
Definition: stochastic_minimizer.hpp:46
Definition: layer.hpp:13
constexpr unsigned int kDefaultSeed
Definition: seed.hpp:4