13 #include <type_traits>
27 std::string
name =
"Experiment";
62 std::string base = config.
name.empty() ?
"run" : config.
name;
63 return base +
"_history.csv";
68 if (log_interval <= 0)
return;
69 std::vector<double> loss_hist;
70 std::vector<double> grad_hist;
71 std::vector<double> time_hist;
73 if (loss_hist.empty())
return;
75 std::ofstream log_file(filename);
76 if (!log_file.is_open())
return;
77 log_file <<
"Iteration,Loss,GradNorm,TimeMs\n";
78 int stride = std::max(1, log_interval);
79 for (
size_t i = 0; i < loss_hist.size(); i +=
static_cast<size_t>(stride)) {
80 double loss = loss_hist[i];
81 double grad = (i < grad_hist.size()) ? grad_hist[i] : 0.0;
82 double time_ms = (i < time_hist.size()) ? time_hist[i] : 0.0;
83 log_file << i <<
"," << loss <<
"," << grad <<
"," << time_ms <<
"\n";
90 using Vec = Eigen::VectorXd;
91 using Mat = Eigen::MatrixXd;
94 size_t params_size = network.
getSize();
96 Vec weights(params_size);
97 std::copy(network.getParamsData(), network.getParamsData() + params_size, weights.data());
99 const double inv_samples = (data.
train_x.cols() > 0) ? (1.0 /
static_cast<double>(data.
train_x.cols())) : 0.0;
102 network.setParams(w);
103 const auto &output = network.forward(data.
train_x);
104 Mat diff = output - data.
train_y;
105 double loss = 0.5 * diff.squaredNorm();
106 if (inv_samples != 0.0) loss *= inv_samples;
111 network.setParams(w);
113 const auto &output = network.forward(data.
train_x);
114 Mat diff = output - data.
train_y;
115 network.backward(diff);
118 if (inv_samples != 0.0) g *= inv_samples;
122 Vec final_weights = minimizer.
solve(weights, f, grad);
123 network.setParams(final_weights);
170 using Vec = Eigen::VectorXd;
171 using Mat = Eigen::MatrixXd;
173 auto minimizer = std::make_shared<cpu_mlp::GradientDescent<Vec, Mat>>();
174 minimizer->setMaxIterations(config.
max_iters);
175 minimizer->setTolerance(config.
tolerance);
177 minimizer->useLineSearch(
false);
180 minimizer->setRecorder(&recorder);
200 using Vec = Eigen::VectorXd;
201 using Mat = Eigen::MatrixXd;
203 auto minimizer = std::make_shared<cpu_mlp::LBFGS<Vec, Mat>>();
204 minimizer->setMaxIterations(config.
max_iters);
205 minimizer->setTolerance(config.
tolerance);
206 minimizer->setHistorySize(config.
m_param > 0 ? config.
m_param : 10);
209 minimizer->setRecorder(&recorder);
229 using Vec = Eigen::VectorXd;
230 using Mat = Eigen::MatrixXd;
232 auto minimizer = std::make_shared<cpu_mlp::StochasticGradientDescent<Vec, Mat>>();
233 minimizer->setMaxIterations(config.
max_iters);
238 minimizer->setRecorder(&recorder);
240 std::cout <<
"Starting Batch SGD (CPU Optimized)..." << std::endl;
246 size_t params_size = network.
getSize();
248 Vec weights(params_size);
249 std::copy(network.getParamsData(), network.getParamsData() + params_size, weights.data());
251 long input_rows = data.
train_x.rows();
252 long output_rows = data.
train_y.rows();
254 Mat batch_x_buffer(input_rows, config.
batch_size);
255 Mat batch_y_buffer(output_rows, config.
batch_size);
257 auto batch_g = [&](
const Vec &w,
const std::vector<size_t> &indices, Vec &grad)
mutable {
258 network.setParams(w);
261 long current_bs = indices.size();
263 if (batch_x_buffer.cols() != current_bs) {
264 batch_x_buffer.resize(input_rows, current_bs);
265 batch_y_buffer.resize(output_rows, current_bs);
268 for (
long i = 0; i < current_bs; ++i) {
269 batch_x_buffer.col(i) = data.
train_x.col(indices[i]);
270 batch_y_buffer.col(i) = data.
train_y.col(indices[i]);
273 const auto &output = network.forward(batch_x_buffer);
275 Mat diff = output - batch_y_buffer;
276 network.backward(diff);
278 network.getGrads(grad);
279 grad /=
static_cast<double>(current_bs);
282 auto f_single = [&](
const Vec &w,
const Vec &x,
const Vec &y) ->
double {
283 network.setParams(w);
284 Eigen::MatrixXd input_mat(x.size(), 1);
285 input_mat.col(0) = x;
286 const auto &output = network.forward(input_mat);
287 return 0.5 * (output.col(0) - y).squaredNorm();
290 minimizer->setData(data.
train_x, data.
train_y, f_single, batch_g);
292 Vec final_weights = minimizer->stochastic_solve(weights,
315 using Vec = Eigen::VectorXd;
316 using Mat = Eigen::MatrixXd;
318 auto minimizer = std::make_shared<cpu_mlp::SLBFGS<Vec, Mat>>();
319 minimizer->setMaxIterations(config.
max_iters);
320 minimizer->setTolerance(config.
tolerance);
323 minimizer->setRecorder(&recorder);
330 size_t params_size = network.
getSize();
332 Vec weights(params_size);
333 std::copy(network.getParamsData(), network.getParamsData() + params_size, weights.data());
334 double lambda = 1e-4;
336 long input_rows = data.
train_x.rows();
337 long output_rows = data.
train_y.rows();
340 Mat batch_x_buffer(input_rows, std::max(config.
batch_size, 128));
341 Mat batch_y_buffer(output_rows, std::max(config.
batch_size, 128));
343 auto batch_g = [&](
const Vec &w,
const std::vector<size_t> &indices, Vec &grad)
mutable {
344 network.setParams(w);
347 long current_bs = indices.size();
349 if (batch_x_buffer.cols() < current_bs) {
350 batch_x_buffer.resize(input_rows, current_bs);
351 batch_y_buffer.resize(output_rows, current_bs);
354 bool is_full_batch = (current_bs == N);
357 const auto &output = network.forward(data.
train_x);
358 Mat diff = output - data.
train_y;
359 network.backward(diff);
361 for (
long i = 0; i < current_bs; ++i) {
362 batch_x_buffer.col(i) = data.
train_x.col(indices[i]);
363 batch_y_buffer.col(i) = data.
train_y.col(indices[i]);
365 auto x_view = batch_x_buffer.leftCols(current_bs);
366 auto y_view = batch_y_buffer.leftCols(current_bs);
368 const auto &output = network.forward(x_view);
369 Mat diff = output - y_view;
370 network.backward(diff);
373 network.getGrads(grad);
374 grad /=
static_cast<double>(current_bs);
375 grad.array() += lambda * w.array();
378 auto batch_f = [&](
const Vec &w,
const std::vector<size_t> &indices) ->
double {
379 network.setParams(w);
380 long current_bs = indices.size();
382 if (batch_x_buffer.cols() < current_bs) {
383 batch_x_buffer.resize(input_rows, current_bs);
384 batch_y_buffer.resize(output_rows, current_bs);
387 for (
long i = 0; i < current_bs; ++i) {
388 batch_x_buffer.col(i) = data.
train_x.col(indices[i]);
389 batch_y_buffer.col(i) = data.
train_y.col(indices[i]);
391 auto x_view = batch_x_buffer.leftCols(current_bs);
392 auto y_view = batch_y_buffer.leftCols(current_bs);
394 const auto &output = network.forward(x_view);
395 Vec diff_sq = (output - y_view).colwise().squaredNorm();
396 double loss = 0.5 * diff_sq.sum();
398 loss += 0.5 * lambda * w.squaredNorm();
402 minimizer->setData(batch_f, batch_g);
404 Vec final_weights = minimizer->stochastic_solve(
441 inline std::string cuda_log_filename(
const UnifiedConfig &config) {
442 std::string base = config.
name.empty() ?
"run" : config.
name;
443 return base +
"_history.csv";
446 inline void write_cuda_history_csv(
448 if (log_interval <= 0)
return;
449 std::vector<cuda_mlp::CudaScalar> loss_hist;
450 std::vector<cuda_mlp::CudaScalar> grad_hist;
451 std::vector<cuda_mlp::CudaScalar> time_hist;
452 recorder.copy_to_host(loss_hist, grad_hist, time_hist);
453 if (loss_hist.empty())
return;
455 std::ofstream log_file(filename);
456 if (!log_file.is_open())
return;
457 log_file <<
"Iteration,Loss,GradNorm,TimeMs\n";
458 int stride = std::max(1, log_interval);
459 for (
size_t i = 0; i < loss_hist.size(); i +=
static_cast<size_t>(stride)) {
463 log_file << i <<
"," << loss <<
"," << grad <<
"," << time_ms <<
"\n";
470 template <
typename SolverFactory>
471 inline void run_cuda_solver_once(SolverFactory make_solver,
483 auto loss_grad = [&](
const CudaScalar *params,
497 auto solver = make_solver();
500 solver->setRecorder(&recorder);
502 auto start_time = std::chrono::steady_clock::now();
507 static_cast<int>(dataset.
train_x.cols()),
509 cudaDeviceSynchronize();
510 auto end_time = std::chrono::steady_clock::now();
514 write_cuda_history_csv(cuda_log_filename(config), recorder, config.
log_interval);
539 run_cuda_solver_once(
541 auto solver = std::make_unique<CudaGD>(handle);
578 run_cuda_solver_once(
580 auto solver = std::make_unique<CudaLBFGS>(handle);
616 run_cuda_solver_once(
618 auto solver = std::make_unique<CudaSGD>(handle);
624 solver->setDimensions(
static_cast<int>(d.
train_x.rows()),
static_cast<int>(d.
train_y.rows()));
639 template <
typename T>
class UnavailableOptimizer {
640 static_assert(
sizeof(T) == 0,
"This Optimizer is NOT available on the current Backend (e.g. SLBFGS is CPU-only).");
648 template <
typename Backend>
649 using UnifiedGD =
typename std::conditional<std::is_same<Backend, CpuBackend>::value,
661 template <
typename Backend>
662 using UnifiedLBFGS =
typename std::conditional<std::is_same<Backend, CpuBackend>::value,
674 template <
typename Backend>
675 using UnifiedSGD =
typename std::conditional<std::is_same<Backend, CpuBackend>::value,
688 template <
typename Backend>
689 using UnifiedSLBFGS =
typename std::conditional<std::is_same<Backend, CpuBackend>::value,
692 UnavailableOptimizer<Backend>
CPU recorder that stores loss/gradient history on host.
Definition: iteration_recorder.hpp:18
void copy_to_host(std::vector< double > &loss_out, std::vector< double > &grad_norm_out) const
Copy recorded loss and gradient norm to output vectors.
Definition: iteration_recorder.hpp:50
void init(int capacity)
Allocate buffers for up to capacity iterations.
Definition: iteration_recorder.hpp:21
CPU specialization of the network wrapper.
Definition: network_wrapper.hpp:60
InternalNetwork & getInternal()
Access the underlying CPU network.
Definition: network_wrapper.hpp:72
Definition: network_wrapper.hpp:55
Standard Gradient Descent implementation for CPU.
Definition: unified_optimization.hpp:161
void optimize(NetworkWrapper< CpuBackend > &net, const UnifiedDataset &data, const UnifiedConfig &config) override
Optimize the network using Gradient Descent on CPU.
Definition: unified_optimization.hpp:169
L-BFGS implementation for CPU.
Definition: unified_optimization.hpp:191
void optimize(NetworkWrapper< CpuBackend > &net, const UnifiedDataset &data, const UnifiedConfig &config) override
Optimize the network using L-BFGS on CPU.
Definition: unified_optimization.hpp:199
virtual void optimize(NetworkWrapper< CpuBackend > &net, const UnifiedDataset &data, const UnifiedConfig &config)=0
Executes the optimization strategy.
virtual ~UnifiedOptimizer()=default
Abstract base class for backend-specific optimizer strategies.
Definition: unified_optimization.hpp:135
Stochastic Gradient Descent implementation for CPU (Optimized with Batch Matrix Ops).
Definition: unified_optimization.hpp:220
void optimize(NetworkWrapper< CpuBackend > &net, const UnifiedDataset &data, const UnifiedConfig &config) override
Optimize the network using Stochastic Gradient Descent on CPU.
Definition: unified_optimization.hpp:228
Stochastic L-BFGS implementation for CPU.
Definition: unified_optimization.hpp:306
void optimize(NetworkWrapper< CpuBackend > &net, const UnifiedDataset &data, const UnifiedConfig &config) override
Optimize the network using Stochastic L-BFGS on CPU.
Definition: unified_optimization.hpp:314
Base class for Full Batch Minimizers.
Definition: full_batch_minimizer.hpp:23
virtual V solve(V x, VecFun< V, double > &f, GradFun< V > &Gradient)=0
Performs optimization.
size_t getSize() const
Total number of parameters.
Definition: network.hpp:36
RAII-managed cuBLAS handle.
Definition: cublas_handle.cuh:22
Feed-forward dense network with GPU-backed parameters and gradients.
Definition: network.cuh:16
size_t params_size() const
Total number of parameters.
Definition: network.cuh:62
CudaScalar compute_loss_and_grad(const CudaScalar *input, const CudaScalar *target, int batch)
Compute MSE loss and gradients for a batch.
Definition: network.cuh:97
CudaScalar * grads_data()
Mutable device pointer to gradients.
Definition: network.cuh:69
CudaScalar * params_data()
Mutable device pointer to parameters.
Definition: network.cuh:67
Owning buffer for device memory.
Definition: device_buffer.cuh:7
T * data()
Mutable raw pointer to device memory.
Definition: device_buffer.cuh:68
std::function< T(T)> GradFun
Gradient function type alias (T -> T).
Definition: common.hpp:32
std::function< W(T)> VecFun
Objective function type alias (T -> W).
Definition: common.hpp:35
void device_copy(CudaScalar *dst, const CudaScalar *src, size_t n)
Copy device-to-device.
Definition: kernels.cuh:24
float CudaScalar
Scalar type used across CUDA kernels and optimizers.
Definition: common.cuh:11
Backend-agnostic wrapper for CPU/CUDA networks.
constexpr unsigned int kDefaultSeed
Definition: seed.hpp:4
Backend tag for CPU implementations.
Definition: network_wrapper.hpp:20
Backend tag for CUDA implementations.
Definition: network_wrapper.hpp:22
Configuration parameters for training experiments.
Definition: unified_optimization.hpp:26
int lr_decay_rate
Definition: unified_optimization.hpp:34
unsigned int seed
Definition: unified_optimization.hpp:47
double lr_decay
Definition: unified_optimization.hpp:33
int max_iters
Definition: unified_optimization.hpp:29
bool reset_params
Definition: unified_optimization.hpp:46
int log_interval
Definition: unified_optimization.hpp:43
int b_H_param
Definition: unified_optimization.hpp:40
int batch_size
Definition: unified_optimization.hpp:37
double tolerance
Definition: unified_optimization.hpp:30
int L_param
Definition: unified_optimization.hpp:39
double learning_rate
Definition: unified_optimization.hpp:31
double momentum
Definition: unified_optimization.hpp:32
int m_param
Definition: unified_optimization.hpp:38
std::string name
Definition: unified_optimization.hpp:27
Container for training and test data.
Definition: unified_optimization.hpp:54
Eigen::MatrixXd test_x
Definition: unified_optimization.hpp:57
Eigen::MatrixXd train_x
Definition: unified_optimization.hpp:55
Eigen::MatrixXd test_y
Definition: unified_optimization.hpp:58
Eigen::MatrixXd train_y
Definition: unified_optimization.hpp:56
typename std::conditional< std::is_same< Backend, CpuBackend >::value, UnifiedSLBFGS_CPU, void >::type UnifiedSLBFGS
Unified alias for Stochastic L-BFGS (CPU ONLY). Triggers compile-time error if used with CudaBackend.
Definition: unified_optimization.hpp:696
typename std::conditional< std::is_same< Backend, CpuBackend >::value, UnifiedLBFGS_CPU, void >::type UnifiedLBFGS
Unified alias for L-BFGS (CPU & CUDA).
Definition: unified_optimization.hpp:669
void write_cpu_history_csv(const std::string &filename, const IterationRecorder< CpuBackend > &recorder, int log_interval)
Definition: unified_optimization.hpp:66
typename std::conditional< std::is_same< Backend, CpuBackend >::value, UnifiedSGD_CPU, void >::type UnifiedSGD
Unified alias for Stochastic Gradient Descent (CPU & CUDA).
Definition: unified_optimization.hpp:682
std::string cpu_log_filename(const UnifiedConfig &config)
Definition: unified_optimization.hpp:61
void run_full_batch_cpu(NetworkWrapper< CpuBackend > &net, const UnifiedDataset &data, cpu_mlp::FullBatchMinimizer< Eigen::VectorXd, Eigen::MatrixXd > &minimizer)
Definition: unified_optimization.hpp:87
typename std::conditional< std::is_same< Backend, CpuBackend >::value, UnifiedGD_CPU, void >::type UnifiedGD
Unified alias for Gradient Descent (CPU & CUDA).
Definition: unified_optimization.hpp:656