00001
00005
00006
00007
00008
00009
00010
00011
00012 #define STUMP_DOUBLE_DIRECTIONS 0
00013
00014 #include <assert.h>
00015 #include <cmath>
00016 #include <numeric>
00017 #include "random.h"
00018 #include "utility.h"
00019 #include "stump.h"
00020 #include "perceptron.h"
00021
00022 REGISTER_CREATOR(lemga::Perceptron);
00023
00024 typedef std::vector<REAL> RVEC;
00025 typedef std::vector<RVEC> RMAT;
00026
00027 #define DOTPROD(x,y) std::inner_product(x.begin(), x.end(), y.begin(), .0)
00028
00029 #define DOTPROD_NB(x,y) std::inner_product(x.begin(),x.end()-1,y.begin(),.0)
00030
00031 inline RVEC randvec (UINT n) {
00032 RVEC x(n);
00033 for (UINT i = 0; i < n; ++i)
00034 x[i] = 2*randu() - 1;
00035 return x;
00036 }
00037
00038 inline RVEC coorvec (UINT n, UINT c) {
00039 RVEC x(n, 0);
00040 x[c] = 1;
00041 return x;
00042 }
00043
00044 inline void normalize (RVEC& v, REAL thr = 0) {
00045 REAL s = DOTPROD(v, v);
00046 assert(s > 0);
00047 if (s <= thr) return;
00048 s = 1 / std::sqrt(s);
00049 for (RVEC::iterator p = v.begin(); p != v.end(); ++p)
00050 *p *= s;
00051 }
00052
00053 RMAT randrot (UINT n, bool bias_row = false, bool conjugate = true) {
00054 RMAT m(n);
00055 if (bias_row)
00056 m.back() = coorvec(n, n-1);
00057
00058 for (UINT i = 0; i+bias_row < n; ++i) {
00059 m[i] = randvec(n);
00060 if (bias_row) m[i][n-1] = 0;
00061 if (!conjugate) continue;
00062
00063
00064 for (UINT k = 0; k < i; ++k) {
00065 REAL t = DOTPROD(m[i], m[k]);
00066 for (UINT j = 0; j < n; ++j)
00067 m[i][j] -= t * m[k][j];
00068 }
00069
00070 normalize(m[i]);
00071 }
00072 return m;
00073 }
00074
00075 namespace lemga {
00076
00079 void update_wgt (RVEC& wgt, const RVEC& dir, const RMAT& X, const RVEC& y) {
00080 const UINT dim = wgt.size();
00081 assert(dim == dir.size());
00082 const UINT n_samples = X.size();
00083 assert(n_samples == y.size());
00084
00085 RVEC xn(n_samples), yn(n_samples);
00086 #if STUMP_DOUBLE_DIRECTIONS
00087 REAL bias = 0;
00088 #endif
00089 for (UINT i = 0; i < n_samples; ++i) {
00090 assert(X[i].size() == dim);
00091 const REAL x_d = DOTPROD(X[i], dir);
00092 REAL x_new = 0, y_new = 0;
00093 if (x_d != 0) {
00094 y_new = (x_d > 0)? y[i] : -y[i];
00095 REAL x_w = DOTPROD(X[i], wgt);
00096 x_new = x_w / x_d;
00097 }
00098 #if STUMP_DOUBLE_DIRECTIONS
00099 else
00100 bias += (DOTPROD(X[i], wgt) > 0)? y[i] : -y[i];
00101 #endif
00102 xn[i] = x_new; yn[i] = y_new;
00103 }
00104
00105 #if STUMP_DOUBLE_DIRECTIONS
00106 REAL th1, th2;
00107 bool ir, pos;
00108 Stump::train_1d(xn, yn, bias, ir, pos, th1, th2);
00109 const REAL w_d_new = -(th1 + th2) / 2;
00110 #else
00111 const REAL w_d_new = - Stump::train_1d(xn, yn);
00112 #endif
00113
00114 for (UINT j = 0; j < dim; ++j)
00115 wgt[j] += w_d_new * dir[j];
00116
00117 #if STUMP_DOUBLE_DIRECTIONS
00118 if (!pos)
00119 for (UINT j = 0; j < dim; ++j)
00120 wgt[j] = -wgt[j];
00121 #endif
00122
00123
00124 normalize(wgt, 1e10);
00125 }
00126
00127 void dset_extract (const pDataSet& ptd, RMAT& X, RVEC& y) {
00128 UINT n = ptd->size();
00129 X.resize(n); y.resize(n);
00130 for (UINT i = 0; i < n; ++i) {
00131 X[i] = ptd->x(i);
00132 X[i].push_back(1);
00133 y[i] = ptd->y(i)[0];
00134 }
00135 }
00136
00137 inline void dset_mult_wgt (const pDataWgt& ptw, RVEC& y) {
00138 UINT n = y.size();
00139 assert(ptw->size() == n);
00140 for (UINT i = 0; i < n; ++i)
00141 y[i] *= (*ptw)[i];
00142 }
00143
00144 Perceptron::Perceptron (UINT n_in)
00145 : LearnModel(n_in, 1), wgt(n_in+1,0), resample(0),
00146 train_method(RCD_BIAS), learn_rate(0.002), min_cst(0),
00147 max_run(1000), with_fld(false), fixed_bias(false)
00148 {
00149 }
00150
00151 Perceptron::Perceptron (const SVM& s)
00152 : LearnModel(s), wgt(_n_in+1,0), resample(0),
00153 train_method(RCD_BIAS), learn_rate(0.002), min_cst(0),
00154 max_run(1000), with_fld(false), fixed_bias(false)
00155 {
00156 assert(s.kernel().id() == kernel::Linear().id());
00157
00158 const UINT nsv = s.n_support_vectors();
00159 for (UINT i = 0; i < nsv; ++i) {
00160 const Input sv = s.support_vector(i);
00161 const REAL coef = s.support_vector_coef(i);
00162 assert(sv.size() == _n_in);
00163 for (UINT k = 0; k < _n_in; ++k)
00164 wgt[k] += coef * sv[k];
00165 }
00166 wgt.back() = s.bias();
00167 }
00168
00169 bool Perceptron::serialize (std::ostream& os, ver_list& vl) const {
00170 SERIALIZE_PARENT(LearnModel, os, vl, 1);
00171 for (UINT i = 0; i <= _n_in; ++i)
00172 if (!(os << wgt[i] << ' ')) return false;
00173 return (os << '\n');
00174 }
00175
00176 bool Perceptron::unserialize (std::istream& is, ver_list& vl, const id_t& d) {
00177 if (d != id() && d != NIL_ID) return false;
00178 UNSERIALIZE_PARENT(LearnModel, is, vl, 1, v);
00179
00180 wgt.resize(_n_in+1);
00181 for (UINT i = 0; i <= _n_in; ++i)
00182 if (!(is >> wgt[i])) return false;
00183 return true;
00184 }
00185
00186 void Perceptron::set_weight (const WEIGHT& w) {
00187 assert(w.size() > 1);
00188 set_dimensions(w.size()-1, 1);
00189 wgt = w;
00190 }
00191
00192 void Perceptron::initialize () {
00193 if (ptd != 0) set_dimensions(*ptd);
00194 assert(_n_in > 0);
00195 wgt.resize(_n_in + 1);
00196 for (UINT i = 0; i < _n_in + (fixed_bias? 0:1); ++i)
00197 wgt[i] = randu() * 0.1 - 0.05;
00198 }
00199
00200 Perceptron::WEIGHT Perceptron::fld () const {
00201 assert(ptd != 0 && ptw != 0);
00202
00203
00204 Input m1(_n_in, 0), m2(_n_in, 0);
00205 REAL w1 = 0, w2 = 0;
00206 for (UINT i = 0; i < n_samples; ++i) {
00207 const Input& x = ptd->x(i);
00208 REAL y = ptd->y(i)[0];
00209 REAL w = (*ptw)[i];
00210
00211 if (y > 0) {
00212 w1 += w;
00213 for (UINT k = 0; k < _n_in; ++k)
00214 m1[k] += w * x[k];
00215 } else {
00216 w2 += w;
00217 for (UINT k = 0; k < _n_in; ++k)
00218 m2[k] += w * x[k];
00219 }
00220 }
00221 assert(w1 > 0 && w2 > 0 && std::fabs(w1+w2-1) < EPSILON);
00222 for (UINT k = 0; k < _n_in; ++k) {
00223 m1[k] /= w1; m2[k] /= w2;
00224 }
00225
00226
00227 RMAT A(_n_in, RVEC(_n_in, 0));
00228 RVEC diff(_n_in);
00229 for (UINT i = 0; i < n_samples; ++i) {
00230 const Input& x = ptd->x(i);
00231 REAL y = ptd->y(i)[0];
00232 REAL w = (*ptw)[i];
00233
00234 if (y > 0)
00235 for (UINT k = 0; k < _n_in; ++k)
00236 diff[k] = x[k] - m1[k];
00237 else
00238 for (UINT k = 0; k < _n_in; ++k)
00239 diff[k] = x[k] - m2[k];
00240
00241 for (UINT j = 0; j < _n_in; ++j)
00242 for (UINT k = j; k < _n_in; ++k)
00243 A[j][k] += w * diff[j] * diff[k];
00244 }
00245
00246 for (UINT k = 0; k < _n_in; ++k)
00247 diff[k] = m1[k] - m2[k];
00248
00249 RVEC w;
00250 bool not_reg = true;
00251 while (!ldivide(A, diff, w)) {
00252 assert(not_reg); not_reg = false;
00253 REAL tr = 0;
00254 for (UINT j = 0; j < _n_in; ++j)
00255 tr += A[j][j];
00256 const REAL gamma = 1e-10;
00257 const REAL adj = gamma/(1-gamma) * tr/_n_in;
00258 for (UINT j = 0; j < _n_in; ++j)
00259 A[j][j] += adj;
00260 #if VERBOSE_OUTPUT
00261 std::cerr << "LDA: The class covariance matrix estimate is "
00262 "regularized by\n\tan eigenvalue shinkage parameter "
00263 << gamma << '\n';
00264 #endif
00265 }
00266
00267 w.push_back(- (DOTPROD(m1,w) * w2 + DOTPROD(m2,w) * w1));
00268 return w;
00269 }
00270
00271 inline UINT randcdf (REAL r, const RVEC& cdf) {
00272 UINT b = 0, e = cdf.size()-1;
00273 while (b+1 < e) {
00274 UINT m = (b + e) / 2;
00275 if (r < cdf[m]) e = m;
00276 else b = m;
00277 }
00278 return b;
00279 }
00280
00281 void Perceptron::train () {
00282 assert(ptd != 0 && ptw != 0 && ptd->size() == n_samples);
00283 set_dimensions(*ptd);
00284 const UINT dim = _n_in+1;
00285
00286 const UINT udim = _n_in + (fixed_bias? 0:1);
00287
00288 RVEC cdf;
00289 if (resample) {
00290 cdf.resize(n_samples+1);
00291 cdf[0] = 0;
00292 for (UINT i = 0; i < n_samples; ++i)
00293 cdf[i+1] = cdf[i] + (*ptw)[i];
00294 assert(cdf[n_samples]-1 > -EPSILON && cdf[n_samples]-1 < EPSILON);
00295 }
00296
00297 wgt.resize(dim);
00298 const REAL bias_save = wgt.back();
00299 if (with_fld) wgt = fld();
00300 if (fixed_bias)
00301 wgt.back() = bias_save;
00302
00303 RMAT X; RVEC Y;
00304 dset_extract(ptd, X, Y);
00305
00306 #define RAND_IDX() (resample? randcdf(randu(),cdf) : UINT(randu()*n_samples))
00307 #define SAMPWGT(i) (resample? 1 : (*ptw)[i]*n_samples)
00308 #define GET_XYO(i) \
00309 const Input& x = X[i]; \
00310 const REAL y = Y[i]; \
00311 const REAL o = DOTPROD(wgt,x)
00312
00313 log_error(0);
00314 switch (train_method) {
00315 case PERCEPTRON:
00316 case ADALINE:
00317 for (UINT i = 0; i < max_run; ++i) {
00318 for (UINT j = 0; j < n_samples; ++j) {
00319 const UINT idx = RAND_IDX();
00320 GET_XYO(idx);
00321 if (y * o > 0) continue;
00322 REAL deriv = (train_method == PERCEPTRON? y : (y - o));
00323 REAL adj = learn_rate * SAMPWGT(idx) * deriv;
00324
00325 for (UINT k = 0; k < udim; ++k)
00326 wgt[k] += adj * x[k];
00327 }
00328 log_error(i+1);
00329 }
00330 break;
00331
00332 case POCKET_RATCHET:
00333 case POCKET: {
00334 bool ratchet = (train_method == POCKET_RATCHET);
00335 RVEC best_w(wgt);
00336 REAL run = 0, err = train_c_error();
00337 bool err_valid = true;
00338 REAL best_run = run, best_err = err;
00339 for (UINT i = 0; i < max_run; ++i) {
00340 for (UINT j = 0; j < n_samples; ++j) {
00341 const UINT idx = RAND_IDX();
00342 GET_XYO(idx);
00343
00344 if (y * o > 0) {
00345 run += SAMPWGT(idx);
00346 if (run > best_run) {
00347 if (!err_valid) err = train_c_error();
00348 err_valid = true;
00349 if (!ratchet || err < best_err) {
00350 best_run = run;
00351 best_err = err;
00352 best_w = wgt;
00353 }
00354 if (err <= 0) break;
00355 }
00356 } else {
00357 run = 0;
00358 err_valid = false;
00359
00360 const REAL adj = SAMPWGT(idx) * y;
00361 for (UINT k = 0; k < udim; ++k)
00362 wgt[k] += adj * x[k];
00363 }
00364 }
00365 wgt.swap(best_w); log_error(i+1, best_err); wgt.swap(best_w);
00366 }
00367 wgt.swap(best_w);
00368 }
00369 break;
00370
00371 case AVE_PERCEPTRON_RAND:
00372 case AVE_PERCEPTRON: {
00373 assert(train_method != AVE_PERCEPTRON || !resample);
00374 RVEC ave_wgt(dim, 0);
00375 REAL run = 0;
00376 for (UINT i = 0; i < max_run; ++i) {
00377 for (UINT j = 0; j < n_samples; ++j) {
00378 const UINT idx = (train_method == AVE_PERCEPTRON)?
00379 j : RAND_IDX();
00380 GET_XYO(idx);
00381 if (y * o > 0)
00382 run += SAMPWGT(idx);
00383 else {
00384 for (UINT k = 0; k < dim; ++k)
00385 ave_wgt[k] += run * wgt[k];
00386
00387 const REAL adj = SAMPWGT(idx) * y;
00388 for (UINT k = 0; k < udim; ++k)
00389 wgt[k] += adj * x[k];
00390 run = SAMPWGT(idx);
00391 }
00392 }
00393 RVEC tmp_wgt(ave_wgt);
00394 for (UINT k = 0; k < dim; ++k)
00395 tmp_wgt[k] += run * wgt[k];
00396 wgt.swap(tmp_wgt); log_error(i+1); wgt.swap(tmp_wgt);
00397 }
00398 for (UINT k = 0; k < dim; ++k)
00399 wgt[k] = ave_wgt[k] + run * wgt[k];
00400 }
00401 break;
00402
00403 case ROMMA_AGG_RAND:
00404 case ROMMA_AGG:
00405 case ROMMA_RAND:
00406 case ROMMA: {
00407 bool fixed = (train_method == ROMMA || train_method == ROMMA_AGG);
00408 assert(!fixed || !resample);
00409 REAL bnd = (train_method == ROMMA || train_method == ROMMA_RAND)?
00410 0 : (1-EPSILON);
00411 for (UINT i = 0; i < max_run; ++i) {
00412 for (UINT j = 0; j < n_samples; ++j) {
00413 const UINT idx = fixed? j : RAND_IDX();
00414 GET_XYO(idx); const REAL& w_x = o;
00415 if (y * w_x > bnd) continue;
00416
00417 REAL w_w = DOTPROD(wgt, wgt);
00418 REAL x_x = 1 + DOTPROD(x, x);
00419 REAL x2w2 = x_x * w_w;
00420 REAL deno = x2w2 - w_x*w_x;
00421 REAL c = (x2w2 - y*w_x) / deno;
00422 REAL d = w_w * (y - w_x) / deno;
00423
00424 wgt[0] = c*wgt[0] + d;
00425 for (UINT k = 0; k < _n_in; ++k)
00426 wgt[k+1] = c*wgt[k+1] + d*x[k];
00427 }
00428 log_error(i+1);
00429 }
00430 }
00431 break;
00432
00433 case SGD_HINGE:
00434 case SGD_MLSE: {
00435 const REAL C = 0;
00436
00437 for (UINT i = 0; i < max_run; ++i) {
00438 for (UINT j = 0; j < n_samples; ++j) {
00439 const UINT idx = RAND_IDX();
00440 GET_XYO(idx);
00441
00442 if (y*o < 1) {
00443 REAL shrink = 1 - C * learn_rate;
00444 REAL deriv = (train_method == SGD_HINGE? y : (y - o));
00445 REAL adj = learn_rate * SAMPWGT(idx) * deriv;
00446 for (UINT k = 0; k < udim; ++k)
00447 wgt[k] = shrink * wgt[k] + adj * x[k];
00448 }
00449 }
00450 log_error(i+1);
00451 }
00452 }
00453 break;
00454
00455 #undef RAND_IDX
00456 #undef SAMPWGT
00457 #define CYCLE(r) (((r)+dim-1) % udim)
00458 #define UPDATE_WGT(d) update_wgt(wgt, d, X, Y)
00459 #define INIT_RCD() { \
00460 dset_mult_wgt(ptw, Y); \
00461 }
00462
00463 case COORDINATE_DESCENT: {
00464 INIT_RCD();
00465 for (UINT r = 0; r < max_run; ++r) {
00466 UPDATE_WGT(coorvec(dim, CYCLE(r)));
00467 log_error(r+1);
00468 }
00469 }
00470 break;
00471
00472 case FIXED_RCD:
00473 case FIXED_RCD_CONJ:
00474 case FIXED_RCD_BIAS:
00475 case FIXED_RCD_CONJ_BIAS: {
00476 bool bias_row = (train_method == FIXED_RCD_BIAS ||
00477 train_method == FIXED_RCD_CONJ_BIAS);
00478 bool conjugate = (train_method == FIXED_RCD_CONJ ||
00479 train_method == FIXED_RCD_CONJ_BIAS);
00480 RMAT A = randrot(dim, bias_row || fixed_bias, conjugate);
00481
00482 INIT_RCD();
00483 for (UINT r = 0; r < max_run; ++r) {
00484 UPDATE_WGT(A[CYCLE(r)]);
00485 log_error(r+1);
00486 }
00487 }
00488 break;
00489
00490 case RCD:
00491 case RCD_CONJ:
00492 case RCD_BIAS:
00493 case RCD_CONJ_BIAS: {
00494 bool bias_row = (train_method == RCD_BIAS ||
00495 train_method == RCD_CONJ_BIAS);
00496 bool conjugate = (train_method == RCD_CONJ ||
00497 train_method == RCD_CONJ_BIAS);
00498 RMAT A;
00499
00500 INIT_RCD();
00501 for (UINT r = 0; r < max_run; ++r) {
00502 const UINT c = CYCLE(r);
00503 if (c == CYCLE(0))
00504 A = randrot(dim, bias_row || fixed_bias, conjugate);
00505 UPDATE_WGT(A[c]);
00506 log_error(r+1);
00507 }
00508 }
00509 break;
00510
00511 case RCD_GRAD_BATCH_RAND:
00512 case RCD_GRAD_BATCH:
00513 case RCD_GRAD_RAND:
00514 case RCD_GRAD: {
00515 bool online = (train_method == RCD_GRAD ||
00516 train_method == RCD_GRAD_RAND);
00517 bool wrand = (train_method == RCD_GRAD_RAND ||
00518 train_method == RCD_GRAD_BATCH_RAND);
00519
00520 INIT_RCD();
00521 for (UINT r = 0; r < max_run; ++r) {
00522 RVEC dir(dim, 0);
00523 if (r % 5 == 0 && wrand) {
00524 dir = randvec(dim);
00525 } else if (online) {
00526 UINT idx, cnt = 0;
00527 REAL o;
00528 do {
00529 ++cnt;
00530 idx = UINT(randu() * n_samples);
00531 o = DOTPROD(wgt, X[idx]);
00532 } while (Y[idx] * o > 0 && cnt < 2*n_samples);
00533
00534 dir = X[idx];
00535 } else {
00536 bool no_err = true;
00537 for (UINT j = 0; j < n_samples; ++j) {
00538 GET_XYO(j);
00539 if (y * o > 0) continue;
00540 no_err = false;
00541 for (UINT k = 0; k < udim; ++k)
00542 dir[k] += y * x[k];
00543 }
00544 if (no_err) break;
00545 }
00546
00547 if (fixed_bias)
00548 dir.back() = 0;
00549 UPDATE_WGT(dir);
00550 log_error(r+1);
00551 }
00552 }
00553 break;
00554
00555 case RCD_GRAD_MIXED_BATCH_INITRAND:
00556 case RCD_GRAD_MIXED_BATCH:
00557 case RCD_GRAD_MIXED_INITRAND:
00558 case RCD_GRAD_MIXED: {
00559 bool online = (train_method == RCD_GRAD_MIXED ||
00560 train_method == RCD_GRAD_MIXED_INITRAND);
00561 bool init_rand = (train_method == RCD_GRAD_MIXED_INITRAND ||
00562 train_method == RCD_GRAD_MIXED_BATCH_INITRAND);
00563
00564 INIT_RCD();
00565 for (UINT r = 0; r < max_run; ++r) {
00566 RVEC dir(dim, 0);
00567 if (init_rand)
00568 dir = randvec(dim);
00569 UINT cnt = 0;
00570
00571 for (UINT j = 0; j < n_samples; ++j) {
00572 UINT idx = (online? UINT(randu() * n_samples) : j);
00573 GET_XYO(idx);
00574 if (y * o > 0) continue;
00575 ++cnt;
00576 REAL adj = y*n_samples * randu();
00577 for (UINT k = 0; k < udim; ++k)
00578 dir[k] += adj * x[k];
00579 }
00580
00581
00582 if (cnt == 0 && !init_rand)
00583 dir = randvec(dim);
00584 if (fixed_bias)
00585 dir.back() = 0;
00586 UPDATE_WGT(dir);
00587 log_error(r+1);
00588 }
00589 }
00590 break;
00591
00592 case RCD_MIXED: {
00593 INIT_RCD();
00594 RMAT A;
00595 for (UINT r = 0; r < max_run; ++r) {
00596 UINT c = r % (2*udim);
00597 RVEC dir;
00598 if (c < udim)
00599 dir = coorvec(dim, CYCLE(c));
00600 else {
00601 if (c == udim) A = randrot(dim, fixed_bias, false);
00602 dir = A[c-udim];
00603 }
00604 UPDATE_WGT(dir);
00605 log_error(r+1);
00606 }
00607 }
00608 break;
00609
00610 default:
00611 assert(false);
00612 }
00613
00614 assert(!fixed_bias || train_method == AVE_PERCEPTRON_RAND ||
00615 train_method == AVE_PERCEPTRON || wgt.back() == bias_save);
00616 }
00617
00618 #define INPUT_SUM(w,x) \
00619 std::inner_product(x.begin(), x.end(), w.begin(), w.back())
00620
00621 Output Perceptron::operator() (const Input& x) const {
00622 assert(x.size() == n_input());
00623 REAL sum = INPUT_SUM(wgt, x);
00624 return Output(1, (sum>=0)? 1 : -1);
00625 }
00626
00627 REAL Perceptron::margin_of (const Input& x, const Output& y) const {
00628 assert(std::fabs(std::fabs(y[0]) - 1) < INFINITESIMAL);
00629 return INPUT_SUM(wgt, x) * y[0];
00630 }
00631
00632 REAL Perceptron::w_norm () const {
00633 REAL s = DOTPROD_NB(wgt, wgt);
00634 return std::sqrt(s);
00635 }
00636
00637 void Perceptron::log_error (UINT, REAL err) const {
00638 if (logf != NULL) {
00639 if (err < 0) err = train_c_error();
00640 fprintf(logf, "%g ", err);
00641 }
00642 }
00643
00644 }