00001
00005 #include <assert.h>
00006 #include <cmath>
00007 #include <algorithm>
00008 #include <vector>
00009 #include <map>
00010 #include "stump.h"
00011
00012 REGISTER_CREATOR(lemga::Stump);
00013
00014 namespace lemga {
00015
00016 bool Stump::serialize (std::ostream& os, ver_list& vl) const {
00017 SERIALIZE_PARENT(LearnModel, os, vl, 2);
00018 return (os << idx << ' ' << bd1 << ' ' << bd2 << ' '
00019 << (dir? 'P':'N') << '\n');
00020 }
00021
00022 bool Stump::unserialize (std::istream& is, ver_list& vl, const id_t& d) {
00023 if (d != id() && d != empty_id) return false;
00024 UNSERIALIZE_PARENT(LearnModel, is, vl, 2, v);
00025
00026 if (v == 0) {
00027 if (!(is >> _n_in)) return false;
00028 assert(_n_out == 1);
00029 }
00030
00031 if (!(is >> idx >> bd1)) return false;
00032 bd2 = bd1;
00033 if (v >= 2)
00034 if (!(is >> bd2) || bd1 > bd2) return false;
00035 char c;
00036 if (!(is >> c) || (c != 'P' && c != 'N')) return false;
00037 dir = (c == 'P');
00038 return true;
00039 }
00040
00041 typedef std::map<REAL,REAL>::iterator MI;
00042
00043 REAL Stump::train_1d (const std::vector<REAL>& x, const std::vector<REAL>& yw)
00044 {
00045 const UINT N = x.size();
00046 assert(yw.size() == N);
00047
00048
00049 std::map<REAL,REAL> xy;
00050 for (UINT i = 0; i < N; ++i)
00051 xy[x[i]] += yw[i];
00052 UINT n = xy.size();
00053 xy[xy.begin()->first-2] = 1;
00054 assert(xy.size() == n+1);
00055 for (MI p, pn = xy.begin(); p = pn++, p != xy.end();)
00056 if (p->second > -INFINITESIMAL && p->second < INFINITESIMAL)
00057 xy.erase(p);
00058
00059 n = xy.size();
00060 xy[xy.rbegin()->first+2] = 1;
00061 assert(n > 0 && xy.size() == n+1);
00062 const MI xyb = xy.begin();
00063
00064 REAL minthr = 0, mine = INFINITY, e = - xyb->second;
00065 REAL cur_x = xyb->first;
00066 for (MI p = xyb; n; --n) {
00067 e += p->second;
00068 assert(p != xyb || e == 0);
00069 REAL nxt_x = (++p)->first;
00070 REAL cur_thr = (cur_x + nxt_x) / 2;
00071 cur_x = nxt_x;
00072
00073 if (e < mine) {
00074 mine = e; minthr = cur_thr;
00075 }
00076 }
00077
00078 return minthr;
00079 }
00080
00081
00082 REAL Stump::train_1d (const std::vector<REAL>& x, const std::vector<REAL>& yw,
00083 REAL yw_inf, bool& ir, bool& mind, REAL& th1, REAL& th2)
00084 {
00085 const UINT N = x.size();
00086 assert(yw.size() == N);
00087
00088
00089 std::map<REAL,REAL> xy;
00090 for (UINT i = 0; i < N; ++i)
00091 xy[x[i]] += yw[i];
00092 #ifndef NDEBUG
00093 UINT n = xy.size();
00094 #endif
00095 xy[xy.begin()->first-2] = 1;
00096 assert(xy.size() == n+1);
00097 for (MI p, pn = xy.begin(); p = pn++, p != xy.end();)
00098 if (p->second > -INFINITESIMAL && p->second < INFINITESIMAL)
00099 xy.erase(p);
00100
00101 #ifndef NDEBUG
00102
00103 bool all_the_same = true;
00104 MI pb = xy.begin(); ++pb;
00105 for (MI p = pb; p != xy.end(); ++p)
00106 if (p->second < -INFINITESIMAL) { all_the_same = false; break; }
00107
00108 if (!all_the_same) {
00109 all_the_same = true;
00110 for (MI p = pb; p != xy.end(); ++p)
00111 if (p->second > INFINITESIMAL) { all_the_same = false; break; }
00112 }
00113
00114 if (all_the_same)
00115 std::cerr << "Stump: Warning: all y's are the same.\n";
00116 #endif
00117 const MI xyb = xy.begin(), xye = xy.end();
00118
00119 REAL mine = 0, maxe = 0, e = -1;
00120 MI mint, maxt, p;
00121 for (mint = maxt = p = xyb; p != xye; ++p) {
00122 e += p->second;
00123 assert(p != xyb || e == 0);
00124 if (e < mine) {
00125 mine = e; mint = p;
00126 } else if (e > maxe) {
00127 maxe = e; maxt = p;
00128 }
00129
00130 if (mint == xyb && e == 0) mint = p;
00131 if (maxt == xyb && e == 0) maxt = p;
00132 }
00133 e = (1-e-yw_inf) / 2;
00134 mine += e;
00135 maxe = 1 - (maxe+e);
00136
00137
00138 if (std::fabs(mine - maxe) < EPSILON) {
00139 MI nxtt = mint; ++nxtt;
00140 mind = (mint != xyb && nxtt != xye);
00141 }
00142 else mind = (mine < maxe);
00143 if (!mind) {
00144 mine = maxe; mint = maxt;
00145 }
00146
00147 MI nxtt = mint; ++nxtt;
00148 ir = (mint != xyb && nxtt != xye);
00149 assert(!ir || !all_the_same);
00150 th1 = mint->first;
00151 if (ir)
00152 th2 = nxtt->first;
00153 else
00154 th2 = th1 + 2;
00155
00156 return mine;
00157 }
00158
00159
00160 REAL Stump::train () {
00161 const UINT N = n_samples;
00162 assert(ptd != 0 && ptw != 0 && ptd->size() == N);
00163 assert(n_input() > 0 && n_output() == 1);
00164
00165
00166 std::vector<REAL> yw(N);
00167 for (UINT i = 0; i < N; ++i)
00168 yw[i] = ptd->y(i)[0] * (*ptw)[i];
00169
00170 REAL minerr = 2;
00171 bool minir = true;
00172
00173 std::vector<REAL> x(N);
00174 for (UINT d = 0; d < _n_in; ++d) {
00175 for (UINT i = 0; i < N; ++i)
00176 x[i] = ptd->x(i)[d];
00177
00178 bool mind, ir;
00179 REAL th1, th2;
00180 REAL mine = train_1d(x, yw, 0, ir, mind, th1, th2);
00181
00182 if (mine < minerr) {
00183 minerr = mine;
00184 minir = ir;
00185 dir = mind; idx = d;
00186 bd1 = th1; bd2 = th2;
00187 }
00188 }
00189
00190 if (!minir)
00191 std::cerr << "Stump: Warning: threshold out of range.\n";
00192 return minerr;
00193 }
00194
00195 Output Stump::operator() (const Input& x) const {
00196 assert(idx < n_input() && x.size() == n_input());
00197 assert(bd2 >= bd1);
00198 REAL y = x[idx]*2 - (bd1 + bd2);
00199 if (hard || bd1 == bd2)
00200 y = (y < 0)? -1 : 1;
00201 else {
00202 y /= bd2 - bd1;
00203 y = (y<-1)? -1 : (y>1)? 1 : y;
00204 }
00205 return Output(1, dir? y : -y);
00206 }
00207
00208 }