#include #include #include #include #include #include typedef boost::mt19937 random_type; random_type irnd; boost::uniform_01 uni(irnd); using namespace lemga; typedef std::pair,std::vector > PVEC; PVEC adaboost (const LearnModel& blm, const pDataSet& pd, UINT T) { const UINT n = pd->size(); std::vector cnt_misclassify(n, 0), cnt_out_boost(n, 0); std::vector wgt(n, 1); for (UINT t = 0; t < T; ++t) { // generate the training data // copied from lemga/dataset.h, random_sample(wgt, n) std::vector included(n, false); static std::vector cdf(n+1); cdf[0] = 0; for (UINT i = 0; i < n; ++i) cdf[i+1] = cdf[i]+wgt[i]; DataSet* rd = new DataSet(); for (UINT i = 0; i < n; ++i) { const REAL r = uni()*cdf[n]; UINT b = 0, e = n; while (b+1 < e) { UINT m = (b + e) / 2; if (r < cdf[m]) e = m; else b = m; } rd->append(pd->x(b), pd->y(b)); included[b] = true; } // train the base learner LearnModel *plm = blm.clone(); plm->set_train_data(rd); plm->initialize(); plm->train(); // record the errors static std::vector err(n); REAL errsum = 0; for (UINT i = 0; i < n; ++i) { err[i] = ((*plm)(pd->x(i))[0] * pd->y(i)[0] <= 0); if (err[i]) errsum += wgt[i]; if (!included[i]) { ++cnt_out_boost[i]; if (err[i]) ++cnt_misclassify[i]; } } delete plm; if (errsum == 0 || errsum*2 > cdf[n]) { std::cerr << "Early stopped after iteration " << t << " with error " << (errsum/cdf[n]*100) << "%\n"; break; } // update the weight const REAL beta = cdf[n]/errsum - 1; for (UINT i = 0; i < n; ++i) if (err[i]) wgt[i] *= beta; if (t % 20) { // normalize the weight to avoid numerical problems. REAL wsum = 0; for (UINT i = 0; i < n; ++i) wsum += wgt[i]; for (UINT i = 0; i < n; ++i) wgt[i] /= wsum; } } return std::make_pair(cnt_misclassify, cnt_out_boost); } int main (unsigned int argc, char *argv[]) { if (argc < 3) { std::cerr << "Usage: " << argv[0] << " trf in method n_agg n_run [output]\n"; return(-1); } const UINT in = atoi(argv[2]); std::ifstream fd(argv[1]); if (!fd.is_open()) { std::cerr << "Data file open error\n"; return(-2); } pDataSet dat = load_data(fd, (1L<<30)-1, in, 1); int method = 0; UINT n_agg = 1000; UINT n_run = 10; const char *outf = "misclass.dat"; if (argc > 3) method = atoi(argv[3]); if (argc > 4) n_agg = atoi(argv[4]); if (argc > 5) n_run = atoi(argv[5]); if (argc > 6) outf = argv[6]; std::cerr << n_run << " AdaBoost runs of " << n_agg << " base hypotheses.\n"; random_type::result_type rnd_seed = std::time(0); std::cerr << "seed = " << rnd_seed << std::endl; irnd.seed(rnd_seed); LearnModel *lm; if (method == 0) lm = new Stump(in); else { Stump st(in); st.set_log_file(NULL); AdaBoost* ab = new AdaBoost(false); ab->set_max_models(method); ab->set_base_model(st); lm = ab; } lm->set_log_file(NULL); const UINT n = dat->size(); std::vector mis(n, 0), out(n, 0); for (UINT i = 0; i < n_run; ++i) { PVEC mi = adaboost(*lm, dat, n_agg); for (UINT j = 0; j < n; ++j) { mis[j] += mi.first[j]; out[j] += mi.second[j]; } } delete lm; std::ofstream fm(outf); for (UINT i = 0; i < n; ++i) fm << (mis[i]/(REAL)out[i]) << ' ' << out[i] << '\n'; return(0); }