datafeeder.cpp

Go to the documentation of this file.
00001 
00005 #include <assert.h>
00006 #include <algorithm>
00007 #include <cmath>
00008 #include "random.h"
00009 #include "datafeeder.h"
00010 
00011 namespace lemga {
00012 
00013 DataFeeder::DataFeeder (const pDataSet& pd)
00014     : dat(pd), perms(0), _do_normalize(MIN_MAX), tr_size(0), tr_flip(0) {
00015     fsize = dat->size();
00016 }
00017 
00018 DataFeeder::DataFeeder (std::istream& ds)
00019     : perms(0), _do_normalize(MIN_MAX), tr_size(0), tr_flip(0) {
00020     /* load all the data */
00021     dat = load_data(ds, (1L<<30)-1);
00022     fsize = dat->size();
00023 }
00024 
00025 void DataFeeder::set_train_size (UINT trn) {
00026     assert(trn < fsize);
00027     tr_size = trn;
00028 }
00029 
00030 void DataFeeder::set_train_noise (REAL p) {
00031     assert(p >= 0 && p <= 1);
00032     tr_flip = p;
00033 }
00034 
00035 bool DataFeeder::next_train_test (pDataSet& ptr, pDataSet& pte) const {
00036     DataSet *p_tr = new DataSet();
00037     DataSet *p_te = new DataSet();
00038 
00039     std::vector<UINT> perm;
00040     if (!next_permutation(perm)) return false;
00041 
00042     for (UINT i = 0; i < tr_size; ++i) {
00043         bool flip = (tr_flip > 0) && (randu() < tr_flip);
00044         if (!flip)
00045             p_tr->append(dat->x(perm[i]), dat->y(perm[i]));
00046         else {
00047             const Output& y = dat->y(perm[i]);
00048             assert(y.size() == 1 &&
00049                    std::fabs(std::fabs(y[0]) - 1) < INFINITESIMAL);
00050             p_tr->append(dat->x(perm[i]), Output(1, -y[0]));
00051         }
00052     }
00053     for (UINT i = tr_size; i < fsize; ++i)
00054         p_te->append(dat->x(perm[i]), dat->y(perm[i]));
00055 
00056     if (_do_normalize != NONE) {
00057         LINEAR_SCALE_PARAMS lsp;
00058         switch (_do_normalize) {
00059             case MIN_MAX:  lsp = min_max(*p_tr); break;
00060             case MEAN_VAR: lsp = mean_var(*p_tr); break;
00061             default:       assert(false);
00062         }
00063         linear_scale(*p_tr, lsp);
00064         linear_scale(*p_te, lsp);
00065     }
00066 
00067     ptr = p_tr; pte = p_te;
00068     return true;
00069 }
00070 
00071 bool DataFeeder::next_permutation (std::vector<UINT>& perm) const {
00072     perm.resize(fsize);
00073 
00074     if (perms == 0) {
00075         for (UINT i = 0; i < fsize; ++i)
00076             perm[i] = i;
00077         std::random_shuffle(perm.begin(), perm.end());
00078         return true;
00079     }
00080 
00081     std::vector<bool> visited(fsize, false);
00082     for (UINT i = 0; i < fsize; ++i) {
00083         UINT idx; // starting from 0
00084         if (!((*perms) >> idx)) {
00085             if (i) std::cerr << "DataFeeder: "
00086                 "Permutation stream ends prematurely\n";
00087             return false;
00088         }
00089         if (idx >= fsize || visited[idx]) {
00090             std::cerr << "DataFeeder: "
00091                 "Permutation stream has errors\n";
00092             return false;
00093         }
00094         visited[idx] = true;
00095         perm[i] = idx;
00096     }
00097     return true;
00098 }
00099 
00100 DataFeeder::LINEAR_SCALE_PARAMS DataFeeder::min_max (DataSet& d) {
00101     assert(d.size() > 0);
00102 
00103     const Input& x0 = d.x(0);
00104     const UINT ls = x0.size();
00105     std::vector<REAL> dmin(x0), dmax(x0);
00106     for (UINT i = 1; i < d.size(); ++i) {
00107         const Input& x = d.x(i);
00108         for (UINT j = 0; j < ls; ++j) {
00109             if (dmin[j] > x[j])
00110                 dmin[j] = x[j];
00111             else if (dmax[j] < x[j])
00112                 dmax[j] = x[j];
00113         }
00114     }
00115 
00116     LINEAR_SCALE_PARAMS l(ls);
00117     for (UINT j = 0; j < ls; ++j) {
00118         l[j].center = (dmin[j] + dmax[j]) / 2;
00119         if (dmin[j] != dmax[j])
00120             l[j].scale = 2 / (dmax[j] - dmin[j]);
00121         else
00122             l[j].scale = 0;
00123     }
00124     return l;
00125 }
00126 
00127 DataFeeder::LINEAR_SCALE_PARAMS DataFeeder::mean_var (DataSet& d) {
00128     const UINT n = d.size();
00129     assert(n > 0);
00130     const UINT ls = d.x(0).size();
00131 
00132     std::vector<REAL> sum1(ls, 0), sum2(ls, 0);
00133     for (UINT i = 0; i < n; ++i) {
00134         const Input& x = d.x(i);
00135         for (UINT j = 0; j < ls; ++j) {
00136             sum1[j] += x[j];
00137             sum2[j] += x[j] * x[j];
00138         }
00139     }
00140 
00141     LINEAR_SCALE_PARAMS l(ls);
00142     for (UINT j = 0; j < ls; ++j) {
00143         l[j].center = sum1[j] / n;
00144         REAL n_1_var = sum2[j] - sum1[j] * l[j].center;
00145         if (n_1_var > INFINITESIMAL)
00146             l[j].scale = std::sqrt((n-1) / n_1_var);
00147         else
00148             l[j].scale = 0;
00149     }
00150     return l;
00151 }
00152 
00153 void DataFeeder::linear_scale (DataSet& d, const LINEAR_SCALE_PARAMS& l) {
00154     const UINT ls = l.size();
00155     for (UINT i = 0; i < d.size(); ++i) {
00156         Input x = d.x(i);
00157         assert(x.size() == ls);
00158         for (UINT j = 0; j < ls; ++j)
00159             x[j] = (x[j] - l[j].center) * l[j].scale;
00160         d.replace(i, x, d.y(i));
00161     }
00162 }
00163 
00164 } // namespace lemga

Generated on Wed Nov 8 08:15:21 2006 for LEMGA by  doxygen 1.4.6