00001
00005 #include <assert.h>
00006 #include <algorithm>
00007 #include <cmath>
00008 #include "random.h"
00009 #include "datafeeder.h"
00010
00011 namespace lemga {
00012
00013 DataFeeder::DataFeeder (const pDataSet& pd)
00014 : dat(pd), perms(0), _do_normalize(MIN_MAX), tr_size(0), tr_flip(0) {
00015 fsize = dat->size();
00016 }
00017
00018 DataFeeder::DataFeeder (std::istream& ds)
00019 : perms(0), _do_normalize(MIN_MAX), tr_size(0), tr_flip(0) {
00020
00021 dat = load_data(ds, (1L<<30)-1);
00022 fsize = dat->size();
00023 }
00024
00025 void DataFeeder::set_train_size (UINT trn) {
00026 assert(trn < fsize);
00027 tr_size = trn;
00028 }
00029
00030 void DataFeeder::set_train_noise (REAL p) {
00031 assert(p >= 0 && p <= 1);
00032 tr_flip = p;
00033 }
00034
00035 bool DataFeeder::next_train_test (pDataSet& ptr, pDataSet& pte) const {
00036 DataSet *p_tr = new DataSet();
00037 DataSet *p_te = new DataSet();
00038
00039 std::vector<UINT> perm;
00040 if (!next_permutation(perm)) return false;
00041
00042 for (UINT i = 0; i < tr_size; ++i) {
00043 bool flip = (tr_flip > 0) && (randu() < tr_flip);
00044 if (!flip)
00045 p_tr->append(dat->x(perm[i]), dat->y(perm[i]));
00046 else {
00047 const Output& y = dat->y(perm[i]);
00048 assert(y.size() == 1 &&
00049 std::fabs(std::fabs(y[0]) - 1) < INFINITESIMAL);
00050 p_tr->append(dat->x(perm[i]), Output(1, -y[0]));
00051 }
00052 }
00053 for (UINT i = tr_size; i < fsize; ++i)
00054 p_te->append(dat->x(perm[i]), dat->y(perm[i]));
00055
00056 if (_do_normalize != NONE) {
00057 LINEAR_SCALE_PARAMS lsp;
00058 switch (_do_normalize) {
00059 case MIN_MAX: lsp = min_max(*p_tr); break;
00060 case MEAN_VAR: lsp = mean_var(*p_tr); break;
00061 default: assert(false);
00062 }
00063 linear_scale(*p_tr, lsp);
00064 linear_scale(*p_te, lsp);
00065 }
00066
00067 ptr = p_tr; pte = p_te;
00068 return true;
00069 }
00070
00071 bool DataFeeder::next_permutation (std::vector<UINT>& perm) const {
00072 perm.resize(fsize);
00073
00074 if (perms == 0) {
00075 for (UINT i = 0; i < fsize; ++i)
00076 perm[i] = i;
00077 std::random_shuffle(perm.begin(), perm.end());
00078 return true;
00079 }
00080
00081 std::vector<bool> visited(fsize, false);
00082 for (UINT i = 0; i < fsize; ++i) {
00083 UINT idx;
00084 if (!((*perms) >> idx)) {
00085 if (i) std::cerr << "DataFeeder: "
00086 "Permutation stream ends prematurely\n";
00087 return false;
00088 }
00089 if (idx >= fsize || visited[idx]) {
00090 std::cerr << "DataFeeder: "
00091 "Permutation stream has errors\n";
00092 return false;
00093 }
00094 visited[idx] = true;
00095 perm[i] = idx;
00096 }
00097 return true;
00098 }
00099
00100 DataFeeder::LINEAR_SCALE_PARAMS DataFeeder::min_max (DataSet& d) {
00101 assert(d.size() > 0);
00102
00103 const Input& x0 = d.x(0);
00104 const UINT ls = x0.size();
00105 std::vector<REAL> dmin(x0), dmax(x0);
00106 for (UINT i = 1; i < d.size(); ++i) {
00107 const Input& x = d.x(i);
00108 for (UINT j = 0; j < ls; ++j) {
00109 if (dmin[j] > x[j])
00110 dmin[j] = x[j];
00111 else if (dmax[j] < x[j])
00112 dmax[j] = x[j];
00113 }
00114 }
00115
00116 LINEAR_SCALE_PARAMS l(ls);
00117 for (UINT j = 0; j < ls; ++j) {
00118 l[j].center = (dmin[j] + dmax[j]) / 2;
00119 if (dmin[j] != dmax[j])
00120 l[j].scale = 2 / (dmax[j] - dmin[j]);
00121 else
00122 l[j].scale = 0;
00123 }
00124 return l;
00125 }
00126
00127 DataFeeder::LINEAR_SCALE_PARAMS DataFeeder::mean_var (DataSet& d) {
00128 const UINT n = d.size();
00129 assert(n > 0);
00130 const UINT ls = d.x(0).size();
00131
00132 std::vector<REAL> sum1(ls, 0), sum2(ls, 0);
00133 for (UINT i = 0; i < n; ++i) {
00134 const Input& x = d.x(i);
00135 for (UINT j = 0; j < ls; ++j) {
00136 sum1[j] += x[j];
00137 sum2[j] += x[j] * x[j];
00138 }
00139 }
00140
00141 LINEAR_SCALE_PARAMS l(ls);
00142 for (UINT j = 0; j < ls; ++j) {
00143 l[j].center = sum1[j] / n;
00144 REAL n_1_var = sum2[j] - sum1[j] * l[j].center;
00145 if (n_1_var > INFINITESIMAL)
00146 l[j].scale = std::sqrt((n-1) / n_1_var);
00147 else
00148 l[j].scale = 0;
00149 }
00150 return l;
00151 }
00152
00153 void DataFeeder::linear_scale (DataSet& d, const LINEAR_SCALE_PARAMS& l) {
00154 const UINT ls = l.size();
00155 for (UINT i = 0; i < d.size(); ++i) {
00156 Input x = d.x(i);
00157 assert(x.size() == ls);
00158 for (UINT j = 0; j < ls; ++j)
00159 x[j] = (x[j] - l[j].center) * l[j].scale;
00160 d.replace(i, x, d.y(i));
00161 }
00162 }
00163
00164 }