1 /*M///////////////////////////////////////////////////////////////////////////////////////
\r
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
\r
5 // By downloading, copying, installing or using the software you agree to this license.
\r
6 // If you do not agree to this license, do not download, install,
\r
7 // copy or use the software.
\r
10 // Intel License Agreement
\r
12 // Copyright (C) 2000, Intel Corporation, all rights reserved.
\r
13 // Third party copyrights are property of their respective owners.
\r
15 // Redistribution and use in source and binary forms, with or without modification,
\r
16 // are permitted provided that the following conditions are met:
\r
18 // * Redistribution's of source code must retain the above copyright notice,
\r
19 // this list of conditions and the following disclaimer.
\r
21 // * Redistribution's in binary form must reproduce the above copyright notice,
\r
22 // this list of conditions and the following disclaimer in the documentation
\r
23 // and/or other materials provided with the distribution.
\r
25 // * The name of Intel Corporation may not be used to endorse or promote products
\r
26 // derived from this software without specific prior written permission.
\r
28 // This software is provided by the copyright holders and contributors "as is" and
\r
29 // any express or implied warranties, including, but not limited to, the implied
\r
30 // warranties of merchantability and fitness for a particular purpose are disclaimed.
\r
31 // In no event shall the Intel Corporation or contributors be liable for any direct,
\r
32 // indirect, incidental, special, exemplary, or consequential damages
\r
33 // (including, but not limited to, procurement of substitute goods or services;
\r
34 // loss of use, data, or profits; or business interruption) however caused
\r
35 // and on any theory of liability, whether in contract, strict liability,
\r
36 // or tort (including negligence or otherwise) arising in any way out of
\r
37 // the use of this software, even if advised of the possibility of such damage.
\r
44 #define MISS_VAL FLT_MAX
\r
45 #define CV_VAR_MISS 0
\r
47 CvTrainTestSplit :: CvTrainTestSplit()
\r
49 train_sample_part_mode = CV_COUNT;
\r
50 train_sample_part.count = -1;
\r
55 CvTrainTestSplit :: CvTrainTestSplit( int _train_sample_count, bool _mix )
\r
57 train_sample_part_mode = CV_COUNT;
\r
58 train_sample_part.count = _train_sample_count;
\r
63 CvTrainTestSplit :: CvTrainTestSplit( float _train_sample_portion, bool _mix )
\r
65 train_sample_part_mode = CV_PORTION;
\r
66 train_sample_part.portion = _train_sample_portion;
\r
73 CvMLData :: CvMLData()
\r
75 values = missing = var_types = var_idx_mask = response_out = var_idx_out = var_types_out = 0;
\r
76 train_sample_idx = test_sample_idx = 0;
\r
80 train_sample_count = -1;
\r
84 //flt_separator = '.';
\r
86 class_map = new map<string, int>();
\r
87 rng = cvRNG( -cvGetTickCount() );
\r
90 CvMLData :: ~CvMLData()
\r
96 void CvMLData :: free_train_test_idx()
\r
98 cvReleaseMat( &train_sample_idx );
\r
99 cvReleaseMat( &test_sample_idx );
\r
103 void CvMLData :: clear()
\r
105 if ( !class_map->empty() )
\r
106 class_map->clear();
\r
108 cvReleaseMat( &values );
\r
109 cvReleaseMat( &missing );
\r
110 cvReleaseMat( &var_types );
\r
111 cvReleaseMat( &var_idx_mask );
\r
113 cvReleaseMat( &response_out );
\r
114 cvReleaseMat( &var_idx_out );
\r
115 cvReleaseMat( &var_types_out );
\r
117 free_train_test_idx();
\r
119 total_class_count = 0;
\r
123 train_sample_count = -1;
\r
126 int CvMLData :: read_csv(const char* filename)
\r
128 const int M = 10000;
\r
129 const char str_delimiter[3] = { ' ', delimiter, '\0' };
\r
131 CvMemStorage* storage;
\r
136 CvSeqReader reader;
\r
137 int cols_count = 0;
\r
138 uchar *var_types_ptr = 0;
\r
142 file = fopen( filename, "rt" );
\r
147 // read the first line and determine the number of variables
\r
149 if( !fgets( buf, M, file ))
\r
154 for( ptr = buf; *ptr != '\0'; ptr++ )
\r
155 cols_count += (*ptr == delimiter);
\r
157 if ( cols_count == 0)
\r
164 // create temporary memory storage to store the whole database
\r
165 el_ptr = new float[cols_count];
\r
166 storage = cvCreateMemStorage();
\r
167 seq = cvCreateSeq( 0, sizeof(*seq), cols_count*sizeof(float), storage );
\r
169 var_types = cvCreateMat( 1, cols_count, CV_8U );
\r
170 cvZero( var_types );
\r
171 var_types_ptr = var_types->data.ptr;
\r
175 char *token = NULL;
\r
177 token = strtok(buf, str_delimiter);
\r
183 for (int i = 0; i < cols_count-1; i++)
\r
186 str_to_flt_elem( token, el_ptr[i], type);
\r
187 var_types_ptr[i] |= type;
\r
188 token = strtok(NULL, str_delimiter);
\r
195 str_to_flt_elem( token, el_ptr[cols_count-1], type);
\r
196 var_types_ptr[cols_count-1] |= type;
\r
197 cvSeqPush( seq, el_ptr );
\r
198 if( !fgets( buf, M, file ) || !strchr( buf, delimiter ) )
\r
203 values = cvCreateMat( seq->total, cols_count, CV_32FC1 );
\r
204 missing = cvCreateMat( seq->total, cols_count, CV_8U );
\r
205 var_idx_mask = cvCreateMat( 1, values->cols, CV_8UC1 );
\r
206 cvSet( var_idx_mask, cvRealScalar(1) );
\r
207 train_sample_count = seq->total;
\r
209 cvStartReadSeq( seq, &reader );
\r
210 for(int i = 0; i < seq->total; i++ )
\r
212 const float* sdata = (float*)reader.ptr;
\r
213 float* ddata = values->data.fl + cols_count*i;
\r
214 uchar* dm = missing->data.ptr + cols_count*i;
\r
216 for( int j = 0; j < cols_count; j++ )
\r
218 ddata[j] = sdata[j];
\r
219 dm[j] = ( fabs( MISS_VAL - sdata[j] ) <= FLT_EPSILON );
\r
221 CV_NEXT_SEQ_ELEM( seq->elem_size, reader );
\r
224 if ( cvNorm( missing, 0, CV_L1 ) <= FLT_EPSILON )
\r
225 cvReleaseMat( &missing );
\r
227 cvReleaseMemStorage( &storage );
\r
233 void CvMLData :: str_to_flt_elem( const char* token, float& flt_elem, int& type)
\r
236 char* stopstring = NULL;
\r
237 flt_elem = (float)strtod( token, &stopstring );
\r
238 assert( stopstring );
\r
239 type = CV_VAR_ORDERED;
\r
240 if ( *stopstring == miss_ch && strlen(stopstring) == 1 ) // missed value
\r
242 flt_elem = MISS_VAL;
\r
243 type = CV_VAR_MISS;
\r
247 if ( (*stopstring != 0) && (*stopstring != '\n') && (strcmp(stopstring, "\r\n") != 0) ) // class label
\r
249 int idx = (*class_map)[token];
\r
252 total_class_count++;
\r
253 idx = total_class_count;
\r
254 (*class_map)[token] = idx;
\r
256 flt_elem = (float)idx;
\r
257 type = CV_VAR_CATEGORICAL;
\r
262 void CvMLData :: set_delimiter(char ch)
\r
264 CV_FUNCNAME( "CvMLData :: set_delimited" );
\r
267 if (ch == miss_ch /*|| ch == flt_separator*/)
\r
268 CV_ERROR(CV_StsBadArg, "delimited, miss_character and flt_separator must be different");
\r
275 void CvMLData :: set_miss_ch(char ch)
\r
277 CV_FUNCNAME( "CvMLData :: set_miss_ch" );
\r
280 if (ch == delimiter/* || ch == flt_separator*/)
\r
281 CV_ERROR(CV_StsBadArg, "delimited, miss_character and flt_separator must be different");
\r
288 void CvMLData :: set_response_idx( int idx )
\r
290 CV_FUNCNAME( "CvMLData :: set_response_idx" );
\r
294 CV_ERROR( CV_StsInternal, "data is empty" );
\r
296 if ( idx >= values->cols)
\r
297 CV_ERROR( CV_StsBadArg, "idx value is not correct" );
\r
299 if ( response_idx >= 0 )
\r
300 chahge_var_idx( response_idx, true );
\r
304 response_idx = idx;
\r
305 chahge_var_idx( response_idx, false );
\r
311 void CvMLData :: change_var_type( int var_idx, int type )
\r
313 CV_FUNCNAME( "CvMLData :: change_var_type" );
\r
319 CV_ERROR( CV_StsInternal, "data is empty" );
\r
321 var_count = values->cols;
\r
323 if ( var_idx < 0 || var_idx >= var_count)
\r
324 CV_ERROR( CV_StsBadArg, "var_idx is not correct" );
\r
326 if ( type != CV_VAR_ORDERED && type != CV_VAR_CATEGORICAL)
\r
327 CV_ERROR( CV_StsBadArg, "type is not correct" );
\r
329 assert( var_types );
\r
330 if ( var_types->data.ptr[var_idx] == CV_VAR_CATEGORICAL && type == CV_VAR_ORDERED)
\r
331 CV_ERROR( CV_StsBadArg, "it`s impossible to assign CV_VAR_ORDERED type to categorical variable" );
\r
332 var_types->data.ptr[var_idx] = (uchar)type;
\r
339 void CvMLData :: set_var_types( const char* str )
\r
341 CV_FUNCNAME( "CvMLData :: set_var_types" );
\r
344 const char* ord = 0, *cat = 0;
\r
345 int var_count = 0, set_var_type_count = 0;
\r
347 CV_ERROR( CV_StsInternal, "data is empty" );
\r
349 var_count = values->cols;
\r
351 assert( var_types );
\r
353 ord = strstr( str, "ord" );
\r
354 cat = strstr( str, "cat" );
\r
355 if ( !ord && !cat )
\r
356 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
358 if ( !ord && strlen(cat) == 3 ) // str == "cat"
\r
360 cvSet( var_types, cvScalarAll(CV_VAR_CATEGORICAL) );
\r
364 if ( !cat && strlen(ord) == 3 ) // str == "ord"
\r
366 cvSet( var_types, cvScalarAll(CV_VAR_ORDERED) );
\r
370 if ( ord ) // parse ord str
\r
372 char* stopstring = NULL;
\r
373 if ( ord[3] != '[')
\r
374 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
376 ord += 4; // pass "ord["
\r
379 int b1 = (int)strtod( ord, &stopstring );
\r
380 if ( *stopstring == 0 || (*stopstring != ',' && *stopstring != ']' && *stopstring != '-') )
\r
381 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
382 ord = stopstring + 1;
\r
383 if ( (stopstring[0] == ',') || (stopstring[0] == ']'))
\r
385 if ( var_types->data.ptr[b1] == CV_VAR_CATEGORICAL)
\r
386 CV_ERROR( CV_StsBadArg, "it`s impossible to assign CV_VAR_ORDERED type to categorical variable" );
\r
387 var_types->data.ptr[b1] = CV_VAR_ORDERED;
\r
388 set_var_type_count++;
\r
392 if ( stopstring[0] == '-')
\r
394 int b2 = (int)strtod( ord, &stopstring);
\r
395 if ( (*stopstring == 0) || (*stopstring != ',' && *stopstring != ']') )
\r
396 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
397 ord = stopstring + 1;
\r
398 for (int i = b1; i <= b2; i++)
\r
400 if ( var_types->data.ptr[i] == CV_VAR_CATEGORICAL)
\r
401 CV_ERROR( CV_StsBadArg, "it`s impossible to assign CV_VAR_ORDERED type to categorical variable" );
\r
402 var_types->data.ptr[i] = CV_VAR_ORDERED;
\r
404 set_var_type_count += b2 - b1 + 1;
\r
407 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
411 while (*stopstring != ']');
\r
413 if ( stopstring[1] != '\0' && stopstring[1] != ',')
\r
414 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
417 if ( cat ) // parse cat str
\r
419 char* stopstring = NULL;
\r
420 if ( cat[3] != '[')
\r
421 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
423 cat += 4; // pass "cat["
\r
426 int b1 = (int)strtod( cat, &stopstring );
\r
427 if ( *stopstring == 0 || (*stopstring != ',' && *stopstring != ']' && *stopstring != '-') )
\r
428 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
429 cat = stopstring + 1;
\r
430 if ( (stopstring[0] == ',') || (stopstring[0] == ']'))
\r
432 var_types->data.ptr[b1] = CV_VAR_CATEGORICAL;
\r
433 set_var_type_count++;
\r
437 if ( stopstring[0] == '-')
\r
439 int b2 = (int)strtod( cat, &stopstring);
\r
440 if ( (*stopstring == 0) || (*stopstring != ',' && *stopstring != ']') )
\r
441 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
442 cat = stopstring + 1;
\r
443 for (int i = b1; i <= b2; i++)
\r
444 var_types->data.ptr[i] = CV_VAR_CATEGORICAL;
\r
445 set_var_type_count += b2 - b1 + 1;
\r
448 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
452 while (*stopstring != ']');
\r
454 if ( stopstring[1] != '\0' && stopstring[1] != ',')
\r
455 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
458 if (set_var_type_count != var_count)
\r
459 CV_ERROR( CV_StsBadArg, "types string is not correct" );
\r
464 const CvMat* CvMLData :: get_var_types()
\r
466 CV_FUNCNAME( "CvMLData :: get_var_types" );
\r
469 uchar *var_types_out_ptr = 0;
\r
470 int avcount, vt_size;
\r
472 CV_ERROR( CV_StsInternal, "data is empty" );
\r
474 assert( var_idx_mask );
\r
476 avcount = cvFloor( cvNorm( var_idx_mask, 0, CV_L1 ) );
\r
477 vt_size = avcount + (response_idx >= 0);
\r
479 if ( avcount == values->cols || (avcount == values->cols-1 && response_idx == values->cols-1) )
\r
482 if ( !var_types_out || ( var_types_out && var_types_out->cols != vt_size ) )
\r
484 cvReleaseMat( &var_types_out );
\r
485 var_types_out = cvCreateMat( 1, vt_size, CV_8UC1 );
\r
488 var_types_out_ptr = var_types_out->data.ptr;
\r
489 for( int i = 0; i < var_types->cols; i++)
\r
491 if (i == response_idx || !var_idx_mask->data.ptr[i]) continue;
\r
492 *var_types_out_ptr = var_types->data.ptr[i];
\r
493 var_types_out_ptr++;
\r
495 if ( response_idx >= 0 )
\r
496 *var_types_out_ptr = var_types->data.ptr[response_idx];
\r
500 return var_types_out;
\r
503 const CvMat* CvMLData :: get_responses()
\r
505 CV_FUNCNAME( "CvMLData :: get_responses_ptr" );
\r
511 CV_ERROR( CV_StsInternal, "data is empty" );
\r
512 var_count = values->cols;
\r
514 if ( response_idx < 0 || response_idx >= var_count )
\r
516 if ( !response_out )
\r
517 response_out = cvCreateMatHeader( values->rows, 1, CV_32FC1 );
\r
519 cvInitMatHeader( response_out, values->rows, 1, CV_32FC1);
\r
520 cvGetCol( values, response_out, response_idx );
\r
524 return response_out;
\r
527 void CvMLData :: set_train_test_split( const CvTrainTestSplit * spl)
\r
529 CV_FUNCNAME( "CvMLData :: set_division" );
\r
532 int sample_count = 0;
\r
534 if ( spl->class_part )
\r
535 CV_ERROR( CV_StsBadArg, "this division type is not supported yet" );
\r
538 CV_ERROR( CV_StsInternal, "data is empty" );
\r
540 sample_count = values->rows;
\r
542 float train_sample_portion;
\r
544 if (spl->train_sample_part_mode == CV_COUNT)
\r
546 train_sample_count = spl->train_sample_part.count;
\r
547 if (train_sample_count > sample_count)
\r
548 CV_ERROR( CV_StsBadArg, "train samples count is not correct" );
\r
549 train_sample_count = train_sample_count<=0 ? sample_count : train_sample_count;
\r
551 else // dtype.train_sample_part_mode == CV_PORTION
\r
553 train_sample_portion = spl->train_sample_part.portion;
\r
554 if ( train_sample_portion > 1)
\r
555 CV_ERROR( CV_StsBadArg, "train samples count is not correct" );
\r
556 train_sample_portion = train_sample_portion <= FLT_EPSILON ||
\r
557 1 - train_sample_portion <= FLT_EPSILON ? 1 : train_sample_portion;
\r
558 train_sample_count = cvFloor( train_sample_portion * sample_count );
\r
561 if ( train_sample_count == sample_count )
\r
563 free_train_test_idx();
\r
567 if ( train_sample_idx && train_sample_idx->cols != train_sample_count )
\r
568 free_train_test_idx();
\r
572 int test_sample_count = sample_count- train_sample_count;
\r
573 sample_idx = (int*)cvAlloc( sample_count * sizeof(sample_idx[0]) );
\r
574 for (int i = 0; i < sample_count; i++ )
\r
576 train_sample_idx = cvCreateMatHeader( 1, train_sample_count, CV_32SC1 );
\r
577 test_sample_idx = cvCreateMatHeader( 1, test_sample_count, CV_32SC1 );
\r
578 *train_sample_idx = cvMat( 1, train_sample_count, CV_32SC1, &sample_idx[0] );
\r
579 *test_sample_idx = cvMat( 1, test_sample_count, CV_32SC1, &sample_idx[train_sample_count] );
\r
584 mix_train_and_test_idx();
\r
589 void CvMLData :: mix_train_and_test_idx()
\r
591 if ( !values || !sample_idx) return;
\r
593 if ( train_sample_count > 0 && train_sample_count < values->rows )
\r
595 int n = values->rows;
\r
596 for (int i = 0; i < n; i++)
\r
598 int a = cvRandInt( &rng ) % n;
\r
599 int b = cvRandInt( &rng ) % n;
\r
601 CV_SWAP( sample_idx[a], sample_idx[b], t );
\r
606 const CvMat* CvMLData :: get_var_idx()
\r
608 CV_FUNCNAME( "CvMLData :: get_var_idx" );
\r
614 CV_ERROR( CV_StsInternal, "data is empty" );
\r
616 assert( var_idx_mask );
\r
618 avcount = cvFloor( cvNorm( var_idx_mask, 0, CV_L1 ) );
\r
621 if ( avcount == values->cols )
\r
624 if ( !var_idx_out || ( var_idx_out && var_idx_out->cols != avcount ) )
\r
626 cvReleaseMat( &var_idx_out );
\r
627 var_idx_out = cvCreateMat( 1, avcount, CV_32SC1);
\r
628 if ( response_idx >=0 )
\r
629 var_idx_mask->data.ptr[response_idx] = 0;
\r
632 vidx = var_idx_out->data.i;
\r
634 for(int i = 0; i < var_idx_mask->cols; i++)
\r
635 if ( var_idx_mask->data.ptr[i] )
\r
643 return var_idx_out;
\r
646 void CvMLData :: chahge_var_idx( int vi, bool state )
\r
648 CV_FUNCNAME( "CvMLData :: get_responses_ptr" );
\r
654 CV_ERROR( CV_StsInternal, "data is empty" );
\r
656 var_count = values->cols;
\r
658 if ( vi < 0 || vi >= var_count)
\r
659 CV_ERROR( CV_StsBadArg, "variable index is not correct" );
\r
661 assert( var_idx_mask );
\r
662 var_idx_mask->data.ptr[vi] = state;
\r