@@ -211,19 +211,21 @@ public static VectorBundle Load(CsvDataHolder csvData, int numOfOutputVariables)
211211 return bundle ;
212212 }
213213
214- //Methods
215214 /// <summary>
216- /// Splits this bundle to a collection of smaller folds (sub-bundles) suitable for the cross-validation.
217- /// Remember that in case of binary output the length of the output vectors should be equal to 1, because
218- /// function keeps balanced ratios of 0 and 1 values in output vectors in each fold and takes into account
219- /// only the first value in the output vector.
215+ /// Adds data from given bundle into this bundle
220216 /// </summary>
221- /// <param name="foldDataRatio">Requested ratio of the samples constituting one fold (sub-bundle).</param>
222- /// <param name="binBorder">If specified, method keeps balanced ratios of 0 and 1 values in each fold (sub-bundle).</param>
223- /// <returns>Collection of created folds (sub-bundles)</returns>
224- public List < VectorBundle > CreateFolds ( double foldDataRatio , double binBorder = double . NaN )
217+ /// <param name="data">Data to be added</param>
218+ public void Add ( VectorBundle data )
219+ {
220+ InputVectorCollection . AddRange ( data . InputVectorCollection ) ;
221+ OutputVectorCollection . AddRange ( data . OutputVectorCollection ) ;
222+ return ;
223+ }
224+
225+ //Methods
226+ public List < VectorBundle > Folderize ( double foldDataRatio , double binBorder = double . NaN )
225227 {
226- if ( OutputVectorCollection . Count < 2 )
228+ if ( OutputVectorCollection . Count < 2 )
227229 {
228230 throw new InvalidOperationException ( $ "Insufficient number of samples ({ OutputVectorCollection . Count . ToString ( CultureInfo . InvariantCulture ) } ).") ;
229231 }
@@ -268,11 +270,11 @@ public List<VectorBundle> CreateFolds(double foldDataRatio, double binBorder = d
268270 BinDistribution refBinDistr = new BinDistribution ( binBorder ) ;
269271 refBinDistr . Update ( OutputVectorCollection , 0 ) ;
270272 int min01 = Math . Min ( refBinDistr . NumOf [ 0 ] , refBinDistr . NumOf [ 1 ] ) ;
271- if ( min01 < 2 )
273+ if ( min01 < 2 )
272274 {
273275 throw new InvalidOperationException ( $ "Insufficient bin 0 or 1 samples (less than 2).") ;
274276 }
275- if ( numOfFolds > min01 )
277+ if ( numOfFolds > min01 )
276278 {
277279 numOfFolds = min01 ;
278280 }
@@ -334,17 +336,186 @@ public List<VectorBundle> CreateFolds(double foldDataRatio, double binBorder = d
334336
335337 return bundleCollection ;
336338 }
337-
338-
339339 /// <summary>
340- /// Adds data from given bundle into this bundle
340+ /// Splits this bundle to a collection of smaller folds (sub-bundles) suitable for the cross-validation.
341+ /// When the binBorder is specified then all output features are considered as binary
342+ /// within the "one takes all" group and function then keeps balanced ratios of 0 and 1
343+ /// for every output feature and fold.
341344 /// </summary>
342- /// <param name="data">Data to be added</param>
343- public void Add ( VectorBundle data )
345+ /// <param name="foldDataRatio">Requested ratio of the samples constituting one fold (sub-bundle).</param>
346+ /// <param name="binBorder">When specified then method keeps balanced ratios of 0 and 1 values in each fold sub-bundle.</param>
347+ /// <returns>Collection of created folds.</returns>
348+ public List < VectorBundle > Folderize_new ( double foldDataRatio , double binBorder = double . NaN )
344349 {
345- InputVectorCollection . AddRange ( data . InputVectorCollection ) ;
346- OutputVectorCollection . AddRange ( data . OutputVectorCollection ) ;
347- return ;
350+ if ( OutputVectorCollection . Count < 2 )
351+ {
352+ throw new InvalidOperationException ( $ "Insufficient number of samples ({ OutputVectorCollection . Count . ToString ( CultureInfo . InvariantCulture ) } ).") ;
353+ }
354+ List < VectorBundle > foldCollection = new List < VectorBundle > ( ) ;
355+ //Fold data ratio basic correction
356+ if ( foldDataRatio > MaxRatioOfFoldData )
357+ {
358+ foldDataRatio = MaxRatioOfFoldData ;
359+ }
360+ //Prelimitary fold size estimation
361+ int foldSize = Math . Max ( 1 , ( int ) Math . Round ( OutputVectorCollection . Count * foldDataRatio , 0 ) ) ;
362+ //Prelimitary number of folds
363+ int numOfFolds = OutputVectorCollection . Count / foldSize ;
364+ //Folds creation
365+ if ( double . IsNaN ( binBorder ) )
366+ {
367+ //No binary output -> simple split
368+ int samplesPos = 0 ;
369+ for ( int foldIdx = 0 ; foldIdx < numOfFolds ; foldIdx ++ )
370+ {
371+ VectorBundle fold = new VectorBundle ( ) ;
372+ for ( int i = 0 ; i < foldSize && samplesPos < OutputVectorCollection . Count ; i ++ )
373+ {
374+ fold . InputVectorCollection . Add ( InputVectorCollection [ samplesPos ] ) ;
375+ fold . OutputVectorCollection . Add ( OutputVectorCollection [ samplesPos ] ) ;
376+ ++ samplesPos ;
377+ }
378+ foldCollection . Add ( fold ) ;
379+ }
380+ //Remaining samples
381+ for ( int i = 0 ; i < OutputVectorCollection . Count - samplesPos ; i ++ )
382+ {
383+ int foldIdx = i % foldCollection . Count ;
384+ foldCollection [ foldIdx ] . InputVectorCollection . Add ( InputVectorCollection [ samplesPos + i ] ) ;
385+ foldCollection [ foldIdx ] . OutputVectorCollection . Add ( OutputVectorCollection [ samplesPos + i ] ) ;
386+ }
387+ } //Indifferent output
388+ else
389+ {
390+ //Binary outputs -> keep balanced ratios of outputs
391+ int numOfOutputs = OutputVectorCollection [ 0 ] . Length ;
392+ if ( numOfOutputs == 1 )
393+ {
394+ //Special case there is only one binary output
395+ //Investigation of the output data metrics
396+ BinDistribution refBinDistr = new BinDistribution ( binBorder ) ;
397+ refBinDistr . Update ( OutputVectorCollection , 0 ) ;
398+ int min01 = Math . Min ( refBinDistr . NumOf [ 0 ] , refBinDistr . NumOf [ 1 ] ) ;
399+ if ( min01 < 2 )
400+ {
401+ throw new InvalidOperationException ( $ "Insufficient bin 0 or 1 samples (less than 2).") ;
402+ }
403+ if ( numOfFolds > min01 )
404+ {
405+ numOfFolds = min01 ;
406+ }
407+ //Scan data
408+ int [ ] bin0SampleIdxs = new int [ refBinDistr . NumOf [ 0 ] ] ;
409+ int bin0SamplesPos = 0 ;
410+ int [ ] bin1SampleIdxs = new int [ refBinDistr . NumOf [ 1 ] ] ;
411+ int bin1SamplesPos = 0 ;
412+ for ( int i = 0 ; i < OutputVectorCollection . Count ; i ++ )
413+ {
414+ if ( OutputVectorCollection [ i ] [ 0 ] >= refBinDistr . BinBorder )
415+ {
416+ bin1SampleIdxs [ bin1SamplesPos ++ ] = i ;
417+ }
418+ else
419+ {
420+ bin0SampleIdxs [ bin0SamplesPos ++ ] = i ;
421+ }
422+ }
423+ //Determine distributions of 0 and 1 for one fold
424+ int bundleBin0Count = Math . Max ( 1 , refBinDistr . NumOf [ 0 ] / numOfFolds ) ;
425+ int bundleBin1Count = Math . Max ( 1 , refBinDistr . NumOf [ 1 ] / numOfFolds ) ;
426+ //Bundles creation
427+ bin0SamplesPos = 0 ;
428+ bin1SamplesPos = 0 ;
429+ for ( int foldIdx = 0 ; foldIdx < numOfFolds ; foldIdx ++ )
430+ {
431+ VectorBundle fold = new VectorBundle ( ) ;
432+ //Bin 0
433+ for ( int i = 0 ; i < bundleBin0Count ; i ++ )
434+ {
435+ fold . InputVectorCollection . Add ( InputVectorCollection [ bin0SampleIdxs [ bin0SamplesPos ] ] ) ;
436+ fold . OutputVectorCollection . Add ( OutputVectorCollection [ bin0SampleIdxs [ bin0SamplesPos ] ] ) ;
437+ ++ bin0SamplesPos ;
438+ }
439+ //Bin 1
440+ for ( int i = 0 ; i < bundleBin1Count ; i ++ )
441+ {
442+ fold . InputVectorCollection . Add ( InputVectorCollection [ bin1SampleIdxs [ bin1SamplesPos ] ] ) ;
443+ fold . OutputVectorCollection . Add ( OutputVectorCollection [ bin1SampleIdxs [ bin1SamplesPos ] ] ) ;
444+ ++ bin1SamplesPos ;
445+ }
446+ foldCollection . Add ( fold ) ;
447+ }
448+ //Remaining samples
449+ for ( int i = 0 ; i < bin0SampleIdxs . Length - bin0SamplesPos ; i ++ )
450+ {
451+ int foldIdx = i % foldCollection . Count ;
452+ foldCollection [ foldIdx ] . InputVectorCollection . Add ( InputVectorCollection [ bin0SampleIdxs [ bin0SamplesPos + i ] ] ) ;
453+ foldCollection [ foldIdx ] . OutputVectorCollection . Add ( OutputVectorCollection [ bin0SampleIdxs [ bin0SamplesPos + i ] ] ) ;
454+ }
455+ for ( int i = 0 ; i < bin1SampleIdxs . Length - bin1SamplesPos ; i ++ )
456+ {
457+ int foldIdx = i % foldCollection . Count ;
458+ foldCollection [ foldIdx ] . InputVectorCollection . Add ( InputVectorCollection [ bin1SampleIdxs [ bin1SamplesPos + i ] ] ) ;
459+ foldCollection [ foldIdx ] . OutputVectorCollection . Add ( OutputVectorCollection [ bin1SampleIdxs [ bin1SamplesPos + i ] ] ) ;
460+ }
461+ } //Only 1 binary output
462+ else
463+ {
464+ //There is more than 1 binary output - "one takes all approach"
465+ //Investigation of the output data metrics
466+ //Collect bin 1 sample indexes and check "one takes all" consistency for every output feature
467+ List < int > [ ] outBin1SampleIdxs = new List < int > [ numOfOutputs ] ;
468+ for ( int i = 0 ; i < numOfOutputs ; i ++ )
469+ {
470+ outBin1SampleIdxs [ i ] = new List < int > ( ) ;
471+ }
472+ for ( int sampleIdx = 0 ; sampleIdx < OutputVectorCollection . Count ; sampleIdx ++ )
473+ {
474+ int numOf1 = 0 ;
475+ for ( int outFeatureIdx = 0 ; outFeatureIdx < numOfOutputs ; outFeatureIdx ++ )
476+ {
477+ if ( OutputVectorCollection [ sampleIdx ] [ outFeatureIdx ] >= binBorder )
478+ {
479+ outBin1SampleIdxs [ outFeatureIdx ] . Add ( sampleIdx ) ;
480+ ++ numOf1 ;
481+ }
482+ }
483+ if ( numOf1 != 1 )
484+ {
485+ throw new ArgumentException ( $ "Data are inconsistent on data index { sampleIdx . ToString ( CultureInfo . InvariantCulture ) } . Output vector has { numOf1 . ToString ( CultureInfo . InvariantCulture ) } feature(s) having bin value 1.", "binBorder" ) ;
486+ }
487+ }
488+ //Determine max possible number of folds
489+ int maxNumOfFolds = OutputVectorCollection . Count ;
490+ for ( int outFeatureIdx = 0 ; outFeatureIdx < numOfOutputs ; outFeatureIdx ++ )
491+ {
492+ int outFeatureMaxFolds = Math . Min ( outBin1SampleIdxs [ outFeatureIdx ] . Count , OutputVectorCollection . Count - outBin1SampleIdxs [ outFeatureIdx ] . Count ) ;
493+ maxNumOfFolds = Math . Min ( outFeatureMaxFolds , maxNumOfFolds ) ;
494+ }
495+ //Correct the number of folds to be created
496+ if ( numOfFolds > maxNumOfFolds )
497+ {
498+ numOfFolds = maxNumOfFolds ;
499+ }
500+ //Create the folds
501+ for ( int foldIdx = 0 ; foldIdx < numOfFolds ; foldIdx ++ )
502+ {
503+ foldCollection . Add ( new VectorBundle ( ) ) ;
504+ }
505+ //Samples distribution
506+ for ( int outFeatureIdx = 0 ; outFeatureIdx < numOfOutputs ; outFeatureIdx ++ )
507+ {
508+ for ( int bin1SampleRefIdx = 0 ; bin1SampleRefIdx < outBin1SampleIdxs [ outFeatureIdx ] . Count ; bin1SampleRefIdx ++ )
509+ {
510+ int foldIdx = bin1SampleRefIdx % foldCollection . Count ;
511+ int dataIdx = outBin1SampleIdxs [ outFeatureIdx ] [ bin1SampleRefIdx ] ;
512+ foldCollection [ foldIdx ] . AddPair ( InputVectorCollection [ dataIdx ] , OutputVectorCollection [ dataIdx ] ) ;
513+ }
514+ }
515+ } //More binary outputs
516+ } //Binary output
517+
518+ return foldCollection ;
348519 }
349520
350521 } //VectorBundle
0 commit comments