@@ -120,22 +120,14 @@ FCANN<ACTF, ACTFLast>::FCANN(
120120 // we have a fully connected layer
121121 m_params.weightMasks .push_back (NNMatrix::Ones (in_size, out_size));
122122 }
123-
124- m_params.outputBiasLastUpdate .push_back (NNVector::Zero (out_size));
125- m_params.weightsLastUpdate .push_back (NNMatrix::Zero (in_size, out_size));
126-
127- m_params.outputBiasEg2 .push_back (NNVector::Zero (out_size));
128- m_params.weightsEg2 .push_back (NNMatrix::Zero (in_size, out_size));
129-
130- m_params.outputBiasRMSd2 .push_back (NNVector::Zero (out_size));
131- m_params.weightsRMSd2 .push_back (NNMatrix::Zero (in_size, out_size));
132123 }
133124
134125 m_params.evalTmp .resize (hiddenLayers.size () + 2 );
135126 m_params.evalSingleTmp .resize (hiddenLayers.size () + 2 );
136127
137128 UpdateWeightMasksRegions_ ();
138129 UpdateWeightSemiSparse_ ();
130+ InitializeOptimizationState_ ();
139131}
140132
141133template <ActivationFunc ACTF, ActivationFunc ACTFLast>
@@ -400,20 +392,21 @@ float FCANN<ACTF, ACTFLast>::TrainGDM(const MatrixBase<Derived1> &x, const Matri
400392}
401393
402394template <ActivationFunc ACTF, ActivationFunc ACTFLast>
403- void FCANN<ACTF, ACTFLast>::ApplyWeightUpdates(const Gradients &grad, float learningRate, float reg)
395+ void FCANN<ACTF, ACTFLast>::ApplyWeightUpdates(const Gradients &grad, float /* learningRate*/ , float reg)
404396{
405397 assert (grad.weightGradients .size () == m_params.weights .size ());
406398 assert (grad.biasGradients .size () == m_params.outputBias .size ());
407399 assert (grad.weightGradients .size () == grad.biasGradients .size ());
408400
401+ /* // for SGD + M
409402 m_params.weightsLastUpdate.resize(m_params.weights.size());
410403 m_params.outputBiasLastUpdate.resize(m_params.outputBias.size());
404+ */
411405
412- m_params.weightsEg2 .resize (m_params.weights .size ());
413- m_params.outputBiasEg2 .resize (m_params.outputBias .size ());
414-
415- m_params.weightsRMSd2 .resize (m_params.weights .size ());
416- m_params.outputBiasRMSd2 .resize (m_params.outputBias .size ());
406+ if (m_params.weightsEg2 .size () != m_params.weights .size ())
407+ {
408+ InitializeOptimizationState_ ();
409+ }
417410
418411 for (size_t layer = 0 ; layer < m_params.weights .size (); ++layer)
419412 {
@@ -484,8 +477,8 @@ void FCANN<ACTF, ACTFLast>::ApplyWeightUpdates(const Gradients &grad, float lear
484477 #endif
485478
486479 // update Eg2 (ADADELTA)
487- float decay = 0 .99f ;
488- float e = 1e-8f ;
480+ float decay = 0 .95f ;
481+ float e = 1e-6f ;
489482 weightsEg2Block.array () *= decay;
490483 weightsEg2Block.array () += (weightsGradientsBlock.array () * weightsGradientsBlock.array ()) * (1 .0f - decay);
491484 biasEg2Block.array () *= decay;
@@ -498,9 +491,9 @@ void FCANN<ACTF, ACTFLast>::ApplyWeightUpdates(const Gradients &grad, float lear
498491 // NNMatrix weightDelta = -weightsGradientsBlock.array() * learningRate /*+ weightReg.array()*/;
499492 // NNVector biasDelta = -biasGradientsBlock.array() * learningRate;
500493
501- weightsBlock += weightDelta * learningRate ;
494+ weightsBlock += weightDelta;
502495 weightsBlock.array () *= weightMaskBlock.array ();
503- biasBlock += biasDelta * learningRate ;
496+ biasBlock += biasDelta;
504497
505498 FP weightMax = std::max (std::max (weightsBlock.maxCoeff (), -weightsBlock.minCoeff ()), std::max (biasBlock.maxCoeff (), -biasBlock.minCoeff ()));
506499 if (weightMax > MAX_WEIGHT)
@@ -779,6 +772,27 @@ void FCANN<ACTF, ACTFLast>::UpdateWeightSemiSparse_()
779772 m_params.weightsSemiSparseCurrent = true ;
780773}
781774
775+
776+ template <ActivationFunc ACTF, ActivationFunc ACTFLast>
777+ void FCANN<ACTF, ACTFLast>::InitializeOptimizationState_()
778+ {
779+ m_params.weightsEg2 .resize (m_params.weights .size ());
780+ m_params.outputBiasEg2 .resize (m_params.outputBias .size ());
781+
782+ m_params.weightsRMSd2 .resize (m_params.weights .size ());
783+ m_params.outputBiasRMSd2 .resize (m_params.outputBias .size ());
784+
785+ for (size_t i = 0 ; i < m_params.weights .size (); ++i)
786+ {
787+ m_params.outputBiasEg2 [i] = NNVector::Zero (m_params.outputBias [i].cols ());
788+ m_params.weightsEg2 [i] = NNMatrix::Zero (m_params.weights [i].rows (), m_params.weights [i].cols ());
789+
790+ m_params.outputBiasRMSd2 [i] = NNVector::Zero (m_params.outputBias [i].cols ());
791+ m_params.weightsRMSd2 [i] = NNMatrix::Zero (m_params.weights [i].rows (), m_params.weights [i].cols ());
792+ }
793+ }
794+
795+
782796/* serialization format:
783797 * numLayers
784798 * for each layer:
0 commit comments