Feat/adam optimizer (#140)

tracel-ai · Dec 30, 2022 · eea5a26 · eea5a26
1 parent 248039d
commit eea5a26
Show file tree

Hide file tree

Showing 17 changed files with 405 additions and 28 deletions.
diff --git a/burn-autodiff/src/ops/tensor.rs b/burn-autodiff/src/ops/tensor.rs
@@ -1062,6 +1062,32 @@ impl<B: Backend> TensorOps<ADBackendDecorator<B>> for ADBackendDecorator<B> {
         unary_ops_wrapper(tensor.node.clone(), output, ops)
     }
 
+    fn sqrt<const D: usize>(
+        tensor: &<ADBackendDecorator<B> as Backend>::TensorPrimitive<D>,
+    ) -> <ADBackendDecorator<B> as Backend>::TensorPrimitive<D> {
+        #[derive(new, Debug)]
+        struct Backward<B: Backend, const D: usize> {
+            _b: B,
+        }
+
+        impl<B: Backend, const D: usize> UnaryOps<B::TensorPrimitive<D>, B::TensorPrimitive<D>>
+            for Backward<B, D>
+        {
+            fn partial(
+                &self,
+                state: &UnaryOpsNodeState<B::TensorPrimitive<D>, B::TensorPrimitive<D>>,
+            ) -> B::TensorPrimitive<D> {
+                let value = B::div_scalar(&B::powf(&state.input.value(), -0.5), &2.to_elem());
+                B::mul(&state.output.grad(), &value)
+            }
+        }
+
+        let output = B::sqrt(tensor.tensor_ref());
+        let ops = Backward::<B, D>::new(B::default());
+
+        unary_ops_wrapper(tensor.node.clone(), output, ops)
+    }
+
     fn erf<const D: usize>(
         tensor: &<ADBackendDecorator<B> as Backend>::TensorPrimitive<D>,
     ) -> <ADBackendDecorator<B> as Backend>::TensorPrimitive<D> {

diff --git a/burn-autodiff/src/tests/mod.rs b/burn-autodiff/src/tests/mod.rs
@@ -18,6 +18,7 @@ mod pow;
 mod relu;
 mod reshape;
 mod softmax;
+mod sqrt;
 mod sub;
 mod transpose;
 
@@ -43,6 +44,7 @@ macro_rules! testgen_all {
         burn_autodiff::testgen_ad_mul!();
         burn_autodiff::testgen_ad_neg!();
         burn_autodiff::testgen_ad_powf!();
+        burn_autodiff::testgen_ad_sqrt!();
         burn_autodiff::testgen_ad_relu!();
         burn_autodiff::testgen_ad_reshape!();
         burn_autodiff::testgen_ad_softmax!();

diff --git a/burn-autodiff/src/tests/sqrt.rs b/burn-autodiff/src/tests/sqrt.rs
@@ -0,0 +1,28 @@
+#[burn_tensor_testgen::testgen(ad_sqrt)]
+mod tests {
+    use super::*;
+    use burn_tensor::Data;
+
+    #[test]
+    fn should_diff_sqrt() {
+        let data_1 = Data::<f32, 2>::from([[0.0, 1.0], [3.0, 4.0]]);
+        let data_2 = Data::<f32, 2>::from([[6.0, 7.0], [9.0, 10.0]]);
+
+        let tensor_1 = TestADTensor::from_data(data_1);
+        let tensor_2 = TestADTensor::from_data(data_2);
+
+        let tensor_3 = tensor_1.matmul(&tensor_2.sqrt());
+        let tensor_4 = tensor_3.matmul(&tensor_2);
+        let grads = tensor_4.backward();
+
+        let grad_1 = tensor_1.grad(&grads).unwrap();
+        let grad_2 = tensor_2.grad(&grads).unwrap();
+
+        grad_1
+            .to_data()
+            .assert_approx_eq(&Data::from([[82.1126, 99.0832], [82.1126, 99.0832]]), 3);
+        grad_2
+            .to_data()
+            .assert_approx_eq(&Data::from([[30.3093, 33.1204], [34.5819, 38.7694]]), 3);
+    }
+}
diff --git a/burn-ndarray/src/element.rs b/burn-ndarray/src/element.rs
@@ -9,6 +9,7 @@ pub(crate) trait ExpElement {
     fn exp_elem(self) -> Self;
     fn log_elem(self) -> Self;
     fn pow_elem(self, value: f32) -> Self;
+    fn sqrt_elem(self) -> Self;
 }
 
 macro_rules! impl_exp_elem {
@@ -23,6 +24,9 @@ macro_rules! impl_exp_elem {
             fn pow_elem(self, value: f32) -> Self {
                 $elem::powf(self, value.into())
             }
+            fn sqrt_elem(self) -> Self {
+                $elem::sqrt(self)
+            }
         }
     };
     ($elem:ident, $tmp:ident) => {
@@ -39,6 +43,10 @@ macro_rules! impl_exp_elem {
                 let tmp = $tmp::powf(self as $tmp, value as $tmp);
                 tmp as $elem
             }
+            fn sqrt_elem(self) -> Self {
+                let tmp = $tmp::sqrt(self as $tmp);
+                tmp as $elem
+            }
         }
     };
 }

diff --git a/burn-ndarray/src/ops/tensor.rs b/burn-ndarray/src/ops/tensor.rs
@@ -531,6 +531,13 @@ impl<E: NdArrayElement> TensorOps<NdArrayBackend<E>> for NdArrayBackend<E> {
         NdArrayTensor { array, shape }
     }
 
+    fn sqrt<const D: usize>(tensor: &NdArrayTensor<E, D>) -> NdArrayTensor<E, D> {
+        let array = tensor.array.mapv(|a| a.sqrt_elem()).into_shared();
+        let shape = tensor.shape;
+
+        NdArrayTensor { array, shape }
+    }
+
     fn erf<const D: usize>(tensor: &NdArrayTensor<E, D>) -> NdArrayTensor<E, D> {
         let array = tensor
             .array

diff --git a/burn-tch/src/ops/tensor.rs b/burn-tch/src/ops/tensor.rs
@@ -447,6 +447,10 @@ impl<E: TchElement> TensorOps<TchBackend<E>> for TchBackend<E> {
         to_tensor(tensor.tensor.pow_tensor_scalar(value as f64))
     }
 
+    fn sqrt<const D: usize>(tensor: &TchTensor<E, D>) -> TchTensor<E, D> {
+        to_tensor(tensor.tensor.sqrt())
+    }
+
     fn erf<const D: usize>(tensor: &TchTensor<E, D>) -> TchTensor<E, D> {
         to_tensor(tensor.tensor.erf())
     }

diff --git a/burn-tensor/src/tensor/base.rs b/burn-tensor/src/tensor/base.rs
@@ -27,6 +27,16 @@ where
     }
 }
 
+impl<B> Tensor<B, 1>
+where
+    B: Backend,
+{
+    /// Returns the first value of the tensor.
+    pub fn single_value(&self) -> B::Elem {
+        self.to_data().value[0]
+    }
+}
+
 impl<const D: usize, B> Tensor<B, D>
 where
     B: Backend,
@@ -90,6 +100,11 @@ where
         Self::new(B::powf(&self.value, value))
     }
 
+    /// Applies element wise root square operation.
+    pub fn sqrt(&self) -> Self {
+        Self::new(B::sqrt(&self.value))
+    }
+
     /// Returns the shape of the current tensor.
     pub fn shape(&self) -> &Shape<D> {
         B::shape(&self.value)

diff --git a/burn-tensor/src/tensor/ops/base.rs b/burn-tensor/src/tensor/ops/base.rs
@@ -226,6 +226,7 @@ pub trait TensorOps<B: Backend> {
     fn exp<const D: usize>(tensor: &B::TensorPrimitive<D>) -> B::TensorPrimitive<D>;
     fn log<const D: usize>(tensor: &B::TensorPrimitive<D>) -> B::TensorPrimitive<D>;
     fn powf<const D: usize>(tensor: &B::TensorPrimitive<D>, value: f32) -> B::TensorPrimitive<D>;
+    fn sqrt<const D: usize>(tensor: &B::TensorPrimitive<D>) -> B::TensorPrimitive<D>;
     fn erf<const D: usize>(tensor: &B::TensorPrimitive<D>) -> B::TensorPrimitive<D>;
     fn cat<const D: usize>(tensors: &[B::TensorPrimitive<D>], dim: usize) -> B::TensorPrimitive<D>;
     fn relu<const D: usize>(tensor: &B::TensorPrimitive<D>) -> B::TensorPrimitive<D>;

diff --git a/burn/src/nn/layer_norm.rs b/burn/src/nn/layer_norm.rs
@@ -48,9 +48,7 @@ impl<B: Backend> LayerNorm<B> {
     pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
         let (var, mean) = input.var_mean_bias(D - 1);
 
-        let input_normalized = input
-            .sub(&mean)
-            .div(&var.powf(0.5).add_scalar(self.epsilon));
+        let input_normalized = input.sub(&mean).div(&var.sqrt().add_scalar(self.epsilon));
 
         input_normalized
             .mul(&self.gamma.unsqueeze())