From 7b01b3d4fad472e13d4ab3284ab95763f37eaf10 Mon Sep 17 00:00:00 2001
From: Yuncheng Li <raingomm@gmail.com>
Date: Tue, 15 Sep 2015 09:11:01 -0400
Subject: [PATCH 1/5] merged pull#1271; and passed the tests

---
 examples/mnist/lenet_local_solver.prototxt    |  25 +
 .../mnist/lenet_local_train_test.prototxt     | 171 +++++++
 examples/mnist/train_lenet.sh                 |   2 +-
 examples/mnist/train_lenet_local.sh           |   3 +
 .../mnist_siamese_local_solver.prototxt       |  25 +
 .../mnist_siamese_local_train_test.prototxt   | 435 ++++++++++++++++++
 examples/siamese/train_mnist_siamese_local.sh |   5 +
 include/caffe/filler.hpp                      |  25 +
 include/caffe/vision_layers.hpp               |  50 ++
 src/caffe/layers/local_layer.cpp              | 212 +++++++++
 src/caffe/layers/local_layer.cu               | 185 ++++++++
 src/caffe/proto/caffe.proto                   |  14 +-
 src/caffe/test/test_local_layer.cpp           | 121 +++++
 13 files changed, 1271 insertions(+), 2 deletions(-)
 create mode 100644 examples/mnist/lenet_local_solver.prototxt
 create mode 100644 examples/mnist/lenet_local_train_test.prototxt
 create mode 100644 examples/mnist/train_lenet_local.sh
 create mode 100644 examples/siamese/mnist_siamese_local_solver.prototxt
 create mode 100644 examples/siamese/mnist_siamese_local_train_test.prototxt
 create mode 100644 examples/siamese/train_mnist_siamese_local.sh
 create mode 100644 src/caffe/layers/local_layer.cpp
 create mode 100644 src/caffe/layers/local_layer.cu
 create mode 100644 src/caffe/test/test_local_layer.cpp

diff --git a/examples/mnist/lenet_local_solver.prototxt b/examples/mnist/lenet_local_solver.prototxt
new file mode 100644
index 00000000000..a4b02d25e7e
--- /dev/null
+++ b/examples/mnist/lenet_local_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/mnist/lenet_local_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0005
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 10000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/mnist/lenet"
+# solver mode: CPU or GPU
+solver_mode: GPU
diff --git a/examples/mnist/lenet_local_train_test.prototxt b/examples/mnist/lenet_local_train_test.prototxt
new file mode 100644
index 00000000000..ff88b5f4919
--- /dev/null
+++ b/examples/mnist/lenet_local_train_test.prototxt
@@ -0,0 +1,171 @@
+name: "LeNet"
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_train_lmdb"
+    backend: LMDB
+    batch_size: 64
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TRAIN }
+}
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_test_lmdb"
+    backend: LMDB
+    batch_size: 100
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TEST }
+}
+
+layers {
+  name: "conv1"
+  type: CONVOLUTION
+  bottom: "data"
+  top: "conv1"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "pool1"
+  type: POOLING
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layers {
+  name: "local1"
+  type: LOCAL
+  bottom: "pool1"
+  top: "local1"
+  blobs_lr: 1
+  blobs_lr: 1
+  local_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "local1"
+  top: "local1"
+}
+layers {
+  name: "local2"
+  type: LOCAL
+  bottom: "local1"
+  top: "local2"
+  blobs_lr: 1
+  blobs_lr: 1
+  local_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layers {
+  name: "relu2"
+  type: RELU
+  bottom: "local2"
+  top: "local2"
+}
+layers {
+  name: "ip1"
+  type: INNER_PRODUCT
+  bottom: "local2"
+  top: "ip1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "relu2"
+  type: RELU
+  bottom: "ip1"
+  top: "ip1"
+}
+layers {
+  name: "ip2"
+  type: INNER_PRODUCT
+  bottom: "ip1"
+  top: "ip2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "accuracy"
+  type: ACCURACY
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include: { phase: TEST }
+}
+layers {
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/examples/mnist/train_lenet.sh b/examples/mnist/train_lenet.sh
index 1b6bf7d978d..1f718825ded 100755
--- a/examples/mnist/train_lenet.sh
+++ b/examples/mnist/train_lenet.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env sh
 
-./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
+GLOG_logtostderr=0 GLOG_log_dir=examples/mnist/ ./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
diff --git a/examples/mnist/train_lenet_local.sh b/examples/mnist/train_lenet_local.sh
new file mode 100644
index 00000000000..b9e29e5ceb8
--- /dev/null
+++ b/examples/mnist/train_lenet_local.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+
+GLOG_logtostderr=0 GLOG_log_dir=examples/mnist/ ./build/tools/caffe train --solver=examples/mnist/lenet_local_solver.prototxt --gpu=1
diff --git a/examples/siamese/mnist_siamese_local_solver.prototxt b/examples/siamese/mnist_siamese_local_solver.prototxt
new file mode 100644
index 00000000000..c85bb90fc87
--- /dev/null
+++ b/examples/siamese/mnist_siamese_local_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/siamese/mnist_siamese_local_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0000
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 50000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/siamese/mnist_siamese"
+# solver mode: CPU or GPU
+solver_mode: GPU
diff --git a/examples/siamese/mnist_siamese_local_train_test.prototxt b/examples/siamese/mnist_siamese_local_train_test.prototxt
new file mode 100644
index 00000000000..4212217df15
--- /dev/null
+++ b/examples/siamese/mnist_siamese_local_train_test.prototxt
@@ -0,0 +1,435 @@
+name: "mnist_siamese_train_test"
+layers {
+  name: "pair_data"
+  type: DATA
+  top: "pair_data"
+  top: "sim"
+  data_param {
+    source: "examples/siamese/mnist_siamese_train_leveldb"
+    scale: 0.00390625
+    batch_size: 64
+  }
+  include: { phase: TRAIN }
+}
+layers {
+  name: "pair_data"
+  type: DATA
+  top: "pair_data"
+  top: "sim"
+  data_param {
+    source: "examples/siamese/mnist_siamese_test_leveldb"
+    scale: 0.00390625
+    batch_size: 100
+  }
+  include: { phase: TEST }
+}
+layers {
+    name: "slice_pair"
+    type: SLICE
+    bottom: "pair_data"
+    top: "data"
+    top: "data_p"
+    slice_param {
+        slice_dim: 1
+        slice_point: 1
+    }
+}
+
+
+
+
+layers {
+  name: "conv1"
+  type: CONVOLUTION
+  bottom: "data"
+  top: "conv1"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "conv1_w"
+  param: "conv1_b"
+}
+layers {
+  name: "pool1"
+  type: POOLING
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+
+
+layers {
+  name: "local1"
+  type: LOCAL
+  bottom: "pool1"
+  top: "local1"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local1_w"
+  param: "local1_b"
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "local1"
+  top: "local1"
+}
+layers {
+  name: "local2"
+  type: LOCAL
+  bottom: "local1"
+  top: "local2"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local2_w"
+  param: "local2_b"
+}
+layers {
+  name: "relu2"
+  type: RELU
+  bottom: "local2"
+  top: "local2"
+}
+
+
+
+
+
+
+
+layers {
+  name: "ip1"
+  type: INNER_PRODUCT
+  bottom: "local2"
+  top: "ip1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip1_w"
+  param: "ip1_b"
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "ip1"
+  top: "ip1"
+}
+layers {
+  name: "ip2"
+  type: INNER_PRODUCT
+  bottom: "ip1"
+  top: "ip2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip2_w"
+  param: "ip2_b"
+}
+
+layers {
+  name: "feat2"
+  type: INNER_PRODUCT
+  bottom: "ip2"
+  top: "feat2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat2_w"
+  param: "feat2_b"
+}
+
+layers {
+  name: "feat1"
+  type: INNER_PRODUCT
+  bottom: "local1"
+  top: "feat1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat1_w"
+  param: "feat1_b"
+}
+
+
+
+layers {
+  name: "conv1_p"
+  type: CONVOLUTION
+  bottom: "data_p"
+  top: "conv1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "conv1_w"
+  param: "conv1_b"
+}
+layers {
+  name: "pool1_p"
+  type: POOLING
+  bottom: "conv1_p"
+  top: "pool1_p"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+
+
+layers {
+  name: "local1_p"
+  type: LOCAL
+  bottom: "pool1_p"
+  top: "local1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local1_w"
+  param: "local1_b"
+}
+layers {
+  name: "relu1_p"
+  type: RELU
+  bottom: "local1_p"
+  top: "local1_p"
+}
+layers {
+  name: "local2_p"
+  type: LOCAL
+  bottom: "local1_p"
+  top: "local2_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local2_w"
+  param: "local2_b"
+}
+layers {
+  name: "relu2_p"
+  type: RELU
+  bottom: "local2_p"
+  top: "local2_p"
+}
+
+
+
+
+
+layers {
+  name: "ip1_p"
+  type: INNER_PRODUCT
+  bottom: "local2_p"
+  top: "ip1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip1_w"
+  param: "ip1_b"
+}
+layers {
+  name: "relu1_p"
+  type: RELU
+  bottom: "ip1_p"
+  top: "ip1_p"
+}
+layers {
+  name: "ip2_p"
+  type: INNER_PRODUCT
+  bottom: "ip1_p"
+  top: "ip2_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip2_w"
+  param: "ip2_b"
+}
+
+layers {
+  name: "feat2_p"
+  type: INNER_PRODUCT
+  bottom: "ip2_p"
+  top: "feat2_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat2_w"
+  param: "feat2_b"
+}
+
+
+layers {
+  name: "feat1_p"
+  type: INNER_PRODUCT
+  bottom: "local1_p"
+  top: "feat1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat1_w"
+  param: "feat1_b"
+}
+
+
+layers {
+    name: "loss2"
+    type: CONTRASTIVE_LOSS
+    contrastive_loss_param {
+        margin: 1.0
+    }
+    bottom: "feat2"
+    bottom: "feat2_p"
+    bottom: "sim"
+    top: "loss2"
+}
+
+
+layers {
+    name: "loss1"
+    type: CONTRASTIVE_LOSS
+    contrastive_loss_param {
+        margin: 1.0
+    }
+    bottom: "feat1"
+    bottom: "feat1_p"
+    bottom: "sim"
+    top: "loss1"
+}
+
diff --git a/examples/siamese/train_mnist_siamese_local.sh b/examples/siamese/train_mnist_siamese_local.sh
new file mode 100644
index 00000000000..83035bf2ccc
--- /dev/null
+++ b/examples/siamese/train_mnist_siamese_local.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env sh
+
+TOOLS=./build/tools
+
+GLOG_logtostderr=0 GLOG_log_dir=examples/siamese/ $TOOLS/caffe train --solver=examples/siamese/mnist_siamese_local_solver.prototxt
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 888f4a4ba3b..3f762920f87 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -166,6 +166,29 @@ class XavierFiller : public Filler<Dtype> {
   }
 };
 
+template <typename Dtype>
+class TestLocalFiller : public Filler<Dtype> {
+ public:
+  explicit TestLocalFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    LOG(INFO) << "Doing mutable cpu";
+    LOG(INFO) << "blobs" << blob;
+    Dtype* data = blob->mutable_cpu_data();
+    LOG(INFO) << "Done Doing mutable cpu";
+    CHECK_EQ(blob->channels(), 1);
+
+    for (int n=0; n<blob->num(); n++) {
+      for (int j=0; j<blob->height(); j++) {
+        for (int i=0; i<blob->width(); i++) {
+          *(data+blob->offset(n, 0, j, i)) = i;
+        }
+      }
+    }
+  }
+};
+
+
 /**
  * @brief Fills a Blob with values @f$ x \sim N(0, \sigma^2) @f$ where
  *        @f$ \sigma^2 @f$ is set inversely proportional to number of incoming
@@ -281,6 +304,8 @@ Filler<Dtype>* GetFiller(const FillerParameter& param) {
     return new UniformFiller<Dtype>(param);
   } else if (type == "xavier") {
     return new XavierFiller<Dtype>(param);
+  } else if (type == "test_local") {
+    return new TestLocalFiller<Dtype>(param);
   } else if (type == "msra") {
     return new MSRAFiller<Dtype>(param);
   } else if (type == "bilinear") {
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 211e3d9042d..a2adf43c9ba 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -211,6 +211,56 @@ class DeconvolutionLayer : public BaseConvolutionLayer<Dtype> {
   virtual void compute_output_shape();
 };
 
+
+
+template <typename Dtype>
+class LocalLayer : public Layer<Dtype> {
+ public:
+  explicit LocalLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "Local"; }
+
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline bool EqualNumBottomTopBlobs() const { return true; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+
+  int kernel_size_;
+  int stride_;
+  int num_;
+  int channels_;
+  int pad_;
+  int height_, width_;
+  int height_out_, width_out_;
+  int num_output_;
+  bool bias_term_;
+
+  int M_;
+  int K_;
+  int N_;
+
+  Blob<Dtype> col_buffer_;
+};
+
+
+
+
 #ifdef USE_CUDNN
 /*
  * @brief cuDNN implementation of ConvolutionLayer.
diff --git a/src/caffe/layers/local_layer.cpp b/src/caffe/layers/local_layer.cpp
new file mode 100644
index 00000000000..2c45e1765f9
--- /dev/null
+++ b/src/caffe/layers/local_layer.cpp
@@ -0,0 +1,212 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void LocalLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(bottom.size(), 1) << "Conv Layer takes a single blob as input.";
+  CHECK_EQ(top.size(), 1) << "Conv Layer takes a single blob as output.";
+
+  kernel_size_ = this->layer_param_.local_param().kernel_size();
+  stride_ = this->layer_param_.local_param().stride();
+  pad_ = this->layer_param_.local_param().pad();
+  num_ = bottom[0]->num();
+  channels_ = bottom[0]->channels();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  num_output_ = this->layer_param_.local_param().num_output();
+
+  height_out_ = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+  width_out_ = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+
+  M_ = num_output_;
+  K_ = channels_ * kernel_size_ * kernel_size_;
+  N_ = height_out_ * width_out_;
+
+  CHECK_GT(num_output_, 0);
+  CHECK_GE(height_, kernel_size_) << "height smaller than kernel size";
+  CHECK_GE(width_, kernel_size_) << "width smaller than kernel size";
+  // Set the parameters
+  bias_term_ = this->layer_param_.local_param().bias_term();
+
+  // Check if we need to set up the weights
+  if (this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else {
+    if (bias_term_) {
+      this->blobs_.resize(2);
+    } else {
+      this->blobs_.resize(1);
+    }
+    // Intialize the weight
+    this->blobs_[0].reset(new Blob<Dtype>(
+        num_output_, 1, K_, N_));
+    // fill the weights
+    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
+        this->layer_param_.local_param().weight_filler()));
+    weight_filler->Fill(this->blobs_[0].get());
+    // If necessary, intiialize and fill the bias term
+    if (bias_term_) {
+      this->blobs_[1].reset(new Blob<Dtype>(1, 1, M_, N_));
+      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
+          this->layer_param_.local_param().bias_filler()));
+      bias_filler->Fill(this->blobs_[1].get());
+    }
+  }
+}
+
+template <typename Dtype>
+void LocalLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
+    " weights.";
+  // TODO: generalize to handle inputs of different shapes.
+  for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
+    CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
+    CHECK_EQ(channels_, bottom[bottom_id]->channels())
+        << "Inputs must have same channels.";
+    CHECK_EQ(height_, bottom[bottom_id]->height())
+        << "Inputs must have same height.";
+    CHECK_EQ(width_, bottom[bottom_id]->width())
+        << "Inputs must have same width.";
+  }
+
+  // Shape the tops.
+  for (int top_id = 0; top_id < top.size(); ++top_id) {
+    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
+  }
+
+  // The im2col result buffer would only hold one image at a time to avoid
+  // overly large memory usage.
+  col_buffer_.Reshape(
+      1, channels_ * kernel_size_ * kernel_size_, height_out_, width_out_);
+
+  for (int top_id = 0; top_id < top.size(); ++top_id) {
+    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
+  }
+}
+
+template <typename Dtype>
+void LocalLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+
+  Dtype* x_data = col_buffer_.mutable_cpu_data();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+
+  Blob<Dtype> E;
+  E.Reshape(1, 1, 1, K_);
+  FillerParameter filler_param;
+  filler_param.set_value(1);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(&E);
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, K_, N_);
+  for (int n=0; n<num_; n++) {
+    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    for (int m=0; m<num_output_; m++) {
+      caffe_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
+                intermediate.mutable_cpu_data());
+
+      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_,
+                            (Dtype)1., E.cpu_data(),
+                            intermediate.cpu_data(),
+                            (Dtype)0., top_data + top[0]->offset(n, m));
+    }
+
+    if (bias_term_) {
+      caffe_add(M_ * N_, this->blobs_[1]->cpu_data(),
+                top_data + top[0]->offset(n),
+                top_data + top[0]->offset(n));
+    }
+  }
+}
+
+template <typename Dtype>
+void LocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
+  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* x_data = col_buffer_.mutable_cpu_data();
+  Dtype* x_diff = col_buffer_.mutable_cpu_diff();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
+  Dtype* bias_diff = NULL;
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, 1, N_);
+
+  Blob<Dtype> xt;
+  xt.Reshape(1, 1, K_, N_);
+  Dtype* xt_data = xt.mutable_cpu_data();
+
+  if (bias_term_) {
+    bias_diff = this->blobs_[1]->mutable_cpu_diff();
+    memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count());
+    for (int n = 0; n < num_; ++n) {
+      caffe_add(M_ * N_, bias_diff,
+                top_diff + top[0]->offset(n),
+                bias_diff);
+    }
+  }
+
+  memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count());
+  for (int n=0; n<num_; n++) {
+    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    // gradient wrt weight
+    for (int m=0; m<num_output_; m++) {
+      Dtype* filter_weight_diff = weight_diff+this->blobs_[0]->offset(m);
+      for (int k=0; k<K_; k++) {
+        caffe_mul(N_, top_diff+top[0]->offset(n, m),
+                  x_data+col_buffer_.offset(0,k), xt_data+xt.offset(0,0,k));
+      }
+      caffe_cpu_axpby(K_*N_, Dtype(1.0), xt_data, Dtype(1.0), filter_weight_diff);
+    }
+
+    // gradient wrt bottom data
+    if (propagate_down[0]) {
+      memset(x_diff, 0, col_buffer_.count() * sizeof(Dtype));
+      for (int m=0; m<num_output_; m++) {
+        for (int k=0; k<K_; k++) {
+          caffe_mul(N_, top_diff+top[0]->offset(n, m),
+                    weight+this->blobs_[0]->offset(m,0,k),
+                    intermediate.mutable_cpu_data());
+
+          caffe_cpu_axpby(N_, Dtype(1.0),
+                          intermediate.cpu_data(), Dtype(1.0),
+                          x_diff+col_buffer_.offset(0,k));
+        }
+      }
+
+      // col2im back to the data
+      col2im_cpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
+                 pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
+
+    }
+  }
+
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(LocalLayer);
+#endif
+
+INSTANTIATE_CLASS(LocalLayer);
+REGISTER_LAYER_CLASS(Local);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/local_layer.cu b/src/caffe/layers/local_layer.cu
new file mode 100644
index 00000000000..f393ac6e4d7
--- /dev/null
+++ b/src/caffe/layers/local_layer.cu
@@ -0,0 +1,185 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+
+
+template <typename Dtype>
+__global__ void local_update1_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
+                                    Dtype* data_R, const int filter_num,
+                                    const int location_num, const int output_num) {
+  int total = filter_num * location_num * output_num;
+  CUDA_KERNEL_LOOP(index, total) {
+    int p = index % location_num;
+    int n = (index / location_num) % filter_num;
+    int q = (index / location_num) / filter_num;
+    data_R[index] += data_A[q*location_num+p] * data_B[n*location_num+p];
+  }
+}
+
+template <typename Dtype>
+void local_update1_gpu(const Dtype* data_A, const Dtype* data_B,
+                       Dtype* data_R, const int filter_num,
+                       const int location_num, const int output_num) {
+  // data_A is output_num x location_num
+  // data_B is filter_num x location_num
+  // data_R is output_num x filter_num x location_num, the update performed is Rqnp += Aqp * Bnp
+
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  local_update1_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num * output_num),
+                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void local_update1_gpu<float>(const float* data_A, const float* data_B,
+                                float* data_R, const int filter_num,
+                                const int location_num, const int output_num);
+template void local_update1_gpu<double>(const double* data_A, const double* data_B,
+                                double* data_R, const int filter_num,
+                                const int location_num, const int output_num);
+
+
+template <typename Dtype>
+__global__ void local_update2_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
+                                Dtype* data_R, const int filter_num,
+                                const int location_num, const int output_num) {
+  int total = filter_num * location_num;
+  CUDA_KERNEL_LOOP(index, total) {
+    int p = index % location_num;
+    int n = (index / location_num);
+    for (int q=0; q<output_num; q++) {
+      data_R[index] += data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
+    }
+  }
+}
+
+template <typename Dtype>
+void local_update2_gpu(const Dtype* data_A, const Dtype* data_B,
+                       Dtype* data_R, const int filter_num,
+                       const int location_num, const int output_num) {
+  // data_A is output_num x location_num
+  // data_B is output_num x filter_num x location_num
+  // data_R is filter_num x location_num, the update performed is Rnp += \sum_q(Aqp * Bqnp)
+
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  local_update2_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num),
+                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void local_update2_gpu<float>(const float* data_A, const float* data_B,
+                       float* data_R, const int filter_num,
+                       const int location_num, const int output_num);
+template void local_update2_gpu<double>(const double* data_A, const double* data_B,
+                       double* data_R, const int filter_num,
+                       const int location_num, const int output_num);
+
+
+
+
+/// @brief refer to CPU forward -- the BLAS implementation is the same.
+template <typename Dtype>
+void LocalLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+
+  Dtype* x_data = col_buffer_.mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+
+  Blob<Dtype> E;
+  E.Reshape(1, 1, 1, K_);
+  FillerParameter filler_param;
+  filler_param.set_value(1);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(&E);
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, K_, N_);
+  for (int n=0; n<num_; n++) {
+    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    for (int m=0; m<num_output_; m++) {
+      caffe_gpu_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
+                    intermediate.mutable_gpu_data());
+
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_,
+                            (Dtype)1., E.gpu_data(), intermediate.gpu_data(),
+                            (Dtype)0., top_data + top[0]->offset(n, m));
+    }
+
+    if (bias_term_) {
+      caffe_gpu_add(M_ * N_, this->blobs_[1]->gpu_data(),
+                    top_data + top[0]->offset(n),
+                    top_data + top[0]->offset(n));
+    }
+  }
+
+}
+
+/// @brief refer to CPU backward -- the BLAS implementation is the same.
+template <typename Dtype>
+void LocalLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* x_data = col_buffer_.mutable_gpu_data();
+  Dtype* x_diff = col_buffer_.mutable_gpu_diff();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  Dtype* bias_diff = NULL;
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, 1, N_);
+
+  Blob<Dtype> xt;
+  xt.Reshape(1, 1, K_, N_);
+  Dtype* xt_data = xt.mutable_gpu_data();
+  if (bias_term_) {
+    bias_diff = this->blobs_[1]->mutable_gpu_diff();
+    CUDA_CHECK(cudaMemset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count()));
+    for (int n = 0; n < num_; ++n) {
+      caffe_gpu_add(M_ * N_, bias_diff,
+                    top_diff + top[0]->offset(n),
+                    bias_diff);
+    }
+  }
+
+  Blob<Dtype> buf;
+  buf.Reshape(1, 1, K_, N_);
+  Dtype* buf_data = buf.mutable_gpu_data();
+  CUDA_CHECK(cudaMemset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count()));
+  for (int n=0; n<num_; n++) {
+    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    local_update1_gpu(top_diff+top[0]->offset(n), x_data, weight_diff, K_, N_, M_);
+
+    if (propagate_down[0]) {
+      CUDA_CHECK(cudaMemset(x_diff, 0, col_buffer_.count() * sizeof(Dtype)));
+      local_update2_gpu(top_diff+top[0]->offset(n), weight, x_diff, K_, N_, M_);
+
+      // col2im back to the data
+      col2im_gpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
+                 pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
+    }
+  }
+}
+
+
+
+
+INSTANTIATE_LAYER_GPU_FUNCS(LocalLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index aa299f8660b..ae8b8e6a65a 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -301,7 +301,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 139 (last added: tile_param)
+// LayerParameter next available layer-specific ID: 140 (last added: local_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -385,6 +385,7 @@ message LayerParameter {
   optional ThresholdParameter threshold_param = 128;
   optional TileParameter tile_param = 138;
   optional WindowDataParameter window_data_param = 129;
+  optional LocalParameter local_param = 139;
 }
 
 // Message that stores parameters used to apply transformation
@@ -1179,3 +1180,14 @@ message PReLUParameter {
   // Whether or not slope paramters are shared across channels.
   optional bool channel_shared = 2 [default = false];
 }
+
+// Message that stores parameters used by LocalLayer
+message LocalParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional uint32 pad = 3 [default = 0]; // The padding size
+  optional uint32 kernel_size = 4; // The kernel size
+  optional uint32 stride = 6 [default = 1]; // The stride
+  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter bias_filler = 8; // The filler for the bias
+}
diff --git a/src/caffe/test/test_local_layer.cpp b/src/caffe/test/test_local_layer.cpp
new file mode 100644
index 00000000000..880c1bb49aa
--- /dev/null
+++ b/src/caffe/test/test_local_layer.cpp
@@ -0,0 +1,121 @@
+#include <cstring>
+#include <vector>
+
+#include "cuda_runtime.h"
+#include "gtest/gtest.h"
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+
+template <typename TypeParam>
+class LocalLayerTest: public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+ protected:
+	LocalLayerTest()
+      : blob_bottom_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    blob_bottom_->Reshape(2, 3, 6, 4);
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_value(1.);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+
+  virtual ~LocalLayerTest() { delete blob_bottom_; delete blob_top_; }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(LocalLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(LocalLayerTest, TestSetup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(4);
+  shared_ptr<Layer<Dtype> > layer(
+      new LocalLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 4);
+  EXPECT_EQ(this->blob_top_->height(), 2);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+  convolution_param->set_num_output(3);
+  layer.reset(new LocalLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 3);
+  EXPECT_EQ(this->blob_top_->height(), 2);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+}
+
+
+TYPED_TEST(LocalLayerTest, TestSimpleConvolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  // We will simply see if the convolution layer carries out averaging well.
+  FillerParameter filler_param;
+  filler_param.set_value(1.);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(this->blob_bottom_);
+  LayerParameter layer_param;
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(1);
+  convolution_param->set_num_output(1);
+  convolution_param->mutable_weight_filler()->set_type("test_local");
+  convolution_param->mutable_weight_filler()->set_value(1);
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new LocalLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // After the convolution, the output should all have output values 27.1
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int n=0; n<this->blob_top_->num(); n++) {
+    for (int k=0; k<this->blob_top_->channels(); k++) {
+      for (int j=0; j<this->blob_top_->height(); j++) {
+        for (int i=0; i<this->blob_top_->width(); i++) {
+          int idx = j*this->blob_top_->width()+i;
+          EXPECT_NEAR(*(top_data+this->blob_top_->offset(n, k, j, i)), idx*27+0.1, 1e-4);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(LocalLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  LocalLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+}  // namespace caffe

From 00d2cae8d88442aaf9ecd3d0e81885cf14979334 Mon Sep 17 00:00:00 2001
From: Yuncheng Li <raingomm@gmail.com>
Date: Tue, 15 Sep 2015 09:29:01 -0400
Subject: [PATCH 2/5] fix cpu only compile

---
 src/caffe/test/test_local_layer.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/caffe/test/test_local_layer.cpp b/src/caffe/test/test_local_layer.cpp
index 880c1bb49aa..49685697829 100644
--- a/src/caffe/test/test_local_layer.cpp
+++ b/src/caffe/test/test_local_layer.cpp
@@ -1,20 +1,18 @@
 #include <cstring>
 #include <vector>
 
-#include "cuda_runtime.h"
 #include "gtest/gtest.h"
+
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/filler.hpp"
 #include "caffe/vision_layers.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
 
 namespace caffe {
 
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
-
 template <typename TypeParam>
 class LocalLayerTest: public MultiDeviceTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;

From 53e146609abfe7a56c0a3138878f7c5267321bb9 Mon Sep 17 00:00:00 2001
From: Yuncheng Li <raingomm@gmail.com>
Date: Tue, 15 Sep 2015 13:25:27 -0400
Subject: [PATCH 3/5] fix lint errors

---
 include/caffe/filler.hpp            |   6 +-
 src/caffe/layers/local_layer.cpp    |  82 +++++++++---------
 src/caffe/layers/local_layer.cu     | 127 +++++++++++++++-------------
 src/caffe/test/test_local_layer.cpp |  22 ++---
 4 files changed, 127 insertions(+), 110 deletions(-)

diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 3f762920f87..ad4b88e32ac 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -178,9 +178,9 @@ class TestLocalFiller : public Filler<Dtype> {
     LOG(INFO) << "Done Doing mutable cpu";
     CHECK_EQ(blob->channels(), 1);
 
-    for (int n=0; n<blob->num(); n++) {
-      for (int j=0; j<blob->height(); j++) {
-        for (int i=0; i<blob->width(); i++) {
+    for (int n = 0; n < blob->num(); n++) {
+      for (int j = 0; j < blob->height(); j++) {
+        for (int i = 0; i < blob->width(); i++) {
           *(data+blob->offset(n, 0, j, i)) = i;
         }
       }
diff --git a/src/caffe/layers/local_layer.cpp b/src/caffe/layers/local_layer.cpp
index 2c45e1765f9..1261fc67b9d 100644
--- a/src/caffe/layers/local_layer.cpp
+++ b/src/caffe/layers/local_layer.cpp
@@ -10,7 +10,7 @@ namespace caffe {
 
 template <typename Dtype>
 void LocalLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(bottom.size(), 1) << "Conv Layer takes a single blob as input.";
   CHECK_EQ(top.size(), 1) << "Conv Layer takes a single blob as output.";
 
@@ -47,16 +47,16 @@ void LocalLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     }
     // Intialize the weight
     this->blobs_[0].reset(new Blob<Dtype>(
-        num_output_, 1, K_, N_));
+          num_output_, 1, K_, N_));
     // fill the weights
     shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
-        this->layer_param_.local_param().weight_filler()));
+          this->layer_param_.local_param().weight_filler()));
     weight_filler->Fill(this->blobs_[0].get());
     // If necessary, intiialize and fill the bias term
     if (bias_term_) {
       this->blobs_[1].reset(new Blob<Dtype>(1, 1, M_, N_));
       shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
-          this->layer_param_.local_param().bias_filler()));
+            this->layer_param_.local_param().bias_filler()));
       bias_filler->Fill(this->blobs_[1].get());
     }
   }
@@ -64,18 +64,18 @@ void LocalLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void LocalLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
     " weights.";
   // TODO: generalize to handle inputs of different shapes.
   for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
     CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
     CHECK_EQ(channels_, bottom[bottom_id]->channels())
-        << "Inputs must have same channels.";
+      << "Inputs must have same channels.";
     CHECK_EQ(height_, bottom[bottom_id]->height())
-        << "Inputs must have same height.";
+      << "Inputs must have same height.";
     CHECK_EQ(width_, bottom[bottom_id]->width())
-        << "Inputs must have same width.";
+      << "Inputs must have same width.";
   }
 
   // Shape the tops.
@@ -95,7 +95,7 @@ void LocalLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void LocalLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
 
   Dtype* x_data = col_buffer_.mutable_cpu_data();
   const Dtype* weight = this->blobs_[0]->cpu_data();
@@ -111,31 +111,32 @@ void LocalLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   Blob<Dtype> intermediate;
   intermediate.Reshape(1, 1, K_, N_);
-  for (int n=0; n<num_; n++) {
+  for (int n = 0; n < num_; n++) {
     im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+        width_, kernel_size_, kernel_size_,
+        pad_, pad_, stride_, stride_, x_data);
 
-    for (int m=0; m<num_output_; m++) {
+    for (int m = 0; m < num_output_; m++) {
       caffe_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
-                intermediate.mutable_cpu_data());
+          intermediate.mutable_cpu_data());
 
       caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_,
-                            (Dtype)1., E.cpu_data(),
-                            intermediate.cpu_data(),
-                            (Dtype)0., top_data + top[0]->offset(n, m));
+          (Dtype)1., E.cpu_data(),
+          intermediate.cpu_data(),
+          (Dtype)0., top_data + top[0]->offset(n, m));
     }
 
     if (bias_term_) {
       caffe_add(M_ * N_, this->blobs_[1]->cpu_data(),
-                top_data + top[0]->offset(n),
-                top_data + top[0]->offset(n));
+          top_data + top[0]->offset(n),
+          top_data + top[0]->offset(n));
     }
   }
 }
 
 template <typename Dtype>
 void LocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 
   const Dtype* top_diff = top[0]->cpu_diff();
   const Dtype* bottom_data = bottom[0]->cpu_data();
@@ -155,51 +156,52 @@ void LocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
   if (bias_term_) {
     bias_diff = this->blobs_[1]->mutable_cpu_diff();
-    memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count());
+    caffe_set(this->blobs_[1]->count(), Dtype(0.0), bias_diff);
     for (int n = 0; n < num_; ++n) {
       caffe_add(M_ * N_, bias_diff,
-                top_diff + top[0]->offset(n),
-                bias_diff);
+          top_diff + top[0]->offset(n),
+          bias_diff);
     }
   }
 
-  memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count());
-  for (int n=0; n<num_; n++) {
+  caffe_set(this->blobs_[0]->count(), Dtype(0.0), weight_diff);
+  for (int n = 0; n < num_; n++) {
     im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+        width_, kernel_size_, kernel_size_,
+        pad_, pad_, stride_, stride_, x_data);
 
     // gradient wrt weight
-    for (int m=0; m<num_output_; m++) {
+    for (int m = 0; m < num_output_; m++) {
       Dtype* filter_weight_diff = weight_diff+this->blobs_[0]->offset(m);
-      for (int k=0; k<K_; k++) {
+      for (int k = 0; k < K_; k++) {
         caffe_mul(N_, top_diff+top[0]->offset(n, m),
-                  x_data+col_buffer_.offset(0,k), xt_data+xt.offset(0,0,k));
+            x_data+col_buffer_.offset(0, k), xt_data+xt.offset(0, 0, k));
       }
-      caffe_cpu_axpby(K_*N_, Dtype(1.0), xt_data, Dtype(1.0), filter_weight_diff);
+      caffe_cpu_axpby(K_*N_, Dtype(1.0), xt_data,
+          Dtype(1.0), filter_weight_diff);
     }
 
     // gradient wrt bottom data
     if (propagate_down[0]) {
-      memset(x_diff, 0, col_buffer_.count() * sizeof(Dtype));
-      for (int m=0; m<num_output_; m++) {
-        for (int k=0; k<K_; k++) {
+      caffe_set(col_buffer_.count(), Dtype(0.0), x_diff);
+      for (int m = 0; m < num_output_; m++) {
+        for (int k = 0; k < K_; k++) {
           caffe_mul(N_, top_diff+top[0]->offset(n, m),
-                    weight+this->blobs_[0]->offset(m,0,k),
-                    intermediate.mutable_cpu_data());
+              weight+this->blobs_[0]->offset(m, 0, k),
+              intermediate.mutable_cpu_data());
 
           caffe_cpu_axpby(N_, Dtype(1.0),
-                          intermediate.cpu_data(), Dtype(1.0),
-                          x_diff+col_buffer_.offset(0,k));
+              intermediate.cpu_data(), Dtype(1.0),
+              x_diff+col_buffer_.offset(0, k));
         }
       }
 
       // col2im back to the data
-      col2im_cpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
-                 pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
-
+      col2im_cpu(x_diff, channels_, height_,
+          width_, kernel_size_, kernel_size_,
+          pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
     }
   }
-
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/local_layer.cu b/src/caffe/layers/local_layer.cu
index f393ac6e4d7..da739c68e6a 100644
--- a/src/caffe/layers/local_layer.cu
+++ b/src/caffe/layers/local_layer.cu
@@ -11,9 +11,10 @@ namespace caffe {
 
 
 template <typename Dtype>
-__global__ void local_update1_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
-                                    Dtype* data_R, const int filter_num,
-                                    const int location_num, const int output_num) {
+__global__ void local_update1_gpu_kernel(
+    const Dtype* data_A, const Dtype* data_B,
+    Dtype* data_R, const int filter_num,
+    const int location_num, const int output_num) {
   int total = filter_num * location_num * output_num;
   CUDA_KERNEL_LOOP(index, total) {
     int p = index % location_num;
@@ -24,63 +25,75 @@ __global__ void local_update1_gpu_kernel(const Dtype* data_A, const Dtype* data_
 }
 
 template <typename Dtype>
-void local_update1_gpu(const Dtype* data_A, const Dtype* data_B,
-                       Dtype* data_R, const int filter_num,
-                       const int location_num, const int output_num) {
+void local_update1_gpu(
+    const Dtype* data_A, const Dtype* data_B,
+    Dtype* data_R, const int filter_num,
+    const int location_num, const int output_num) {
   // data_A is output_num x location_num
   // data_B is filter_num x location_num
-  // data_R is output_num x filter_num x location_num, the update performed is Rqnp += Aqp * Bnp
+  // data_R is output_num x filter_num x location_num,
+  //      the update performed is Rqnp += Aqp * Bnp
+  const int nthreads = filter_num * location_num * output_num;
 
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  local_update1_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num * output_num),
-                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
+  local_update1_gpu_kernel<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+    <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+      data_A, data_B, data_R, filter_num, location_num, output_num);
   CUDA_POST_KERNEL_CHECK;
 }
 
 // Explicit instantiation
-template void local_update1_gpu<float>(const float* data_A, const float* data_B,
-                                float* data_R, const int filter_num,
-                                const int location_num, const int output_num);
-template void local_update1_gpu<double>(const double* data_A, const double* data_B,
-                                double* data_R, const int filter_num,
-                                const int location_num, const int output_num);
-
+template void local_update1_gpu<float>(
+    const float* data_A, const float* data_B,
+    float* data_R, const int filter_num,
+    const int location_num, const int output_num);
+template void local_update1_gpu<double>(
+    const double* data_A, const double* data_B,
+    double* data_R, const int filter_num,
+    const int location_num, const int output_num);
 
 template <typename Dtype>
-__global__ void local_update2_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
-                                Dtype* data_R, const int filter_num,
-                                const int location_num, const int output_num) {
+__global__ void local_update2_gpu_kernel(
+    const Dtype* data_A, const Dtype* data_B,
+    Dtype* data_R, const int filter_num,
+    const int location_num, const int output_num) {
   int total = filter_num * location_num;
   CUDA_KERNEL_LOOP(index, total) {
     int p = index % location_num;
     int n = (index / location_num);
-    for (int q=0; q<output_num; q++) {
-      data_R[index] += data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
+    for (int q = 0; q < output_num; q++) {
+      data_R[index] +=
+        data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
     }
   }
 }
 
 template <typename Dtype>
 void local_update2_gpu(const Dtype* data_A, const Dtype* data_B,
-                       Dtype* data_R, const int filter_num,
-                       const int location_num, const int output_num) {
+    Dtype* data_R, const int filter_num,
+    const int location_num, const int output_num) {
   // data_A is output_num x location_num
   // data_B is output_num x filter_num x location_num
-  // data_R is filter_num x location_num, the update performed is Rnp += \sum_q(Aqp * Bqnp)
+  // data_R is filter_num x location_num,
+  //    the update performed is Rnp += \sum_q(Aqp * Bqnp)
+  int nthreads = filter_num * location_num;
+
 
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  local_update2_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num),
-                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
+  local_update2_gpu_kernel<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+    <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+        data_A, data_B, data_R, filter_num,
+        location_num, output_num);
   CUDA_POST_KERNEL_CHECK;
 }
 
 // Explicit instantiation
-template void local_update2_gpu<float>(const float* data_A, const float* data_B,
-                       float* data_R, const int filter_num,
-                       const int location_num, const int output_num);
-template void local_update2_gpu<double>(const double* data_A, const double* data_B,
-                       double* data_R, const int filter_num,
-                       const int location_num, const int output_num);
+template void local_update2_gpu<float>(
+    const float* data_A, const float* data_B,
+    float* data_R, const int filter_num,
+    const int location_num, const int output_num);
+template void local_update2_gpu<double>(
+    const double* data_A, const double* data_B,
+    double* data_R, const int filter_num,
+    const int location_num, const int output_num);
 
 
 
@@ -88,7 +101,7 @@ template void local_update2_gpu<double>(const double* data_A, const double* data
 /// @brief refer to CPU forward -- the BLAS implementation is the same.
 template <typename Dtype>
 void LocalLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
 
   Dtype* x_data = col_buffer_.mutable_gpu_data();
   const Dtype* weight = this->blobs_[0]->gpu_data();
@@ -104,32 +117,32 @@ void LocalLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   Blob<Dtype> intermediate;
   intermediate.Reshape(1, 1, K_, N_);
-  for (int n=0; n<num_; n++) {
+  for (int n = 0; n < num_; n++) {
     im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+        width_, kernel_size_, kernel_size_,
+        pad_, pad_, stride_, stride_, x_data);
 
-    for (int m=0; m<num_output_; m++) {
+    for (int m = 0; m < num_output_; m++) {
       caffe_gpu_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
-                    intermediate.mutable_gpu_data());
+          intermediate.mutable_gpu_data());
 
       caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_,
-                            (Dtype)1., E.gpu_data(), intermediate.gpu_data(),
-                            (Dtype)0., top_data + top[0]->offset(n, m));
+          (Dtype)1., E.gpu_data(), intermediate.gpu_data(),
+          (Dtype)0., top_data + top[0]->offset(n, m));
     }
 
     if (bias_term_) {
       caffe_gpu_add(M_ * N_, this->blobs_[1]->gpu_data(),
-                    top_data + top[0]->offset(n),
-                    top_data + top[0]->offset(n));
+          top_data + top[0]->offset(n),
+          top_data + top[0]->offset(n));
     }
   }
-
 }
 
 /// @brief refer to CPU backward -- the BLAS implementation is the same.
 template <typename Dtype>
 void LocalLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 
   const Dtype* top_diff = top[0]->gpu_diff();
   const Dtype* bottom_data = bottom[0]->gpu_data();
@@ -148,38 +161,38 @@ void LocalLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   Dtype* xt_data = xt.mutable_gpu_data();
   if (bias_term_) {
     bias_diff = this->blobs_[1]->mutable_gpu_diff();
-    CUDA_CHECK(cudaMemset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count()));
+    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0.), bias_diff);
     for (int n = 0; n < num_; ++n) {
       caffe_gpu_add(M_ * N_, bias_diff,
-                    top_diff + top[0]->offset(n),
-                    bias_diff);
+          top_diff + top[0]->offset(n),
+          bias_diff);
     }
   }
 
   Blob<Dtype> buf;
   buf.Reshape(1, 1, K_, N_);
   Dtype* buf_data = buf.mutable_gpu_data();
-  CUDA_CHECK(cudaMemset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count()));
-  for (int n=0; n<num_; n++) {
+  caffe_gpu_set(this->blobs_[0]->count(), Dtype(0.), weight_diff);
+  for (int n = 0; n < num_; n++) {
     im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+        width_, kernel_size_, kernel_size_,
+        pad_, pad_, stride_, stride_, x_data);
 
-    local_update1_gpu(top_diff+top[0]->offset(n), x_data, weight_diff, K_, N_, M_);
+    local_update1_gpu(
+        top_diff+top[0]->offset(n), x_data,
+        weight_diff, K_, N_, M_);
 
     if (propagate_down[0]) {
-      CUDA_CHECK(cudaMemset(x_diff, 0, col_buffer_.count() * sizeof(Dtype)));
+      caffe_gpu_set(col_buffer_.count(), Dtype(0.), x_diff);
       local_update2_gpu(top_diff+top[0]->offset(n), weight, x_diff, K_, N_, M_);
 
       // col2im back to the data
       col2im_gpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
-                 pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
+          pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
     }
   }
 }
 
-
-
-
 INSTANTIATE_LAYER_GPU_FUNCS(LocalLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/test/test_local_layer.cpp b/src/caffe/test/test_local_layer.cpp
index 49685697829..1e16c117655 100644
--- a/src/caffe/test/test_local_layer.cpp
+++ b/src/caffe/test/test_local_layer.cpp
@@ -17,7 +17,7 @@ template <typename TypeParam>
 class LocalLayerTest: public MultiDeviceTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
  protected:
-	LocalLayerTest()
+  LocalLayerTest()
       : blob_bottom_(new Blob<Dtype>()),
         blob_top_(new Blob<Dtype>()) {}
   virtual void SetUp() {
@@ -74,7 +74,7 @@ TYPED_TEST(LocalLayerTest, TestSimpleConvolution) {
   filler.Fill(this->blob_bottom_);
   LayerParameter layer_param;
   LocalParameter* convolution_param =
-      layer_param.mutable_local_param();
+    layer_param.mutable_local_param();
   convolution_param->set_kernel_size(3);
   convolution_param->set_stride(1);
   convolution_param->set_num_output(1);
@@ -88,12 +88,13 @@ TYPED_TEST(LocalLayerTest, TestSimpleConvolution) {
   layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
   // After the convolution, the output should all have output values 27.1
   const Dtype* top_data = this->blob_top_->cpu_data();
-  for (int n=0; n<this->blob_top_->num(); n++) {
-    for (int k=0; k<this->blob_top_->channels(); k++) {
-      for (int j=0; j<this->blob_top_->height(); j++) {
-        for (int i=0; i<this->blob_top_->width(); i++) {
-          int idx = j*this->blob_top_->width()+i;
-          EXPECT_NEAR(*(top_data+this->blob_top_->offset(n, k, j, i)), idx*27+0.1, 1e-4);
+  for (int n = 0; n < this->blob_top_->num(); n++) {
+    for (int k = 0; k < this->blob_top_->channels(); k++) {
+      for (int j = 0; j < this->blob_top_->height(); j++) {
+        for (int i = 0; i < this->blob_top_->width(); i++) {
+          int idx = j * this->blob_top_->width() + i;
+          EXPECT_NEAR(*(top_data + this->blob_top_->offset(n, k, j, i)),
+              idx * 27 + 0.1, 1e-4);
         }
       }
     }
@@ -104,7 +105,7 @@ TYPED_TEST(LocalLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   LocalParameter* convolution_param =
-      layer_param.mutable_local_param();
+    layer_param.mutable_local_param();
   convolution_param->set_kernel_size(3);
   convolution_param->set_stride(2);
   convolution_param->set_num_output(2);
@@ -112,7 +113,8 @@ TYPED_TEST(LocalLayerTest, TestGradient) {
   convolution_param->mutable_bias_filler()->set_type("gaussian");
   LocalLayer<Dtype> layer(layer_param);
   GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+  checker.CheckGradientExhaustive(&layer,
+      this->blob_bottom_vec_,
       this->blob_top_vec_);
 }
 

From 255d2d760b5f8edb7b6ebde06bbdf44e14a4e239 Mon Sep 17 00:00:00 2001
From: Yuncheng Li <yli@cs.rochester.edu>
Date: Wed, 13 Jan 2016 18:19:34 -0500
Subject: [PATCH 4/5] merge with master; use new format for the example;

---
 .../mnist/lenet_local_train_test.prototxt     | 137 +++++++++++-------
 examples/mnist/train_lenet_local.sh           |   0
 include/caffe/layers/local_layer.hpp          |   4 +-
 src/caffe/layers/local_layer.cpp              |  10 +-
 src/caffe/layers/local_layer.cu               |  10 +-
 src/caffe/proto/caffe.proto                   |   2 +-
 src/caffe/test/test_local_layer.cpp           |   2 +-
 7 files changed, 106 insertions(+), 59 deletions(-)
 mode change 100644 => 100755 examples/mnist/train_lenet_local.sh

diff --git a/examples/mnist/lenet_local_train_test.prototxt b/examples/mnist/lenet_local_train_test.prototxt
index ff88b5f4919..81bb4787a58 100644
--- a/examples/mnist/lenet_local_train_test.prototxt
+++ b/examples/mnist/lenet_local_train_test.prototxt
@@ -1,42 +1,49 @@
 name: "LeNet"
-layers {
+layer {
   name: "mnist"
-  type: DATA
+  type: "Data"
   top: "data"
   top: "label"
-  data_param {
-    source: "examples/mnist/mnist_train_lmdb"
-    backend: LMDB
-    batch_size: 64
+  include {
+    phase: TRAIN
   }
   transform_param {
     scale: 0.00390625
   }
-  include: { phase: TRAIN }
+  data_param {
+    source: "examples/mnist/mnist_train_lmdb"
+    batch_size: 64
+    backend: LMDB
+  }
 }
-layers {
+layer {
   name: "mnist"
-  type: DATA
+  type: "Data"
   top: "data"
   top: "label"
-  data_param {
-    source: "examples/mnist/mnist_test_lmdb"
-    backend: LMDB
-    batch_size: 100
+  include {
+    phase: TEST
   }
   transform_param {
     scale: 0.00390625
   }
-  include: { phase: TEST }
+  data_param {
+    source: "examples/mnist/mnist_test_lmdb"
+    batch_size: 100
+    backend: LMDB
+  }
 }
-
-layers {
+layer {
   name: "conv1"
-  type: CONVOLUTION
+  type: "Convolution"
   bottom: "data"
   top: "conv1"
-  blobs_lr: 1
-  blobs_lr: 2
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
   convolution_param {
     num_output: 20
     kernel_size: 5
@@ -49,9 +56,9 @@ layers {
     }
   }
 }
-layers {
+layer {
   name: "pool1"
-  type: POOLING
+  type: "Pooling"
   bottom: "conv1"
   top: "pool1"
   pooling_param {
@@ -60,13 +67,17 @@ layers {
     stride: 2
   }
 }
-layers {
+layer {
   name: "local1"
-  type: LOCAL
+  type: "Local"
   bottom: "pool1"
   top: "local1"
-  blobs_lr: 1
-  blobs_lr: 1
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 1
+  }
   local_param {
     num_output: 5
     kernel_size: 5
@@ -81,19 +92,23 @@ layers {
     }
   }
 }
-layers {
+layer {
   name: "relu1"
-  type: RELU
+  type: "ReLU"
   bottom: "local1"
   top: "local1"
 }
-layers {
+layer {
   name: "local2"
-  type: LOCAL
+  type: "Local"
   bottom: "local1"
   top: "local2"
-  blobs_lr: 1
-  blobs_lr: 1
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 1
+  }
   local_param {
     num_output: 10
     kernel_size: 5
@@ -108,19 +123,35 @@ layers {
     }
   }
 }
-layers {
+layer {
   name: "relu2"
-  type: RELU
+  type: "ReLU"
   bottom: "local2"
   top: "local2"
 }
-layers {
-  name: "ip1"
-  type: INNER_PRODUCT
+
+layer {
+  name: "pool2"
+  type: "Pooling"
   bottom: "local2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool2"
   top: "ip1"
-  blobs_lr: 1
-  blobs_lr: 2
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
   inner_product_param {
     num_output: 500
     weight_filler {
@@ -131,19 +162,23 @@ layers {
     }
   }
 }
-layers {
-  name: "relu2"
-  type: RELU
+layer {
+  name: "relu1"
+  type: "ReLU"
   bottom: "ip1"
   top: "ip1"
 }
-layers {
+layer {
   name: "ip2"
-  type: INNER_PRODUCT
+  type: "InnerProduct"
   bottom: "ip1"
   top: "ip2"
-  blobs_lr: 1
-  blobs_lr: 2
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
   inner_product_param {
     num_output: 10
     weight_filler {
@@ -154,17 +189,19 @@ layers {
     }
   }
 }
-layers {
+layer {
   name: "accuracy"
-  type: ACCURACY
+  type: "Accuracy"
   bottom: "ip2"
   bottom: "label"
   top: "accuracy"
-  include: { phase: TEST }
+  include {
+    phase: TEST
+  }
 }
-layers {
+layer {
   name: "loss"
-  type: SOFTMAX_LOSS
+  type: "SoftmaxWithLoss"
   bottom: "ip2"
   bottom: "label"
   top: "loss"
diff --git a/examples/mnist/train_lenet_local.sh b/examples/mnist/train_lenet_local.sh
old mode 100644
new mode 100755
diff --git a/include/caffe/layers/local_layer.hpp b/include/caffe/layers/local_layer.hpp
index 80cd1244c02..e2e4d526772 100644
--- a/include/caffe/layers/local_layer.hpp
+++ b/include/caffe/layers/local_layer.hpp
@@ -13,7 +13,7 @@ template <typename Dtype>
 class LocalLayer : public Layer<Dtype> {
  public:
   explicit LocalLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
+      : Layer<Dtype>(param), dilation_(1) {}
 
   virtual inline const char* type() const { return "Local"; }
 
@@ -51,6 +51,8 @@ class LocalLayer : public Layer<Dtype> {
   int K_;
   int N_;
 
+  const int dilation_;
+
   Blob<Dtype> col_buffer_;
 };
 
diff --git a/src/caffe/layers/local_layer.cpp b/src/caffe/layers/local_layer.cpp
index c28ea065a5a..17e9c63de4a 100644
--- a/src/caffe/layers/local_layer.cpp
+++ b/src/caffe/layers/local_layer.cpp
@@ -2,6 +2,7 @@
 
 #include "caffe/filler.hpp"
 #include "caffe/layers/local_layer.hpp"
+#include "caffe/util/im2col.hpp"
 
 namespace caffe {
 
@@ -111,7 +112,8 @@ void LocalLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   for (int n = 0; n < num_; n++) {
     im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
         width_, kernel_size_, kernel_size_,
-        pad_, pad_, stride_, stride_, x_data);
+        pad_, pad_, stride_, stride_,
+        dilation_, dilation_, x_data);
 
     for (int m = 0; m < num_output_; m++) {
       caffe_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
@@ -165,7 +167,8 @@ void LocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   for (int n = 0; n < num_; n++) {
     im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
         width_, kernel_size_, kernel_size_,
-        pad_, pad_, stride_, stride_, x_data);
+        pad_, pad_, stride_, stride_,
+        dilation_, dilation_, x_data);
 
     // gradient wrt weight
     for (int m = 0; m < num_output_; m++) {
@@ -196,7 +199,8 @@ void LocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       // col2im back to the data
       col2im_cpu(x_diff, channels_, height_,
           width_, kernel_size_, kernel_size_,
-          pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
+          pad_, pad_, stride_, stride_,
+          dilation_, dilation_, bottom_diff + bottom[0]->offset(n));
     }
   }
 }
diff --git a/src/caffe/layers/local_layer.cu b/src/caffe/layers/local_layer.cu
index 817076e5cc0..396a4ed0325 100644
--- a/src/caffe/layers/local_layer.cu
+++ b/src/caffe/layers/local_layer.cu
@@ -2,6 +2,7 @@
 
 #include "caffe/filler.hpp"
 #include "caffe/layers/local_layer.hpp"
+#include "caffe/util/im2col.hpp"
 
 namespace caffe {
 
@@ -115,7 +116,8 @@ void LocalLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   for (int n = 0; n < num_; n++) {
     im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
         width_, kernel_size_, kernel_size_,
-        pad_, pad_, stride_, stride_, x_data);
+        pad_, pad_, stride_, stride_,
+        dilation_, dilation_, x_data);
 
     for (int m = 0; m < num_output_; m++) {
       caffe_gpu_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
@@ -171,7 +173,8 @@ void LocalLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   for (int n = 0; n < num_; n++) {
     im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
         width_, kernel_size_, kernel_size_,
-        pad_, pad_, stride_, stride_, x_data);
+        pad_, pad_, stride_, stride_,
+        dilation_, dilation_, x_data);
 
     local_update1_gpu(
         top_diff+top[0]->offset(n), x_data,
@@ -183,7 +186,8 @@ void LocalLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 
       // col2im back to the data
       col2im_gpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
-          pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
+          pad_, pad_, stride_, stride_,
+          dilation_, dilation_, bottom_diff + bottom[0]->offset(n));
     }
   }
 }
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index e2cee07e0f6..a1ca42a399b 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -391,7 +391,7 @@ message LayerParameter {
   optional ThresholdParameter threshold_param = 128;
   optional TileParameter tile_param = 138;
   optional WindowDataParameter window_data_param = 129;
-  optional LocalParameter local_param = 134;
+  optional LocalParameter local_param = 140;
 }
 
 // Message that stores parameters used to apply transformation
diff --git a/src/caffe/test/test_local_layer.cpp b/src/caffe/test/test_local_layer.cpp
index 1e16c117655..a1e3a8ea1f8 100644
--- a/src/caffe/test/test_local_layer.cpp
+++ b/src/caffe/test/test_local_layer.cpp
@@ -6,7 +6,7 @@
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/filler.hpp"
-#include "caffe/vision_layers.hpp"
+#include "caffe/layers/local_layer.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 #include "caffe/test/test_gradient_check_util.hpp"

From 96b20185f77d17e81efead841a18aa509f9f7c4f Mon Sep 17 00:00:00 2001
From: Yuncheng Li <yli@cs.rochester.edu>
Date: Wed, 13 Jan 2016 18:32:10 -0500
Subject: [PATCH 5/5] remove unnecessary diff; upgrade the siamese examples

---
 examples/mnist/train_lenet.sh                 |   2 +-
 examples/mnist/train_lenet_local.sh           |   2 +-
 .../mnist_siamese_local_train_test.prototxt   | 400 +++++++++---------
 examples/siamese/train_mnist_siamese_local.sh |   2 +-
 4 files changed, 201 insertions(+), 205 deletions(-)
 mode change 100644 => 100755 examples/siamese/train_mnist_siamese_local.sh

diff --git a/examples/mnist/train_lenet.sh b/examples/mnist/train_lenet.sh
index 1f718825ded..1b6bf7d978d 100755
--- a/examples/mnist/train_lenet.sh
+++ b/examples/mnist/train_lenet.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env sh
 
-GLOG_logtostderr=0 GLOG_log_dir=examples/mnist/ ./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
+./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
diff --git a/examples/mnist/train_lenet_local.sh b/examples/mnist/train_lenet_local.sh
index b9e29e5ceb8..c9b969c55e1 100755
--- a/examples/mnist/train_lenet_local.sh
+++ b/examples/mnist/train_lenet_local.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env sh
 
-GLOG_logtostderr=0 GLOG_log_dir=examples/mnist/ ./build/tools/caffe train --solver=examples/mnist/lenet_local_solver.prototxt --gpu=1
+./build/tools/caffe train --solver=examples/mnist/lenet_local_solver.prototxt --gpu=1
diff --git a/examples/siamese/mnist_siamese_local_train_test.prototxt b/examples/siamese/mnist_siamese_local_train_test.prototxt
index 4212217df15..fc91301f457 100644
--- a/examples/siamese/mnist_siamese_local_train_test.prototxt
+++ b/examples/siamese/mnist_siamese_local_train_test.prototxt
@@ -1,50 +1,60 @@
 name: "mnist_siamese_train_test"
-layers {
+layer {
   name: "pair_data"
-  type: DATA
+  type: "Data"
   top: "pair_data"
   top: "sim"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    scale: 0.00390625
+  }
   data_param {
     source: "examples/siamese/mnist_siamese_train_leveldb"
-    scale: 0.00390625
     batch_size: 64
   }
-  include: { phase: TRAIN }
 }
-layers {
+layer {
   name: "pair_data"
-  type: DATA
+  type: "Data"
   top: "pair_data"
   top: "sim"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    scale: 0.00390625
+  }
   data_param {
     source: "examples/siamese/mnist_siamese_test_leveldb"
-    scale: 0.00390625
     batch_size: 100
   }
-  include: { phase: TEST }
 }
-layers {
-    name: "slice_pair"
-    type: SLICE
-    bottom: "pair_data"
-    top: "data"
-    top: "data_p"
-    slice_param {
-        slice_dim: 1
-        slice_point: 1
-    }
+layer {
+  name: "slice_pair"
+  type: "Slice"
+  bottom: "pair_data"
+  top: "data"
+  top: "data_p"
+  slice_param {
+    slice_dim: 1
+    slice_point: 1
+  }
 }
-
-
-
-
-layers {
+layer {
   name: "conv1"
-  type: CONVOLUTION
+  type: "Convolution"
   bottom: "data"
   top: "conv1"
-  blobs_lr: 1
-  blobs_lr: 2
+  param {
+    name: "conv1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv1_b"
+    lr_mult: 2
+  }
   convolution_param {
     num_output: 20
     kernel_size: 5
@@ -56,12 +66,10 @@ layers {
       type: "constant"
     }
   }
-  param: "conv1_w"
-  param: "conv1_b"
 }
-layers {
+layer {
   name: "pool1"
-  type: POOLING
+  type: "Pooling"
   bottom: "conv1"
   top: "pool1"
   pooling_param {
@@ -70,15 +78,11 @@ layers {
     stride: 2
   }
 }
-
-
-layers {
+layer {
   name: "local1"
-  type: LOCAL
+  type: "Local"
   bottom: "pool1"
   top: "local1"
-  blobs_lr: 1
-  blobs_lr: 2
   local_param {
     num_output: 5
     kernel_size: 5
@@ -92,22 +96,26 @@ layers {
       value: 0.1
     }
   }
-  param: "local1_w"
-  param: "local1_b"
+  param {
+    name: "local1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "local1_b"
+    lr_mult: 2
+  }
 }
-layers {
+layer {
   name: "relu1"
-  type: RELU
+  type: "ReLU"
   bottom: "local1"
   top: "local1"
 }
-layers {
+layer {
   name: "local2"
-  type: LOCAL
+  type: "Local"
   bottom: "local1"
   top: "local2"
-  blobs_lr: 1
-  blobs_lr: 2
   local_param {
     num_output: 10
     kernel_size: 5
@@ -121,29 +129,45 @@ layers {
       value: 0.1
     }
   }
-  param: "local2_w"
-  param: "local2_b"
+  param {
+    name: "local2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "local2_b"
+    lr_mult: 2
+  }
 }
-layers {
+layer {
   name: "relu2"
-  type: RELU
+  type: "ReLU"
   bottom: "local2"
   top: "local2"
 }
-
-
-
-
-
-
-
-layers {
-  name: "ip1"
-  type: INNER_PRODUCT
+layer {
+  name: "pool2"
+  type: "Pooling"
   bottom: "local2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool2"
   top: "ip1"
-  blobs_lr: 1
-  blobs_lr: 2
+  param {
+    name: "ip1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip1_b"
+    lr_mult: 2
+  }
   inner_product_param {
     num_output: 500
     weight_filler {
@@ -153,22 +177,26 @@ layers {
       type: "constant"
     }
   }
-  param: "ip1_w"
-  param: "ip1_b"
 }
-layers {
+layer {
   name: "relu1"
-  type: RELU
+  type: "ReLU"
   bottom: "ip1"
   top: "ip1"
 }
-layers {
+layer {
   name: "ip2"
-  type: INNER_PRODUCT
+  type: "InnerProduct"
   bottom: "ip1"
   top: "ip2"
-  blobs_lr: 1
-  blobs_lr: 2
+  param {
+    name: "ip2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip2_b"
+    lr_mult: 2
+  }
   inner_product_param {
     num_output: 10
     weight_filler {
@@ -178,37 +206,20 @@ layers {
       type: "constant"
     }
   }
-  param: "ip2_w"
-  param: "ip2_b"
 }
-
-layers {
-  name: "feat2"
-  type: INNER_PRODUCT
+layer {
+  name: "feat"
+  type: "InnerProduct"
   bottom: "ip2"
-  top: "feat2"
-  blobs_lr: 1
-  blobs_lr: 2
-  inner_product_param {
-    num_output: 2
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-    }
+  top: "feat"
+  param {
+    name: "feat_w"
+    lr_mult: 1
+  }
+  param {
+    name: "feat_b"
+    lr_mult: 2
   }
-  param: "feat2_w"
-  param: "feat2_b"
-}
-
-layers {
-  name: "feat1"
-  type: INNER_PRODUCT
-  bottom: "local1"
-  top: "feat1"
-  blobs_lr: 1
-  blobs_lr: 2
   inner_product_param {
     num_output: 2
     weight_filler {
@@ -218,19 +229,20 @@ layers {
       type: "constant"
     }
   }
-  param: "feat1_w"
-  param: "feat1_b"
 }
-
-
-
-layers {
+layer {
   name: "conv1_p"
-  type: CONVOLUTION
+  type: "Convolution"
   bottom: "data_p"
   top: "conv1_p"
-  blobs_lr: 1
-  blobs_lr: 2
+  param {
+    name: "conv1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv1_b"
+    lr_mult: 2
+  }
   convolution_param {
     num_output: 20
     kernel_size: 5
@@ -242,12 +254,10 @@ layers {
       type: "constant"
     }
   }
-  param: "conv1_w"
-  param: "conv1_b"
 }
-layers {
+layer {
   name: "pool1_p"
-  type: POOLING
+  type: "Pooling"
   bottom: "conv1_p"
   top: "pool1_p"
   pooling_param {
@@ -256,15 +266,11 @@ layers {
     stride: 2
   }
 }
-
-
-layers {
+layer {
   name: "local1_p"
-  type: LOCAL
+  type: "Local"
   bottom: "pool1_p"
   top: "local1_p"
-  blobs_lr: 1
-  blobs_lr: 2
   local_param {
     num_output: 5
     kernel_size: 5
@@ -278,22 +284,26 @@ layers {
       value: 0.1
     }
   }
-  param: "local1_w"
-  param: "local1_b"
+  param {
+    name: "local1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "local1_b"
+    lr_mult: 2
+  }
 }
-layers {
+layer {
   name: "relu1_p"
-  type: RELU
+  type: "ReLU"
   bottom: "local1_p"
   top: "local1_p"
 }
-layers {
+layer {
   name: "local2_p"
-  type: LOCAL
+  type: "Local"
   bottom: "local1_p"
   top: "local2_p"
-  blobs_lr: 1
-  blobs_lr: 2
   local_param {
     num_output: 10
     kernel_size: 5
@@ -307,27 +317,45 @@ layers {
       value: 0.1
     }
   }
-  param: "local2_w"
-  param: "local2_b"
+  param {
+    name: "local2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "local2_b"
+    lr_mult: 2
+  }
 }
-layers {
+layer {
   name: "relu2_p"
-  type: RELU
+  type: "ReLU"
   bottom: "local2_p"
   top: "local2_p"
 }
-
-
-
-
-
-layers {
-  name: "ip1_p"
-  type: INNER_PRODUCT
+layer {
+  name: "pool2_p"
+  type: "Pooling"
   bottom: "local2_p"
+  top: "pool2_p"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "ip1_p"
+  type: "InnerProduct"
+  bottom: "pool2_p"
   top: "ip1_p"
-  blobs_lr: 1
-  blobs_lr: 2
+  param {
+    name: "ip1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip1_b"
+    lr_mult: 2
+  }
   inner_product_param {
     num_output: 500
     weight_filler {
@@ -337,22 +365,26 @@ layers {
       type: "constant"
     }
   }
-  param: "ip1_w"
-  param: "ip1_b"
 }
-layers {
+layer {
   name: "relu1_p"
-  type: RELU
+  type: "ReLU"
   bottom: "ip1_p"
   top: "ip1_p"
 }
-layers {
+layer {
   name: "ip2_p"
-  type: INNER_PRODUCT
+  type: "InnerProduct"
   bottom: "ip1_p"
   top: "ip2_p"
-  blobs_lr: 1
-  blobs_lr: 2
+  param {
+    name: "ip2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip2_b"
+    lr_mult: 2
+  }
   inner_product_param {
     num_output: 10
     weight_filler {
@@ -362,38 +394,20 @@ layers {
       type: "constant"
     }
   }
-  param: "ip2_w"
-  param: "ip2_b"
 }
-
-layers {
-  name: "feat2_p"
-  type: INNER_PRODUCT
+layer {
+  name: "feat_p"
+  type: "InnerProduct"
   bottom: "ip2_p"
-  top: "feat2_p"
-  blobs_lr: 1
-  blobs_lr: 2
-  inner_product_param {
-    num_output: 2
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-    }
+  top: "feat_p"
+  param {
+    name: "feat_w"
+    lr_mult: 1
+  }
+  param {
+    name: "feat_b"
+    lr_mult: 2
   }
-  param: "feat2_w"
-  param: "feat2_b"
-}
-
-
-layers {
-  name: "feat1_p"
-  type: INNER_PRODUCT
-  bottom: "local1_p"
-  top: "feat1_p"
-  blobs_lr: 1
-  blobs_lr: 2
   inner_product_param {
     num_output: 2
     weight_filler {
@@ -403,33 +417,15 @@ layers {
       type: "constant"
     }
   }
-  param: "feat1_w"
-  param: "feat1_b"
 }
-
-
-layers {
-    name: "loss2"
-    type: CONTRASTIVE_LOSS
-    contrastive_loss_param {
-        margin: 1.0
-    }
-    bottom: "feat2"
-    bottom: "feat2_p"
-    bottom: "sim"
-    top: "loss2"
-}
-
-
-layers {
-    name: "loss1"
-    type: CONTRASTIVE_LOSS
-    contrastive_loss_param {
-        margin: 1.0
-    }
-    bottom: "feat1"
-    bottom: "feat1_p"
-    bottom: "sim"
-    top: "loss1"
+layer {
+  name: "loss"
+  type: "ContrastiveLoss"
+  bottom: "feat"
+  bottom: "feat_p"
+  bottom: "sim"
+  top: "loss"
+  contrastive_loss_param {
+    margin: 1
+  }
 }
-
diff --git a/examples/siamese/train_mnist_siamese_local.sh b/examples/siamese/train_mnist_siamese_local.sh
old mode 100644
new mode 100755
index 83035bf2ccc..aa84bcb718a
--- a/examples/siamese/train_mnist_siamese_local.sh
+++ b/examples/siamese/train_mnist_siamese_local.sh
@@ -2,4 +2,4 @@
 
 TOOLS=./build/tools
 
-GLOG_logtostderr=0 GLOG_log_dir=examples/siamese/ $TOOLS/caffe train --solver=examples/siamese/mnist_siamese_local_solver.prototxt
+$TOOLS/caffe train --solver=examples/siamese/mnist_siamese_local_solver.prototxt