diff --git a/examples/mnist/lenet_local_solver.prototxt b/examples/mnist/lenet_local_solver.prototxt
new file mode 100644
index 00000000000..a4b02d25e7e
--- /dev/null
+++ b/examples/mnist/lenet_local_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/mnist/lenet_local_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0005
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 10000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/mnist/lenet"
+# solver mode: CPU or GPU
+solver_mode: GPU
diff --git a/examples/mnist/lenet_local_train_test.prototxt b/examples/mnist/lenet_local_train_test.prototxt
new file mode 100644
index 00000000000..81bb4787a58
--- /dev/null
+++ b/examples/mnist/lenet_local_train_test.prototxt
@@ -0,0 +1,208 @@
+name: "LeNet"
+layer {
+  name: "mnist"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  data_param {
+    source: "examples/mnist/mnist_train_lmdb"
+    batch_size: 64
+    backend: LMDB
+  }
+}
+layer {
+  name: "mnist"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  data_param {
+    source: "examples/mnist/mnist_test_lmdb"
+    batch_size: 100
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "local1"
+  type: "Local"
+  bottom: "pool1"
+  top: "local1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 1
+  }
+  local_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "local1"
+  top: "local1"
+}
+layer {
+  name: "local2"
+  type: "Local"
+  bottom: "local1"
+  top: "local2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 1
+  }
+  local_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "local2"
+  top: "local2"
+}
+
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "local2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool2"
+  top: "ip1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "ip1"
+  top: "ip1"
+}
+layer {
+  name: "ip2"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "ip2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/examples/mnist/train_lenet_local.sh b/examples/mnist/train_lenet_local.sh
new file mode 100755
index 00000000000..c9b969c55e1
--- /dev/null
+++ b/examples/mnist/train_lenet_local.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+
+./build/tools/caffe train --solver=examples/mnist/lenet_local_solver.prototxt --gpu=1
diff --git a/examples/siamese/mnist_siamese_local_solver.prototxt b/examples/siamese/mnist_siamese_local_solver.prototxt
new file mode 100644
index 00000000000..c85bb90fc87
--- /dev/null
+++ b/examples/siamese/mnist_siamese_local_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/siamese/mnist_siamese_local_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0000
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 50000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/siamese/mnist_siamese"
+# solver mode: CPU or GPU
+solver_mode: GPU
diff --git a/examples/siamese/mnist_siamese_local_train_test.prototxt b/examples/siamese/mnist_siamese_local_train_test.prototxt
new file mode 100644
index 00000000000..fc91301f457
--- /dev/null
+++ b/examples/siamese/mnist_siamese_local_train_test.prototxt
@@ -0,0 +1,431 @@
+name: "mnist_siamese_train_test"
+layer {
+  name: "pair_data"
+  type: "Data"
+  top: "pair_data"
+  top: "sim"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  data_param {
+    source: "examples/siamese/mnist_siamese_train_leveldb"
+    batch_size: 64
+  }
+}
+layer {
+  name: "pair_data"
+  type: "Data"
+  top: "pair_data"
+  top: "sim"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  data_param {
+    source: "examples/siamese/mnist_siamese_test_leveldb"
+    batch_size: 100
+  }
+}
+layer {
+  name: "slice_pair"
+  type: "Slice"
+  bottom: "pair_data"
+  top: "data"
+  top: "data_p"
+  slice_param {
+    slice_dim: 1
+    slice_point: 1
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    name: "conv1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv1_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "local1"
+  type: "Local"
+  bottom: "pool1"
+  top: "local1"
+  local_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param {
+    name: "local1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "local1_b"
+    lr_mult: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "local1"
+  top: "local1"
+}
+layer {
+  name: "local2"
+  type: "Local"
+  bottom: "local1"
+  top: "local2"
+  local_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param {
+    name: "local2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "local2_b"
+    lr_mult: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "local2"
+  top: "local2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "local2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool2"
+  top: "ip1"
+  param {
+    name: "ip1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip1_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "ip1"
+  top: "ip1"
+}
+layer {
+  name: "ip2"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "ip2"
+  param {
+    name: "ip2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip2_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "feat"
+  type: "InnerProduct"
+  bottom: "ip2"
+  top: "feat"
+  param {
+    name: "feat_w"
+    lr_mult: 1
+  }
+  param {
+    name: "feat_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "conv1_p"
+  type: "Convolution"
+  bottom: "data_p"
+  top: "conv1_p"
+  param {
+    name: "conv1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv1_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool1_p"
+  type: "Pooling"
+  bottom: "conv1_p"
+  top: "pool1_p"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "local1_p"
+  type: "Local"
+  bottom: "pool1_p"
+  top: "local1_p"
+  local_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param {
+    name: "local1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "local1_b"
+    lr_mult: 2
+  }
+}
+layer {
+  name: "relu1_p"
+  type: "ReLU"
+  bottom: "local1_p"
+  top: "local1_p"
+}
+layer {
+  name: "local2_p"
+  type: "Local"
+  bottom: "local1_p"
+  top: "local2_p"
+  local_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param {
+    name: "local2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "local2_b"
+    lr_mult: 2
+  }
+}
+layer {
+  name: "relu2_p"
+  type: "ReLU"
+  bottom: "local2_p"
+  top: "local2_p"
+}
+layer {
+  name: "pool2_p"
+  type: "Pooling"
+  bottom: "local2_p"
+  top: "pool2_p"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "ip1_p"
+  type: "InnerProduct"
+  bottom: "pool2_p"
+  top: "ip1_p"
+  param {
+    name: "ip1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip1_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu1_p"
+  type: "ReLU"
+  bottom: "ip1_p"
+  top: "ip1_p"
+}
+layer {
+  name: "ip2_p"
+  type: "InnerProduct"
+  bottom: "ip1_p"
+  top: "ip2_p"
+  param {
+    name: "ip2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip2_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "feat_p"
+  type: "InnerProduct"
+  bottom: "ip2_p"
+  top: "feat_p"
+  param {
+    name: "feat_w"
+    lr_mult: 1
+  }
+  param {
+    name: "feat_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "loss"
+  type: "ContrastiveLoss"
+  bottom: "feat"
+  bottom: "feat_p"
+  bottom: "sim"
+  top: "loss"
+  contrastive_loss_param {
+    margin: 1
+  }
+}
diff --git a/examples/siamese/train_mnist_siamese_local.sh b/examples/siamese/train_mnist_siamese_local.sh
new file mode 100755
index 00000000000..aa84bcb718a
--- /dev/null
+++ b/examples/siamese/train_mnist_siamese_local.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env sh
+
+TOOLS=./build/tools
+
+$TOOLS/caffe train --solver=examples/siamese/mnist_siamese_local_solver.prototxt
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index dad9ad46b3b..f57a445956f 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -165,6 +165,29 @@ class XavierFiller : public Filler<Dtype> {
   }
 };
 
+template <typename Dtype>
+class TestLocalFiller : public Filler<Dtype> {
+ public:
+  explicit TestLocalFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    LOG(INFO) << "Doing mutable cpu";
+    LOG(INFO) << "blobs" << blob;
+    Dtype* data = blob->mutable_cpu_data();
+    LOG(INFO) << "Done Doing mutable cpu";
+    CHECK_EQ(blob->channels(), 1);
+
+    for (int n = 0; n < blob->num(); n++) {
+      for (int j = 0; j < blob->height(); j++) {
+        for (int i = 0; i < blob->width(); i++) {
+          *(data+blob->offset(n, 0, j, i)) = i;
+        }
+      }
+    }
+  }
+};
+
+
 /**
  * @brief Fills a Blob with values @f$ x \sim N(0, \sigma^2) @f$ where
  *        @f$ \sigma^2 @f$ is set inversely proportional to number of incoming
@@ -280,6 +303,8 @@ Filler<Dtype>* GetFiller(const FillerParameter& param) {
     return new UniformFiller<Dtype>(param);
   } else if (type == "xavier") {
     return new XavierFiller<Dtype>(param);
+  } else if (type == "test_local") {
+    return new TestLocalFiller<Dtype>(param);
   } else if (type == "msra") {
     return new MSRAFiller<Dtype>(param);
   } else if (type == "bilinear") {
diff --git a/include/caffe/layers/local_layer.hpp b/include/caffe/layers/local_layer.hpp
new file mode 100644
index 00000000000..e2e4d526772
--- /dev/null
+++ b/include/caffe/layers/local_layer.hpp
@@ -0,0 +1,61 @@
+#ifndef HEADER_LOCAL_LAYER
+#define HEADER_LOCAL_LAYER
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype>
+class LocalLayer : public Layer<Dtype> {
+ public:
+  explicit LocalLayer(const LayerParameter& param)
+      : Layer<Dtype>(param), dilation_(1) {}
+
+  virtual inline const char* type() const { return "Local"; }
+
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline bool EqualNumBottomTopBlobs() const { return true; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+
+  int kernel_size_;
+  int stride_;
+  int num_;
+  int channels_;
+  int pad_;
+  int height_, width_;
+  int height_out_, width_out_;
+  int num_output_;
+  bool bias_term_;
+
+  int M_;
+  int K_;
+  int N_;
+
+  const int dilation_;
+
+  Blob<Dtype> col_buffer_;
+};
+
+}  // namespace caffe
+
+#endif
diff --git a/src/caffe/layers/local_layer.cpp b/src/caffe/layers/local_layer.cpp
new file mode 100644
index 00000000000..17e9c63de4a
--- /dev/null
+++ b/src/caffe/layers/local_layer.cpp
@@ -0,0 +1,215 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layers/local_layer.hpp"
+#include "caffe/util/im2col.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void LocalLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(bottom.size(), 1) << "Conv Layer takes a single blob as input.";
+  CHECK_EQ(top.size(), 1) << "Conv Layer takes a single blob as output.";
+
+  kernel_size_ = this->layer_param_.local_param().kernel_size();
+  stride_ = this->layer_param_.local_param().stride();
+  pad_ = this->layer_param_.local_param().pad();
+  num_ = bottom[0]->num();
+  channels_ = bottom[0]->channels();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  num_output_ = this->layer_param_.local_param().num_output();
+
+  height_out_ = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+  width_out_ = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+
+  M_ = num_output_;
+  K_ = channels_ * kernel_size_ * kernel_size_;
+  N_ = height_out_ * width_out_;
+
+  CHECK_GT(num_output_, 0);
+  CHECK_GE(height_, kernel_size_) << "height smaller than kernel size";
+  CHECK_GE(width_, kernel_size_) << "width smaller than kernel size";
+  // Set the parameters
+  bias_term_ = this->layer_param_.local_param().bias_term();
+
+  // Check if we need to set up the weights
+  if (this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else {
+    if (bias_term_) {
+      this->blobs_.resize(2);
+    } else {
+      this->blobs_.resize(1);
+    }
+    // Intialize the weight
+    this->blobs_[0].reset(new Blob<Dtype>(
+          num_output_, 1, K_, N_));
+    // fill the weights
+    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
+          this->layer_param_.local_param().weight_filler()));
+    weight_filler->Fill(this->blobs_[0].get());
+    // If necessary, intiialize and fill the bias term
+    if (bias_term_) {
+      this->blobs_[1].reset(new Blob<Dtype>(1, 1, M_, N_));
+      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
+            this->layer_param_.local_param().bias_filler()));
+      bias_filler->Fill(this->blobs_[1].get());
+    }
+  }
+}
+
+template <typename Dtype>
+void LocalLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
+    " weights.";
+  // TODO: generalize to handle inputs of different shapes.
+  for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
+    CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
+    CHECK_EQ(channels_, bottom[bottom_id]->channels())
+      << "Inputs must have same channels.";
+    CHECK_EQ(height_, bottom[bottom_id]->height())
+      << "Inputs must have same height.";
+    CHECK_EQ(width_, bottom[bottom_id]->width())
+      << "Inputs must have same width.";
+  }
+
+  // Shape the tops.
+  for (int top_id = 0; top_id < top.size(); ++top_id) {
+    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
+  }
+
+  // The im2col result buffer would only hold one image at a time to avoid
+  // overly large memory usage.
+  col_buffer_.Reshape(
+      1, channels_ * kernel_size_ * kernel_size_, height_out_, width_out_);
+
+  for (int top_id = 0; top_id < top.size(); ++top_id) {
+    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
+  }
+}
+
+template <typename Dtype>
+void LocalLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+
+  Dtype* x_data = col_buffer_.mutable_cpu_data();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+
+  Blob<Dtype> E;
+  E.Reshape(1, 1, 1, K_);
+  FillerParameter filler_param;
+  filler_param.set_value(1);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(&E);
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, K_, N_);
+  for (int n = 0; n < num_; n++) {
+    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+        width_, kernel_size_, kernel_size_,
+        pad_, pad_, stride_, stride_,
+        dilation_, dilation_, x_data);
+
+    for (int m = 0; m < num_output_; m++) {
+      caffe_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
+          intermediate.mutable_cpu_data());
+
+      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_,
+          (Dtype)1., E.cpu_data(),
+          intermediate.cpu_data(),
+          (Dtype)0., top_data + top[0]->offset(n, m));
+    }
+
+    if (bias_term_) {
+      caffe_add(M_ * N_, this->blobs_[1]->cpu_data(),
+          top_data + top[0]->offset(n),
+          top_data + top[0]->offset(n));
+    }
+  }
+}
+
+template <typename Dtype>
+void LocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
+  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* x_data = col_buffer_.mutable_cpu_data();
+  Dtype* x_diff = col_buffer_.mutable_cpu_diff();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
+  Dtype* bias_diff = NULL;
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, 1, N_);
+
+  Blob<Dtype> xt;
+  xt.Reshape(1, 1, K_, N_);
+  Dtype* xt_data = xt.mutable_cpu_data();
+
+  if (bias_term_) {
+    bias_diff = this->blobs_[1]->mutable_cpu_diff();
+    caffe_set(this->blobs_[1]->count(), Dtype(0.0), bias_diff);
+    for (int n = 0; n < num_; ++n) {
+      caffe_add(M_ * N_, bias_diff,
+          top_diff + top[0]->offset(n),
+          bias_diff);
+    }
+  }
+
+  caffe_set(this->blobs_[0]->count(), Dtype(0.0), weight_diff);
+  for (int n = 0; n < num_; n++) {
+    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+        width_, kernel_size_, kernel_size_,
+        pad_, pad_, stride_, stride_,
+        dilation_, dilation_, x_data);
+
+    // gradient wrt weight
+    for (int m = 0; m < num_output_; m++) {
+      Dtype* filter_weight_diff = weight_diff+this->blobs_[0]->offset(m);
+      for (int k = 0; k < K_; k++) {
+        caffe_mul(N_, top_diff+top[0]->offset(n, m),
+            x_data+col_buffer_.offset(0, k), xt_data+xt.offset(0, 0, k));
+      }
+      caffe_cpu_axpby(K_*N_, Dtype(1.0), xt_data,
+          Dtype(1.0), filter_weight_diff);
+    }
+
+    // gradient wrt bottom data
+    if (propagate_down[0]) {
+      caffe_set(col_buffer_.count(), Dtype(0.0), x_diff);
+      for (int m = 0; m < num_output_; m++) {
+        for (int k = 0; k < K_; k++) {
+          caffe_mul(N_, top_diff+top[0]->offset(n, m),
+              weight+this->blobs_[0]->offset(m, 0, k),
+              intermediate.mutable_cpu_data());
+
+          caffe_cpu_axpby(N_, Dtype(1.0),
+              intermediate.cpu_data(), Dtype(1.0),
+              x_diff+col_buffer_.offset(0, k));
+        }
+      }
+
+      // col2im back to the data
+      col2im_cpu(x_diff, channels_, height_,
+          width_, kernel_size_, kernel_size_,
+          pad_, pad_, stride_, stride_,
+          dilation_, dilation_, bottom_diff + bottom[0]->offset(n));
+    }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(LocalLayer);
+#endif
+
+INSTANTIATE_CLASS(LocalLayer);
+REGISTER_LAYER_CLASS(Local);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/local_layer.cu b/src/caffe/layers/local_layer.cu
new file mode 100644
index 00000000000..396a4ed0325
--- /dev/null
+++ b/src/caffe/layers/local_layer.cu
@@ -0,0 +1,197 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layers/local_layer.hpp"
+#include "caffe/util/im2col.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void local_update1_gpu_kernel(
+    const Dtype* data_A, const Dtype* data_B,
+    Dtype* data_R, const int filter_num,
+    const int location_num, const int output_num) {
+  int total = filter_num * location_num * output_num;
+  CUDA_KERNEL_LOOP(index, total) {
+    int p = index % location_num;
+    int n = (index / location_num) % filter_num;
+    int q = (index / location_num) / filter_num;
+    data_R[index] += data_A[q*location_num+p] * data_B[n*location_num+p];
+  }
+}
+
+template <typename Dtype>
+void local_update1_gpu(
+    const Dtype* data_A, const Dtype* data_B,
+    Dtype* data_R, const int filter_num,
+    const int location_num, const int output_num) {
+  // data_A is output_num x location_num
+  // data_B is filter_num x location_num
+  // data_R is output_num x filter_num x location_num,
+  //      the update performed is Rqnp += Aqp * Bnp
+  const int nthreads = filter_num * location_num * output_num;
+
+  local_update1_gpu_kernel<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+    <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+      data_A, data_B, data_R, filter_num, location_num, output_num);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void local_update1_gpu<float>(
+    const float* data_A, const float* data_B,
+    float* data_R, const int filter_num,
+    const int location_num, const int output_num);
+template void local_update1_gpu<double>(
+    const double* data_A, const double* data_B,
+    double* data_R, const int filter_num,
+    const int location_num, const int output_num);
+
+template <typename Dtype>
+__global__ void local_update2_gpu_kernel(
+    const Dtype* data_A, const Dtype* data_B,
+    Dtype* data_R, const int filter_num,
+    const int location_num, const int output_num) {
+  int total = filter_num * location_num;
+  CUDA_KERNEL_LOOP(index, total) {
+    int p = index % location_num;
+    int n = (index / location_num);
+    for (int q = 0; q < output_num; q++) {
+      data_R[index] +=
+        data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
+    }
+  }
+}
+
+template <typename Dtype>
+void local_update2_gpu(const Dtype* data_A, const Dtype* data_B,
+    Dtype* data_R, const int filter_num,
+    const int location_num, const int output_num) {
+  // data_A is output_num x location_num
+  // data_B is output_num x filter_num x location_num
+  // data_R is filter_num x location_num,
+  //    the update performed is Rnp += \sum_q(Aqp * Bqnp)
+  int nthreads = filter_num * location_num;
+
+
+  local_update2_gpu_kernel<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+    <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+        data_A, data_B, data_R, filter_num,
+        location_num, output_num);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void local_update2_gpu<float>(
+    const float* data_A, const float* data_B,
+    float* data_R, const int filter_num,
+    const int location_num, const int output_num);
+template void local_update2_gpu<double>(
+    const double* data_A, const double* data_B,
+    double* data_R, const int filter_num,
+    const int location_num, const int output_num);
+
+
+
+
+/// @brief refer to CPU forward -- the BLAS implementation is the same.
+template <typename Dtype>
+void LocalLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+
+  Dtype* x_data = col_buffer_.mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+
+  Blob<Dtype> E;
+  E.Reshape(1, 1, 1, K_);
+  FillerParameter filler_param;
+  filler_param.set_value(1);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(&E);
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, K_, N_);
+  for (int n = 0; n < num_; n++) {
+    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+        width_, kernel_size_, kernel_size_,
+        pad_, pad_, stride_, stride_,
+        dilation_, dilation_, x_data);
+
+    for (int m = 0; m < num_output_; m++) {
+      caffe_gpu_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
+          intermediate.mutable_gpu_data());
+
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_,
+          (Dtype)1., E.gpu_data(), intermediate.gpu_data(),
+          (Dtype)0., top_data + top[0]->offset(n, m));
+    }
+
+    if (bias_term_) {
+      caffe_gpu_add(M_ * N_, this->blobs_[1]->gpu_data(),
+          top_data + top[0]->offset(n),
+          top_data + top[0]->offset(n));
+    }
+  }
+}
+
+/// @brief refer to CPU backward -- the BLAS implementation is the same.
+template <typename Dtype>
+void LocalLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* x_data = col_buffer_.mutable_gpu_data();
+  Dtype* x_diff = col_buffer_.mutable_gpu_diff();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  Dtype* bias_diff = NULL;
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, 1, N_);
+
+  Blob<Dtype> xt;
+  xt.Reshape(1, 1, K_, N_);
+  Dtype* xt_data = xt.mutable_gpu_data();
+  if (bias_term_) {
+    bias_diff = this->blobs_[1]->mutable_gpu_diff();
+    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0.), bias_diff);
+    for (int n = 0; n < num_; ++n) {
+      caffe_gpu_add(M_ * N_, bias_diff,
+          top_diff + top[0]->offset(n),
+          bias_diff);
+    }
+  }
+
+  Blob<Dtype> buf;
+  buf.Reshape(1, 1, K_, N_);
+  Dtype* buf_data = buf.mutable_gpu_data();
+  caffe_gpu_set(this->blobs_[0]->count(), Dtype(0.), weight_diff);
+  for (int n = 0; n < num_; n++) {
+    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+        width_, kernel_size_, kernel_size_,
+        pad_, pad_, stride_, stride_,
+        dilation_, dilation_, x_data);
+
+    local_update1_gpu(
+        top_diff+top[0]->offset(n), x_data,
+        weight_diff, K_, N_, M_);
+
+    if (propagate_down[0]) {
+      caffe_gpu_set(col_buffer_.count(), Dtype(0.), x_diff);
+      local_update2_gpu(top_diff+top[0]->offset(n), weight, x_diff, K_, N_, M_);
+
+      // col2im back to the data
+      col2im_gpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
+          pad_, pad_, stride_, stride_,
+          dilation_, dilation_, bottom_diff + bottom[0]->offset(n));
+    }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(LocalLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 019aa614373..a1ca42a399b 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -306,7 +306,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 140 (last added: batch_norm_param)
+// LayerParameter next available layer-specific ID: 141 (last added: local_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -391,6 +391,7 @@ message LayerParameter {
   optional ThresholdParameter threshold_param = 128;
   optional TileParameter tile_param = 138;
   optional WindowDataParameter window_data_param = 129;
+  optional LocalParameter local_param = 140;
 }
 
 // Message that stores parameters used to apply transformation
@@ -428,7 +429,7 @@ message LossParameter {
     // Outputs that receive the ignore label will NOT be ignored in computing
     // the normalization factor.
     FULL = 0;
-    // Divide by the total number of output locations that do not take the 
+    // Divide by the total number of output locations that do not take the
     // ignore_label.  If ignore_label is not set, this behaves like FULL.
     VALID = 1;
     // Divide by the batch size.
@@ -1254,3 +1255,14 @@ message PReLUParameter {
   // Whether or not slope paramters are shared across channels.
   optional bool channel_shared = 2 [default = false];
 }
+
+// Message that stores parameters used by LocalLayer
+message LocalParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional uint32 pad = 3 [default = 0]; // The padding size
+  optional uint32 kernel_size = 4; // The kernel size
+  optional uint32 stride = 6 [default = 1]; // The stride
+  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter bias_filler = 8; // The filler for the bias
+}
diff --git a/src/caffe/test/test_local_layer.cpp b/src/caffe/test/test_local_layer.cpp
new file mode 100644
index 00000000000..a1e3a8ea1f8
--- /dev/null
+++ b/src/caffe/test/test_local_layer.cpp
@@ -0,0 +1,121 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/local_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class LocalLayerTest: public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+ protected:
+  LocalLayerTest()
+      : blob_bottom_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    blob_bottom_->Reshape(2, 3, 6, 4);
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_value(1.);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+
+  virtual ~LocalLayerTest() { delete blob_bottom_; delete blob_top_; }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(LocalLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(LocalLayerTest, TestSetup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(4);
+  shared_ptr<Layer<Dtype> > layer(
+      new LocalLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 4);
+  EXPECT_EQ(this->blob_top_->height(), 2);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+  convolution_param->set_num_output(3);
+  layer.reset(new LocalLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 3);
+  EXPECT_EQ(this->blob_top_->height(), 2);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+}
+
+
+TYPED_TEST(LocalLayerTest, TestSimpleConvolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  // We will simply see if the convolution layer carries out averaging well.
+  FillerParameter filler_param;
+  filler_param.set_value(1.);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(this->blob_bottom_);
+  LayerParameter layer_param;
+  LocalParameter* convolution_param =
+    layer_param.mutable_local_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(1);
+  convolution_param->set_num_output(1);
+  convolution_param->mutable_weight_filler()->set_type("test_local");
+  convolution_param->mutable_weight_filler()->set_value(1);
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new LocalLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // After the convolution, the output should all have output values 27.1
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int n = 0; n < this->blob_top_->num(); n++) {
+    for (int k = 0; k < this->blob_top_->channels(); k++) {
+      for (int j = 0; j < this->blob_top_->height(); j++) {
+        for (int i = 0; i < this->blob_top_->width(); i++) {
+          int idx = j * this->blob_top_->width() + i;
+          EXPECT_NEAR(*(top_data + this->blob_top_->offset(n, k, j, i)),
+              idx * 27 + 0.1, 1e-4);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(LocalLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LocalParameter* convolution_param =
+    layer_param.mutable_local_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  LocalLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer,
+      this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+}  // namespace caffe