diff --git a/examples/mnist/lenet_import_conv_pool.prototxt b/examples/mnist/lenet_import_conv_pool.prototxt
new file mode 100644
index 00000000000..5e2b7886e22
--- /dev/null
+++ b/examples/mnist/lenet_import_conv_pool.prototxt
@@ -0,0 +1,30 @@
+layers {
+  name: "conv"
+  type: CONVOLUTION
+  bottom: "${bottom}"
+  top: "conv"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: ${num_output}
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "pool"
+  type: POOLING
+  bottom: "conv"
+  top: "pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
diff --git a/examples/mnist/lenet_import_solver.prototxt b/examples/mnist/lenet_import_solver.prototxt
new file mode 100644
index 00000000000..c332567f37b
--- /dev/null
+++ b/examples/mnist/lenet_import_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/mnist/lenet_import_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0005
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 10000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/mnist/lenet"
+# solver mode: CPU or GPU
+solver_mode: GPU
diff --git a/examples/mnist/lenet_import_train_test.prototxt b/examples/mnist/lenet_import_train_test.prototxt
new file mode 100644
index 00000000000..4ab86e0dc49
--- /dev/null
+++ b/examples/mnist/lenet_import_train_test.prototxt
@@ -0,0 +1,104 @@
+name: "LeNet"
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_train_lmdb"
+    backend: LMDB
+    batch_size: 64
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TRAIN }
+}
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_test_lmdb"
+    backend: LMDB
+    batch_size: 100
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TEST }
+}
+layers {
+  name: "cp1"
+  type: IMPORT
+  import_param {
+    net: "examples/mnist/lenet_import_conv_pool.prototxt"
+    var { name: "bottom" value: "/data" }
+    var { name: "num_output" value: "20" }
+  }
+}
+layers {
+  name: "cp2"
+  type: IMPORT
+  import_param {
+    net: "examples/mnist/lenet_import_conv_pool.prototxt"
+    var { name: "bottom" value: "../cp1/pool" }
+    var { name: "num_output" value: "50" }
+  }
+}
+layers {
+  name: "ip1"
+  type: INNER_PRODUCT
+  bottom: "cp2/pool"
+  top: "ip1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "ip1"
+  top: "ip1"
+}
+layers {
+  name: "ip2"
+  type: INNER_PRODUCT
+  bottom: "ip1"
+  top: "ip2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "accuracy"
+  type: ACCURACY
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include: { phase: TEST }
+}
+layers {
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/examples/mnist/lenet_local_solver.prototxt b/examples/mnist/lenet_local_solver.prototxt
new file mode 100644
index 00000000000..a4b02d25e7e
--- /dev/null
+++ b/examples/mnist/lenet_local_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/mnist/lenet_local_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0005
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 10000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/mnist/lenet"
+# solver mode: CPU or GPU
+solver_mode: GPU
diff --git a/examples/mnist/lenet_local_train_test.prototxt b/examples/mnist/lenet_local_train_test.prototxt
new file mode 100644
index 00000000000..ff88b5f4919
--- /dev/null
+++ b/examples/mnist/lenet_local_train_test.prototxt
@@ -0,0 +1,171 @@
+name: "LeNet"
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_train_lmdb"
+    backend: LMDB
+    batch_size: 64
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TRAIN }
+}
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_test_lmdb"
+    backend: LMDB
+    batch_size: 100
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TEST }
+}
+
+layers {
+  name: "conv1"
+  type: CONVOLUTION
+  bottom: "data"
+  top: "conv1"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "pool1"
+  type: POOLING
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layers {
+  name: "local1"
+  type: LOCAL
+  bottom: "pool1"
+  top: "local1"
+  blobs_lr: 1
+  blobs_lr: 1
+  local_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "local1"
+  top: "local1"
+}
+layers {
+  name: "local2"
+  type: LOCAL
+  bottom: "local1"
+  top: "local2"
+  blobs_lr: 1
+  blobs_lr: 1
+  local_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layers {
+  name: "relu2"
+  type: RELU
+  bottom: "local2"
+  top: "local2"
+}
+layers {
+  name: "ip1"
+  type: INNER_PRODUCT
+  bottom: "local2"
+  top: "ip1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "relu2"
+  type: RELU
+  bottom: "ip1"
+  top: "ip1"
+}
+layers {
+  name: "ip2"
+  type: INNER_PRODUCT
+  bottom: "ip1"
+  top: "ip2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "accuracy"
+  type: ACCURACY
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include: { phase: TEST }
+}
+layers {
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/examples/mnist/train_lenet.sh b/examples/mnist/train_lenet.sh
index 1b6bf7d978d..1f718825ded 100755
--- a/examples/mnist/train_lenet.sh
+++ b/examples/mnist/train_lenet.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env sh
 
-./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
+GLOG_logtostderr=0 GLOG_log_dir=examples/mnist/ ./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
diff --git a/examples/mnist/train_lenet_import.sh b/examples/mnist/train_lenet_import.sh
new file mode 100755
index 00000000000..6387228d368
--- /dev/null
+++ b/examples/mnist/train_lenet_import.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+
+GLOG_logtostderr=0 GLOG_log_dir=examples/mnist/ ./build/tools/caffe train --solver=examples/mnist/lenet_import_solver.prototxt --gpu=1
diff --git a/examples/mnist/train_lenet_local.sh b/examples/mnist/train_lenet_local.sh
new file mode 100755
index 00000000000..b9e29e5ceb8
--- /dev/null
+++ b/examples/mnist/train_lenet_local.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+
+GLOG_logtostderr=0 GLOG_log_dir=examples/mnist/ ./build/tools/caffe train --solver=examples/mnist/lenet_local_solver.prototxt --gpu=1
diff --git a/examples/siamese/mnist_siamese_local_solver.prototxt b/examples/siamese/mnist_siamese_local_solver.prototxt
new file mode 100644
index 00000000000..c85bb90fc87
--- /dev/null
+++ b/examples/siamese/mnist_siamese_local_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/siamese/mnist_siamese_local_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0000
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 50000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/siamese/mnist_siamese"
+# solver mode: CPU or GPU
+solver_mode: GPU
diff --git a/examples/siamese/mnist_siamese_local_train_test.prototxt b/examples/siamese/mnist_siamese_local_train_test.prototxt
new file mode 100644
index 00000000000..4212217df15
--- /dev/null
+++ b/examples/siamese/mnist_siamese_local_train_test.prototxt
@@ -0,0 +1,435 @@
+name: "mnist_siamese_train_test"
+layers {
+  name: "pair_data"
+  type: DATA
+  top: "pair_data"
+  top: "sim"
+  data_param {
+    source: "examples/siamese/mnist_siamese_train_leveldb"
+    scale: 0.00390625
+    batch_size: 64
+  }
+  include: { phase: TRAIN }
+}
+layers {
+  name: "pair_data"
+  type: DATA
+  top: "pair_data"
+  top: "sim"
+  data_param {
+    source: "examples/siamese/mnist_siamese_test_leveldb"
+    scale: 0.00390625
+    batch_size: 100
+  }
+  include: { phase: TEST }
+}
+layers {
+    name: "slice_pair"
+    type: SLICE
+    bottom: "pair_data"
+    top: "data"
+    top: "data_p"
+    slice_param {
+        slice_dim: 1
+        slice_point: 1
+    }
+}
+
+
+
+
+layers {
+  name: "conv1"
+  type: CONVOLUTION
+  bottom: "data"
+  top: "conv1"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "conv1_w"
+  param: "conv1_b"
+}
+layers {
+  name: "pool1"
+  type: POOLING
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+
+
+layers {
+  name: "local1"
+  type: LOCAL
+  bottom: "pool1"
+  top: "local1"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local1_w"
+  param: "local1_b"
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "local1"
+  top: "local1"
+}
+layers {
+  name: "local2"
+  type: LOCAL
+  bottom: "local1"
+  top: "local2"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local2_w"
+  param: "local2_b"
+}
+layers {
+  name: "relu2"
+  type: RELU
+  bottom: "local2"
+  top: "local2"
+}
+
+
+
+
+
+
+
+layers {
+  name: "ip1"
+  type: INNER_PRODUCT
+  bottom: "local2"
+  top: "ip1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip1_w"
+  param: "ip1_b"
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "ip1"
+  top: "ip1"
+}
+layers {
+  name: "ip2"
+  type: INNER_PRODUCT
+  bottom: "ip1"
+  top: "ip2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip2_w"
+  param: "ip2_b"
+}
+
+layers {
+  name: "feat2"
+  type: INNER_PRODUCT
+  bottom: "ip2"
+  top: "feat2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat2_w"
+  param: "feat2_b"
+}
+
+layers {
+  name: "feat1"
+  type: INNER_PRODUCT
+  bottom: "local1"
+  top: "feat1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat1_w"
+  param: "feat1_b"
+}
+
+
+
+layers {
+  name: "conv1_p"
+  type: CONVOLUTION
+  bottom: "data_p"
+  top: "conv1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "conv1_w"
+  param: "conv1_b"
+}
+layers {
+  name: "pool1_p"
+  type: POOLING
+  bottom: "conv1_p"
+  top: "pool1_p"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+
+
+layers {
+  name: "local1_p"
+  type: LOCAL
+  bottom: "pool1_p"
+  top: "local1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local1_w"
+  param: "local1_b"
+}
+layers {
+  name: "relu1_p"
+  type: RELU
+  bottom: "local1_p"
+  top: "local1_p"
+}
+layers {
+  name: "local2_p"
+  type: LOCAL
+  bottom: "local1_p"
+  top: "local2_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local2_w"
+  param: "local2_b"
+}
+layers {
+  name: "relu2_p"
+  type: RELU
+  bottom: "local2_p"
+  top: "local2_p"
+}
+
+
+
+
+
+layers {
+  name: "ip1_p"
+  type: INNER_PRODUCT
+  bottom: "local2_p"
+  top: "ip1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip1_w"
+  param: "ip1_b"
+}
+layers {
+  name: "relu1_p"
+  type: RELU
+  bottom: "ip1_p"
+  top: "ip1_p"
+}
+layers {
+  name: "ip2_p"
+  type: INNER_PRODUCT
+  bottom: "ip1_p"
+  top: "ip2_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip2_w"
+  param: "ip2_b"
+}
+
+layers {
+  name: "feat2_p"
+  type: INNER_PRODUCT
+  bottom: "ip2_p"
+  top: "feat2_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat2_w"
+  param: "feat2_b"
+}
+
+
+layers {
+  name: "feat1_p"
+  type: INNER_PRODUCT
+  bottom: "local1_p"
+  top: "feat1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat1_w"
+  param: "feat1_b"
+}
+
+
+layers {
+    name: "loss2"
+    type: CONTRASTIVE_LOSS
+    contrastive_loss_param {
+        margin: 1.0
+    }
+    bottom: "feat2"
+    bottom: "feat2_p"
+    bottom: "sim"
+    top: "loss2"
+}
+
+
+layers {
+    name: "loss1"
+    type: CONTRASTIVE_LOSS
+    contrastive_loss_param {
+        margin: 1.0
+    }
+    bottom: "feat1"
+    bottom: "feat1_p"
+    bottom: "sim"
+    top: "loss1"
+}
+
diff --git a/examples/siamese/train_mnist_siamese_local.sh b/examples/siamese/train_mnist_siamese_local.sh
new file mode 100755
index 00000000000..83035bf2ccc
--- /dev/null
+++ b/examples/siamese/train_mnist_siamese_local.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env sh
+
+TOOLS=./build/tools
+
+GLOG_logtostderr=0 GLOG_log_dir=examples/siamese/ $TOOLS/caffe train --solver=examples/siamese/mnist_siamese_local_solver.prototxt
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 136ce958aed..291c122fa7d 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -158,6 +158,29 @@ class XavierFiller : public Filler<Dtype> {
 };
 
 
+template <typename Dtype>
+class TestLocalFiller : public Filler<Dtype> {
+ public:
+  explicit TestLocalFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    LOG(INFO) << "Doing mutable cpu";
+    LOG(INFO) << "blobs" << blob;
+    Dtype* data = blob->mutable_cpu_data();
+    LOG(INFO) << "Done Doing mutable cpu";
+    CHECK_EQ(blob->channels(), 1);
+
+    for (int n=0; n<blob->num(); n++) {
+      for (int j=0; j<blob->height(); j++) {
+        for (int i=0; i<blob->width(); i++) {
+          *(data+blob->offset(n, 0, j, i)) = i;
+        }
+      }
+    }
+  }
+};
+
+
 /**
  * @brief Get a specific filler from the specification given in FillerParameter.
  *
@@ -177,6 +200,8 @@ Filler<Dtype>* GetFiller(const FillerParameter& param) {
     return new UniformFiller<Dtype>(param);
   } else if (type == "xavier") {
     return new XavierFiller<Dtype>(param);
+  } else if (type == "test_local") {
+    return new TestLocalFiller<Dtype>(param);
   } else {
     CHECK(false) << "Unknown filler name: " << param.type();
   }
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 9fe58cd97bc..7be24fd56e2 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -355,6 +355,13 @@ class HingeLossLayer : public LossLayer<Dtype> {
     return LayerParameter_LayerType_HINGE_LOSS;
   }
 
+  // HingeLossLayer takes 2-3 bottom Blobs; if there are 3 the second and third
+  // are compared to compute a 0/1 label.  (Otherwise the label comes directly
+  // from the second.)
+  virtual inline int ExactNumBottomBlobs() const { return -1; }
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 3; }
+
  protected:
   /// @copydoc HingeLossLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
@@ -389,6 +396,9 @@ class HingeLossLayer : public LossLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+ private:
+  int ComputeLabel(const vector<Blob<Dtype>*>& bottom, int i);
 };
 
 /**
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 1d06dc45533..a5229f1df34 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -182,6 +182,13 @@ class Net {
   /// @brief Get misc parameters, e.g. the LR multiplier and weight decay.
   void GetLearningRateAndWeightDecay();
 
+  // @brief Loads imports, for modular network definitions
+  static void LoadImports(const NetParameter& source, NetParameter* target);
+  static void LoadImports(const NetParameter& source, NetParameter* target,
+      const string& pwd);
+  // @brief Resolves a layer or blob name, e.g. "../data"
+  static string ResolveImportName(const string& path, const string& pwd);
+
   /// @brief Individual layers in the net
   vector<shared_ptr<Layer<Dtype> > > layers_;
   vector<string> layer_names_;
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index e518979a75b..9ca84d1582d 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -52,6 +52,8 @@ inline void MakeTempDir(string* temp_dirname) {
   delete temp_dirname_cstr;
 }
 
+string ReadFile(const string& filename);
+
 bool ReadProtoFromTextFile(const char* filename, Message* proto);
 
 inline bool ReadProtoFromTextFile(const string& filename, Message* proto) {
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index c803cd72449..8465750af18 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -112,6 +112,56 @@ class ConvolutionLayer : public Layer<Dtype> {
   Blob<Dtype> bias_multiplier_;
 };
 
+
+
+template <typename Dtype>
+class LocalLayer : public Layer<Dtype> {
+ public:
+  explicit LocalLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline LayerParameter_LayerType type() const {
+    return LayerParameter_LayerType_LOCAL;
+  }
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline bool EqualNumBottomTopBlobs() const { return true; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+
+  int kernel_size_;
+  int stride_;
+  int num_;
+  int channels_;
+  int pad_;
+  int height_, width_;
+  int height_out_, width_out_;
+  int num_output_;
+  bool bias_term_;
+
+  int M_;
+  int K_;
+  int N_;
+
+  Blob<Dtype> col_buffer_;
+};
+
+
+
+
 #ifdef USE_CUDNN
 /*
  * @brief cuDNN implementation of ConvolutionLayer.
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index 69863543c30..b17396b51b9 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -185,5 +185,6 @@ REGISTER_LAYER_CLASS(SOFTMAX_LOSS, SoftmaxWithLossLayer);
 REGISTER_LAYER_CLASS(SPLIT, SplitLayer);
 REGISTER_LAYER_CLASS(THRESHOLD, ThresholdLayer);
 REGISTER_LAYER_CLASS(WINDOW_DATA, WindowDataLayer);
+REGISTER_LAYER_CLASS(LOCAL, LocalLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp
index f09916e2556..e7a3d5387c5 100644
--- a/src/caffe/layers/hinge_loss_layer.cpp
+++ b/src/caffe/layers/hinge_loss_layer.cpp
@@ -10,19 +10,34 @@
 
 namespace caffe {
 
+template <typename Dtype>
+int HingeLossLayer<Dtype>::ComputeLabel(const vector<Blob<Dtype>*>& bottom, int i) {
+  int label;
+  if (bottom.size() == 2) {
+    label = static_cast<int>(bottom[1]->cpu_data()[i]);
+  } else {  // bottom.size() == 3
+    // label == 1 if bottom[1] == bottom[2] (same)
+    // label == 0 if bottom[1] != bottom[2] (not same)
+    label = (bottom[1]->cpu_data()[i] ==
+             bottom[2]->cpu_data()[i]) ? 1 : 0;
+  }
+  return label;
+}
+
 template <typename Dtype>
 void HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  const Dtype* label = bottom[1]->cpu_data();
   int num = bottom[0]->num();
   int count = bottom[0]->count();
   int dim = count / num;
+  int label;
 
   caffe_copy(count, bottom_data, bottom_diff);
   for (int i = 0; i < num; ++i) {
-    bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
+    label = ComputeLabel(bottom, i);
+    bottom_diff[i * dim + label] *= -1;
   }
   for (int i = 0; i < num; ++i) {
     for (int j = 0; j < dim; ++j) {
@@ -52,13 +67,14 @@ void HingeLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
   if (propagate_down[0]) {
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    const Dtype* label = bottom[1]->cpu_data();
     int num = bottom[0]->num();
     int count = bottom[0]->count();
     int dim = count / num;
+    int label;
 
     for (int i = 0; i < num; ++i) {
-      bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
+      label = ComputeLabel(bottom, i);
+      bottom_diff[i * dim + label] *= -1;
     }
 
     const Dtype loss_weight = top[0]->cpu_diff()[0];
diff --git a/src/caffe/layers/local_layer.cpp b/src/caffe/layers/local_layer.cpp
new file mode 100644
index 00000000000..a00b934174e
--- /dev/null
+++ b/src/caffe/layers/local_layer.cpp
@@ -0,0 +1,211 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void LocalLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(bottom.size(), 1) << "Conv Layer takes a single blob as input.";
+  CHECK_EQ(top.size(), 1) << "Conv Layer takes a single blob as output.";
+
+  kernel_size_ = this->layer_param_.local_param().kernel_size();
+  stride_ = this->layer_param_.local_param().stride();
+  pad_ = this->layer_param_.local_param().pad();
+  num_ = bottom[0]->num();
+  channels_ = bottom[0]->channels();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  num_output_ = this->layer_param_.local_param().num_output();
+
+  height_out_ = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+  width_out_ = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+
+  M_ = num_output_;
+  K_ = channels_ * kernel_size_ * kernel_size_;
+  N_ = height_out_ * width_out_;
+
+  CHECK_GT(num_output_, 0); 
+  CHECK_GE(height_, kernel_size_) << "height smaller than kernel size";
+  CHECK_GE(width_, kernel_size_) << "width smaller than kernel size";
+  // Set the parameters
+  bias_term_ = this->layer_param_.local_param().bias_term();
+
+  // Check if we need to set up the weights
+  if (this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else {
+    if (bias_term_) {
+      this->blobs_.resize(2);
+    } else {
+      this->blobs_.resize(1);
+    }
+    // Intialize the weight
+    this->blobs_[0].reset(new Blob<Dtype>(
+        num_output_, 1, K_, N_));
+    // fill the weights
+    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
+        this->layer_param_.local_param().weight_filler()));
+    weight_filler->Fill(this->blobs_[0].get());
+    // If necessary, intiialize and fill the bias term
+    if (bias_term_) {
+      this->blobs_[1].reset(new Blob<Dtype>(1, 1, M_, N_));
+      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
+          this->layer_param_.local_param().bias_filler()));
+      bias_filler->Fill(this->blobs_[1].get());  
+    }
+  }
+}
+
+template <typename Dtype>
+void LocalLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
+    " weights.";
+  // TODO: generalize to handle inputs of different shapes.
+  for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
+    CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
+    CHECK_EQ(channels_, bottom[bottom_id]->channels())
+        << "Inputs must have same channels.";
+    CHECK_EQ(height_, bottom[bottom_id]->height())
+        << "Inputs must have same height.";
+    CHECK_EQ(width_, bottom[bottom_id]->width())
+        << "Inputs must have same width.";
+  }
+
+  // Shape the tops.
+  for (int top_id = 0; top_id < top.size(); ++top_id) {
+    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
+  }
+
+  // The im2col result buffer would only hold one image at a time to avoid
+  // overly large memory usage.
+  col_buffer_.Reshape(
+      1, channels_ * kernel_size_ * kernel_size_, height_out_, width_out_);
+
+  for (int top_id = 0; top_id < top.size(); ++top_id) {
+    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
+  }
+}
+
+template <typename Dtype>
+void LocalLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+
+  Dtype* x_data = col_buffer_.mutable_cpu_data();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+
+  Blob<Dtype> E;
+  E.Reshape(1, 1, 1, K_);
+  FillerParameter filler_param;
+  filler_param.set_value(1);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(&E);
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, K_, N_);
+  for (int n=0; n<num_; n++) {
+    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    for (int m=0; m<num_output_; m++) { 
+      caffe_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
+                intermediate.mutable_cpu_data());
+
+      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_,
+                            (Dtype)1., E.cpu_data(),
+                            intermediate.cpu_data(),
+                            (Dtype)0., top_data + top[0]->offset(n, m));
+    }
+
+    if (bias_term_) {
+      caffe_add(M_ * N_, this->blobs_[1]->cpu_data(),
+                top_data + top[0]->offset(n),
+                top_data + top[0]->offset(n));
+    }
+  }
+}
+
+template <typename Dtype>
+void LocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
+  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* x_data = col_buffer_.mutable_cpu_data();
+  Dtype* x_diff = col_buffer_.mutable_cpu_diff();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
+  Dtype* bias_diff = NULL;
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, 1, N_);
+
+  Blob<Dtype> xt;
+  xt.Reshape(1, 1, K_, N_);
+  Dtype* xt_data = xt.mutable_cpu_data();
+
+  if (bias_term_) {
+    bias_diff = this->blobs_[1]->mutable_cpu_diff();
+    memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count());
+    for (int n = 0; n < num_; ++n) {
+      caffe_add(M_ * N_, bias_diff,
+                top_diff + top[0]->offset(n),
+                bias_diff);
+    }
+  }
+
+  memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count());
+  for (int n=0; n<num_; n++) {
+    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    // gradient wrt weight
+    for (int m=0; m<num_output_; m++) {
+      Dtype* filter_weight_diff = weight_diff+this->blobs_[0]->offset(m);
+      for (int k=0; k<K_; k++) {
+        caffe_mul(N_, top_diff+top[0]->offset(n, m),  
+                  x_data+col_buffer_.offset(0,k), xt_data+xt.offset(0,0,k));
+      }
+      caffe_cpu_axpby(K_*N_, Dtype(1.0), xt_data, Dtype(1.0), filter_weight_diff);
+    }
+      
+    // gradient wrt bottom data
+    if (propagate_down[0]) {
+      memset(x_diff, 0, col_buffer_.count() * sizeof(Dtype));
+      for (int m=0; m<num_output_; m++) {
+        for (int k=0; k<K_; k++) {
+          caffe_mul(N_, top_diff+top[0]->offset(n, m),
+                    weight+this->blobs_[0]->offset(m,0,k),
+                    intermediate.mutable_cpu_data());
+
+          caffe_cpu_axpby(N_, Dtype(1.0),
+                          intermediate.cpu_data(), Dtype(1.0),
+                          x_diff+col_buffer_.offset(0,k));
+        }
+      }
+
+      // col2im back to the data
+      col2im_cpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
+                 pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
+
+    }
+  }
+
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(LocalLayer);
+#endif
+
+INSTANTIATE_CLASS(LocalLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/local_layer.cu b/src/caffe/layers/local_layer.cu
new file mode 100644
index 00000000000..1abd9ed4ac7
--- /dev/null
+++ b/src/caffe/layers/local_layer.cu
@@ -0,0 +1,179 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+/// @brief refer to CPU forward -- the BLAS implementation is the same.
+template <typename Dtype>
+void LocalLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+
+  Dtype* x_data = col_buffer_.mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+
+  Blob<Dtype> E;
+  E.Reshape(1, 1, 1, K_);
+  FillerParameter filler_param;
+  filler_param.set_value(1);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(&E);
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, K_, N_);
+  for (int n=0; n<num_; n++) {
+    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    for (int m=0; m<num_output_; m++) {
+      caffe_gpu_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
+                    intermediate.mutable_gpu_data());
+
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_,
+                            (Dtype)1., E.gpu_data(), intermediate.gpu_data(),
+                            (Dtype)0., top_data + top[0]->offset(n, m));
+    }
+
+    if (bias_term_) {
+      caffe_gpu_add(M_ * N_, this->blobs_[1]->gpu_data(),
+                    top_data + top[0]->offset(n),
+                    top_data + top[0]->offset(n));
+    }
+  }
+
+}
+
+/// @brief refer to CPU backward -- the BLAS implementation is the same.
+template <typename Dtype>
+void LocalLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* x_data = col_buffer_.mutable_gpu_data();
+  Dtype* x_diff = col_buffer_.mutable_gpu_diff();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  Dtype* bias_diff = NULL;
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, 1, N_);
+
+  Blob<Dtype> xt;
+  xt.Reshape(1, 1, K_, N_);
+  Dtype* xt_data = xt.mutable_gpu_data();
+  if (bias_term_) {
+    bias_diff = this->blobs_[1]->mutable_gpu_diff();
+    CUDA_CHECK(cudaMemset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count()));
+    for (int n = 0; n < num_; ++n) {
+      caffe_gpu_add(M_ * N_, bias_diff,
+                    top_diff + top[0]->offset(n),
+                    bias_diff);
+    }
+  }
+
+  Blob<Dtype> buf;
+  buf.Reshape(1, 1, K_, N_);
+  Dtype* buf_data = buf.mutable_gpu_data();
+  CUDA_CHECK(cudaMemset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count()));
+  for (int n=0; n<num_; n++) {
+    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    local_update1_gpu(top_diff+top[0]->offset(n), x_data, weight_diff, K_, N_, M_);
+
+    if (propagate_down[0]) {
+      CUDA_CHECK(cudaMemset(x_diff, 0, col_buffer_.count() * sizeof(Dtype)));
+      local_update2_gpu(top_diff+top[0]->offset(n), weight, x_diff, K_, N_, M_);
+
+      // col2im back to the data
+      col2im_gpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
+                 pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
+    }
+  }
+}
+
+
+template <typename Dtype>
+__global__ void local_update1_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
+                                    Dtype* data_R, const int filter_num,
+                                    const int location_num, const int output_num) {
+  int total = filter_num * location_num * output_num;
+  CUDA_KERNEL_LOOP(index, total) {
+    int p = index % location_num;
+    int n = (index / location_num) % filter_num;
+    int q = (index / location_num) / filter_num;
+    data_R[index] += data_A[q*location_num+p] * data_B[n*location_num+p];
+  }
+}
+
+template <typename Dtype>
+void local_update1_gpu(const Dtype* data_A, const Dtype* data_B,
+                       Dtype* data_R, const int filter_num,
+                       const int location_num, const int output_num) {
+  // data_A is output_num x location_num
+  // data_B is filter_num x location_num
+  // data_R is output_num x filter_num x location_num, the update performed is Rqnp += Aqp * Bnp
+
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  local_update1_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num * output_num),
+                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void local_update1_gpu<float>(const float* data_A, const float* data_B,
+                                float* data_R, const int filter_num,
+                                const int location_num, const int output_num);
+template void local_update1_gpu<double>(const double* data_A, const double* data_B,
+                                double* data_R, const int filter_num,
+                                const int location_num, const int output_num);
+
+
+template <typename Dtype>
+__global__ void local_update2_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
+                                Dtype* data_R, const int filter_num,
+                                const int location_num, const int output_num) {
+  int total = filter_num * location_num;
+  CUDA_KERNEL_LOOP(index, total) {
+    int p = index % location_num;
+    int n = (index / location_num);
+    for (int q=0; q<output_num; q++) {
+      data_R[index] += data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
+    }
+  }
+}
+
+template <typename Dtype>
+void local_update2_gpu(const Dtype* data_A, const Dtype* data_B,
+                       Dtype* data_R, const int filter_num,
+                       const int location_num, const int output_num) {
+  // data_A is output_num x location_num
+  // data_B is output_num x filter_num x location_num
+  // data_R is filter_num x location_num, the update performed is Rnp += \sum_q(Aqp * Bqnp)
+
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  local_update2_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num),
+                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void local_update2_gpu<float>(const float* data_A, const float* data_B,
+                       float* data_R, const int filter_num,
+                       const int location_num, const int output_num);
+template void local_update2_gpu<double>(const double* data_A, const double* data_B,
+                       double* data_R, const int filter_num,
+                       const int location_num, const int output_num);
+
+
+INSTANTIATE_LAYER_GPU_FUNCS(LocalLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 21ab15fd31b..b011676f979 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -1,3 +1,6 @@
+#include <boost/algorithm/string.hpp>
+#include <google/protobuf/text_format.h>
+
 #include <algorithm>
 #include <map>
 #include <set>
@@ -16,6 +19,8 @@
 
 #include "caffe/test/test_caffe_main.hpp"
 
+using boost::replace_all;
+
 namespace caffe {
 
 template <typename Dtype>
@@ -32,10 +37,14 @@ Net<Dtype>::Net(const string& param_file) {
 
 template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
+  // Load import layers
+  NetParameter expanded(in_param);
+  LoadImports(in_param, &expanded);
+
   // Filter layers based on their include/exclude rules and
   // the current NetState.
   NetParameter filtered_param;
-  FilterNet(in_param, &filtered_param);
+  FilterNet(expanded, &filtered_param);
   LOG(INFO) << "Initializing net from parameters: " << std::endl
             << filtered_param.DebugString();
   // Create a copy of filtered_param with splits added where necessary.
@@ -462,6 +471,66 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
   }
 }
 
+
+template <typename Dtype>
+void Net<Dtype>::LoadImports(const NetParameter& source, NetParameter* target) {
+  target->CopyFrom(source);
+  target->clear_layers();
+  LoadImports(source, target, "");
+}
+
+template <typename Dtype>
+void Net<Dtype>::LoadImports(const NetParameter& source, NetParameter* target,
+    const string& pwd) {
+  for (int i = 0; i < source.layers_size(); ++i) {
+    if (source.layers(i).type() == LayerParameter_LayerType_IMPORT) {
+      const LayerParameter& layer = source.layers(i);
+      CHECK(layer.has_import_param()) << "Missing import_param";
+      const ImportParameter& import = layer.import_param();
+      string proto = ReadFile(import.net());
+      // Replace variables and references
+      for (int j = 0; j < import.var_size(); ++j) {
+        const Pair& p = import.var(j);
+        replace_all(proto, "${" + p.name() + "}", p.value());
+      }
+      NetParameter net;
+      bool parse = google::protobuf::TextFormat::ParseFromString(proto, &net);
+      CHECK(parse) << "Failed to parse NetParameter file: " << import.net();
+      CHECK(layer.has_name() && layer.name().length() > 0)
+          << "Import layer must have a name";
+      LoadImports(net, target, ResolveImportName(layer.name(), pwd));
+    } else {
+      LayerParameter *t = target->add_layers();
+      t->CopyFrom(source.layers(i));
+      t->set_name(ResolveImportName(t->name(), pwd));
+      for (int j = 0; j < source.layers(i).top_size(); ++j)
+        t->set_top(j, ResolveImportName(source.layers(i).top(j), pwd));
+      for (int j = 0; j < source.layers(i).bottom_size(); ++j)
+        t->set_bottom(j, ResolveImportName(source.layers(i).bottom(j), pwd));
+    }
+  }
+}
+
+template <typename Dtype>
+string Net<Dtype>::ResolveImportName(const string& path, const string& pwd) {
+  CHECK(!boost::starts_with(pwd, "/") && !boost::ends_with(pwd, "/"));
+  if (boost::starts_with(path, "/"))
+    return path.substr(1, path.size() - 1);
+  string cpath = path;
+  string cpwd = pwd;
+  while (boost::starts_with(cpath, "../")) {
+    cpath = cpath.substr(3, cpath.size() - 3);
+    size_t i = cpwd.find_last_of('/');
+    cpwd = i == string::npos ? "" : cpwd.substr(0, i);
+  }
+  if (!cpwd.size())
+    return cpath;
+  if (!cpath.size() || cpath == ".")
+    return cpwd;
+  return cpwd + '/' + cpath;
+}
+
+
 template <typename Dtype>
 void Net<Dtype>::GetLearningRateAndWeightDecay() {
   LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index f0404a09b90..f5bd5ca6928 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -221,7 +221,7 @@ message LayerParameter {
   // line above the enum. Update the next available ID when you add a new
   // LayerType.
   //
-  // LayerType next available ID: 39 (last added: EXP)
+  // LayerType next available ID: 40 (last added: LOCAL)
   enum LayerType {
     // "NONE" layer type is 0th enum element so that we don't cause confusion
     // by defaulting to an existent LayerType (instead, should usually error if
@@ -246,8 +246,10 @@ message LayerParameter {
     HINGE_LOSS = 28;
     IM2COL = 11;
     IMAGE_DATA = 12;
+    IMPORT = 51;
     INFOGAIN_LOSS = 13;
     INNER_PRODUCT = 14;
+    LOCAL = 50;
     LRN = 15;
     MEMORY_DATA = 29;
     MULTINOMIAL_LOGISTIC_LOSS = 16;
@@ -307,8 +309,10 @@ message LayerParameter {
   optional HDF5OutputParameter hdf5_output_param = 14;
   optional HingeLossParameter hinge_loss_param = 29;
   optional ImageDataParameter image_data_param = 15;
+  optional ImportParameter import_param = 103;
   optional InfogainLossParameter infogain_loss_param = 16;
   optional InnerProductParameter inner_product_param = 17;
+  optional LocalParameter local_param = 102;
   optional LRNParameter lrn_param = 18;
   optional MemoryDataParameter memory_data_param = 22;
   optional MVNParameter mvn_param = 34;
@@ -410,6 +414,17 @@ message ConvolutionParameter {
   optional Engine engine = 15 [default = DEFAULT];
 }
 
+// Message that stores parameters used by LocalLayer
+message LocalParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional uint32 pad = 3 [default = 0]; // The padding size
+  optional uint32 kernel_size = 4; // The kernel size
+  optional uint32 stride = 6 [default = 1]; // The stride
+  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter bias_filler = 8; // The filler for the bias
+}
+
 // Message that stores parameters used by DataLayer
 message DataParameter {
   enum DB {
@@ -540,6 +555,20 @@ message ImageDataParameter {
   optional bool mirror = 6 [default = false];
 }
 
+message Pair {
+  required string name = 1;
+  required string value = 2;
+}
+
+// Message that stores parameters used by ImportLayer
+message ImportParameter {
+  // Proto file to import
+  required string net = 1;
+  // Variable names to replace before importing the file. Variables can
+  // be used in the file in this format: ${name}
+  repeated Pair var = 2;
+}
+
 // Message that stores parameters InfogainLossLayer
 message InfogainLossParameter {
   // Specify the infogain matrix source.
diff --git a/src/caffe/test/test_data/module.prototxt b/src/caffe/test/test_data/module.prototxt
new file mode 100644
index 00000000000..6c2d5359360
--- /dev/null
+++ b/src/caffe/test/test_data/module.prototxt
@@ -0,0 +1,21 @@
+layers: {
+  name: 'innerproduct'
+  type: INNER_PRODUCT
+  inner_product_param {
+    num_output: ${num_output}
+    weight_filler {
+      type: 'gaussian'
+      std: 0.01
+    }
+    bias_filler {
+      type: 'constant'
+      value: 0
+    }
+  }
+  blobs_lr: 1.
+  blobs_lr: 2.
+  weight_decay: 1.
+  weight_decay: 0.
+  bottom: '../data'
+  top: 'innerproduct'
+}
diff --git a/src/caffe/test/test_imports.cpp b/src/caffe/test/test_imports.cpp
new file mode 100644
index 00000000000..023ae6dce7f
--- /dev/null
+++ b/src/caffe/test/test_imports.cpp
@@ -0,0 +1,87 @@
+#include <map>
+#include <string>
+#include <vector>
+
+#include "google/protobuf/text_format.h"
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/io.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class ImportsTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  virtual void InitNetFromProtoString(const string& proto) {
+    NetParameter param;
+    CHECK(google::protobuf::TextFormat::ParseFromString(proto, &param));
+    net_.reset(new Net<Dtype>(param));
+  }
+
+  virtual void InitNet() {
+    string file = CMAKE_SOURCE_DIR "caffe/test/test_data/module.prototxt";
+    string proto =
+        "name: 'TestNetwork' "
+        "layers: { "
+        "  name: 'data' "
+        "  type: DUMMY_DATA "
+        "  dummy_data_param { "
+        "    num: 5 "
+        "    channels: 2 "
+        "    height: 3 "
+        "    width: 4 "
+        "    num: 5 "
+        "    channels: 1 "
+        "    height: 1 "
+        "    width: 1 "
+        "    data_filler { "
+        "      type: 'gaussian' "
+        "      std: 0.01 "
+        "    } "
+        "  } "
+        "  top: 'data' "
+        "  top: 'label' "
+        "} "
+        "layers: { "
+        "  name: 'import' "
+        "  type: IMPORT "
+        "  import_param { "
+        "    net: '" + file + "' "
+        "    var { name: 'num_output' value: '1000' } "
+        "  } "
+        "} "
+        "layers: { "
+        "  name: 'loss' "
+        "  type: SOFTMAX_LOSS "
+        "  bottom: 'import/innerproduct' "
+        "  bottom: 'label' "
+        "  top: 'top_loss' "
+        "} ";
+    InitNetFromProtoString(proto);
+  }
+
+  shared_ptr<Net<Dtype> > net_;
+};
+
+TYPED_TEST_CASE(ImportsTest, TestDtypesAndDevices);
+
+TYPED_TEST(ImportsTest, ConvPool) {
+  this->InitNet();
+  EXPECT_TRUE(this->net_->has_blob("data"));
+  EXPECT_TRUE(this->net_->has_blob("label"));
+  EXPECT_TRUE(this->net_->has_blob("import/innerproduct"));
+  EXPECT_FALSE(this->net_->has_blob("loss"));
+}
+}  // namespace caffe
+
diff --git a/src/caffe/test/test_local_layer.cpp b/src/caffe/test/test_local_layer.cpp
new file mode 100644
index 00000000000..d5b09d8488f
--- /dev/null
+++ b/src/caffe/test/test_local_layer.cpp
@@ -0,0 +1,174 @@
+// Copyright 2014 BVLC and contributors.
+
+#include <cstring>
+#include <vector>
+
+#include "cuda_runtime.h"
+#include "gtest/gtest.h"
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+
+template <typename Dtype>
+class LocalLayerTest : public ::testing::Test {
+ protected:
+	LocalLayerTest()
+      : blob_bottom_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    blob_bottom_->Reshape(2, 3, 6, 4);
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_value(1.);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+
+  virtual ~LocalLayerTest() { delete blob_bottom_; delete blob_top_; }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(LocalLayerTest, Dtypes);
+
+TYPED_TEST(LocalLayerTest, TestSetup) {
+  LayerParameter layer_param;
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(4);
+  shared_ptr<Layer<TypeParam> > layer(
+      new LocalLayer<TypeParam>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 4);
+  EXPECT_EQ(this->blob_top_->height(), 2);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+  convolution_param->set_num_output(3);
+  layer.reset(new LocalLayer<TypeParam>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 3);
+  EXPECT_EQ(this->blob_top_->height(), 2);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+}
+
+
+TYPED_TEST(LocalLayerTest, TestCPUSimpleConvolution) {
+  // We will simply see if the convolution layer carries out averaging well.
+  FillerParameter filler_param;
+  filler_param.set_value(1.);
+  ConstantFiller<TypeParam> filler(filler_param);
+  filler.Fill(this->blob_bottom_);
+  LayerParameter layer_param;
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(1);
+  convolution_param->set_num_output(1);
+  convolution_param->mutable_weight_filler()->set_type("test_local");
+  convolution_param->mutable_weight_filler()->set_value(1);
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<TypeParam> > layer(
+      new LocalLayer<TypeParam>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  Caffe::set_mode(Caffe::CPU);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // After the convolution, the output should all have output values 27.1
+  const TypeParam* top_data = this->blob_top_->cpu_data();
+  for (int n=0; n<this->blob_top_->num(); n++) {
+    for (int k=0; k<this->blob_top_->channels(); k++) {
+      for (int j=0; j<this->blob_top_->height(); j++) {
+        for (int i=0; i<this->blob_top_->width(); i++) {
+          int idx = j*this->blob_top_->width()+i;
+          EXPECT_NEAR(*(top_data+this->blob_top_->offset(n, k, j, i)), idx*27+0.1, 1e-4);
+        }
+      }
+    }
+  }
+}
+
+
+TYPED_TEST(LocalLayerTest, TestGPUSimpleConvolution) {
+  // We will simply see if the convolution layer carries out averaging well.
+  FillerParameter filler_param;
+  filler_param.set_value(1.);
+  ConstantFiller<TypeParam> filler(filler_param);
+  filler.Fill(this->blob_bottom_);
+  LayerParameter layer_param;
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("test_local");
+  convolution_param->mutable_weight_filler()->set_value(1);
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<TypeParam> > layer(
+      new LocalLayer<TypeParam>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  Caffe::set_mode(Caffe::GPU);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // After the convolution, the output should all have output values 27.1
+  const TypeParam* top_data = this->blob_top_->cpu_data();
+  for (int n=0; n<this->blob_top_->num(); n++) {
+    for (int k=0; k<this->blob_top_->channels(); k++) {
+      for (int j=0; j<this->blob_top_->height(); j++) {
+        for (int i=0; i<this->blob_top_->width(); i++) {
+          int idx = j*this->blob_top_->width()+i;
+          EXPECT_NEAR(*(top_data+this->blob_top_->offset(n, k, j, i)), idx*27+0.1, 1e-4);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(LocalLayerTest, TestCPUGradient) {
+  LayerParameter layer_param;
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  Caffe::set_mode(Caffe::CPU);
+  LocalLayer<TypeParam> layer(layer_param);
+  GradientChecker<TypeParam> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(LocalLayerTest, TestGPUGradient) {
+  LayerParameter layer_param;
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  Caffe::set_mode(Caffe::GPU);
+  LocalLayer<TypeParam> layer(layer_param);
+  GradientChecker<TypeParam> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 36510d61d40..09d4472c7bc 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -28,6 +28,18 @@ using google::protobuf::io::ZeroCopyOutputStream;
 using google::protobuf::io::CodedOutputStream;
 using google::protobuf::Message;
 
+std::string ReadFile(const string& filename) {
+  std::ifstream in(filename.c_str(), std::ios::in | std::ios::binary);
+  CHECK(in) << "Failed to read file: " << filename;
+  std::string contents;
+  in.seekg(0, std::ios::end);
+  contents.resize(in.tellg());
+  in.seekg(0, std::ios::beg);
+  in.read(&contents[0], contents.size());
+  in.close();
+  return contents;
+}
+
 bool ReadProtoFromTextFile(const char* filename, Message* proto) {
   int fd = open(filename, O_RDONLY);
   CHECK_NE(fd, -1) << "File not found: " << filename;